diff --git a/tutorials/multimodal/omni-fuse-data-curation/.env.example b/tutorials/multimodal/omni-fuse-data-curation/.env.example
new file mode 100644
index 0000000000..f63f05150f
--- /dev/null
+++ b/tutorials/multimodal/omni-fuse-data-curation/.env.example
@@ -0,0 +1,14 @@
+# Copy this file to .env and fill in real values. Do not commit .env.
+
+# Required for the API-first hybrid tutorial path.
+NV_BUILD_API_KEY=
+
+# Defaults to NVIDIA Build's OpenAI-compatible API base URL.
+NVIDIA_API_BASE_URL=https://integrate.api.nvidia.com/v1
+
+# Optional if you use Hugging Face gated/local model paths outside the API backend.
+HF_TOKEN=
+
+# Required for the LanguageBind fusion expert unless LanguageBind is cloned into
+# ./third_party/LanguageBind.
+LANGUAGEBIND_ROOT=
diff --git a/tutorials/multimodal/omni-fuse-data-curation/.python-version b/tutorials/multimodal/omni-fuse-data-curation/.python-version
new file mode 100644
index 0000000000..7eebfafa04
--- /dev/null
+++ b/tutorials/multimodal/omni-fuse-data-curation/.python-version
@@ -0,0 +1 @@
+3.12.11
diff --git a/tutorials/multimodal/omni-fuse-data-curation/.rayignore b/tutorials/multimodal/omni-fuse-data-curation/.rayignore
new file mode 100644
index 0000000000..df18d9aaf1
--- /dev/null
+++ b/tutorials/multimodal/omni-fuse-data-curation/.rayignore
@@ -0,0 +1,12 @@
+.git/
+.venv/
+__pycache__/
+.pytest_cache/
+.ruff_cache/
+.mypy_cache/
+datasets/
+model_files/
+outputs/
+tutorials/multimodal/omni-fuse-data-curation/outputs/
+tutorials/multimodal/omni-fuse-data-curation/tmp/
+*.egg-info/
diff --git a/tutorials/multimodal/omni-fuse-data-curation/0_validate_inputs.py b/tutorials/multimodal/omni-fuse-data-curation/0_validate_inputs.py
new file mode 100644
index 0000000000..4532c4878c
--- /dev/null
+++ b/tutorials/multimodal/omni-fuse-data-curation/0_validate_inputs.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Validate config, data manifests, API keys, and local model assets."""
+
+from __future__ import annotations
+
+from utils import config_parser, load_tutorial_config, print_outputs, validate_inputs
+
+
+def main() -> int:
+    parser = config_parser(__doc__ or "")
+    args = parser.parse_args()
+    config = load_tutorial_config(args.config)
+    print_outputs(validate_inputs(config))
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/tutorials/multimodal/omni-fuse-data-curation/1_sns.py b/tutorials/multimodal/omni-fuse-data-curation/1_sns.py
new file mode 100644
index 0000000000..75694ad5a0
--- /dev/null
+++ b/tutorials/multimodal/omni-fuse-data-curation/1_sns.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Run Symmetric Nucleus Subsampling over paired multimodal records."""
+
+from __future__ import annotations
+
+from utils import config_parser, load_tutorial_config, print_outputs, run_sns
+
+
+def main() -> int:
+    parser = config_parser(__doc__ or "")
+    args = parser.parse_args()
+    config = load_tutorial_config(args.config)
+    task = run_sns(config)
+    metadata = dict(getattr(task, "_metadata", {}) or {})
+    print_outputs(
+        {
+            "run_dir": str(config.run_dir),
+            "sns_manifest_path": metadata.get("sns_manifest_path"),
+            "sns_records_path": metadata.get("sns_records_path"),
+        }
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/tutorials/multimodal/omni-fuse-data-curation/2_embed.py b/tutorials/multimodal/omni-fuse-data-curation/2_embed.py
new file mode 100644
index 0000000000..ab6c0e632f
--- /dev/null
+++ b/tutorials/multimodal/omni-fuse-data-curation/2_embed.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Run the Expert Embedding Engine for SNS-refined records."""
+
+from __future__ import annotations
+
+from utils import config_parser, load_tutorial_config, print_outputs, run_eee
+
+
+def main() -> int:
+    parser = config_parser(__doc__ or "")
+    args = parser.parse_args()
+    config = load_tutorial_config(args.config)
+    task = run_eee(config)
+    metadata = dict(getattr(task, "_metadata", {}) or {})
+    print_outputs(
+        {
+            "run_dir": str(config.run_dir),
+            "embedding_metadata_path": metadata.get("embedding_metadata_path"),
+            "embedding_records_path": metadata.get("embedding_records_path"),
+        }
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/tutorials/multimodal/omni-fuse-data-curation/3_project.py b/tutorials/multimodal/omni-fuse-data-curation/3_project.py
new file mode 100644
index 0000000000..da34b1c3df
--- /dev/null
+++ b/tutorials/multimodal/omni-fuse-data-curation/3_project.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Train the Omni-Fuse projection network and project raw embeddings."""
+
+from __future__ import annotations
+
+from utils import config_parser, load_tutorial_config, print_outputs, run_projection
+
+
+def main() -> int:
+    parser = config_parser(__doc__ or "")
+    args = parser.parse_args()
+    config = load_tutorial_config(args.config)
+    task = run_projection(config)
+    metadata = dict(getattr(task, "_metadata", {}) or {})
+    print_outputs(
+        {
+            "run_dir": str(config.run_dir),
+            "projection_model_path": metadata.get("projection_model_path"),
+            "projection_metrics_path": metadata.get("projection_metrics_path"),
+            "projected_embeddings_path": metadata.get("projected_embeddings_path"),
+            "projection_records_path": metadata.get("projection_records_path"),
+        }
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/tutorials/multimodal/omni-fuse-data-curation/4_datablend.py b/tutorials/multimodal/omni-fuse-data-curation/4_datablend.py
new file mode 100644
index 0000000000..ffff81e053
--- /dev/null
+++ b/tutorials/multimodal/omni-fuse-data-curation/4_datablend.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Rank SNS-refined records into a query-conditioned datablend."""
+
+from __future__ import annotations
+
+from utils import config_parser, load_tutorial_config, print_outputs, run_datablend
+
+
+def main() -> int:
+    parser = config_parser(__doc__ or "")
+    args = parser.parse_args()
+    config = load_tutorial_config(args.config)
+    task = run_datablend(config)
+    metadata = dict(getattr(task, "_metadata", {}) or {})
+    print_outputs(
+        {
+            "run_dir": str(config.run_dir),
+            "datablend_ranked_path": metadata.get("datablend_ranked_path"),
+            "datablend_topk_path": metadata.get("datablend_topk_path"),
+            "datablend_size": metadata.get("datablend_size"),
+        }
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/tutorials/multimodal/omni-fuse-data-curation/README.md b/tutorials/multimodal/omni-fuse-data-curation/README.md
new file mode 100644
index 0000000000..44e575629a
--- /dev/null
+++ b/tutorials/multimodal/omni-fuse-data-curation/README.md
@@ -0,0 +1,214 @@
+# Omni-Fuse Data Curation
+
+Omni-Fuse (see paper [here](https://arxiv.org/pdf/2605.01163v1)) curates paired multimodal datasets by improving pair alignment and
+then ranking the resulting records for a target data blend. This tutorial uses
+NeMo Curator task/stage abstractions to implement the curation pipeline:
+
+1. Validate paired data manifests and model assets.
+2. Apply Symmetric Nucleus Subsampling (SNS).
+3. Run the Expert Embedding Engine (EEE).
+4. Train/apply the projection network.
+5. Export a query-ranked datablend.
+
+The tutorial is API-first hybrid. It uses NVIDIA API models where hosted
+endpoints preserve the intended Omni-Fuse role, and local models where the
+current implementation needs local model execution.
+
+## Setup
+
+Install the tutorial dependencies from the tutorial directory:
+
+```bash
+cd tutorials/multimodal/omni-fuse-data-curation/
+uv sync --extra dev
+```
+
+You will need to do the following before you're able to run the tutorial:
+- Ensure `ffmpeg` is installed and added to `PATH`.
+- Log in to Hugging Face using `hf auth login`
+- Copy `.env.example` to `.env` in this tutorial directory.
+- Create an API key at `build.nvidia.com` and set the `NV_BUILD_API_KEY` variable in the `.env` file.
+- Clone the [LanguageBind](https://github.com/pku-yuangroup/languagebind) repository. Either clone it to `third_party/` or set the `LANGUAGEBIND_ROOT` variable in the `.env` file.
+- Download pre-trained weights for CG-DETR model from [Lighthouse](https://github.com/line/lighthouse#pre-trained-weights) and save it to `model_files/best.ckpt` We use `cg_detr/qvhighlight/clip/best.ckpt`.
+- We recommend using GPUs as we run several local models
+- Set the paths to the datasets you want to use in `configs/omni_fuse_hybrid.yaml` and change other settings as you see fit.
+
+### Data Layout
+
+This tutorial is bring-your-own data. Each pool contains raw files, text
+annotations, and a `pair_mapping.jsonl` file:
+
+```text
+my_pool/
+  raw/
+  annotations/
+  pair_mapping.jsonl
+```
+
+Each mapping row must contain a raw path and either an annotation path or inline
+annotation text:
+
+```json
+{"id": "sample-1", "data_path": "raw/sample.jpg", "annotation_path": "annotations/sample.txt"}
+{"id": "sample-2", "data_path": "raw/sample.wav", "annotation": "A person speaks over background music."}
+```
+
+Supported raw modalities are `text`, `image`, `audio`, and `video`. Configure
+each pool in `configs/omni_fuse_hybrid.yaml`:
+
+```yaml
+data_pools:
+  - name: "image_caption_pool"
+    modality: "image"
+    root_dir: "/path/to/image_pool"
+    mapping_file: "pair_mapping.jsonl"
+    n_samples: 1
+```
+
+Use small `n_samples` values while validating the tutorial.
+
+### Model Backends
+
+The default config uses `sns.backend: hybrid` and `eee.backend: hybrid`.
+If you wish to use strictly api-based or local models, you can change these to `api` or `local`. However, this won't work out of the box and you will have to modify code to fit your requirements.
+
+API-backed components:
+
+- Modality descriptions for backward SNS and the text-based EEE expert:
+  - `nvidia/nemotron-nano-12b-v2-vl` for text, image, and video.
+  - `google/gemma-3n-e4b-it` for audio.
+- Text embeddings:
+  - `nvidia/llama-nemotron-embed-1b-v2`.
+
+Local components:
+
+- SNS multimodal similarity and MI gating:
+  - `nvidia/omni-embed-nemotron-3b`.
+- SNS image forward extraction:
+  - `IDEA-Research/grounding-dino-tiny`.
+- SNS audio forward extraction:
+  - `lighthouse-emnlp2024/AM-DETR`.
+- SNS video forward extraction:
+  - CG-DETR from Lighthouse with `model_files/best.ckpt`.
+- EEE fusion expert:
+  - LanguageBind.
+- EEE end-to-end expert:
+  - `nvidia/omni-embed-nemotron-3b`.
+
+
+
+## Step 0: Validate Inputs
+
+```bash
+python 0_validate_inputs.py --config configs/omni_fuse_hybrid.yaml
+```
+
+This checks the data manifests, API key availability, LanguageBind checkout,
+and CG-DETR checkpoint path.
+
+## Step 1: Symmetric Nucleus Subsampling
+
+```bash
+python 1_sns.py --config configs/omni_fuse_hybrid.yaml
+```
+
+SNS writes:
+
+```text
+outputs/<experiment_id>/sns/manifest.jsonl
+outputs/<experiment_id>/sns/records.jsonl
+```
+
+In hybrid mode, backward extraction uses API descriptions and API text
+embeddings. Forward extraction for image/audio/video uses local
+Grounding-DINO/AM-DETR/CG-DETR and local Omni-Embed MI gating.
+
+## Step 2: Expert Embeddings
+
+```bash
+python 2_embed.py --config configs/omni_fuse_hybrid.yaml
+```
+
+EEE writes interleaved, raw, and annotation embeddings for each expert:
+
+```text
+outputs/<experiment_id>/embeddings/text_based_*.npy
+outputs/<experiment_id>/embeddings/fusion_*.npy
+outputs/<experiment_id>/embeddings/e2e_*.npy
+outputs/<experiment_id>/embeddings/metadata.json
+outputs/<experiment_id>/embeddings/records.jsonl
+```
+
+The text-based expert uses NVIDIA API descriptions and text embeddings. The
+fusion and e2e experts use LanguageBind and Omni-Embed locally.
+
+## Step 3: Projection
+
+```bash
+python 3_project.py --config configs/omni_fuse_hybrid.yaml
+```
+
+The projection stage trains a small MLP over concatenated expert embeddings
+using contrastive, cluster-bias, and scale-bias losses. It writes:
+
+```text
+outputs/<experiment_id>/projection/model.json
+outputs/<experiment_id>/projection/loss_history.json
+outputs/<experiment_id>/projection/metrics.json
+outputs/<experiment_id>/projection/projected_embeddings.npy
+outputs/<experiment_id>/projection/annotation_embeddings.npy
+```
+
+## Step 4: Datablend Ranking
+
+```bash
+python 4_datablend.py --config configs/omni_fuse_hybrid.yaml
+```
+
+The datablend stage embeds the query through the text-based expert and ranks
+projected records by cosine similarity:
+
+```text
+outputs/<experiment_id>/datablend/datablend_ranked.jsonl
+outputs/<experiment_id>/datablend/datablend_topk.jsonl
+```
+
+## End-to-End Script
+
+Run every step in order:
+
+```bash
+CONFIG=configs/omni_fuse_hybrid.yaml bash e2e.sh
+```
+
+Set `PYTHON_BIN` if you want to use a specific interpreter:
+
+```bash
+PYTHON_BIN="uv run python" CONFIG=configs/omni_fuse_hybrid.yaml bash e2e.sh
+```
+
+## Output Layout
+
+```text
+outputs/<experiment_id>/
+  config.resolved.json
+  sns/
+    manifest.jsonl
+    records.jsonl
+    media/
+  embeddings/
+    metadata.json
+    records.jsonl
+    *_interleaved.npy
+    *_raw.npy
+    *_annotation.npy
+  projection/
+    model.json
+    loss_history.json
+    metrics.json
+    projected_embeddings.npy
+    annotation_embeddings.npy
+  datablend/
+    datablend_ranked.jsonl
+    datablend_topk.jsonl
+```
diff --git a/tutorials/multimodal/omni-fuse-data-curation/configs/omni_fuse_hybrid.yaml b/tutorials/multimodal/omni-fuse-data-curation/configs/omni_fuse_hybrid.yaml
new file mode 100644
index 0000000000..7cbf9d59c9
--- /dev/null
+++ b/tutorials/multimodal/omni-fuse-data-curation/configs/omni_fuse_hybrid.yaml
@@ -0,0 +1,76 @@
+experiment_id: "omni-fuse-hybrid-tutorial"
+description: "API-first hybrid Omni-Fuse multimodal data curation tutorial"
+output_dir: "../outputs"
+
+data_pools:
+  - name: "image_caption_pool"
+    modality: "image"
+    root_dir: "/path/to/image_pool"
+    mapping_file: "pair_mapping.jsonl"
+    n_samples: 1
+  - name: "audio_caption_pool"
+    modality: "audio"
+    root_dir: "/path/to/audio_pool"
+    mapping_file: "pair_mapping.jsonl"
+    n_samples: 1
+  - name: "video_instruction_pool"
+    modality: "video"
+    root_dir: "/path/to/video_pool"
+    mapping_file: "pair_mapping.jsonl"
+    n_samples: 1
+  - name: "text_instruction_pool"
+    modality: "text"
+    root_dir: "/path/to/text_pool"
+    mapping_file: "pair_mapping.jsonl"
+    n_samples: 1
+
+sns:
+  enabled: true
+  backend: "hybrid"
+  direction: "bidirectional"
+  mi_ratio: 0.75
+  mi_eps: 0.05
+  tau_forward_text: 0.05
+  tau_forward_image: 0.10
+  tau_forward_audio: 0.10
+  tau_forward_video: 0.10
+  tau_backward: 0.10
+  nvidia_model: "nvidia/omni-embed-nemotron-3b"
+  grounding_dino_model_id: "IDEA-Research/grounding-dino-tiny"
+  amdetr_repo_id: "lighthouse-emnlp2024/AM-DETR"
+  cg_detr_checkpoint: "../model_files/best.ckpt"
+  require_forward_models: true
+  use_ann_components: true
+
+eee:
+  backend: "hybrid"
+  experts: ["text-based", "fusion", "e2e"]
+  embedding_dim: 2048
+  batch_size: 1
+  nvidia_api_base_url: "https://integrate.api.nvidia.com/v1"
+  nvidia_text_describer_model: "nvidia/nemotron-nano-12b-v2-vl"
+  nvidia_image_describer_model: "nvidia/nemotron-nano-12b-v2-vl"
+  nvidia_video_describer_model: "nvidia/nemotron-nano-12b-v2-vl"
+  nvidia_audio_describer_model: "google/gemma-3n-e4b-it"
+  nvidia_embedding_model: "nvidia/llama-nemotron-embed-1b-v2"
+  nvidia_multimodal_model: "nvidia/omni-embed-nemotron-3b"
+
+projection:
+  enabled: true
+  backend: "torch"
+  num_epochs: 1
+  batch_size: 4
+  hidden_layer_size: 256
+  contrastive_loss_weight: 0.99
+  bias_loss_weight: 0.01
+  scale_loss_weight: 0.0001
+
+datablend:
+  query: "high quality multimodal instruction tuning data"
+  top_k: 4
+  include_metadata: true
+
+runtime:
+  device: "auto"
+  dtype: "float32"
+  offline_mode: false
diff --git a/tutorials/multimodal/omni-fuse-data-curation/e2e.sh b/tutorials/multimodal/omni-fuse-data-curation/e2e.sh
new file mode 100755
index 0000000000..5fab4f7e19
--- /dev/null
+++ b/tutorials/multimodal/omni-fuse-data-curation/e2e.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+TUTORIAL_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CONFIG="${CONFIG:-${TUTORIAL_DIR}/configs/omni_fuse_hybrid.yaml}"
+PYTHON_BIN="${PYTHON_BIN:-python}"
+read -r -a PYTHON_CMD <<< "${PYTHON_BIN}"
+
+"${PYTHON_CMD[@]}" "${TUTORIAL_DIR}/0_validate_inputs.py" --config "${CONFIG}"
+"${PYTHON_CMD[@]}" "${TUTORIAL_DIR}/1_sns.py" --config "${CONFIG}"
+"${PYTHON_CMD[@]}" "${TUTORIAL_DIR}/2_embed.py" --config "${CONFIG}"
+"${PYTHON_CMD[@]}" "${TUTORIAL_DIR}/3_project.py" --config "${CONFIG}"
+"${PYTHON_CMD[@]}" "${TUTORIAL_DIR}/4_datablend.py" --config "${CONFIG}"
diff --git a/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/__init__.py b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/__init__.py
new file mode 100644
index 0000000000..0c320a276a
--- /dev/null
+++ b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""NeMo Curator rewrite of the Omni-Fuse data curation pipeline."""
+
+from omnifuse_tutorial.config.models import ExperimentConfig
+from omnifuse_tutorial.pipeline import build_pipeline
+
+__all__ = ["ExperimentConfig", "build_pipeline"]
diff --git a/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/compat/__init__.py b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/compat/__init__.py
new file mode 100644
index 0000000000..32460e672a
--- /dev/null
+++ b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/compat/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Compatibility helpers for optional NeMo Curator imports."""
diff --git a/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/compat/curator.py b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/compat/curator.py
new file mode 100644
index 0000000000..af48309ff0
--- /dev/null
+++ b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/compat/curator.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Small compatibility layer around NeMo Curator APIs."""
+
+from __future__ import annotations
+
+import copy
+import inspect
+from typing import Any
+
+import pandas as pd
+from nemo_curator.pipeline import Pipeline as CuratorPipeline
+from nemo_curator.tasks import DocumentBatch, EmptyTask
+
+
+def records_from_task(task: Any) -> list[dict[str, Any]]:
+    """Return task data as records from a Curator task."""
+
+    data = task.data
+    if hasattr(data, "to_dict"):
+        try:
+            return [dict(row) for row in data.to_dict(orient="records")]
+        except TypeError:
+            return [dict(row) for row in data.to_dict("records")]
+    if isinstance(data, list):
+        return [dict(row) for row in data]
+    raise TypeError(f"Unsupported task data type: {type(data)!r}")
+
+
+def make_document_batch(
+    task_id: str,
+    dataset_name: str,
+    records: list[dict[str, Any]],
+    metadata: dict[str, Any] | None = None,
+    stage_perf: list[Any] | None = None,
+) -> DocumentBatch:
+    """Construct a NeMo Curator DocumentBatch."""
+
+    kwargs = {
+        "dataset_name": dataset_name,
+        "data": pd.DataFrame.from_records(records),
+        "_metadata": metadata or {},
+        "_stage_perf": stage_perf or [],
+    }
+    if "task_id" in inspect.signature(DocumentBatch).parameters:
+        kwargs["task_id"] = task_id
+
+    return DocumentBatch(**kwargs)
+
+
+def make_empty_task() -> EmptyTask:
+    if callable(EmptyTask):
+        try:
+            return EmptyTask(task_id="empty", dataset_name="omnifuse", data=None)
+        except TypeError:
+            return EmptyTask()
+    return copy.deepcopy(EmptyTask)
+
+
+def make_curator_pipeline(name: str, stages: list[Any], description: str | None = None) -> CuratorPipeline:
+    return CuratorPipeline(name=name, description=description, stages=stages)
diff --git a/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/config/__init__.py b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/config/__init__.py
new file mode 100644
index 0000000000..f2f6beb44c
--- /dev/null
+++ b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/config/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Configuration models and loading helpers."""
diff --git a/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/config/loader.py b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/config/loader.py
new file mode 100644
index 0000000000..0cae7af71a
--- /dev/null
+++ b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/config/loader.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Configuration loading helpers."""
+
+from __future__ import annotations
+
+import json
+import os
+from pathlib import Path
+from typing import Any
+
+import yaml
+
+from omnifuse_tutorial.config.models import ExperimentConfig
+
+
+def load_config(path: str | Path) -> ExperimentConfig:
+    config_path = Path(path)
+    _load_dotenv(Path.cwd() / ".env")
+    _load_dotenv(config_path.parent / ".env")
+    data = _load_mapping(config_path)
+    config = ExperimentConfig.from_dict(data)
+    return _resolve_relative_paths(config, config_path.parent)
+
+
+def _load_mapping(path: Path) -> dict[str, Any]:
+    suffix = path.suffix.lower()
+    text = path.read_text(encoding="utf-8")
+    if suffix == ".json":
+        return json.loads(text)
+    if suffix in {".yaml", ".yml"}:
+        loaded = yaml.safe_load(text)
+        if not isinstance(loaded, dict):
+            raise ValueError(f"Config must be a mapping: {path}")
+        return loaded
+    raise ValueError(f"Unsupported config suffix: {path.suffix}")
+
+
+def _resolve_relative_paths(config: ExperimentConfig, base_dir: Path) -> ExperimentConfig:
+    if not config.output_dir.is_absolute():
+        config.output_dir = (base_dir / config.output_dir).resolve()
+    if config.runtime.cache_dir and not config.runtime.cache_dir.is_absolute():
+        config.runtime.cache_dir = (base_dir / config.runtime.cache_dir).resolve()
+    if config.sns.sns_output_dir and not config.sns.sns_output_dir.is_absolute():
+        config.sns.sns_output_dir = (base_dir / config.sns.sns_output_dir).resolve()
+    if config.sns.cg_detr_checkpoint and not config.sns.cg_detr_checkpoint.is_absolute():
+        config.sns.cg_detr_checkpoint = (base_dir / config.sns.cg_detr_checkpoint).resolve()
+    if config.projection.save_weights_path and not config.projection.save_weights_path.is_absolute():
+        config.projection.save_weights_path = (base_dir / config.projection.save_weights_path).resolve()
+    for pool in config.data_pools:
+        if not pool.root_dir.is_absolute():
+            pool.root_dir = (base_dir / pool.root_dir).resolve()
+    return config
+
+
+def _load_dotenv(path: Path) -> None:
+    if not path.exists():
+        return
+    for raw_line in path.read_text(encoding="utf-8").splitlines():
+        line = raw_line.strip()
+        if not line or line.startswith("#") or "=" not in line:
+            continue
+        key, value = line.split("=", 1)
+        key = key.strip()
+        value = value.strip().strip('"').strip("'")
+        if key and key not in os.environ:
+            os.environ[key] = value
diff --git a/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/config/models.py b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/config/models.py
new file mode 100644
index 0000000000..011666d701
--- /dev/null
+++ b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/config/models.py
@@ -0,0 +1,320 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Configuration dataclasses for the standalone Omni-Fuse Curator pipeline."""
+
+from __future__ import annotations
+
+from dataclasses import asdict, dataclass, field
+from pathlib import Path
+from typing import Any, Literal
+
+Modality = Literal["text", "image", "video", "audio", "point-cloud"]
+SNSDirection = Literal["forward", "backward", "bidirectional"]
+ExpertName = Literal["text-based", "fusion", "e2e"]
+
+
+@dataclass
+class DataPoolConfig:
+    name: str
+    modality: Modality
+    root_dir: Path
+    mapping_file: str = "pair_mapping.jsonl"
+    n_samples: int | None = None
+    shuffle: bool = False
+    max_file_size_mb: int | None = None
+    max_video_frames: int | None = None
+    max_audio_duration_seconds: int | None = None
+
+    @classmethod
+    def from_dict(cls, value: dict[str, Any]) -> DataPoolConfig:
+        name = value.get("name", value.get("title"))
+        modality = value.get("modality", value.get("data_modality"))
+        root_dir = value.get("root_dir", value.get("data_root_dir"))
+        if name is None:
+            raise ValueError(f"Data pool missing name/title: {value}")
+        if modality is None:
+            raise ValueError(f"Data pool missing modality/data_modality: {value}")
+        if root_dir is None:
+            raise ValueError(f"Data pool missing root_dir/data_root_dir: {value}")
+        return cls(
+            name=str(name),
+            modality=modality,
+            root_dir=Path(root_dir),
+            mapping_file=str(value.get("mapping_file", "pair_mapping.jsonl")),
+            n_samples=value.get("n_samples"),
+            shuffle=bool(value.get("shuffle", False)),
+            max_file_size_mb=value.get("max_file_size_mb"),
+            max_video_frames=value.get("max_video_frames"),
+            max_audio_duration_seconds=value.get("max_audio_duration_seconds"),
+        )
+
+
+@dataclass
+class SNSConfig:
+    enabled: bool = True
+    backend: Literal["auto", "hybrid", "local", "api"] = "auto"
+    direction: SNSDirection = "bidirectional"
+    mi_ratio: float = 0.95
+    mi_eps: float = 0.05
+    tau_forward_text: float = 0.30
+    tau_forward_image: float = 0.30
+    tau_forward_video: float = 0.20
+    tau_forward_audio: float = 0.25
+    tau_backward: float = 0.35
+    grid_size: int = 5
+    max_patches: int = 4
+    max_video_segments: int = 5
+    max_audio_segments: int = 5
+    min_segment_duration: float = 2.0
+    bbox_padding_px: int = 0
+    reinject: bool = False
+    sns_output_dir: Path | None = None
+    grounding_dino_model_id: str = "IDEA-Research/grounding-dino-tiny"
+    cg_detr_checkpoint: Path = Path("model_files/best.ckpt")
+    amdetr_repo_id: str = "lighthouse-emnlp2024/AM-DETR"
+    require_forward_models: bool = True
+    use_ann_components: bool = True
+    nvidia_model: str = "nvidia/omni-embed-nemotron-3b"
+
+    @classmethod
+    def from_dict(cls, value: dict[str, Any] | None) -> SNSConfig:
+        value = value or {}
+        sns_output_dir = value.get("sns_output_dir")
+        cg_detr_checkpoint = value.get("cg_detr_checkpoint")
+        cfg = cls(**{key: item for key, item in value.items() if key not in {"sns_output_dir", "cg_detr_checkpoint"}})
+        if sns_output_dir:
+            cfg.sns_output_dir = Path(sns_output_dir)
+        if cg_detr_checkpoint:
+            cfg.cg_detr_checkpoint = Path(cg_detr_checkpoint)
+        return cfg
+
+
+@dataclass
+class EEEConfig:
+    experts: list[ExpertName] = field(default_factory=lambda: ["text-based", "fusion", "e2e"])
+    backend: Literal["hybrid", "local", "api"] = "hybrid"
+    embedding_dim: int = 2048
+    batch_size: int = 32
+    text_prompt_base: str = "Describe this in detail."
+    text_prompt_prefix: str = "Focus specifically on the aspects highlighted in this annotation."
+    nvidia_api_key: str | None = None
+    nvidia_api_base_url: str = "https://integrate.api.nvidia.com/v1"
+    nvidia_text_describer_model: str = "nvidia/nemotron-nano-12b-v2-vl"
+    nvidia_image_describer_model: str = "nvidia/nemotron-nano-12b-v2-vl"
+    nvidia_video_describer_model: str = "nvidia/nemotron-nano-12b-v2-vl"
+    nvidia_audio_describer_model: str = "google/gemma-3n-e4b-it"
+    nvidia_embedding_model: str = "nvidia/llama-nemotron-embed-1b-v2"
+    nvidia_multimodal_model: str = "nvidia/omni-embed-nemotron-3b"
+
+    @classmethod
+    def from_dict(cls, value: dict[str, Any] | None) -> EEEConfig:
+        value = dict(value or {})
+        if "text_expert_backend" in value and "backend" not in value:
+            value["backend"] = value.pop("text_expert_backend")
+        cfg = cls(**value)
+        if not cfg.experts:
+            raise ValueError("eee.experts cannot be empty")
+        if cfg.backend not in {"hybrid", "local", "api"}:
+            raise ValueError(f"Unsupported eee.backend: {cfg.backend}")
+        return cfg
+
+
+@dataclass
+class ProjectionConfig:
+    enabled: bool = True
+    backend: Literal["auto", "linear", "torch"] = "auto"
+    num_epochs: int = 100
+    batch_size: int = 128
+    learning_rate: float = 1e-3
+    contrastive_loss_weight: float = 0.99
+    scale_loss_weight: float = 0.0001
+    bias_loss_weight: float = 0.01
+    contrastive_temperature: float = 0.07
+    num_layers: int = 3
+    hidden_layer_size: int = 512
+    dropout: float = 0.1
+    output_embeddings: bool = True
+    save_weights_path: Path | None = None
+    eval_recall_k: int = 10
+    verbose: bool = False
+
+    @classmethod
+    def from_dict(cls, value: dict[str, Any] | None) -> ProjectionConfig:
+        value = dict(value or {})
+        save_weights_path = value.get("save_weights_path")
+        cfg = cls(**{key: item for key, item in value.items() if key != "save_weights_path"})
+        if save_weights_path:
+            cfg.save_weights_path = Path(save_weights_path)
+        return cfg
+
+
+@dataclass
+class DatablendConfig:
+    query: str
+    top_k: int | None = None
+    blend_fraction: float | None = None
+    include_metadata: bool = True
+
+    @classmethod
+    def from_dict(cls, value: dict[str, Any] | None) -> DatablendConfig:
+        if not value or not value.get("query"):
+            raise ValueError("datablend.query is required")
+        return cls(
+            query=str(value["query"]),
+            top_k=value.get("top_k"),
+            blend_fraction=value.get("blend_fraction"),
+            include_metadata=bool(value.get("include_metadata", True)),
+        )
+
+
+@dataclass
+class RuntimeConfig:
+    device: str = "auto"
+    dtype: str = "float32"
+    offline_mode: bool = False
+    cache_dir: Path | None = None
+
+    @classmethod
+    def from_dict(cls, value: dict[str, Any] | None) -> RuntimeConfig:
+        value = value or {}
+        cache_dir = value.get("cache_dir")
+        return cls(
+            device=value.get("device", "auto"),
+            dtype=value.get("dtype", "float32"),
+            offline_mode=bool(value.get("offline_mode", False)),
+            cache_dir=Path(cache_dir) if cache_dir else None,
+        )
+
+
+@dataclass
+class ExperimentConfig:
+    experiment_id: str
+    output_dir: Path
+    data_pools: list[DataPoolConfig]
+    sns: SNSConfig
+    eee: EEEConfig
+    projection: ProjectionConfig
+    datablend: DatablendConfig
+    runtime: RuntimeConfig = field(default_factory=RuntimeConfig)
+    description: str = ""
+    embedsim_config_name: str = ""
+    reranking_enabled: bool = True
+    random_shuffle: bool = False
+    strict_data_validation: bool = False
+    downstream_eval: dict[str, Any] = field(default_factory=dict)
+    log_wandb: bool = False
+    log_local: bool = True
+
+    @classmethod
+    def from_dict(cls, value: dict[str, Any]) -> ExperimentConfig:
+        value = dict(value)
+        data_pool_values = value.get("data_pools", value.get("data_pools_config", []))
+        data_pools = [DataPoolConfig.from_dict(item) for item in data_pool_values]
+        if not data_pools:
+            raise ValueError("data_pools cannot be empty")
+        random_shuffle = bool(value.get("random_shuffle", False))
+        if random_shuffle:
+            for pool in data_pools:
+                pool.shuffle = True
+        experiment_id = str(value.get("experiment_id", "")).strip()
+        if not experiment_id:
+            raise ValueError("experiment_id is required")
+        eee_value = _eee_value_from_experiment(value)
+        projection_value = value.get("projection", value.get("awn"))
+        datablend_value = (
+            value.get("datablend")
+            or _datablend_from_downstream(value.get("downstream_eval"))
+            or {"query": "Describe the media content in detail"}
+        )
+        return cls(
+            experiment_id=experiment_id,
+            description=str(value.get("description", "")),
+            output_dir=Path(value.get("output_dir", value.get("experiment_dir", "outputs"))),
+            data_pools=data_pools,
+            sns=SNSConfig.from_dict(value.get("sns")),
+            eee=EEEConfig.from_dict(eee_value),
+            projection=ProjectionConfig.from_dict(projection_value),
+            datablend=DatablendConfig.from_dict(datablend_value),
+            runtime=RuntimeConfig.from_dict(value.get("runtime")),
+            embedsim_config_name=str(value.get("embedsim_config_name", "")),
+            reranking_enabled=bool(value.get("reranking_enabled", True)),
+            random_shuffle=random_shuffle,
+            strict_data_validation=bool(value.get("strict_data_validation", False)),
+            downstream_eval=dict(value.get("downstream_eval") or {}),
+            log_wandb=bool(value.get("log_wandb", False)),
+            log_local=bool(value.get("log_local", True)),
+        )
+
+    @property
+    def run_dir(self) -> Path:
+        return self.output_dir / self.experiment_id
+
+    def to_dict(self) -> dict[str, Any]:
+        value = asdict(self)
+        return _stringify_paths(value)
+
+
+def _stringify_paths(value: Any) -> Any:
+    if isinstance(value, Path):
+        return str(value)
+    if isinstance(value, list):
+        return [_stringify_paths(item) for item in value]
+    if isinstance(value, dict):
+        return {key: _redact_or_stringify(key, item) for key, item in value.items()}
+    return value
+
+
+def _redact_or_stringify(key: str, value: Any) -> Any:
+    lowered = key.lower()
+    if any(marker in lowered for marker in ("api_key", "token", "secret", "password")) and value:
+        return "***REDACTED***"
+    return _stringify_paths(value)
+
+
+def _eee_value_from_experiment(value: dict[str, Any]) -> dict[str, Any] | None:
+    eee = dict(value.get("eee") or {})
+    if "experts" in value and "experts" not in eee:
+        eee["experts"] = [item.value if hasattr(item, "value") else str(item) for item in value["experts"]]
+    if "text_expert_backend" in value and "backend" not in eee:
+        backend = value["text_expert_backend"]
+        eee["backend"] = backend.value if hasattr(backend, "value") else str(backend)
+    for key in (
+        "text_prompt_base",
+        "text_prompt_prefix",
+        "nvidia_api_key",
+        "nvidia_api_base_url",
+        "nvidia_text_describer_model",
+        "nvidia_image_describer_model",
+        "nvidia_video_describer_model",
+        "nvidia_audio_describer_model",
+        "nvidia_embedding_model",
+    ):
+        if key in value and key not in eee:
+            eee[key] = value[key]
+    return eee or None
+
+
+def _datablend_from_downstream(value: Any) -> dict[str, Any] | None:
+    if not isinstance(value, dict):
+        return None
+    query = value.get("query")
+    if not query:
+        return None
+    result: dict[str, Any] = {"query": query}
+    if value.get("train_count") is not None:
+        result["top_k"] = value["train_count"]
+    if value.get("blend_fraction") is not None:
+        result["blend_fraction"] = value["blend_fraction"]
+    return result
diff --git a/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/data/__init__.py b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/data/__init__.py
new file mode 100644
index 0000000000..17b16d0dc1
--- /dev/null
+++ b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/data/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Data loading and artifact helpers."""
diff --git a/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/data/io.py b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/data/io.py
new file mode 100644
index 0000000000..4867fe8030
--- /dev/null
+++ b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/data/io.py
@@ -0,0 +1,96 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Small stdlib I/O helpers used by pipeline stages."""
+
+from __future__ import annotations
+
+import json
+import math
+import os
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from collections.abc import Iterable
+
+
+def ensure_dir(path: str | Path) -> Path:
+    path = Path(path)
+    path.mkdir(parents=True, exist_ok=True)
+    return path
+
+
+def write_json(path: str | Path, payload: object) -> Path:
+    path = Path(path)
+    ensure_dir(path.parent)
+    path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8")
+    return path
+
+
+def write_jsonl(path: str | Path, rows: Iterable[dict]) -> Path:
+    path = Path(path)
+    ensure_dir(path.parent)
+    with path.open("w", encoding="utf-8") as handle:
+        for row in rows:
+            handle.write(json.dumps(row, sort_keys=True) + "\n")
+    return path
+
+
+def read_jsonl(path: str | Path) -> list[dict]:
+    rows: list[dict] = []
+    with Path(path).open("r", encoding="utf-8") as handle:
+        for raw_line in handle:
+            line = raw_line.strip()
+            if line:
+                rows.append(json.loads(line))
+    return rows
+
+
+def write_npy(path: str | Path, rows: list[list[float]]) -> Path:
+    """Write a 2D float64 NumPy .npy file."""
+
+    import numpy as np
+
+    path = Path(path)
+    ensure_dir(path.parent)
+    n_rows = len(rows)
+    n_cols = len(rows[0]) if rows else 0
+    for row in rows:
+        if len(row) != n_cols:
+            raise ValueError("All rows must have the same length")
+
+    array = np.empty((0, 0), dtype=np.float64) if n_rows == 0 else np.asarray(rows, dtype=np.float64)
+    with path.open("wb") as handle:
+        np.save(handle, array, allow_pickle=False)
+    return path
+
+
+def cosine_similarity(left: list[float], right: list[float]) -> float:
+    if len(left) != len(right):
+        raise ValueError("Vectors must have matching dimensions")
+    dot = sum(a * b for a, b in zip(left, right))
+    left_norm = math.sqrt(sum(a * a for a in left))
+    right_norm = math.sqrt(sum(b * b for b in right))
+    if left_norm == 0 or right_norm == 0:
+        return 0.0
+    return dot / (left_norm * right_norm)
+
+
+def stable_relpath(path: str | Path, start: str | Path | None = None) -> str:
+    path = Path(path)
+    try:
+        return os.path.relpath(path, start=start or Path.cwd())
+    except ValueError:
+        return str(path)
diff --git a/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/data/loader.py b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/data/loader.py
new file mode 100644
index 0000000000..f043296933
--- /dev/null
+++ b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/data/loader.py
@@ -0,0 +1,136 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Paired data-pool loader."""
+
+from __future__ import annotations
+
+import csv
+import hashlib
+import json
+import random
+from pathlib import Path
+from typing import Any
+
+from omnifuse_tutorial.config.models import DataPoolConfig
+
+
+def load_pool_records(pool: DataPoolConfig) -> list[dict[str, Any]]:
+    mapping_path = pool.root_dir / pool.mapping_file
+    if not mapping_path.exists():
+        raise FileNotFoundError(f"Mapping file not found: {mapping_path}")
+
+    if mapping_path.suffix.lower() == ".csv":
+        mapping_rows = _read_csv(mapping_path)
+    else:
+        mapping_rows = _read_jsonl(mapping_path)
+
+    records = [_normalize_mapping_row(pool, row, index) for index, row in enumerate(mapping_rows)]
+    records = [record for record in records if record is not None]
+    if pool.shuffle:
+        rng = random.Random(0)  # noqa: S311 - deterministic tutorial sampling, not security.
+        rng.shuffle(records)
+    if pool.n_samples is not None:
+        records = records[: pool.n_samples]
+    return records
+
+
+def load_all_pools(pools: list[DataPoolConfig]) -> list[dict[str, Any]]:
+    records: list[dict[str, Any]] = []
+    for pool in pools:
+        records.extend(load_pool_records(pool))
+    return records
+
+
+def _read_jsonl(path: Path) -> list[dict[str, Any]]:
+    rows: list[dict[str, Any]] = []
+    with path.open("r", encoding="utf-8") as handle:
+        for raw_line in handle:
+            line = raw_line.strip()
+            if line:
+                rows.append(json.loads(line))
+    return rows
+
+
+def _read_csv(path: Path) -> list[dict[str, Any]]:
+    with path.open("r", encoding="utf-8", newline="") as handle:
+        return [dict(row) for row in csv.DictReader(handle)]
+
+
+def _normalize_mapping_row(pool: DataPoolConfig, row: dict[str, Any], index: int) -> dict[str, Any] | None:
+    raw_rel = row.get("data_path") or row.get("raw_path") or row.get("path")
+    ann_rel = row.get("annotation_path") or row.get("caption_path") or row.get("label_path")
+    annotation_text = row.get("annotation") or row.get("caption") or row.get("text")
+    if not raw_rel:
+        raise ValueError(f"Mapping row missing data_path/raw_path/path in pool {pool.name}: {row}")
+    raw_path = _resolve(pool.root_dir, str(raw_rel))
+    if not raw_path.exists():
+        raise FileNotFoundError(f"Raw data file not found for pool {pool.name}: {raw_path}")
+
+    if pool.max_file_size_mb is not None and raw_path.exists():
+        max_bytes = pool.max_file_size_mb * 1024 * 1024
+        if raw_path.stat().st_size > max_bytes:
+            return None
+
+    annotation_path = _resolve(pool.root_dir, str(ann_rel)) if ann_rel else None
+    if annotation_text is None and annotation_path:
+        annotation_text = annotation_path.read_text(encoding="utf-8").strip()
+    if annotation_text is None:
+        raise ValueError(f"Mapping row has no annotation text/path in pool {pool.name}: {row}")
+
+    raw_text = None
+    if pool.modality == "text" and raw_path.exists():
+        raw_text = raw_path.read_text(encoding="utf-8").strip()
+
+    record_id = row.get("id") or _stable_id(pool.name, str(raw_rel), str(ann_rel or annotation_text))
+    metadata = {
+        key: value
+        for key, value in row.items()
+        if key
+        not in {
+            "id",
+            "data_path",
+            "raw_path",
+            "path",
+            "annotation_path",
+            "caption_path",
+            "label_path",
+            "annotation",
+            "caption",
+            "text",
+        }
+    }
+    return {
+        "pair_id": str(record_id),
+        "pool": pool.name,
+        "pool_index": index,
+        "modality": pool.modality,
+        "raw_path": str(raw_path),
+        "annotation_path": str(annotation_path) if annotation_path else None,
+        "annotation": str(annotation_text).strip(),
+        "raw_text": raw_text,
+        "metadata": metadata,
+    }
+
+
+def _resolve(root: Path, value: str) -> Path:
+    path = Path(value)
+    if path.is_absolute():
+        return path
+    return (root / path).resolve()
+
+
+def _stable_id(*parts: str) -> str:
+    digest = hashlib.sha256("\x1f".join(parts).encode("utf-8")).hexdigest()
+    return digest[:16]
diff --git a/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/datablend/__init__.py b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/datablend/__init__.py
new file mode 100644
index 0000000000..f03e9c88c6
--- /dev/null
+++ b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/datablend/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Datablend export package."""
diff --git a/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/datablend/ranker.py b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/datablend/ranker.py
new file mode 100644
index 0000000000..2f934d0976
--- /dev/null
+++ b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/datablend/ranker.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Query-based datablend ranking."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any
+
+from omnifuse_tutorial.config.models import DatablendConfig
+from omnifuse_tutorial.data.io import cosine_similarity
+from omnifuse_tutorial.eee.backends import EEEBackend
+from omnifuse_tutorial.projection.trainer import ProjectionResult
+
+
+@dataclass
+class DatablendRanker:
+    config: DatablendConfig
+    backend: EEEBackend
+
+    def rank(self, records: list[dict[str, Any]], projection: ProjectionResult) -> list[dict[str, Any]]:
+        query_embedding = self.backend.embed_query(self.config.query, expert="text-based")
+        rows: list[dict[str, Any]] = []
+        for index, (record, projected) in enumerate(zip(records, projection.projected_raw)):
+            score = cosine_similarity(projected, query_embedding)
+            row = {
+                "rank": 0,
+                "score": score,
+                "pair_id": record["pair_id"],
+                "pool": record["pool"],
+                "modality": record["modality"],
+                "raw_path": record["raw_path"],
+                "annotation": record.get("sns_annotation") or record.get("annotation"),
+                "original_annotation": record.get("annotation"),
+                "source_index": index,
+            }
+            if self.config.include_metadata:
+                row["metadata"] = record.get("metadata", {})
+            rows.append(row)
+
+        rows.sort(key=lambda item: item["score"], reverse=True)
+        for rank, row in enumerate(rows, start=1):
+            row["rank"] = rank
+        return rows
+
+    def select_top(self, ranked: list[dict[str, Any]]) -> list[dict[str, Any]]:
+        if self.config.top_k is not None:
+            return ranked[: self.config.top_k]
+        if self.config.blend_fraction is not None:
+            count = max(1, int(len(ranked) * self.config.blend_fraction))
+            return ranked[:count]
+        return ranked
diff --git a/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/eee/__init__.py b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/eee/__init__.py
new file mode 100644
index 0000000000..1a60e555cb
--- /dev/null
+++ b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/eee/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Expert embedding engine package."""
diff --git a/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/eee/backends.py b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/eee/backends.py
new file mode 100644
index 0000000000..aaf09d3a6b
--- /dev/null
+++ b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/eee/backends.py
@@ -0,0 +1,715 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Embedding backends for the Expert Embedding Engine."""
+
+from __future__ import annotations
+
+import base64
+import hashlib
+import io
+import math
+import os
+import re
+import time
+import wave
+from collections.abc import Callable
+from pathlib import Path
+from typing import Any, Literal, Protocol
+
+
+class EEEBackend(Protocol):
+    def embed_raw(self, record: dict[str, Any], expert: str) -> list[float]: ...
+
+    def embed_annotation(self, record: dict[str, Any], expert: str) -> list[float]: ...
+
+    def embed_query(self, query: str, expert: str = "text-based") -> list[float]: ...
+
+
+BackendName = Literal["hybrid", "local", "api"]
+BackendFactory = Callable[[Any, Any], EEEBackend]
+
+IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp", ".bmp", ".gif"}
+VIDEO_EXTENSIONS = {".mp4", ".mov", ".avi", ".mkv", ".webm", ".m4v"}
+AUDIO_EXTENSIONS = {".wav", ".mp3"}
+TEXT_EXTENSIONS = {".txt", ".md", ".json", ".csv"}
+SUPPORTED_EXPERTS = {"text-based", "fusion", "e2e"}
+PHI4_MULTIMODAL_MODEL = "microsoft/phi-4-multimodal-instruct"
+GEMMA_3N_E4B_MODEL = "google/gemma-3n-e4b-it"
+AUDIO_URL_CHAT_MODELS = {PHI4_MULTIMODAL_MODEL, GEMMA_3N_E4B_MODEL}
+NVCF_ASSET_UPLOAD_THRESHOLD_BYTES = 180 * 1024
+AUDIO_INLINE_PREVIEW_BYTES = 160 * 1024
+NVCF_ASSET_BASE_URL = "https://api.nvcf.nvidia.com/v2/nvcf"
+MIME_TYPES = {
+    ".jpg": "image/jpeg",
+    ".jpeg": "image/jpeg",
+    ".png": "image/png",
+    ".webp": "image/webp",
+    ".bmp": "image/bmp",
+    ".gif": "image/gif",
+    ".wav": "audio/wav",
+    ".mp3": "audio/mpeg",
+    ".mp4": "video/mp4",
+    ".mov": "video/quicktime",
+    ".avi": "video/x-msvideo",
+    ".mkv": "video/x-matroska",
+    ".webm": "video/webm",
+    ".m4v": "video/x-m4v",
+}
+
+
+class LocalEEEBackend:
+    """Full local EEE backend: text expert, LanguageBind fusion, Omni-Embed e2e."""
+
+    def __init__(self, config: Any | None = None, runtime: Any | None = None, embedding_dim: int = 2048):
+        from omnifuse_tutorial.eee.local_models import FullLocalEEEBackend
+
+        self._backend = FullLocalEEEBackend(config=config, runtime=runtime, embedding_dim=embedding_dim)
+
+    def embed_raw(self, record: dict[str, Any], expert: str) -> list[float]:
+        return self._backend.embed_raw(record, expert)
+
+    def embed_annotation(self, record: dict[str, Any], expert: str) -> list[float]:
+        return self._backend.embed_annotation(record, expert)
+
+    def embed_query(self, query: str, expert: str = "text-based") -> list[float]:
+        return self._backend.embed_query(query, expert)
+
+    def unload(self) -> None:
+        self._backend.unload()
+
+
+class NvidiaApiEEEBackend:
+    """NVIDIA API backend aligned with EmbedSim's API text expert path."""
+
+    def __init__(
+        self,
+        embedding_dim: int = 2048,
+        api_key: str | None = None,
+        api_base_url: str = "https://integrate.api.nvidia.com/v1",
+        text_model: str = "nvidia/nemotron-nano-12b-v2-vl",
+        image_model: str = "nvidia/nemotron-nano-12b-v2-vl",
+        video_model: str = "nvidia/nemotron-nano-12b-v2-vl",
+        audio_model: str = GEMMA_3N_E4B_MODEL,
+        embedding_model: str = "nvidia/llama-nemotron-embed-1b-v2",
+        timeout: int = 120,
+        batch_size: int = 4,
+    ):
+        self.embedding_dim = embedding_dim
+        self.api_key = api_key or os.environ.get("NV_BUILD_API_KEY") or os.environ.get("NVIDIA_API_KEY")
+        if not self.api_key:
+            raise ValueError("NVIDIA API key required. Set eee.nvidia_api_key or NV_BUILD_API_KEY.")
+        self.api_base_url = os.environ.get("NVIDIA_API_BASE_URL", api_base_url).rstrip("/")
+        self.text_model = text_model
+        self.image_model = image_model
+        self.video_model = video_model
+        self.audio_model = audio_model
+        self.embedding_model = embedding_model
+        self.timeout = timeout
+        self.batch_size = max(1, min(int(batch_size), 16))
+
+    def embed_raw(self, record: dict[str, Any], expert: str) -> list[float]:
+        _validate_expert(expert)
+        if expert == "text-based":
+            text = self._describe_raw(record)
+        else:
+            # The sibling API toggle is primarily for the text expert. For the
+            # other experts, preserve distinct expert spaces by embedding a
+            # modality-aware textual representation through the API encoder.
+            text = _raw_feature_text(record)
+        return self._embed_text(text, expert)
+
+    def embed_annotation(self, record: dict[str, Any], expert: str) -> list[float]:
+        _validate_expert(expert)
+        annotation = _text_or_empty(record.get("sns_annotation")) or _text_or_empty(record.get("annotation"))
+        return self._embed_text(annotation, expert)
+
+    def embed_query(self, query: str, expert: str = "text-based") -> list[float]:
+        _validate_expert(expert)
+        return self._embed_text(query, expert)
+
+    def _describe_raw(self, record: dict[str, Any]) -> str:
+        modality = str(record.get("modality") or "text")
+        raw_text = _text_or_empty(record.get("sns_raw_text")) or _text_or_empty(record.get("raw_text"))
+        if raw_text:
+            return raw_text
+        if modality == "text":
+            return _read_text_path(record.get("raw_path")) or _raw_feature_text(record)
+
+        raw_path = _path_or_none(record.get("raw_path"))
+        if raw_path is None or not raw_path.exists():
+            return _raw_feature_text(record)
+
+        prompt = _prompt_for_modality(modality)
+        if modality == "image":
+            return self._describe_file(raw_path, self.image_model, "image_url", prompt)
+        if modality == "audio":
+            return self._describe_file(raw_path, self.audio_model, "input_audio", prompt)
+        if modality == "video":
+            return self._describe_file(raw_path, self.video_model, "video_url", prompt)
+        return _raw_feature_text(record)
+
+    def _describe_file(self, path: Path, model: str, content_type: str, prompt: str) -> str:
+        return describe_file_with_nvidia_api(
+            path=path,
+            model=model,
+            content_type=content_type,
+            prompt=prompt,
+            api_base_url=self.api_base_url,
+            headers=self._headers(),
+            timeout=self.timeout,
+        )
+
+    def _embed_text(self, text: str, expert: str) -> list[float]:
+        import requests
+
+        response = requests.post(
+            f"{self.api_base_url}/embeddings",
+            headers=self._headers(),
+            json={
+                "model": self.embedding_model,
+                "input": [text],
+                "input_type": "query" if expert == "text-based" else "passage",
+                "encoding_format": "float",
+                "truncate": "END",
+            },
+            timeout=self.timeout,
+        )
+        response.raise_for_status()
+        vector = response.json()["data"][0]["embedding"]
+        return _resize_and_normalize([float(item) for item in vector], self.embedding_dim)
+
+    def _headers(self) -> dict[str, str]:
+        return {
+            "Authorization": f"Bearer {self.api_key}",
+            "Content-Type": "application/json",
+            "Accept": "application/json",
+        }
+
+
+class HybridEEEBackend:
+    """API-first EEE backend with local fusion and true multimodal experts.
+
+    The text-based expert uses NVIDIA API descriptions and text embeddings.
+    The fusion and e2e experts stay local because the current tutorial code
+    uses LanguageBind and Omni-Embed-Nemotron for those actual expert roles.
+    """
+
+    def __init__(self, config: Any, runtime: Any | None = None):
+        self.api = NvidiaApiEEEBackend(
+            embedding_dim=int(getattr(config, "embedding_dim", 2048)),
+            api_key=getattr(config, "nvidia_api_key", None),
+            api_base_url=getattr(config, "nvidia_api_base_url", "https://integrate.api.nvidia.com/v1"),
+            text_model=getattr(config, "nvidia_text_describer_model", "nvidia/nemotron-nano-12b-v2-vl"),
+            image_model=getattr(config, "nvidia_image_describer_model", "nvidia/nemotron-nano-12b-v2-vl"),
+            video_model=getattr(config, "nvidia_video_describer_model", "nvidia/nemotron-nano-12b-v2-vl"),
+            audio_model=getattr(config, "nvidia_audio_describer_model", GEMMA_3N_E4B_MODEL),
+            embedding_model=getattr(config, "nvidia_embedding_model", "nvidia/llama-nemotron-embed-1b-v2"),
+            batch_size=int(getattr(config, "batch_size", 4)),
+        )
+        self.local = LocalEEEBackend(
+            config=config,
+            runtime=runtime,
+            embedding_dim=int(getattr(config, "embedding_dim", 2048)),
+        )
+
+    def embed_raw(self, record: dict[str, Any], expert: str) -> list[float]:
+        expert = _validate_expert(expert)
+        if expert == "text-based":
+            return self.api.embed_raw(record, expert)
+        return self.local.embed_raw(record, expert)
+
+    def embed_annotation(self, record: dict[str, Any], expert: str) -> list[float]:
+        expert = _validate_expert(expert)
+        if expert == "text-based":
+            return self.api.embed_annotation(record, expert)
+        return self.local.embed_annotation(record, expert)
+
+    def embed_query(self, query: str, expert: str = "text-based") -> list[float]:
+        expert = _validate_expert(expert)
+        if expert == "text-based":
+            return self.api.embed_query(query, expert)
+        return self.local.embed_query(query, expert)
+
+    def describe_record(self, record: dict[str, Any]) -> str:
+        return self.api._describe_raw(record)
+
+    def unload(self) -> None:
+        self.local.unload()
+
+
+def describe_file_with_nvidia_api(
+    *,
+    path: Path,
+    model: str,
+    content_type: str,
+    prompt: str,
+    api_base_url: str,
+    headers: dict[str, str],
+    timeout: int,
+) -> str:
+    if content_type == "input_audio" and model in AUDIO_URL_CHAT_MODELS:
+        return _describe_audio_url_chat_file(path, model, prompt, api_base_url, headers, timeout)
+    return _describe_chat_completion_file(path, model, content_type, prompt, api_base_url, headers, timeout)
+
+
+def _describe_chat_completion_file(
+    path: Path,
+    model: str,
+    content_type: str,
+    prompt: str,
+    api_base_url: str,
+    headers: dict[str, str],
+    timeout: int,
+) -> str:
+    mime = MIME_TYPES.get(path.suffix.lower(), "application/octet-stream")
+    if content_type == "input_audio":
+        encoded = base64.b64encode(path.read_bytes()).decode("utf-8")
+        media_content = {"type": "input_audio", "input_audio": {"data": encoded, "format": _audio_format(path)}}
+        content: str | list[dict[str, Any]] = [{"type": "text", "text": prompt}, media_content]
+        request_headers = headers
+    elif content_type in {"image_url", "video_url"} and path.stat().st_size > NVCF_ASSET_UPLOAD_THRESHOLD_BYTES:
+        asset_id = _upload_nvcf_asset(path, mime, headers, timeout)
+        request_headers = _headers_with_nvcf_asset(headers, asset_id)
+        tag_name = "img" if content_type == "image_url" else "video"
+        content = f'{prompt}\n<{tag_name} src="data:{mime};asset_id,{asset_id}" />'
+    else:
+        encoded = base64.b64encode(path.read_bytes()).decode("utf-8")
+        media_content = {content_type: {"url": f"data:{mime};base64,{encoded}"}, "type": content_type}
+        content = [{"type": "text", "text": prompt}, media_content]
+        request_headers = headers
+    url = f"{api_base_url}/chat/completions"
+    payload = {
+        "model": model,
+        "messages": [{"role": "user", "content": content}],
+        "max_tokens": 512,
+        "temperature": 0.2,
+        "stream": False,
+    }
+    response = _post_nvidia_json_with_retries(url, request_headers, payload, timeout)
+    response = _resolve_nvidia_response(response, api_base_url, request_headers, timeout, model, url)
+    return _response_text(response.json(), model, url)
+
+
+def _describe_audio_url_chat_file(
+    path: Path,
+    model: str,
+    prompt: str,
+    api_base_url: str,
+    headers: dict[str, str],
+    timeout: int,
+) -> str:
+    audio_format = _audio_format(path)
+    mime = "audio/wav" if audio_format == "wav" else "audio/mpeg"
+    request_headers = dict(headers)
+    used_asset = False
+    if path.stat().st_size > NVCF_ASSET_UPLOAD_THRESHOLD_BYTES:
+        asset_id = _upload_nvcf_asset(path, mime, headers, timeout)
+        request_headers = _headers_with_nvcf_asset(headers, asset_id)
+        used_asset = True
+        content: str | list[dict[str, Any]] = f'{prompt}\n<audio src="data:{mime};asset_id,{asset_id}" />'
+    else:
+        encoded = base64.b64encode(path.read_bytes()).decode("utf-8")
+        content = [
+            {"type": "text", "text": prompt},
+            {"type": "audio_url", "audio_url": {"url": f"data:{mime};base64,{encoded}"}},
+        ]
+    url = f"{api_base_url}/chat/completions"
+    payload = {
+        "model": model,
+        "messages": [{"role": "user", "content": content}],
+        "max_tokens": 512,
+        "temperature": 0.2,
+        "stream": False,
+    }
+    response = _post_nvidia_json_with_retries(url, request_headers, payload, timeout)
+    response_headers = request_headers
+    if used_asset and _should_retry_inline_audio_preview(response):
+        content = _inline_audio_content(path, mime, prompt, preview=True)
+        payload = {**payload, "messages": [{"role": "user", "content": content}]}
+        response = _post_nvidia_json_with_retries(url, headers, payload, timeout)
+        response_headers = headers
+    response = _resolve_nvidia_response(response, api_base_url, response_headers, timeout, model, url)
+    return _response_text(response.json(), model, url)
+
+
+def _headers_with_nvcf_asset(headers: dict[str, str], asset_id: str) -> dict[str, str]:
+    request_headers = dict(headers)
+    request_headers["NVCF-INPUT-ASSET-REFERENCES"] = asset_id
+    return request_headers
+
+
+def _upload_nvcf_asset(path: Path, mime: str, headers: dict[str, str], timeout: int) -> str:
+    import requests
+
+    description = f"omni-fuse tutorial media asset {path.name}"
+    asset_base_url = os.environ.get("NVIDIA_NVCF_ASSET_BASE_URL", NVCF_ASSET_BASE_URL).rstrip("/")
+    create_url = f"{asset_base_url}/assets"
+    response = requests.post(
+        create_url,
+        headers=headers,
+        json={"contentType": mime, "description": description},
+        timeout=timeout,
+    )
+    _raise_for_nvidia_response(response, "nvcf-asset", create_url)
+    payload = response.json()
+    asset_id = _first_string(payload, ("assetId", "asset_id", "id"))
+    upload_url = _first_string(payload, ("uploadUrl", "upload_url", "uploadURL", "url"))
+    if not asset_id or not upload_url:
+        raise RuntimeError(f"NVIDIA NVCF asset response missing assetId/uploadUrl: {_safe_payload(payload)}")
+    with path.open("rb") as handle:
+        upload_response = requests.put(
+            upload_url,
+            data=handle,
+            headers={
+                "Content-Type": mime,
+                "x-amz-meta-nvcf-asset-description": description,
+            },
+            timeout=timeout,
+        )
+    if upload_response.status_code >= 400:
+        raise RuntimeError(
+            f"NVIDIA NVCF asset upload failed: status={upload_response.status_code} body={upload_response.text[:1000]}"
+        )
+    return asset_id
+
+
+def _inline_audio_content(path: Path, mime: str, prompt: str, preview: bool = False) -> list[dict[str, Any]]:
+    if preview:
+        prompt = (
+            "Only the opening segment is attached because the full audio exceeds the current inline payload limit. "
+            + prompt
+        )
+        audio_bytes = _audio_preview_bytes(path)
+    else:
+        audio_bytes = path.read_bytes()
+    encoded = base64.b64encode(audio_bytes).decode("utf-8")
+    return [
+        {"type": "text", "text": prompt},
+        {"type": "audio_url", "audio_url": {"url": f"data:{mime};base64,{encoded}"}},
+    ]
+
+
+def _audio_preview_bytes(path: Path) -> bytes:
+    if path.suffix.lower() != ".wav":
+        return path.read_bytes()[:AUDIO_INLINE_PREVIEW_BYTES]
+    try:
+        with wave.open(str(path), "rb") as source:
+            params = source.getparams()
+            frame_size = max(1, params.nchannels * params.sampwidth)
+            max_frames = max(1, min(params.nframes, (AUDIO_INLINE_PREVIEW_BYTES - 4096) // frame_size))
+            frames = source.readframes(max_frames)
+        output = io.BytesIO()
+        with wave.open(output, "wb") as target:
+            target.setnchannels(params.nchannels)
+            target.setsampwidth(params.sampwidth)
+            target.setframerate(params.framerate)
+            target.setcomptype(params.comptype, params.compname)
+            target.writeframes(frames)
+        return output.getvalue()
+    except (EOFError, OSError, wave.Error):
+        return path.read_bytes()[:AUDIO_INLINE_PREVIEW_BYTES]
+
+
+def _post_nvidia_json_with_retries(
+    url: str,
+    headers: dict[str, str],
+    payload: dict[str, Any],
+    timeout: int,
+    attempts: int = 3,
+) -> Any:
+    import requests
+
+    response = None
+    for attempt in range(attempts):
+        response = requests.post(url, headers=headers, json=payload, timeout=timeout)
+        if not _is_retryable_nvidia_response(response) or attempt == attempts - 1:
+            return response
+        time.sleep(2 * (attempt + 1))
+    return response
+
+
+def _is_retryable_nvidia_response(response: Any) -> bool:
+    return _is_degraded_response(response) or response.status_code in {500, 502, 503, 504}
+
+
+def _is_degraded_response(response: Any) -> bool:
+    return response.status_code in {400, 503} and "DEGRADED function" in response.text
+
+
+def _is_missing_nvcf_asset_response(response: Any) -> bool:
+    if response.status_code != 400:
+        return False
+    return "not found in nvcf_assets" in response.text or "NVCF asset ID" in response.text
+
+
+def _should_retry_inline_audio_preview(response: Any) -> bool:
+    return _is_missing_nvcf_asset_response(response) or response.status_code in {413, 500, 502, 503, 504}
+
+
+def _resolve_nvidia_response(
+    response: Any,
+    api_base_url: str,
+    headers: dict[str, str],
+    timeout: int,
+    model: str,
+    url: str,
+) -> Any:
+    import requests
+
+    if response.status_code != 202:
+        _raise_for_nvidia_response(response, model, url)
+        return response
+
+    request_id = _request_id(response)
+    if not request_id:
+        raise RuntimeError(f"NVIDIA API returned 202 without requestId for model {model}: {response.text[:1000]}")
+
+    status_url = f"{api_base_url}/status/{request_id}"
+    deadline = time.monotonic() + max(timeout, 1)
+    while True:
+        poll_response = requests.get(status_url, headers=headers, timeout=timeout)
+        if poll_response.status_code == 202:
+            if time.monotonic() >= deadline:
+                raise TimeoutError(f"NVIDIA API polling timed out for model {model} request {request_id}")
+            time.sleep(2)
+            continue
+        _raise_for_nvidia_response(poll_response, model, status_url)
+        return poll_response
+
+
+def _raise_for_nvidia_response(response: Any, model: str, url: str) -> None:
+    if response.status_code < 400:
+        return
+    raise RuntimeError(
+        f"NVIDIA API request failed: model={model} url={url} status={response.status_code} body={response.text[:1000]}"
+    )
+
+
+def _request_id(response: Any) -> str | None:
+    for header in ("NVCF-REQID", "x-request-id", "X-Request-Id", "requestId"):
+        value = response.headers.get(header)
+        if value:
+            return str(value)
+    try:
+        payload = response.json()
+    except ValueError:
+        return None
+    return _first_string(payload, ("requestId", "request_id", "id"))
+
+
+def _audio_format(path: Path) -> str:
+    audio_format = path.suffix.lower().lstrip(".")
+    if audio_format == "mpeg":
+        audio_format = "mp3"
+    if audio_format not in {"wav", "mp3"}:
+        raise ValueError(f"NVIDIA audio descriptions support wav/mp3 only, got {path.suffix} for {path}")
+    return audio_format
+
+
+def _response_text(payload: Any, model: str, url: str) -> str:
+    if isinstance(payload, dict):
+        choices = payload.get("choices")
+        if isinstance(choices, list) and choices:
+            message = choices[0].get("message") if isinstance(choices[0], dict) else None
+            if isinstance(message, dict):
+                content = message.get("content")
+                text = _content_text(content)
+                if text:
+                    return text
+        for key in ("content", "text", "output", "response"):
+            text = _content_text(payload.get(key))
+            if text:
+                return text
+        outputs = payload.get("outputs")
+        if isinstance(outputs, list) and outputs:
+            text = _content_text(outputs[0])
+            if text:
+                return text
+    raise RuntimeError(
+        f"NVIDIA API response missing text content: model={model} url={url} body={_safe_payload(payload)}"
+    )
+
+
+def _content_text(content: Any) -> str:
+    if isinstance(content, str):
+        return content.strip()
+    if isinstance(content, list):
+        parts = []
+        for item in content:
+            if isinstance(item, str):
+                parts.append(item)
+            elif isinstance(item, dict):
+                value = item.get("text") or item.get("content")
+                if isinstance(value, str):
+                    parts.append(value)
+        return " ".join(part.strip() for part in parts if part and part.strip()).strip()
+    if isinstance(content, dict):
+        value = content.get("text") or content.get("content")
+        if isinstance(value, str):
+            return value.strip()
+    return ""
+
+
+def _first_string(payload: Any, keys: tuple[str, ...]) -> str | None:
+    if not isinstance(payload, dict):
+        return None
+    for key in keys:
+        value = payload.get(key)
+        if isinstance(value, str) and value:
+            return value
+    return None
+
+
+def _safe_payload(payload: Any) -> str:
+    text = str(payload)
+    return text[:1000]
+
+
+def backend_factory(config_or_name: Any, runtime: Any | None = None) -> EEEBackend:
+    name = _backend_name(config_or_name)
+    if name == "hybrid":
+        if isinstance(config_or_name, str):
+            raise ValueError("Hybrid EEE backend requires a full EEE config")
+        return HybridEEEBackend(config_or_name, runtime)
+    if name == "local":
+        return LocalEEEBackend(
+            config=config_or_name if not isinstance(config_or_name, str) else None,
+            runtime=runtime,
+            embedding_dim=int(getattr(config_or_name, "embedding_dim", 2048)),
+        )
+    if name == "api":
+        return NvidiaApiEEEBackend(
+            embedding_dim=int(getattr(config_or_name, "embedding_dim", 2048)),
+            api_key=getattr(config_or_name, "nvidia_api_key", None),
+            api_base_url=getattr(config_or_name, "nvidia_api_base_url", "https://integrate.api.nvidia.com/v1"),
+            text_model=getattr(config_or_name, "nvidia_text_describer_model", "nvidia/nemotron-nano-12b-v2-vl"),
+            image_model=getattr(config_or_name, "nvidia_image_describer_model", "nvidia/nemotron-nano-12b-v2-vl"),
+            video_model=getattr(config_or_name, "nvidia_video_describer_model", "nvidia/nemotron-nano-12b-v2-vl"),
+            audio_model=getattr(config_or_name, "nvidia_audio_describer_model", GEMMA_3N_E4B_MODEL),
+            embedding_model=getattr(
+                config_or_name,
+                "nvidia_embedding_model",
+                "nvidia/llama-nemotron-embed-1b-v2",
+            ),
+            batch_size=int(getattr(config_or_name, "batch_size", 4)),
+        )
+    raise ValueError(f"Unsupported EEE backend: {name}")
+
+
+def _backend_name(config_or_name: Any) -> str:
+    if isinstance(config_or_name, str):
+        return config_or_name
+    return str(getattr(config_or_name, "backend", "hybrid"))
+
+
+def _validate_expert(expert: str) -> str:
+    if expert not in SUPPORTED_EXPERTS:
+        raise ValueError(f"Unsupported EEE expert: {expert}")
+    return expert
+
+
+def _describe_raw(record: dict[str, Any]) -> str:
+    modality = str(record.get("modality") or "text")
+    raw_text = _text_or_empty(record.get("sns_raw_text")) or _text_or_empty(record.get("raw_text"))
+    if raw_text:
+        return raw_text
+    if modality == "text":
+        return _read_text_path(record.get("raw_path")) or _raw_feature_text(record)
+    return " ".join([_prompt_for_modality(modality), _raw_feature_text(record)])
+
+
+def _raw_feature_text(record: dict[str, Any]) -> str:
+    raw_path = _path_or_none(record.get("raw_path"))
+    metadata = record.get("metadata") if isinstance(record.get("metadata"), dict) else {}
+    parts = [
+        f"modality:{record.get('modality')}",
+        f"pool:{record.get('pool')}",
+        f"path:{raw_path.name if raw_path else record.get('raw_path')}",
+    ]
+    if raw_path:
+        parts.append(_file_fingerprint(raw_path))
+        parts.extend(_path_tokens(raw_path))
+    for key in sorted(metadata):
+        value = metadata[key]
+        if value is not None and value != "":
+            parts.append(f"{key}:{value}")
+    raw_text = _text_or_empty(record.get("sns_raw_text")) or _text_or_empty(record.get("raw_text"))
+    if raw_text:
+        parts.append(raw_text)
+    return " ".join(parts)
+
+
+def _prompt_for_modality(modality: str) -> str:
+    if modality == "image":
+        return "Describe this image in detail."
+    if modality == "audio":
+        return "Transcribe and describe this audio. What sounds do you hear?"
+    if modality == "video":
+        return "Describe what happens in this video in detail."
+    return "Describe this content in detail."
+
+
+def _file_fingerprint(path: Path) -> str:
+    try:
+        stat = path.stat()
+        digest = hashlib.sha256()
+        with path.open("rb") as handle:
+            digest.update(handle.read(65536))
+        return f"file:{path.suffix.lower()} size:{stat.st_size} sha256:{digest.hexdigest()[:24]}"
+    except OSError:
+        return f"file:{path.suffix.lower()} unreadable"
+
+
+def _path_tokens(path: Path) -> list[str]:
+    return re.findall(r"[a-zA-Z0-9]+", path.stem)
+
+
+def _read_text_path(value: Any) -> str:
+    path = _path_or_none(value)
+    if not path or path.suffix.lower() not in TEXT_EXTENSIONS:
+        return ""
+    try:
+        return path.read_text(encoding="utf-8").strip()
+    except UnicodeDecodeError:
+        return path.read_text(encoding="utf-8", errors="ignore").strip()
+    except OSError:
+        return ""
+
+
+def _path_or_none(value: Any) -> Path | None:
+    if not isinstance(value, (str, Path)):
+        return None
+    try:
+        path = Path(value)
+    except OSError:
+        return None
+    return path
+
+
+def _text_or_empty(value: Any) -> str:
+    if isinstance(value, str):
+        return value.strip()
+    return ""
+
+
+def _resize_and_normalize(vector: list[float], dim: int) -> list[float]:
+    if len(vector) < dim:
+        vector = vector + [0.0] * (dim - len(vector))
+    elif len(vector) > dim:
+        vector = vector[:dim]
+    norm = math.sqrt(sum(item * item for item in vector))
+    if norm == 0:
+        return vector
+    return [item / norm for item in vector]
diff --git a/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/eee/languagebind_runtime.py b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/eee/languagebind_runtime.py
new file mode 100644
index 0000000000..9c0e1db973
--- /dev/null
+++ b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/eee/languagebind_runtime.py
@@ -0,0 +1,253 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Runtime wrapper for the vendored LanguageBind source tree."""
+
+from __future__ import annotations
+
+import importlib
+import os
+import sys
+from pathlib import Path
+from typing import Any
+
+DEFAULT_LANGUAGEBIND_ROOT = Path(__file__).resolve().parents[2] / "third_party" / "LanguageBind"
+
+
+def _runtime_root() -> Path:
+    return Path(os.environ.get("LANGUAGEBIND_ROOT") or DEFAULT_LANGUAGEBIND_ROOT).expanduser().resolve()
+
+
+def _ensure_runtime_dirs() -> None:
+    repo_tmp = Path(__file__).resolve().parents[2] / "tmp"
+    repo_tmp.mkdir(exist_ok=True)
+    torchinductor_dir = repo_tmp / "torchinductor_cache"
+    torchinductor_dir.mkdir(exist_ok=True)
+    pycache_dir = repo_tmp / "pycache"
+    pycache_dir.mkdir(exist_ok=True)
+    os.environ.setdefault("TMPDIR", str(repo_tmp))
+    os.environ.setdefault("TORCHINDUCTOR_CACHE_DIR", str(torchinductor_dir))
+    os.environ.setdefault("PYTHONPYCACHEPREFIX", str(pycache_dir))
+
+
+def bootstrap_languagebind() -> None:
+    """Make the LanguageBind checkout importable on modern torch/transformers."""
+
+    root = _runtime_root()
+    if not root.exists():
+        raise RuntimeError(
+            "LanguageBind source not found. Clone it at "
+            f"{DEFAULT_LANGUAGEBIND_ROOT} or set LANGUAGEBIND_ROOT. "
+            "For parity with Omni-Fuse, use the LanguageBind submodule from "
+            "../embedsimclusterer-experiments/third_party/LanguageBind."
+        )
+
+    _ensure_runtime_dirs()
+    root_str = str(root)
+    if root_str not in sys.path:
+        sys.path.insert(0, root_str)
+
+    try:
+        import torchaudio
+        import transformers.models.clip.modeling_clip as clip_modeling
+        from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+    except ImportError as exc:
+        raise RuntimeError(
+            "LanguageBind runtime requires torchaudio and transformers. "
+            "Install the full local extras with `python -m pip install -e '.[full]'`."
+        ) from exc
+
+    if not hasattr(torchaudio, "set_audio_backend"):
+        torchaudio.set_audio_backend = lambda *args, **kwargs: None
+
+    if "torchvision.transforms.functional_tensor" not in sys.modules:
+        try:
+            import torchvision.transforms._functional_tensor as functional_tensor
+
+            sys.modules["torchvision.transforms.functional_tensor"] = functional_tensor
+        except ImportError:
+            pass
+
+    if not hasattr(clip_modeling, "_expand_mask"):
+        clip_modeling._expand_mask = lambda mask, dtype, tgt_len=None: AttentionMaskConverter._expand_mask(
+            mask=mask,
+            dtype=dtype,
+            tgt_len=tgt_len,
+        )
+
+    if not getattr(clip_modeling.CLIPVisionEmbeddings, "_omnifuse_languagebind_patch", False):
+
+        def patched_forward(self: Any, pixel_values: Any, interpolate_pos_encoding: bool = False) -> Any:
+            import torch
+
+            batch_size, _, height, width = pixel_values.shape
+            if isinstance(self.image_size, (tuple, list)):
+                expected_height, expected_width = self.image_size
+            else:
+                expected_height = expected_width = self.image_size
+            if not interpolate_pos_encoding and (height != expected_height or width != expected_width):
+                raise ValueError(
+                    f"Input image size ({height}*{width}) doesn't match model ({expected_height}*{expected_width})."
+                )
+            target_dtype = self.patch_embedding.weight.dtype
+            patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))
+            patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+            class_embeds = self.class_embedding.expand(batch_size, 1, -1)
+            embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+            if interpolate_pos_encoding:
+                embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+            else:
+                embeddings = embeddings + self.position_embedding(self.position_ids)
+            return embeddings
+
+        clip_modeling.CLIPVisionEmbeddings.forward = patched_forward
+        clip_modeling.CLIPVisionEmbeddings._omnifuse_languagebind_patch = True
+
+
+class LanguageBindRuntime:
+    """Thin adapter over upstream LanguageBind image, audio, and video models."""
+
+    MODEL_REPOS = {
+        "image": "LanguageBind/LanguageBind_Image",
+        "audio": "LanguageBind/LanguageBind_Audio_FT",
+        "video": "LanguageBind/LanguageBind_Video_FT",
+    }
+
+    def __init__(self, device: str, text_branch: str = "video", local_files_only: bool = False) -> None:
+        bootstrap_languagebind()
+
+        import numpy as np
+
+        self.device = device
+        self.text_branch = text_branch
+        self.local_files_only = local_files_only
+        self._models: dict[str, Any] = {}
+        self._processors: dict[str, Any] = {}
+        self._tokenizer: Any | None = None
+
+        tokenizer_mod = importlib.import_module("languagebind.image.tokenization_image")
+        self._tokenizer_cls = tokenizer_mod.LanguageBindImageTokenizer
+
+        image_model_mod = importlib.import_module("languagebind.image.modeling_image")
+        image_proc_mod = importlib.import_module("languagebind.image.processing_image")
+        audio_model_mod = importlib.import_module("languagebind.audio.modeling_audio")
+        audio_proc_mod = importlib.import_module("languagebind.audio.processing_audio")
+        video_model_mod = importlib.import_module("languagebind.video.modeling_video")
+        video_proc_mod = importlib.import_module("languagebind.video.processing_video")
+
+        self._model_classes = {
+            "image": image_model_mod.LanguageBindImage,
+            "audio": audio_model_mod.LanguageBindAudio,
+            "video": video_model_mod.LanguageBindVideo,
+        }
+        self._processor_classes = {
+            "image": image_proc_mod.LanguageBindImageProcessor,
+            "audio": audio_proc_mod.LanguageBindAudioProcessor,
+            "video": video_proc_mod.LanguageBindVideoProcessor,
+        }
+        np.random.seed(13)
+
+    def unload(self) -> None:
+        self._processors.clear()
+        self._tokenizer = None
+        self._models.clear()
+        try:
+            import torch
+
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+        except ImportError:
+            return
+
+    def _ensure_tokenizer(self) -> Any:
+        if self._tokenizer is None:
+            self._tokenizer = self._tokenizer_cls.from_pretrained(
+                self.MODEL_REPOS["image"],
+                local_files_only=self.local_files_only,
+            )
+        return self._tokenizer
+
+    def _ensure_modality(self, modality: str) -> tuple[Any, Any]:
+        if modality in self._models:
+            return self._models[modality], self._processors[modality]
+
+        tokenizer = self._ensure_tokenizer()
+        model = self._model_classes[modality].from_pretrained(
+            self.MODEL_REPOS[modality],
+            local_files_only=self.local_files_only,
+        )
+        self._force_eager_attention(model)
+        model = model.to(self.device)
+        model.eval()
+        processor = self._processor_classes[modality](model.config, tokenizer)
+        if modality == "video":
+            self._strip_random_video_flip(processor)
+        self._models[modality] = model
+        self._processors[modality] = processor
+        return model, processor
+
+    @staticmethod
+    def _force_eager_attention(model: Any) -> None:
+        for cfg in (
+            getattr(model, "config", None),
+            getattr(getattr(model, "text_model", None), "config", None),
+            getattr(getattr(model, "vision_model", None), "config", None),
+        ):
+            if cfg is not None and getattr(cfg, "_attn_implementation", None) is None:
+                cfg._attn_implementation = "eager"
+
+    @staticmethod
+    def _strip_random_video_flip(processor: Any) -> None:
+        transform = getattr(processor, "transform", None)
+        transforms = getattr(transform, "transforms", None)
+        if not transforms:
+            return
+        kept = [step for step in transforms if step.__class__.__name__ != "RandomHorizontalFlipVideo"]
+        if len(kept) != len(transforms):
+            processor.transform = type(transform)(kept)
+
+    def _to_device(self, batch: dict[str, Any]) -> dict[str, Any]:
+        return {key: value.to(self.device) for key, value in batch.items()}
+
+    def encode_text(self, text: str) -> Any:
+        import torch
+        import torch.nn.functional as F
+
+        model, _ = self._ensure_modality(self.text_branch)
+        tokenizer = self._ensure_tokenizer()
+        batch = tokenizer([text], max_length=77, padding="max_length", truncation=True, return_tensors="pt")
+        batch = self._to_device(batch)
+        with torch.inference_mode():
+            features = model.get_text_features(**batch)
+        return F.normalize(features, dim=-1)[0]
+
+    def encode_image(self, path: str) -> Any:
+        return self._encode_media("image", path)
+
+    def encode_audio(self, path: str) -> Any:
+        return self._encode_media("audio", path)
+
+    def encode_video(self, path: str) -> Any:
+        return self._encode_media("video", path)
+
+    def _encode_media(self, modality: str, path: str) -> Any:
+        import torch
+        import torch.nn.functional as F
+
+        model, processor = self._ensure_modality(modality)
+        batch = processor(images=[path], return_tensors="pt")
+        batch = self._to_device(batch)
+        with torch.inference_mode():
+            features = model.get_image_features(**batch)
+        return F.normalize(features, dim=-1)[0]
diff --git a/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/eee/local_models.py b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/eee/local_models.py
new file mode 100644
index 0000000000..88624bfb37
--- /dev/null
+++ b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/eee/local_models.py
@@ -0,0 +1,756 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Full local Omni-Fuse embedding model stack."""
+
+from __future__ import annotations
+
+import logging
+import os
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+
+LOGGER = logging.getLogger(__name__)
+IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp", ".bmp", ".gif"}
+VIDEO_EXTENSIONS = {".mp4", ".mov", ".avi", ".mkv", ".webm", ".m4v"}
+AUDIO_EXTENSIONS = {".wav", ".mp3"}
+TEXT_EXTENSIONS = {".txt", ".md", ".json", ".csv"}
+OMNI_PROCESSOR_ALLOW_PATTERNS = [
+    "*.json",
+    "*.txt",
+    "*.model",
+    "*.py",
+    "*.jinja",
+    "additional_chat_templates/*.jinja",
+]
+
+
+def full_stack_dependency_error(component: str, exc: BaseException | None = None) -> RuntimeError:
+    message = (
+        f"Full local Omni-Fuse {component} is unavailable. Install the local model extras with "
+        "`python -m pip install -e '.[full]'`, make sure Hugging Face model weights are cached "
+        "or network access is available, and place required local checkpoints under `model_files/`."
+    )
+    error = RuntimeError(message)
+    if exc is not None:
+        error.__cause__ = exc
+    return error
+
+
+def resolve_device(runtime: Any | None = None) -> str:
+    requested = str(getattr(runtime, "device", "auto") if runtime is not None else "auto")
+    if requested and requested != "auto":
+        return requested
+    try:
+        import torch
+
+        return "cuda" if torch.cuda.is_available() else "cpu"
+    except ImportError:
+        return "cpu"
+
+
+def resolve_offline_mode(runtime: Any | None = None) -> bool:
+    if runtime is not None and bool(getattr(runtime, "offline_mode", False)):
+        return True
+    return os.environ.get("EMBEDSIM_OFFLINE_MODE", "0") == "1" or os.environ.get("HF_HUB_OFFLINE") == "1"
+
+
+def infer_modality(value: Any, hint: str | None = None) -> str:
+    if hint:
+        return hint
+    try:
+        from PIL import Image
+
+        if isinstance(value, Image.Image):
+            return "image"
+    except ImportError:
+        pass
+    if isinstance(value, dict):
+        if "audio" in value or "waveform" in value or "sample_rate" in value:
+            return "audio"
+        if "frames" in value:
+            return "video"
+        if "file_path" in value:
+            return infer_modality(value["file_path"])
+    if isinstance(value, (str, Path)):
+        value_str = str(value)
+        if len(value_str) < 500 and "\n" not in value_str:
+            suffix = Path(value_str).suffix.lower()
+            if suffix in IMAGE_EXTENSIONS:
+                return "image"
+            if suffix in AUDIO_EXTENSIONS:
+                return "audio"
+            if suffix in VIDEO_EXTENSIONS:
+                return "video"
+        return "text"
+    return "text"
+
+
+def resolve_cached_hf_snapshot_path(model_name: str) -> str:
+    if os.environ.get("HUGGINGFACE_HUB_CACHE"):
+        cache_dir = Path(os.environ["HUGGINGFACE_HUB_CACHE"])
+    elif os.environ.get("HF_HOME"):
+        cache_dir = Path(os.environ["HF_HOME"]) / "hub"
+    else:
+        cache_dir = Path.home() / ".cache" / "huggingface" / "hub"
+    model_cache = cache_dir / f"models--{model_name.replace('/', '--')}"
+    snapshots_dir = model_cache / "snapshots"
+    if not snapshots_dir.exists():
+        raise RuntimeError(f"Model {model_name} is not cached at {snapshots_dir}.")
+    snapshots = sorted(snapshots_dir.iterdir(), key=lambda path: path.stat().st_mtime, reverse=True)
+    if not snapshots:
+        raise RuntimeError(f"Model {model_name} has no cached snapshots under {snapshots_dir}.")
+    return str(snapshots[0])
+
+
+def load_omni_processor_with_workaround(model_name: str, offline_mode: bool) -> tuple[str, bool]:
+    if offline_mode:
+        return resolve_cached_hf_snapshot_path(model_name), True
+    try:
+        from huggingface_hub import snapshot_download
+    except ImportError as exc:
+        raise full_stack_dependency_error("Omni-Embed processor download", exc)
+    processor_path = snapshot_download(model_name, allow_patterns=OMNI_PROCESSOR_ALLOW_PATTERNS)
+    return processor_path, True
+
+
+_QWEN_OMNI_SYSTEM_PROMPT_WARNING_PREFIX = "System prompt modified, audio output may not work as expected."
+_QWEN_OMNI_FILTER_INSTALLED = False
+
+
+class _Qwen2_5OmniSystemPromptWarningFilter(logging.Filter):
+    """Drop the noisy Qwen2.5-Omni 'system prompt modified' warning.
+
+    transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py logs this on
+    every ``apply_chat_template`` call when the system prompt differs from the
+    default Qwen one. We never use Qwen2.5-Omni's audio-generation mode (we
+    only consume embeddings), so the warning is harmless and its per-sample
+    repetition floods worker logs.
+    """
+
+    def filter(self, record: logging.LogRecord) -> bool:
+        try:
+            message = record.getMessage()
+        except Exception:
+            return True
+        return not message.startswith(_QWEN_OMNI_SYSTEM_PROMPT_WARNING_PREFIX)
+
+
+def _silence_qwen_omni_system_prompt_warning() -> None:
+    """Install a one-shot root-logger filter that suppresses the Qwen2.5-Omni
+    ``apply_chat_template`` system-prompt warning.
+
+    transformers issues this via ``logging.warning(...)`` (root logger), so the
+    filter must live on the root logger. Idempotent across repeated calls
+    within a single process.
+    """
+    global _QWEN_OMNI_FILTER_INSTALLED
+    if _QWEN_OMNI_FILTER_INSTALLED:
+        return
+    logging.getLogger().addFilter(_Qwen2_5OmniSystemPromptWarningFilter())
+    _QWEN_OMNI_FILTER_INSTALLED = True
+
+
+class DimAdapter:
+    """Deterministic projection to the configured embedding dimension."""
+
+    def __init__(self, in_dim: int, out_dim: int, seed: int = 13):
+        rng = np.random.default_rng(seed)
+        self.weight = rng.standard_normal((in_dim, out_dim)).astype(np.float32) / np.sqrt(max(1, in_dim))
+        self.out_dim = out_dim
+
+    def __call__(self, values: np.ndarray) -> np.ndarray:
+        return values @ self.weight
+
+
+class NvidiaLlamaNemotronTextEncoder:
+    """Local nvidia/llama-nemotron-embed-1b-v2 text encoder."""
+
+    def __init__(
+        self,
+        model_name: str = "nvidia/llama-nemotron-embed-1b-v2",
+        device: str = "cpu",
+        offline_mode: bool = False,
+    ):
+        try:
+            import torch
+            from transformers import AutoModel, AutoTokenizer
+        except ImportError as exc:
+            raise full_stack_dependency_error("text expert encoder", exc)
+
+        self.model_name = model_name
+        self.device = device
+        self.offline_mode = offline_mode
+        self.torch = torch
+        model_path = resolve_cached_hf_snapshot_path(model_name) if offline_mode else model_name
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_path, local_files_only=offline_mode, trust_remote_code=True
+        )
+        self.model = AutoModel.from_pretrained(model_path, local_files_only=offline_mode, trust_remote_code=True)
+        self.model = self.model.to(device)
+        self.model.eval()
+        self.dim = int(getattr(getattr(self.model, "config", None), "hidden_size", 2048))
+
+    def encode(self, texts: list[str], batch_size: int = 8, max_length: int = 2048) -> np.ndarray:
+        if not texts:
+            return np.zeros((0, self.dim), dtype=np.float32)
+        outputs: list[np.ndarray] = []
+        prefixed = [f"passage: {text}" for text in texts]
+        for start in range(0, len(prefixed), max(1, batch_size)):
+            batch = prefixed[start : start + max(1, batch_size)]
+            batch_dict = self.tokenizer(
+                batch,
+                padding=True,
+                truncation=True,
+                max_length=max_length,
+                return_tensors="pt",
+            ).to(self.device)
+            with self.torch.inference_mode():
+                model_outputs = self.model(**batch_dict)
+                hidden = model_outputs.last_hidden_state
+                mask = batch_dict["attention_mask"]
+                hidden = hidden.masked_fill(~mask[..., None].bool(), 0.0)
+                embedding = hidden.sum(dim=1) / mask.sum(dim=1)[..., None]
+                embedding = self.torch.nn.functional.normalize(embedding, dim=-1)
+            outputs.append(embedding.float().cpu().numpy())
+        return np.concatenate(outputs, axis=0).astype(np.float32)
+
+
+class BlipCaptioner:
+    def __init__(self, device: str = "cpu", offline_mode: bool = False):
+        try:
+            from transformers import pipeline
+        except ImportError as exc:
+            raise full_stack_dependency_error("image captioner", exc)
+        dev = 0 if device.startswith("cuda") else -1
+        self.pipe = pipeline(
+            "image-to-text",
+            model="Salesforce/blip-image-captioning-base",
+            device=dev,
+            model_kwargs={"local_files_only": offline_mode},
+        )
+
+    def caption(self, image_obj: Any, prompt: str = "") -> str:
+        from PIL import Image
+
+        if isinstance(image_obj, (str, Path)):
+            image = Image.open(image_obj).convert("RGB")
+        elif isinstance(image_obj, Image.Image):
+            image = image_obj.convert("RGB")
+        else:
+            image = Image.fromarray(image_obj).convert("RGB")
+        result = self.pipe(image, generate_kwargs={"max_new_tokens": 64})
+        return str(result[0].get("generated_text") or "").strip()
+
+
+class WhisperASR:
+    def __init__(self, device: str = "cpu", offline_mode: bool = False):
+        try:
+            from transformers import pipeline
+        except ImportError as exc:
+            raise full_stack_dependency_error("audio ASR", exc)
+        dev = 0 if device.startswith("cuda") else -1
+        self.pipe = pipeline(
+            "automatic-speech-recognition",
+            model="openai/whisper-tiny.en",
+            device=dev,
+            model_kwargs={"local_files_only": offline_mode},
+        )
+
+    def transcribe(self, audio_obj: Any, prompt: str = "") -> str:
+        if isinstance(audio_obj, dict):
+            if "audio" in audio_obj and "sample_rate" in audio_obj:
+                audio_obj = {"array": audio_obj["audio"], "sampling_rate": audio_obj["sample_rate"]}
+            elif "file_path" in audio_obj:
+                audio_obj = audio_obj["file_path"]
+        if isinstance(audio_obj, (str, Path)):
+            import librosa
+
+            audio, _ = librosa.load(str(audio_obj), sr=16000, mono=True)
+            audio_obj = {"array": audio, "sampling_rate": 16000}
+        try:
+            import transformers.pipelines.automatic_speech_recognition as asr_pipeline
+
+            asr_pipeline.is_torchcodec_available = lambda: False
+        except Exception:
+            pass
+        result = self.pipe(audio_obj)
+        return str(result.get("text") or "").strip()
+
+
+class KeyframeVideoDescriber:
+    def __init__(self, image_captioner: BlipCaptioner, num_frames: int = 4):
+        self.image_captioner = image_captioner
+        self.num_frames = max(1, int(num_frames))
+
+    def describe(self, video_obj: Any, prompt: str = "") -> str:
+        frames = sample_video_keyframes(video_obj, self.num_frames)
+        captions = [self.image_captioner.caption(frame, prompt=prompt) for frame in frames]
+        return " [SEP] ".join(caption for caption in captions if caption)
+
+
+class OmniEmbedNemotronRuntime:
+    """Local nvidia/omni-embed-nemotron-3b runtime for all modalities."""
+
+    def __init__(
+        self,
+        model_name: str = "nvidia/omni-embed-nemotron-3b",
+        device: str = "cpu",
+        offline_mode: bool = False,
+    ):
+        self.model_name = model_name
+        self.device = device
+        self.offline_mode = offline_mode
+        self._model: Any | None = None
+        self._processor: Any | None = None
+        self.dim = 2048
+
+    def _initialize(self) -> None:
+        if self._model is not None and self._processor is not None:
+            return
+        try:
+            import torch
+            from transformers import AutoModel, AutoProcessor
+        except ImportError as exc:
+            raise full_stack_dependency_error("Omni-Embed e2e expert", exc)
+
+        _silence_qwen_omni_system_prompt_warning()
+
+        model_path = resolve_cached_hf_snapshot_path(self.model_name) if self.offline_mode else self.model_name
+        processor_path, processor_local_only = load_omni_processor_with_workaround(self.model_name, self.offline_mode)
+        self._processor = AutoProcessor.from_pretrained(
+            processor_path,
+            trust_remote_code=True,
+            local_files_only=processor_local_only,
+        )
+        dtype = torch.float32
+        if self.device == "cuda":
+            dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
+        self._model = AutoModel.from_pretrained(
+            model_path,
+            torch_dtype=dtype,
+            attn_implementation="eager",
+            trust_remote_code=True,
+            local_files_only=self.offline_mode,
+        )
+        self._model = self._model.to(self.device)
+        self._model.eval()
+
+    def encode_text(self, text: str) -> np.ndarray:
+        return self._get_embedding(text=text)
+
+    def encode_image(self, image: Any) -> np.ndarray:
+        return self._get_embedding(image=image)
+
+    def encode_audio(self, audio: Any) -> np.ndarray:
+        return self._get_embedding(audio=audio)
+
+    def encode_video(self, video: Any) -> np.ndarray:
+        return self._get_embedding(video=video)
+
+    def _get_embedding(
+        self,
+        text: str | None = None,
+        image: Any | None = None,
+        audio: Any | None = None,
+        video: Any | None = None,
+    ) -> np.ndarray:
+        self._initialize()
+        import torch
+
+        processor = self._processor
+        model = self._model
+        content: list[dict[str, Any]] = []
+        image_input = None
+        audio_input = None
+        video_inputs = None
+        preloaded_video = False
+
+        if text is not None:
+            content.append({"type": "text", "text": text})
+        if image is not None:
+            image_input = _load_image(image)
+            content.append({"type": "image", "image": image_input})
+        if audio is not None:
+            audio_input = _load_audio_waveform(audio)
+            content.append({"type": "audio", "audio": audio_input})
+        if video is not None:
+            video_value = video
+            if isinstance(video, dict) and "file_path" in video:
+                video_value = video["file_path"]
+            if isinstance(video_value, np.ndarray):
+                video_inputs = video_value
+                preloaded_video = True
+                content.append({"type": "video", "video": "video.mp4"})
+            else:
+                content.append(
+                    {
+                        "type": "video",
+                        "video": str(video_value),
+                        "fps": 1.0,
+                        "max_pixels": 128 * 128 * 64,
+                    }
+                )
+        if not content:
+            raise ValueError("At least one modality input must be provided")
+
+        messages = [{"role": "user", "content": content}]
+        if video is not None and not preloaded_video:
+            try:
+                from qwen_vl_utils import process_vision_info
+            except ImportError as exc:
+                raise full_stack_dependency_error("Omni-Embed video preprocessing", exc)
+            _, video_inputs, _ = process_vision_info(messages, return_video_kwargs=True)
+
+        text_input = processor.apply_chat_template(messages, add_generation_prompt=False, tokenize=False)
+        processor_kwargs = {
+            "text": [text_input],
+            "images": image_input,
+            "videos": video_inputs,
+            "audio": audio_input,
+            "return_tensors": "pt",
+            "padding": True,
+        }
+        if image is None and video is None:
+            processor_kwargs["text_kwargs"] = {"truncation": True, "max_length": 2048}
+        batch_dict = processor(**processor_kwargs)
+        batch_dict = {key: value.to(self.device) for key, value in batch_dict.items() if value is not None}
+        with torch.inference_mode():
+            outputs = model(**batch_dict, output_hidden_states=True)
+            hidden = outputs.hidden_states[-1]
+            mask = batch_dict["attention_mask"]
+            hidden = hidden.masked_fill(~mask[..., None].bool(), 0.0)
+            embedding = hidden.sum(dim=1) / mask.sum(dim=1)[..., None]
+            embedding = torch.nn.functional.normalize(embedding, dim=-1)
+        return embedding.float().cpu().numpy()[0].astype(np.float32)
+
+    def unload(self) -> None:
+        self._model = None
+        self._processor = None
+        try:
+            import torch
+
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+        except ImportError:
+            return
+
+
+class FullLocalEEEBackend:
+    """Production local EEE backend matching the Omni-Fuse expert stack."""
+
+    def __init__(self, config: Any | None = None, runtime: Any | None = None, embedding_dim: int = 2048):
+        self.config = config
+        self.runtime = runtime
+        self.embedding_dim = int(getattr(config, "embedding_dim", embedding_dim))
+        self.batch_size = int(getattr(config, "batch_size", 8))
+        self.device = resolve_device(runtime)
+        self.offline_mode = resolve_offline_mode(runtime)
+        self.text_prompt_base = str(getattr(config, "text_prompt_base", "") or "")
+        self.text_prompt_prefix = str(getattr(config, "text_prompt_prefix", "") or "")
+        self.text_model = str(getattr(config, "nvidia_embedding_model", "nvidia/llama-nemotron-embed-1b-v2"))
+        self.omni_model = str(getattr(config, "nvidia_multimodal_model", "nvidia/omni-embed-nemotron-3b"))
+        self._text_encoder: NvidiaLlamaNemotronTextEncoder | None = None
+        self._image_captioner: BlipCaptioner | None = None
+        self._audio_asr: WhisperASR | None = None
+        self._video_describer: KeyframeVideoDescriber | None = None
+        self._omni: OmniEmbedNemotronRuntime | None = None
+        self._languagebind: Any | None = None
+        self._languagebind_adapter: DimAdapter | None = None
+        self._description_cache: dict[str, str] = {}
+
+    def embed_raw(self, record: dict[str, Any], expert: str) -> list[float]:
+        expert = _validate_expert(expert)
+        modality = str(record.get("modality") or infer_modality(record.get("raw_path")))
+        if expert == "text-based":
+            text = self.describe_record(record)
+            return self._encode_text_stack(text)
+        if expert == "fusion":
+            return self._encode_fusion(_raw_value(record, modality), modality)
+        if expert == "e2e":
+            return self._encode_omni(_raw_value(record, modality), modality)
+        raise ValueError(f"Unsupported EEE expert: {expert}")
+
+    def embed_annotation(self, record: dict[str, Any], expert: str) -> list[float]:
+        return self.embed_query(_annotation_text(record), expert)
+
+    def embed_query(self, query: str, expert: str = "text-based") -> list[float]:
+        expert = _validate_expert(expert)
+        if expert == "text-based":
+            return self._encode_text_stack(query)
+        if expert == "fusion":
+            return self._encode_languagebind_text(query)
+        if expert == "e2e":
+            return _resize_and_normalize(self._ensure_omni().encode_text(query), self.embedding_dim)
+        raise ValueError(f"Unsupported EEE expert: {expert}")
+
+    def describe_record(self, record: dict[str, Any]) -> str:
+        modality = str(record.get("modality") or infer_modality(record.get("raw_path")))
+        raw_text = _text_or_none(record.get("sns_raw_text")) or _text_or_none(record.get("raw_text"))
+        if raw_text:
+            return raw_text
+        raw_value = _raw_value(record, modality)
+        if modality == "text":
+            return str(raw_value)
+        cache_key = f"{modality}:{raw_value}:{self.text_prompt_base}:{self.text_prompt_prefix}"
+        if cache_key in self._description_cache:
+            return self._description_cache[cache_key]
+        prompt = f"{self.text_prompt_base}{self.text_prompt_prefix}"
+        if modality == "image":
+            description = self._ensure_image_captioner().caption(raw_value, prompt=prompt)
+        elif modality == "audio":
+            description = self._ensure_audio_asr().transcribe(raw_value, prompt=prompt)
+        elif modality == "video":
+            description = self._ensure_video_describer().describe(raw_value, prompt=prompt)
+        else:
+            description = str(raw_value)
+        self._description_cache[cache_key] = description
+        return description
+
+    def unload(self) -> None:
+        if self._languagebind is not None and hasattr(self._languagebind, "unload"):
+            self._languagebind.unload()
+        if self._omni is not None:
+            self._omni.unload()
+        self._text_encoder = None
+        self._image_captioner = None
+        self._audio_asr = None
+        self._video_describer = None
+        self._omni = None
+        self._languagebind = None
+
+    def _encode_text_stack(self, text: str) -> list[float]:
+        vectors = self._ensure_text_encoder().encode([text], batch_size=self.batch_size)
+        return _resize_and_normalize(vectors[0], self.embedding_dim)
+
+    def _encode_omni(self, value: Any, modality: str) -> list[float]:
+        omni = self._ensure_omni()
+        if modality == "text":
+            vector = omni.encode_text(str(value))
+        elif modality == "image":
+            vector = omni.encode_image(value)
+        elif modality == "audio":
+            vector = omni.encode_audio(value)
+        elif modality == "video":
+            vector = omni.encode_video(value)
+        else:
+            raise ValueError(f"Unsupported modality for Omni-Embed: {modality}")
+        return _resize_and_normalize(vector, self.embedding_dim)
+
+    def _encode_fusion(self, value: Any, modality: str) -> list[float]:
+        if modality == "text":
+            return self._encode_languagebind_text(str(value))
+        runtime = self._ensure_languagebind()
+        if modality == "image":
+            vector = runtime.encode_image(_path_string(value, modality))
+        elif modality == "audio":
+            vector = runtime.encode_audio(_path_string(value, modality))
+        elif modality == "video":
+            vector = runtime.encode_video(_path_string(value, modality))
+        else:
+            raise ValueError(f"Unsupported modality for LanguageBind: {modality}")
+        vector_np = vector.detach().cpu().numpy().astype(np.float32)
+        return self._adapt_languagebind(vector_np)
+
+    def _encode_languagebind_text(self, text: str) -> list[float]:
+        runtime = self._ensure_languagebind()
+        vector = runtime.encode_text(text)
+        vector_np = vector.detach().cpu().numpy().astype(np.float32)
+        return self._adapt_languagebind(vector_np)
+
+    def _adapt_languagebind(self, vector: np.ndarray) -> list[float]:
+        vector = np.asarray(vector, dtype=np.float32)
+        if vector.ndim != 1:
+            vector = vector.reshape(-1)
+        if vector.shape[0] != self.embedding_dim:
+            if self._languagebind_adapter is None or self._languagebind_adapter.weight.shape[0] != vector.shape[0]:
+                self._languagebind_adapter = DimAdapter(vector.shape[0], self.embedding_dim, seed=13)
+            vector = self._languagebind_adapter(vector[None, :])[0]
+        return _resize_and_normalize(vector, self.embedding_dim)
+
+    def _ensure_text_encoder(self) -> NvidiaLlamaNemotronTextEncoder:
+        if self._text_encoder is None:
+            self._text_encoder = NvidiaLlamaNemotronTextEncoder(
+                model_name=self.text_model,
+                device=self.device,
+                offline_mode=self.offline_mode,
+            )
+        return self._text_encoder
+
+    def _ensure_image_captioner(self) -> BlipCaptioner:
+        if self._image_captioner is None:
+            self._image_captioner = BlipCaptioner(self.device, self.offline_mode)
+        return self._image_captioner
+
+    def _ensure_audio_asr(self) -> WhisperASR:
+        if self._audio_asr is None:
+            self._audio_asr = WhisperASR(self.device, self.offline_mode)
+        return self._audio_asr
+
+    def _ensure_video_describer(self) -> KeyframeVideoDescriber:
+        if self._video_describer is None:
+            self._video_describer = KeyframeVideoDescriber(self._ensure_image_captioner(), num_frames=4)
+        return self._video_describer
+
+    def _ensure_omni(self) -> OmniEmbedNemotronRuntime:
+        if self._omni is None:
+            self._omni = OmniEmbedNemotronRuntime(self.omni_model, self.device, self.offline_mode)
+        return self._omni
+
+    def _ensure_languagebind(self) -> Any:
+        if self._languagebind is None:
+            from omnifuse_tutorial.eee.languagebind_runtime import LanguageBindRuntime
+
+            self._languagebind = LanguageBindRuntime(
+                device=self.device,
+                text_branch="video",
+                local_files_only=self.offline_mode,
+            )
+        return self._languagebind
+
+
+def sample_video_keyframes(video_path: Any, num_frames: int = 4) -> list[Any]:
+    try:
+        import cv2
+        from PIL import Image
+    except ImportError as exc:
+        raise full_stack_dependency_error("video keyframe sampling", exc)
+
+    if isinstance(video_path, dict):
+        if video_path.get("frames"):
+            frames = video_path["frames"]
+            indices = np.linspace(0, len(frames) - 1, num=min(num_frames, len(frames)), dtype=int)
+            return [
+                frame if hasattr(frame, "mode") else Image.fromarray(frame) for frame in (frames[i] for i in indices)
+            ]
+        if "file_path" in video_path:
+            video_path = video_path["file_path"]
+    cap = cv2.VideoCapture(str(video_path))
+    if not cap.isOpened():
+        cap.release()
+        raise ValueError(f"Failed to open video: {video_path}")
+    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    if frame_count <= 0:
+        cap.release()
+        raise ValueError(f"Video has no frames: {video_path}")
+    indices = set(np.linspace(0, frame_count - 1, num=min(num_frames, frame_count), dtype=int).tolist())
+    frames = []
+    index = 0
+    while True:
+        ok, frame = cap.read()
+        if not ok:
+            break
+        if index in indices:
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            frames.append(Image.fromarray(frame))
+        index += 1
+    cap.release()
+    if not frames:
+        raise ValueError(f"No keyframes sampled from video: {video_path}")
+    return frames
+
+
+def _load_image(value: Any) -> Any:
+    from PIL import Image
+
+    if isinstance(value, Image.Image):
+        return value
+    if isinstance(value, dict) and "file_path" in value:
+        value = value["file_path"]
+    if isinstance(value, (str, Path)):
+        return Image.open(value).convert("RGB")
+    return value
+
+
+def _load_audio_waveform(value: Any) -> Any:
+    if isinstance(value, dict):
+        if "audio" in value:
+            audio = value["audio"]
+            sample_rate = int(value.get("sample_rate", 16000))
+            if sample_rate != 16000:
+                import librosa
+
+                audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=16000)
+            return audio
+        if "file_path" in value:
+            value = value["file_path"]
+    if isinstance(value, (str, Path)):
+        import librosa
+
+        audio, _ = librosa.load(str(value), sr=16000, mono=True)
+        return audio
+    return value
+
+
+def _raw_value(record: dict[str, Any], modality: str) -> Any:
+    if modality == "text":
+        raw_text = _text_or_none(record.get("sns_raw_text")) or _text_or_none(record.get("raw_text"))
+        if raw_text:
+            return raw_text
+        path = _path_or_none(record.get("raw_path"))
+        if path and path.exists():
+            return path.read_text(encoding="utf-8").strip()
+    path = _path_or_none(record.get("raw_path"))
+    if path:
+        return str(path)
+    return record.get("raw_path") or record.get("raw_text") or ""
+
+
+def _annotation_text(record: dict[str, Any]) -> str:
+    return _text_or_none(record.get("sns_annotation")) or _text_or_none(record.get("annotation")) or ""
+
+
+def _text_or_none(value: Any) -> str | None:
+    if isinstance(value, str):
+        text = value.strip()
+        return text or None
+    return None
+
+
+def _path_or_none(value: Any) -> Path | None:
+    if not isinstance(value, (str, Path)):
+        return None
+    try:
+        return Path(value)
+    except OSError:
+        return None
+
+
+def _path_string(value: Any, modality: str) -> str:
+    if isinstance(value, dict) and "file_path" in value:
+        value = value["file_path"]
+    if isinstance(value, (str, Path)):
+        return str(value)
+    raise ValueError(f"LanguageBind {modality} encoding requires a file path, got {type(value)!r}")
+
+
+def _validate_expert(expert: str) -> str:
+    if expert not in {"text-based", "fusion", "e2e"}:
+        raise ValueError(f"Unsupported EEE expert: {expert}")
+    return expert
+
+
+def _resize_and_normalize(vector: Any, dim: int) -> list[float]:
+    values = np.asarray(vector, dtype=np.float32).reshape(-1)
+    if values.size != dim:
+        resized = np.zeros(dim, dtype=np.float32)
+        limit = min(values.size, dim)
+        resized[:limit] = values[:limit]
+        values = resized
+    norm = float(np.linalg.norm(values))
+    if norm > 0:
+        values = values / norm
+    return [float(item) for item in values]
diff --git a/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/eee/results.py b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/eee/results.py
new file mode 100644
index 0000000000..e659061a04
--- /dev/null
+++ b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/eee/results.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Embedding result containers."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any
+
+
+@dataclass
+class EmbeddingBundle:
+    pair_ids: list[str]
+    modalities: list[str]
+    records: list[dict[str, Any]]
+    experts: list[str]
+    embeddings: dict[str, list[list[float]]] = field(default_factory=dict)
+
+    def raw_embeddings(self, expert: str) -> list[list[float]]:
+        return self.embeddings[expert][0::2]
+
+    def annotation_embeddings(self, expert: str) -> list[list[float]]:
+        return self.embeddings[expert][1::2]
+
+    @property
+    def embedding_dim(self) -> int:
+        if not self.experts:
+            return 0
+        first = self.embeddings[self.experts[0]]
+        return len(first[0]) if first else 0
diff --git a/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/pipeline.py b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/pipeline.py
new file mode 100644
index 0000000000..71741767c7
--- /dev/null
+++ b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/pipeline.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Pipeline construction and execution facade."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any
+
+from omnifuse_tutorial.compat.curator import make_curator_pipeline, make_empty_task
+from omnifuse_tutorial.config.models import ExperimentConfig
+from omnifuse_tutorial.data.io import write_json
+from omnifuse_tutorial.eee.backends import BackendFactory, backend_factory
+from omnifuse_tutorial.stages import (
+    DatablendRankingStage,
+    EEEEmbeddingStage,
+    PairManifestReaderStage,
+    ProjectionTrainingStage,
+    SNSStage,
+)
+
+
+@dataclass
+class OmniFusePipeline:
+    config: ExperimentConfig
+    curator_pipeline: Any
+
+    def run(self) -> dict[str, Any]:
+        self.config.run_dir.mkdir(parents=True, exist_ok=True)
+        write_json(self.config.run_dir / "config.resolved.json", self.config.to_dict())
+        tasks = self.curator_pipeline.run(initial_tasks=[make_empty_task()])
+        if not tasks:
+            raise RuntimeError("Pipeline produced no output tasks")
+        final_task = tasks[-1]
+        metadata = dict(getattr(final_task, "_metadata", {}) or {})
+        return {
+            "run_dir": str(self.config.run_dir),
+            "sns_manifest_path": metadata.get("sns_manifest_path"),
+            "embedding_metadata_path": metadata.get("embedding_metadata_path"),
+            "projection_model_path": metadata.get("projection_model_path"),
+            "projection_loss_path": metadata.get("projection_loss_path"),
+            "projection_metrics_path": metadata.get("projection_metrics_path"),
+            "projected_embeddings_path": metadata.get("projected_embeddings_path"),
+            "datablend_ranked_path": metadata.get("datablend_ranked_path"),
+            "datablend_topk_path": metadata.get("datablend_topk_path"),
+            "datablend_size": metadata.get("datablend_size"),
+        }
+
+
+def build_pipeline(
+    config: ExperimentConfig,
+    backend_factory_fn: BackendFactory = backend_factory,
+) -> OmniFusePipeline:
+    stages = [
+        PairManifestReaderStage(config=config),
+        SNSStage(config=config),
+        EEEEmbeddingStage(config=config, backend_factory_fn=backend_factory_fn),
+        ProjectionTrainingStage(config=config),
+        DatablendRankingStage(config=config, backend_factory_fn=backend_factory_fn),
+    ]
+    curator_pipeline = make_curator_pipeline(
+        name=f"omnifuse-{config.experiment_id}",
+        description="Omni-Fuse SNS, EEE, projection, and datablend pipeline",
+        stages=stages,
+    )
+    return OmniFusePipeline(config=config, curator_pipeline=curator_pipeline)
diff --git a/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/projection/__init__.py b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/projection/__init__.py
new file mode 100644
index 0000000000..9aa493c3ba
--- /dev/null
+++ b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/projection/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Projection network package."""
diff --git a/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/projection/trainer.py b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/projection/trainer.py
new file mode 100644
index 0000000000..d3309e49a4
--- /dev/null
+++ b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/projection/trainer.py
@@ -0,0 +1,312 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Projection training and inference."""
+
+from __future__ import annotations
+
+import math
+from dataclasses import dataclass
+from typing import Any
+
+from omnifuse_tutorial.config.models import ProjectionConfig
+from omnifuse_tutorial.data.io import cosine_similarity
+from omnifuse_tutorial.eee.results import EmbeddingBundle
+
+
+@dataclass
+class ProjectionResult:
+    projected_raw: list[list[float]]
+    annotation_embeddings: list[list[float]]
+    expert_weights: dict[str, float]
+    loss_history: list[float]
+    recall_at_10: dict[str, float]
+    model: dict[str, Any]
+
+
+class ProjectionTrainer:
+    def __init__(self, config: ProjectionConfig):
+        self.config = config
+
+    def train_and_project(self, bundle: EmbeddingBundle) -> ProjectionResult:
+        if not bundle.records:
+            raise ValueError("Cannot train projection on an empty bundle")
+        if self.config.backend == "torch" or (self.config.backend == "auto" and _torch_available()):
+            return self._train_torch_projection(bundle)
+        return self._train_linear_projection(bundle)
+
+    def _train_linear_projection(self, bundle: EmbeddingBundle) -> ProjectionResult:
+        expert_scores = {
+            expert: _mean_pair_similarity(bundle.raw_embeddings(expert), bundle.annotation_embeddings(expert))
+            for expert in bundle.experts
+        }
+        weights = _softmax(expert_scores)
+        projected = _weighted_sum(
+            [bundle.raw_embeddings(expert) for expert in bundle.experts], [weights[e] for e in bundle.experts]
+        )
+
+        anchor_expert = "text-based" if "text-based" in bundle.experts else bundle.experts[0]
+        annotations = bundle.annotation_embeddings(anchor_expert)
+        loss = _contrastive_loss(projected, annotations, self.config.contrastive_temperature)
+        recall = _recall_at_k(projected, annotations, k=min(10, len(projected)))
+        model = {
+            "type": "linear_expert_projection",
+            "experts": bundle.experts,
+            "expert_weights": weights,
+            "embedding_dim": bundle.embedding_dim,
+            "anchor_expert": anchor_expert,
+        }
+        return ProjectionResult(
+            projected_raw=projected,
+            annotation_embeddings=annotations,
+            expert_weights=weights,
+            loss_history=[loss],
+            recall_at_10=recall,
+            model=model,
+        )
+
+    def _train_torch_projection(self, bundle: EmbeddingBundle) -> ProjectionResult:
+        import torch
+        import torch.nn.functional as F
+
+        torch.manual_seed(0)
+        raw_inputs = torch.tensor(_concat_raw_expert_embeddings(bundle), dtype=torch.float32)
+        anchor_expert = "text-based" if "text-based" in bundle.experts else bundle.experts[0]
+        anchors = torch.tensor(bundle.annotation_embeddings(anchor_expert), dtype=torch.float32)
+        modalities = [str(record.get("modality", "")) for record in bundle.records]
+
+        model = _ProjectionMLP(
+            input_dim=raw_inputs.shape[1],
+            output_dim=anchors.shape[1],
+            hidden_dim=self.config.hidden_layer_size,
+            num_layers=self.config.num_layers,
+            dropout=self.config.dropout,
+        )
+        optimizer = torch.optim.AdamW(model.parameters(), lr=self.config.learning_rate)
+        batch_size = max(1, min(self.config.batch_size, raw_inputs.shape[0]))
+        loss_history: list[float] = []
+        epochs = max(1, self.config.num_epochs)
+
+        for _epoch in range(epochs):
+            permutation = torch.randperm(raw_inputs.shape[0])
+            epoch_losses: list[float] = []
+            for start in range(0, raw_inputs.shape[0], batch_size):
+                batch_idx = permutation[start : start + batch_size]
+                batch_raw = raw_inputs[batch_idx]
+                batch_anchors = anchors[batch_idx]
+                batch_modalities = [modalities[int(idx)] for idx in batch_idx]
+
+                projected = model(batch_raw)
+                task_loss = _torch_contrastive_loss(projected, batch_anchors, self.config.contrastive_temperature)
+                cluster_loss = _torch_cluster_bias_loss(projected, batch_modalities)
+                scale_loss = _torch_scale_bias_loss(projected, batch_modalities)
+                loss = (
+                    self.config.contrastive_loss_weight * task_loss
+                    + self.config.bias_loss_weight * cluster_loss
+                    + self.config.scale_loss_weight * scale_loss
+                )
+                optimizer.zero_grad()
+                loss.backward()
+                optimizer.step()
+                epoch_losses.append(float(loss.detach().cpu()))
+            loss_history.append(sum(epoch_losses) / len(epoch_losses))
+
+        model.eval()
+        with torch.no_grad():
+            projected_tensor = F.normalize(model(raw_inputs), dim=1)
+        projected = projected_tensor.cpu().tolist()
+        annotations = F.normalize(anchors, dim=1).cpu().tolist()
+        recall = _recall_at_k(projected, annotations, k=min(self.config.eval_recall_k, len(projected)))
+
+        state_dict_path = None
+        if self.config.save_weights_path:
+            self.config.save_weights_path.parent.mkdir(parents=True, exist_ok=True)
+            torch.save(model.state_dict(), self.config.save_weights_path)
+            state_dict_path = str(self.config.save_weights_path)
+
+        equal_weight = 1.0 / len(bundle.experts)
+        metadata = {
+            "type": "torch_mlp_projection",
+            "experts": bundle.experts,
+            "input_dim": int(raw_inputs.shape[1]),
+            "embedding_dim": bundle.embedding_dim,
+            "hidden_layer_size": self.config.hidden_layer_size,
+            "num_layers": self.config.num_layers,
+            "dropout": self.config.dropout,
+            "anchor_expert": anchor_expert,
+            "contrastive_loss_weight": self.config.contrastive_loss_weight,
+            "bias_loss_weight": self.config.bias_loss_weight,
+            "scale_loss_weight": self.config.scale_loss_weight,
+            "contrastive_temperature": self.config.contrastive_temperature,
+            "state_dict_path": state_dict_path,
+        }
+        return ProjectionResult(
+            projected_raw=projected,
+            annotation_embeddings=annotations,
+            expert_weights=dict.fromkeys(bundle.experts, equal_weight),
+            loss_history=loss_history,
+            recall_at_10=recall,
+            model=metadata,
+        )
+
+
+class _ProjectionMLP:
+    def __new__(
+        cls,
+        input_dim: int,
+        output_dim: int,
+        hidden_dim: int,
+        num_layers: int,
+        dropout: float,
+    ):
+        from torch import nn
+
+        layers: list[nn.Module] = []
+        current_dim = input_dim
+        hidden_layers = max(0, num_layers - 1)
+        for _ in range(hidden_layers):
+            layers.append(nn.Linear(current_dim, hidden_dim))
+            layers.append(nn.ReLU())
+            if dropout > 0:
+                layers.append(nn.Dropout(dropout))
+            current_dim = hidden_dim
+        layers.append(nn.Linear(current_dim, output_dim))
+        return nn.Sequential(*layers)
+
+
+def _mean_pair_similarity(raw: list[list[float]], annotations: list[list[float]]) -> float:
+    if not raw or not annotations:
+        return 0.0
+    scores = [cosine_similarity(left, right) for left, right in zip(raw, annotations)]
+    return sum(scores) / len(scores)
+
+
+def _softmax(scores: dict[str, float]) -> dict[str, float]:
+    if not scores:
+        return {}
+    max_score = max(scores.values())
+    exp_scores = {name: math.exp(score - max_score) for name, score in scores.items()}
+    total = sum(exp_scores.values())
+    if total == 0:
+        equal = 1.0 / len(scores)
+        return dict.fromkeys(scores, equal)
+    return {name: value / total for name, value in exp_scores.items()}
+
+
+def _weighted_sum(expert_matrices: list[list[list[float]]], weights: list[float]) -> list[list[float]]:
+    if not expert_matrices:
+        return []
+    n_rows = len(expert_matrices[0])
+    n_cols = len(expert_matrices[0][0]) if n_rows else 0
+    output: list[list[float]] = []
+    for row_idx in range(n_rows):
+        row = [0.0] * n_cols
+        for matrix, weight in zip(expert_matrices, weights):
+            for col_idx, value in enumerate(matrix[row_idx]):
+                row[col_idx] += weight * value
+        output.append(row)
+    return output
+
+
+def _contrastive_loss(raw: list[list[float]], annotations: list[list[float]], temperature: float) -> float:
+    if not raw:
+        return 0.0
+    losses: list[float] = []
+    for idx, raw_vec in enumerate(raw):
+        logits = [cosine_similarity(raw_vec, ann_vec) / temperature for ann_vec in annotations]
+        max_logit = max(logits)
+        log_sum_exp = max_logit + math.log(sum(math.exp(item - max_logit) for item in logits))
+        losses.append(-logits[idx] + log_sum_exp)
+    return sum(losses) / len(losses)
+
+
+def _recall_at_k(raw: list[list[float]], annotations: list[list[float]], k: int) -> dict[str, float]:
+    if not raw or not annotations:
+        return {"annotation_to_raw": 0.0, "raw_to_annotation": 0.0, "average": 0.0}
+    a2r_hits = 0
+    for idx, ann_vec in enumerate(annotations):
+        ranked = sorted(range(len(raw)), key=lambda raw_idx: cosine_similarity(ann_vec, raw[raw_idx]), reverse=True)
+        if idx in ranked[:k]:
+            a2r_hits += 1
+    r2a_hits = 0
+    for idx, raw_vec in enumerate(raw):
+        ranked = sorted(
+            range(len(annotations)), key=lambda ann_idx: cosine_similarity(raw_vec, annotations[ann_idx]), reverse=True
+        )
+        if idx in ranked[:k]:
+            r2a_hits += 1
+    a2r = a2r_hits / len(annotations)
+    r2a = r2a_hits / len(raw)
+    return {"annotation_to_raw": a2r, "raw_to_annotation": r2a, "average": (a2r + r2a) / 2.0}
+
+
+def _concat_raw_expert_embeddings(bundle: EmbeddingBundle) -> list[list[float]]:
+    rows: list[list[float]] = []
+    expert_matrices = [bundle.raw_embeddings(expert) for expert in bundle.experts]
+    for row_idx in range(len(bundle.records)):
+        row: list[float] = []
+        for matrix in expert_matrices:
+            row.extend(matrix[row_idx])
+        rows.append(row)
+    return rows
+
+
+def _torch_available() -> bool:
+    try:
+        import torch
+    except ImportError:
+        return False
+    return True
+
+
+def _torch_contrastive_loss(projected: Any, anchors: Any, temperature: float) -> Any:
+    import torch
+    import torch.nn.functional as F
+
+    projected = F.normalize(projected, dim=1)
+    anchors = F.normalize(anchors, dim=1)
+    logits = projected @ anchors.T / temperature
+    labels = torch.arange(projected.shape[0], device=projected.device)
+    return 0.5 * (F.cross_entropy(logits, labels) + F.cross_entropy(logits.T, labels))
+
+
+def _torch_cluster_bias_loss(projected: Any, modalities: list[str]) -> Any:
+    import torch
+
+    if len(set(modalities)) <= 1:
+        return projected.new_tensor(0.0)
+    overall = projected.mean(dim=0)
+    losses = []
+    for modality in sorted(set(modalities)):
+        indices = [idx for idx, value in enumerate(modalities) if value == modality]
+        centroid = projected[torch.tensor(indices, device=projected.device)].mean(dim=0)
+        losses.append(torch.mean((centroid - overall) ** 2))
+    return torch.stack(losses).mean()
+
+
+def _torch_scale_bias_loss(projected: Any, modalities: list[str]) -> Any:
+    import torch
+
+    if len(set(modalities)) <= 1:
+        return projected.new_tensor(0.0)
+    overall = projected.mean(dim=0)
+    overall_spread = torch.norm(projected - overall, dim=1).mean()
+    losses = []
+    for modality in sorted(set(modalities)):
+        indices = [idx for idx, value in enumerate(modalities) if value == modality]
+        modality_rows = projected[torch.tensor(indices, device=projected.device)]
+        centroid = modality_rows.mean(dim=0)
+        spread = torch.norm(modality_rows - centroid, dim=1).mean()
+        losses.append((spread - overall_spread) ** 2)
+    return torch.stack(losses).mean()
diff --git a/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/sns/__init__.py b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/sns/__init__.py
new file mode 100644
index 0000000000..692504d96d
--- /dev/null
+++ b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/sns/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Symmetric Nucleus Subsampling package."""
diff --git a/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/sns/backends.py b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/sns/backends.py
new file mode 100644
index 0000000000..cd1d37f356
--- /dev/null
+++ b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/sns/backends.py
@@ -0,0 +1,554 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""SNS similarity and description backends."""
+
+from __future__ import annotations
+
+import hashlib
+import math
+import os
+import re
+from pathlib import Path
+from typing import Any, Protocol
+
+from omnifuse_tutorial.data.io import cosine_similarity
+from omnifuse_tutorial.eee.backends import (
+    GEMMA_3N_E4B_MODEL,
+    _post_nvidia_json_with_retries,
+    describe_file_with_nvidia_api,
+)
+
+
+class SNSBackend(Protocol):
+    def image_text(self, image_data: Any, text: str) -> float: ...
+
+    def audio_text(self, audio_data: Any, text: str) -> float: ...
+
+    def video_text(self, video_data: Any, text: str) -> float: ...
+
+    def text_text(self, left: Any, right: Any, dim: int = 2048) -> float: ...
+
+    def text_text_matrix(self, texts_a: list[str], texts_b: list[str], batch_size: int = 16) -> Any: ...
+
+    def describe_record(self, record: dict[str, Any]) -> str: ...
+
+    def forward_media(self, record: dict[str, Any], annotation: str) -> tuple[dict[str, Any], dict[str, Any]]: ...
+
+    def unload(self) -> None: ...
+
+
+class LocalSNSBackend:
+    """Full local SNS backend using Omni-Embed similarity and forward extractors."""
+
+    def __init__(self, sns_config: Any | None = None, eee_config: Any | None = None, runtime: Any | None = None):
+        if sns_config is None:
+            raise ValueError("LocalSNSBackend requires a full SNS config")
+        self.sns_config = sns_config
+        self.eee_config = eee_config
+        self.embedding_dim = int(getattr(eee_config, "embedding_dim", 2048))
+        from omnifuse_tutorial.eee.local_models import (
+            FullLocalEEEBackend,
+            OmniEmbedNemotronRuntime,
+            resolve_device,
+            resolve_offline_mode,
+        )
+        from omnifuse_tutorial.sns.full_forward import ForwardModelStore
+
+        self.device = resolve_device(runtime)
+        self.offline_mode = resolve_offline_mode(runtime)
+        self.omni_model = str(getattr(sns_config, "nvidia_model", "nvidia/omni-embed-nemotron-3b"))
+        self._omni_runtime_cls = OmniEmbedNemotronRuntime
+        self._omni: Any | None = None
+        self._text_embedding_cache: dict[str, list[float]] = {}
+        self._description_backend = FullLocalEEEBackend(
+            config=eee_config, runtime=runtime, embedding_dim=self.embedding_dim
+        )
+        self._forward_models = ForwardModelStore(sns_config, self.device, self.offline_mode)
+        self.require_forward_models = bool(getattr(sns_config, "require_forward_models", True))
+
+    def image_text(self, image_data: Any, text: str) -> float:
+        return cosine_similarity(self._embed_media(image_data, "image"), self._embed_text(text))
+
+    def audio_text(self, audio_data: Any, text: str) -> float:
+        return cosine_similarity(self._embed_media(audio_data, "audio"), self._embed_text(text))
+
+    def video_text(self, video_data: Any, text: str) -> float:
+        return cosine_similarity(self._embed_media(video_data, "video"), self._embed_text(text))
+
+    def text_text(self, left: Any, right: Any, dim: int = 2048) -> float:
+        left = _text_or_none(left) or ""
+        right = _text_or_none(right) or ""
+        if not left or not right:
+            return 0.0
+        return cosine_similarity(self._embed_text(left), self._embed_text(right))
+
+    def text_text_matrix(self, texts_a: list[str], texts_b: list[str], batch_size: int = 16) -> Any:
+        import numpy as np
+
+        return np.array([[self.text_text(left, right) for right in texts_b] for left in texts_a], dtype=np.float32)
+
+    def describe_record(self, record: dict[str, Any]) -> str:
+        return self._description_backend.describe_record(record)
+
+    def forward_media(self, record: dict[str, Any], annotation: str) -> tuple[dict[str, Any], dict[str, Any]]:
+        from omnifuse_tutorial.sns.full_forward import (
+            forward_extract_audio,
+            forward_extract_image,
+            forward_extract_video,
+        )
+
+        modality = str(record.get("modality") or "")
+        raw_value = record.get("raw_path")
+        try:
+            if modality == "image":
+                candidate, extraction = forward_extract_image(raw_value, annotation, self.sns_config)
+            elif modality == "audio":
+                candidate, extraction = forward_extract_audio(
+                    raw_value, annotation, self.sns_config, self._forward_models
+                )
+            elif modality == "video":
+                candidate, extraction = forward_extract_video(
+                    raw_value, annotation, self.sns_config, self._forward_models
+                )
+            else:
+                return record, {
+                    "direction": "forward",
+                    "accepted": False,
+                    "reason": "unsupported_media_modality",
+                    "modality": modality,
+                }
+        except RuntimeError as exc:
+            if self.require_forward_models:
+                raise
+            return record, {
+                "direction": "forward",
+                "accepted": False,
+                "reason": "forward_model_unavailable",
+                "modality": modality,
+                "error": str(exc),
+            }
+
+        if not _changed_media(raw_value, candidate):
+            return record, {
+                "direction": "forward",
+                "accepted": False,
+                "reason": extraction.get("reason", "no_media_nucleus"),
+                "modality": modality,
+                "extraction": _jsonable(extraction),
+            }
+
+        original_sim = self._raw_annotation_similarity(raw_value, modality, annotation)
+        candidate_sim = self._raw_annotation_similarity(candidate, modality, annotation)
+        mi_threshold = float(getattr(self.sns_config, "mi_ratio", 0.95)) * max(
+            original_sim,
+            float(getattr(self.sns_config, "mi_eps", 0.05)),
+        )
+        if candidate_sim < mi_threshold:
+            return record, {
+                "direction": "forward",
+                "accepted": False,
+                "reason": "mi_gate_failed",
+                "modality": modality,
+                "original_similarity": original_sim,
+                "candidate_similarity": candidate_sim,
+                "mi_threshold": mi_threshold,
+                "extraction": _jsonable(extraction),
+            }
+
+        updated = dict(record)
+        updated["raw_path"] = str(candidate)
+        updated["sns_raw_text"] = None
+        return updated, {
+            "direction": "forward",
+            "accepted": True,
+            "reason": extraction.get("reason", "media_nucleus_extracted"),
+            "modality": modality,
+            "original_similarity": original_sim,
+            "candidate_similarity": candidate_sim,
+            "mi_threshold": mi_threshold,
+            "output_path": str(candidate),
+            "extraction": _jsonable(extraction),
+        }
+
+    def raw_annotation_similarity(self, raw_value: Any, modality: str, annotation: str) -> float:
+        return self._raw_annotation_similarity(raw_value, modality, annotation)
+
+    def unload(self) -> None:
+        self._text_embedding_cache.clear()
+        self._description_backend.unload()
+        self._forward_models.unload()
+        if self._omni is not None:
+            self._omni.unload()
+            self._omni = None
+
+    def _raw_annotation_similarity(self, raw_value: Any, modality: str, annotation: str) -> float:
+        if modality == "text":
+            return self.text_text(raw_value, annotation, self.embedding_dim)
+        if modality == "image":
+            return self.image_text(raw_value, annotation)
+        if modality == "audio":
+            return self.audio_text(raw_value, annotation)
+        if modality == "video":
+            return self.video_text(raw_value, annotation)
+        return 0.0
+
+    def _embed_text(self, text: str) -> list[float]:
+        if text not in self._text_embedding_cache:
+            self._text_embedding_cache[text] = self._resize_vector(self._ensure_omni().encode_text(text))
+        return self._text_embedding_cache[text]
+
+    def _embed_media(self, value: Any, modality: str) -> list[float]:
+        omni = self._ensure_omni()
+        if modality == "image":
+            return self._resize_vector(omni.encode_image(value))
+        if modality == "audio":
+            return self._resize_vector(omni.encode_audio(value))
+        if modality == "video":
+            return self._resize_vector(omni.encode_video(value))
+        raise ValueError(f"Unsupported media modality: {modality}")
+
+    def _ensure_omni(self) -> Any:
+        if self._omni is None:
+            self._omni = self._omni_runtime_cls(self.omni_model, self.device, self.offline_mode)
+        return self._omni
+
+    def _resize_vector(self, vector: Any) -> list[float]:
+        from omnifuse_tutorial.eee.local_models import _resize_and_normalize
+
+        return _resize_and_normalize(vector, self.embedding_dim)
+
+
+class NvidiaApiSNSBackend:
+    """NVIDIA API-backed SNS similarity/describer backend.
+
+    This backend provides the model-backed backward extraction path and a
+    conservative media forward path that writes a generated text nucleus to
+    `sns_raw_text` when the media description is sufficiently aligned. It does
+    not crop pixels or cut media segments; local Grounding-DINO/DETR-style
+    extraction still requires those optional model stacks.
+    """
+
+    def __init__(
+        self,
+        sns_config: Any,
+        eee_config: Any,
+        timeout: int = 120,
+    ):
+        self.sns_config = sns_config
+        self.embedding_dim = int(getattr(eee_config, "embedding_dim", 2048))
+        self.api_key = (
+            getattr(eee_config, "nvidia_api_key", None)
+            or os.environ.get("NV_BUILD_API_KEY")
+            or os.environ.get("NVIDIA_API_KEY")
+        )
+        if not self.api_key:
+            raise ValueError("NVIDIA API key required. Set NV_BUILD_API_KEY in .env or the environment.")
+        base_url = getattr(eee_config, "nvidia_api_base_url", "https://integrate.api.nvidia.com/v1")
+        self.api_base_url = os.environ.get("NVIDIA_API_BASE_URL", base_url).rstrip("/")
+        self.text_model = getattr(eee_config, "nvidia_text_describer_model", "nvidia/nemotron-nano-12b-v2-vl")
+        self.image_model = getattr(eee_config, "nvidia_image_describer_model", "nvidia/nemotron-nano-12b-v2-vl")
+        self.video_model = getattr(eee_config, "nvidia_video_describer_model", "nvidia/nemotron-nano-12b-v2-vl")
+        self.audio_model = getattr(eee_config, "nvidia_audio_describer_model", GEMMA_3N_E4B_MODEL)
+        self.embedding_model = getattr(
+            eee_config,
+            "nvidia_embedding_model",
+            "nvidia/llama-nemotron-embed-1b-v2",
+        )
+        self.timeout = timeout
+        self._embedding_cache: dict[str, list[float]] = {}
+        self._description_cache: dict[str, str] = {}
+
+    def image_text(self, image_data: Any, text: str) -> float:
+        return self.text_text(self._describe_media_value(image_data, "image"), text)
+
+    def audio_text(self, audio_data: Any, text: str) -> float:
+        return self.text_text(self._describe_media_value(audio_data, "audio"), text)
+
+    def video_text(self, video_data: Any, text: str) -> float:
+        return self.text_text(self._describe_media_value(video_data, "video"), text)
+
+    def text_text(self, left: Any, right: Any, dim: int = 2048) -> float:
+        left = _text_or_none(left) or ""
+        right = _text_or_none(right) or ""
+        if not left or not right:
+            return 0.0
+        return cosine_similarity(self._embed_text(left), self._embed_text(right))
+
+    def text_text_matrix(self, texts_a: list[str], texts_b: list[str], batch_size: int = 16) -> Any:
+        import numpy as np
+
+        return np.array(
+            [[self.text_text(left, right) for right in texts_b] for left in texts_a],
+            dtype=np.float32,
+        )
+
+    def describe_record(self, record: dict[str, Any]) -> str:
+        raw_text = _text_or_none(record.get("sns_raw_text")) or _text_or_none(record.get("raw_text"))
+        if raw_text:
+            return raw_text
+        return self._describe_media_value(record.get("raw_path"), str(record.get("modality") or "text"))
+
+    def forward_media(self, record: dict[str, Any], annotation: str) -> tuple[dict[str, Any], dict[str, Any]]:
+        modality = str(record.get("modality") or "")
+        description = self.describe_record(record)
+        similarity = self.text_text(description, annotation)
+        threshold = _forward_threshold(self.sns_config, modality)
+        if similarity < threshold:
+            return record, {
+                "direction": "forward",
+                "accepted": False,
+                "reason": "media_description_below_threshold",
+                "modality": modality,
+                "similarity": similarity,
+                "threshold": threshold,
+            }
+        updated = dict(record)
+        updated["sns_raw_text"] = description
+        return updated, {
+            "direction": "forward",
+            "accepted": True,
+            "reason": "media_description_nucleus_api_backend",
+            "modality": modality,
+            "similarity": similarity,
+            "threshold": threshold,
+        }
+
+    def unload(self) -> None:
+        self._embedding_cache.clear()
+        self._description_cache.clear()
+
+    def _describe_media_value(self, value: Any, modality: str) -> str:
+        if modality == "text":
+            return _text_or_none(value) or ""
+        path = _path_or_none(value)
+        if path is None or not path.exists():
+            return _describe_media(value, modality)
+        cache_key = f"{modality}:{path}"
+        if cache_key in self._description_cache:
+            return self._description_cache[cache_key]
+        prompt = _prompt_for_modality(modality)
+        if modality == "image":
+            description = self._describe_file(path, self.image_model, "image_url", prompt)
+        elif modality == "audio":
+            description = self._describe_file(path, self.audio_model, "input_audio", prompt)
+        elif modality == "video":
+            description = self._describe_file(path, self.video_model, "video_url", prompt)
+        else:
+            description = _describe_media(path, modality)
+        self._description_cache[cache_key] = description
+        return description
+
+    def _describe_file(self, path: Path, model: str, content_type: str, prompt: str) -> str:
+        return describe_file_with_nvidia_api(
+            path=path,
+            model=model,
+            content_type=content_type,
+            prompt=prompt,
+            api_base_url=self.api_base_url,
+            headers=self._headers(),
+            timeout=self.timeout,
+        )
+
+    def _embed_text(self, text: str) -> list[float]:
+        if text in self._embedding_cache:
+            return self._embedding_cache[text]
+        response = _post_nvidia_json_with_retries(
+            url=f"{self.api_base_url}/embeddings",
+            headers=self._headers(),
+            payload={
+                "model": self.embedding_model,
+                "input": [text],
+                "input_type": "passage",
+                "encoding_format": "float",
+                "truncate": "END",
+            },
+            timeout=self.timeout,
+        )
+        response.raise_for_status()
+        vector = [float(item) for item in response.json()["data"][0]["embedding"]]
+        vector = _resize_and_normalize(vector, self.embedding_dim)
+        self._embedding_cache[text] = vector
+        return vector
+
+    def _headers(self) -> dict[str, str]:
+        return {
+            "Authorization": f"Bearer {self.api_key}",
+            "Content-Type": "application/json",
+            "Accept": "application/json",
+        }
+
+
+class HybridSNSBackend:
+    """API-first SNS backend with local forward extraction and multimodal scoring.
+
+    Backward extraction and text-text decisions use NVIDIA API descriptions and
+    text embeddings. Image, audio, and video forward extraction stay local
+    because those steps require Grounding-DINO, AM-DETR, and CG-DETR.
+    """
+
+    def __init__(self, sns_config: Any, eee_config: Any, runtime: Any | None = None):
+        self.api = NvidiaApiSNSBackend(sns_config, eee_config)
+        self.local = LocalSNSBackend(sns_config, eee_config, runtime)
+        self.embedding_dim = int(getattr(eee_config, "embedding_dim", 2048))
+
+    def image_text(self, image_data: Any, text: str) -> float:
+        return self.local.image_text(image_data, text)
+
+    def audio_text(self, audio_data: Any, text: str) -> float:
+        return self.local.audio_text(audio_data, text)
+
+    def video_text(self, video_data: Any, text: str) -> float:
+        return self.local.video_text(video_data, text)
+
+    def text_text(self, left: Any, right: Any, dim: int = 2048) -> float:
+        return self.api.text_text(left, right, dim)
+
+    def text_text_matrix(self, texts_a: list[str], texts_b: list[str], batch_size: int = 16) -> Any:
+        return self.api.text_text_matrix(texts_a, texts_b, batch_size)
+
+    def describe_record(self, record: dict[str, Any]) -> str:
+        return self.api.describe_record(record)
+
+    def forward_media(self, record: dict[str, Any], annotation: str) -> tuple[dict[str, Any], dict[str, Any]]:
+        return self.local.forward_media(record, annotation)
+
+    def raw_annotation_similarity(self, raw_value: Any, modality: str, annotation: str) -> float:
+        if modality == "text":
+            return self.api.text_text(raw_value, annotation, self.embedding_dim)
+        return self.local.raw_annotation_similarity(raw_value, modality, annotation)
+
+    def unload(self) -> None:
+        self.api.unload()
+        self.local.unload()
+
+
+def backend_factory(
+    config_or_name: Any = "local", eee_config: Any | None = None, runtime: Any | None = None
+) -> SNSBackend:
+    name = _sns_backend_name(config_or_name, eee_config)
+    if name == "hybrid":
+        if isinstance(config_or_name, str):
+            raise ValueError("Hybrid SNS backend requires a full SNS config")
+        return HybridSNSBackend(config_or_name, eee_config, runtime)
+    if name == "local":
+        return LocalSNSBackend(config_or_name if not isinstance(config_or_name, str) else None, eee_config, runtime)
+    if name == "api":
+        return NvidiaApiSNSBackend(config_or_name, eee_config)
+    raise ValueError(f"Unsupported SNS backend: {name}")
+
+
+def _sns_backend_name(config_or_name: Any, eee_config: Any | None) -> str:
+    if isinstance(config_or_name, str):
+        name = config_or_name
+    else:
+        name = str(getattr(config_or_name, "backend", "auto"))
+    if name == "auto":
+        return str(getattr(eee_config, "backend", "hybrid"))
+    if name in {"hybrid", "local", "api"}:
+        return name
+    raise ValueError(f"Unsupported SNS backend: {name}")
+
+
+def _text_or_none(value: Any) -> str | None:
+    if isinstance(value, str):
+        text = value.strip()
+        return text or None
+    return None
+
+
+def _path_or_none(value: Any) -> Path | None:
+    if not isinstance(value, (str, Path)):
+        return None
+    try:
+        return Path(value)
+    except OSError:
+        return None
+
+
+def _changed_media(original: Any, candidate: Any) -> bool:
+    if candidate is None:
+        return False
+    return str(original) != str(candidate)
+
+
+def _jsonable(value: Any) -> Any:
+    if isinstance(value, Path):
+        return str(value)
+    if isinstance(value, tuple):
+        return [_jsonable(item) for item in value]
+    if isinstance(value, list):
+        return [_jsonable(item) for item in value]
+    if isinstance(value, dict):
+        return {str(key): _jsonable(item) for key, item in value.items()}
+    return value
+
+
+def _path_tokens(path: Path) -> list[str]:
+    return re.findall(r"[a-zA-Z0-9]+", path.stem)
+
+
+def _describe_media(value: Any, modality: str) -> str:
+    if isinstance(value, dict):
+        file_path = value.get("file_path") or value.get("path") or value.get("raw_path")
+        if file_path:
+            path = _path_or_none(file_path)
+            if path:
+                return f"modality {modality} {' '.join(_path_tokens(path))} {_file_fingerprint(path)}"
+    path = _path_or_none(value)
+    if path:
+        return f"modality {modality} {' '.join(_path_tokens(path))} {_file_fingerprint(path)}"
+    return f"modality {modality} {value!r}"
+
+
+def _file_fingerprint(path: Path) -> str:
+    try:
+        stat = path.stat()
+        digest = hashlib.sha256()
+        with path.open("rb") as handle:
+            digest.update(handle.read(65536))
+        return f"{path.suffix.lower()} {stat.st_size} {digest.hexdigest()[:24]}"
+    except OSError:
+        return f"{path.suffix.lower()} unreadable"
+
+
+def _prompt_for_modality(modality: str) -> str:
+    if modality == "image":
+        return "Describe the annotation-relevant visual content in this image in detail."
+    if modality == "audio":
+        return "Transcribe and describe the annotation-relevant sounds in this audio in detail."
+    if modality == "video":
+        return "Describe the annotation-relevant events and actions in this video in detail."
+    return "Describe this content in detail."
+
+
+def _forward_threshold(config: Any, modality: str) -> float:
+    if modality == "image":
+        return float(getattr(config, "tau_forward_image", 0.30))
+    if modality == "audio":
+        return float(getattr(config, "tau_forward_audio", 0.25))
+    if modality == "video":
+        return float(getattr(config, "tau_forward_video", 0.20))
+    return float(getattr(config, "tau_forward_text", 0.30))
+
+
+def _resize_and_normalize(vector: list[float], dim: int) -> list[float]:
+    if len(vector) < dim:
+        vector = vector + [0.0] * (dim - len(vector))
+    elif len(vector) > dim:
+        vector = vector[:dim]
+    norm = math.sqrt(sum(item * item for item in vector))
+    if norm == 0:
+        return vector
+    return [item / norm for item in vector]
diff --git a/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/sns/full_forward.py b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/sns/full_forward.py
new file mode 100644
index 0000000000..bf6aa28f19
--- /dev/null
+++ b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/sns/full_forward.py
@@ -0,0 +1,446 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Full SNS forward extraction for images, audio, and video."""
+
+from __future__ import annotations
+
+import importlib.util
+import io
+import logging
+import sys
+import tempfile
+import types
+import uuid
+from functools import lru_cache
+from pathlib import Path
+from typing import Any
+
+LOGGER = logging.getLogger(__name__)
+
+
+class ForwardModelStore:
+    """Lazy loader for SNS forward extraction models."""
+
+    def __init__(self, config: Any, device: str, offline_mode: bool = False):
+        self.config = config
+        self.device = device
+        self.offline_mode = offline_mode
+        self._amdetr_model: Any | None = None
+        self._amdetr_available: bool | None = None
+        self._videomomentdetr_model: Any | None = None
+        self._videomomentdetr_available: bool | None = None
+
+    def amdetr_model(self) -> Any:
+        if self._amdetr_available is False:
+            raise RuntimeError("AM-DETR model is unavailable from a previous load attempt")
+        if self._amdetr_model is not None:
+            return self._amdetr_model
+        try:
+            self._amdetr_model = self._load_amdetr_model()
+            self._amdetr_available = True
+            return self._amdetr_model
+        except Exception as exc:
+            self._amdetr_available = False
+            raise RuntimeError(
+                "Audio forward extraction requires AM-DETR. Install SNS/Lighthouse extras and "
+                "allow Hugging Face access or cache lighthouse-emnlp2024/AM-DETR."
+            ) from exc
+
+    def video_moment_model(self) -> Any:
+        if self._videomomentdetr_available is False:
+            raise RuntimeError("CG-DETR model is unavailable from a previous load attempt")
+        if self._videomomentdetr_model is not None:
+            return self._videomomentdetr_model
+        try:
+            self._videomomentdetr_model = self._load_cgdetr_model()
+            self._videomomentdetr_available = True
+            return self._videomomentdetr_model
+        except Exception as exc:
+            self._videomomentdetr_available = False
+            checkpoint = _checkpoint_path(self.config)
+            raise RuntimeError(
+                "Video forward extraction requires CG-DETR from the lighthouse package and "
+                f"a checkpoint at {checkpoint}. Place the QVHighlights clip checkpoint there "
+                "or set sns.cg_detr_checkpoint."
+            ) from exc
+
+    def unload(self) -> None:
+        self._amdetr_model = None
+        self._videomomentdetr_model = None
+        try:
+            import torch
+
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+        except ImportError:
+            return
+
+    def _load_cgdetr_model(self) -> Any:
+        import easydict
+        import torch
+        from lighthouse.models import CGDETRPredictor
+
+        checkpoint = _checkpoint_path(self.config)
+        if not checkpoint.exists():
+            raise FileNotFoundError(checkpoint)
+        torch.serialization.add_safe_globals([easydict.EasyDict])
+        original_torch_load = torch.load
+
+        def patched_torch_load(*args: Any, **kwargs: Any) -> Any:
+            kwargs["weights_only"] = False
+            return original_torch_load(*args, **kwargs)
+
+        torch.load = patched_torch_load
+        try:
+            return CGDETRPredictor(str(checkpoint), device=self.device, feature_name="clip")
+        finally:
+            torch.load = original_torch_load
+
+    def _load_amdetr_model(self) -> Any:
+        import torch
+        from huggingface_hub import snapshot_download
+
+        repo_id = str(getattr(self.config, "amdetr_repo_id", "lighthouse-emnlp2024/AM-DETR"))
+        model_dir = snapshot_download(repo_id, local_files_only=self.offline_mode)
+        package_name = "omnifuse_amdetr_runtime"
+        if package_name not in sys.modules:
+            pkg = types.ModuleType(package_name)
+            pkg.__path__ = [model_dir]
+            pkg.__package__ = package_name
+            sys.modules[package_name] = pkg
+        config_module = _load_module(
+            f"{package_name}.configuration_amdetr",
+            Path(model_dir) / "configuration_amdetr.py",
+            package_name,
+        )
+        sys.modules["configuration_amdetr"] = config_module
+        model_module = _load_module(
+            f"{package_name}.modeling_amdetr",
+            Path(model_dir) / "modeling_amdetr.py",
+            package_name,
+        )
+        sys.modules["modeling_amdetr"] = model_module
+        config = config_module.AMDETRConfig(device="cpu")
+        model = model_module.AMDETRPredictorWrapper(config)
+        state_dict = torch.load(Path(model_dir) / "pytorch_model.bin", map_location="cpu", weights_only=False)
+        model.load_state_dict(state_dict, strict=False)
+        try:
+            model = model.to(self.device)
+        except NotImplementedError:
+            model = model.to_empty(device=self.device)
+            model.load_state_dict(state_dict, strict=False, assign=True)
+        model.eval()
+        return model
+
+
+def forward_extract_image(raw_data: Any, annotation: str, config: Any) -> tuple[Any, dict[str, Any]]:
+    from PIL import Image, ImageOps
+
+    from omnifuse_tutorial.sns.model_utils.grounding_dino import (
+        calculate_lurl_from_xywh,
+        calculate_min_span_bbox,
+        get_bboxes,
+    )
+
+    original = raw_data
+    image = Image.open(raw_data) if isinstance(raw_data, (str, Path)) else raw_data
+    image = ImageOps.exif_transpose(image)
+    if image.mode != "RGB":
+        image = image.convert("RGB")
+    components = annotation_components(annotation, config)
+    kept: list[tuple[float, float, float, float]] = []
+    component_scores: list[float] = []
+    for component in components:
+        boxes, scores = get_bboxes(
+            image,
+            component,
+            model_id=str(getattr(config, "grounding_dino_model_id", "IDEA-Research/grounding-dino-tiny")),
+            box_threshold=float(getattr(config, "tau_forward_image", 0.30)),
+        )
+        kept.extend(boxes)
+        component_scores.extend(scores)
+    if not kept:
+        return original, {
+            "reason": "no_bboxes_above_threshold",
+            "components": components,
+            "threshold": float(getattr(config, "tau_forward_image", 0.30)),
+        }
+    span = calculate_min_span_bbox(kept)
+    if span is None:
+        return original, {"reason": "no_bboxes_above_threshold", "components": components}
+    padding = int(getattr(config, "bbox_padding_px", 0))
+    crop_box = calculate_lurl_from_xywh(span, padding=padding, image_width=image.width, image_height=image.height)
+    output_path = _output_dir(config) / f"{uuid.uuid4()}.jpg"
+    image.crop(crop_box).save(output_path)
+    return output_path, {
+        "reason": "image_grounding_dino_crop",
+        "components": components,
+        "boxes": kept,
+        "scores": component_scores,
+        "union_bbox": crop_box,
+        "threshold": float(getattr(config, "tau_forward_image", 0.30)),
+        "bbox_padding_px": padding,
+    }
+
+
+def forward_extract_audio(
+    raw_data: Any,
+    annotation: str,
+    config: Any,
+    model_store: ForwardModelStore,
+) -> tuple[Any, dict[str, Any]]:
+    import librosa
+    import soundfile as sf
+    import torch
+    import torch.nn.functional as F
+
+    components = annotation_components(annotation, config)
+    audio_input: str | io.BytesIO
+    temp_path: Path | None = None
+    try:
+        if isinstance(raw_data, (str, Path)):
+            audio_input = str(raw_data)
+            waveform_np, sample_rate = librosa.load(str(raw_data), sr=None, mono=False)
+        elif isinstance(raw_data, dict):
+            audio_array = raw_data.get("audio")
+            sample_rate = int(raw_data.get("sample_rate", 16000))
+            if audio_array is None:
+                return raw_data, {"reason": "no_audio_in_dict"}
+            temp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+            temp_path = Path(temp.name)
+            temp.close()
+            audio_to_save = audio_array if getattr(audio_array, "ndim", 1) == 1 else audio_array.T
+            sf.write(str(temp_path), audio_to_save, sample_rate)
+            audio_input = str(temp_path)
+            waveform_np = audio_array if getattr(audio_array, "ndim", 1) == 2 else audio_array[None, :]
+        else:
+            audio_input = io.BytesIO(raw_data)
+            waveform_np, sample_rate = librosa.load(io.BytesIO(raw_data), sr=None, mono=False)
+        if waveform_np.ndim == 1:
+            waveform_np = waveform_np[None, :]
+        waveform = torch.from_numpy(waveform_np).float()
+        model = model_store.amdetr_model()
+        feats = model.encode_audio(audio_path=audio_input)
+    finally:
+        if temp_path is not None:
+            temp_path.unlink(missing_ok=True)
+
+    device = next(model.parameters()).device
+    feats_device = {
+        key: value.to(device) if isinstance(value, torch.Tensor) else value for key, value in feats.items()
+    }
+    all_windows: list[tuple[float, float, float]] = []
+    for component in components:
+        query_feats, query_mask = model._text_encoder.encode(component)
+        if model._feature_name != "resnet_glove":
+            query_feats = F.normalize(query_feats, dim=-1, eps=1e-5)
+        query_feats = query_feats.to(device)
+        query_mask = query_mask.to(device)
+        model_inputs = {
+            "src_vid": feats_device.get("video_feats", feats_device.get("audio_feats")),
+            "src_vid_mask": feats_device.get("video_mask"),
+            "src_txt": query_feats,
+            "src_txt_mask": query_mask,
+            "src_aud": feats_device.get("audio_feats"),
+        }
+        model_inputs = {
+            key: value.to(device) if isinstance(value, torch.Tensor) else value for key, value in model_inputs.items()
+        }
+        with torch.inference_mode():
+            outputs = model._model(**model_inputs)
+        ranked_moments, _ = model._post_processing(model_inputs, outputs)
+        for start, end, score in ranked_moments:
+            if float(score) >= float(getattr(config, "tau_forward_audio", 0.25)):
+                all_windows.append((float(start), float(end), float(score)))
+
+    selected = _select_non_overlapping(
+        all_windows,
+        max_segments=int(getattr(config, "max_audio_segments", 5)),
+        min_duration=float(getattr(config, "min_segment_duration", 2.0)),
+    )
+    if not selected:
+        return raw_data, {"reason": "no_audio_segments_above_threshold", "total_segments": len(all_windows)}
+
+    segments = [waveform[:, int(start * sample_rate) : int(end * sample_rate)] for start, end in selected]
+    concatenated = torch.cat(segments, dim=1)
+    output_path = _output_dir(config) / f"{uuid.uuid4()}_audio.wav"
+    sf.write(str(output_path), concatenated.numpy().T, sample_rate, format="WAV")
+    return output_path, {
+        "reason": "audio_amdetr_segments",
+        "components": components,
+        "segments_kept": len(selected),
+        "total_segments": len(all_windows),
+        "kept_intervals": selected,
+        "threshold": float(getattr(config, "tau_forward_audio", 0.25)),
+    }
+
+
+def forward_extract_video(
+    raw_data: Any,
+    annotation: str,
+    config: Any,
+    model_store: ForwardModelStore,
+) -> tuple[Any, dict[str, Any]]:
+    import cv2
+    import numpy as np
+    import torch
+
+    components = annotation_components(annotation, config)
+    if isinstance(raw_data, dict):
+        video_path = raw_data.get("file_path")
+        preloaded_frames = raw_data.get("frames")
+    else:
+        video_path = str(raw_data) if isinstance(raw_data, (str, Path)) else None
+        preloaded_frames = None
+    if not video_path:
+        return raw_data, {"reason": "no_video_path"}
+
+    model = model_store.video_moment_model()
+    feats = model.encode_video(video_path=str(video_path))
+    cap = cv2.VideoCapture(str(video_path))
+    fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
+    if preloaded_frames is not None:
+        frames = preloaded_frames
+        cap.release()
+    else:
+        frames = []
+        while True:
+            ok, frame = cap.read()
+            if not ok:
+                break
+            frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+        cap.release()
+    if not frames:
+        return raw_data, {"reason": "no_frames"}
+
+    all_windows: list[tuple[float, float, float]] = []
+    for component in components:
+        prediction = model.predict(component, feats)
+        for start, end, score in prediction.get("pred_relevant_windows", []):
+            if float(score) >= float(getattr(config, "tau_forward_video", 0.20)):
+                all_windows.append((float(start), float(end), float(score)))
+    selected = _select_non_overlapping(
+        all_windows,
+        max_segments=int(getattr(config, "max_video_segments", 5)),
+        min_duration=float(getattr(config, "min_segment_duration", 2.0)),
+    )
+    if not selected:
+        return raw_data, {"reason": "no_video_segments_above_threshold", "total_segments": len(all_windows)}
+
+    video_tensor = (
+        torch.from_numpy(np.array(frames)).float() if isinstance(frames, list) else torch.from_numpy(frames).float()
+    )
+    segments = [video_tensor[int(start * fps) : int(end * fps)] for start, end in selected]
+    concatenated = torch.cat(segments, dim=0)
+    output_path = _output_dir(config) / f"{uuid.uuid4()}.mp4"
+    height, width = int(concatenated.shape[1]), int(concatenated.shape[2])
+    writer = cv2.VideoWriter(str(output_path), cv2.VideoWriter_fourcc(*"mp4v"), fps, (width, height))
+    for index in range(concatenated.shape[0]):
+        frame = concatenated[index].numpy().astype(np.uint8)
+        writer.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
+    writer.release()
+    return output_path, {
+        "reason": "video_cgdetr_segments",
+        "components": components,
+        "segments_kept": len(selected),
+        "total_segments": len(all_windows),
+        "kept_intervals": selected,
+        "threshold": float(getattr(config, "tau_forward_video", 0.20)),
+    }
+
+
+def annotation_components(annotation: str, config: Any) -> list[str]:
+    if not bool(getattr(config, "use_ann_components", True)):
+        return [annotation] if annotation.strip() else []
+    try:
+        nlp = _spacy_model()
+    except Exception as exc:
+        raise RuntimeError(
+            "SNS annotation component extraction requires spaCy model en_core_web_sm. "
+            "Install it with `python -m spacy download en_core_web_sm`, or set "
+            "sns.use_ann_components: false to use whole annotations."
+        ) from exc
+    doc = nlp(annotation)
+    components: list[str] = []
+    for chunk in doc.noun_chunks:
+        tokens = [token.text for token in chunk if token.pos_ != "PRON"]
+        if tokens:
+            components.append(" ".join(tokens))
+    for token in doc:
+        if token.pos_ == "VERB":
+            phrase_tokens = []
+            for candidate in doc:
+                if candidate == token or (
+                    candidate.head == token and candidate.pos_ in {"VERB", "ADV", "NOUN", "PROPN", "ADJ"}
+                ):
+                    phrase_tokens.append(candidate.text)
+            if phrase_tokens:
+                components.append(" ".join(phrase_tokens))
+    seen = set()
+    unique = []
+    for component in components or [annotation]:
+        normalized = component.strip()
+        lowered = normalized.lower()
+        if normalized and lowered not in seen:
+            seen.add(lowered)
+            unique.append(normalized)
+    return unique
+
+
+@lru_cache(maxsize=1)
+def _spacy_model() -> Any:
+    import spacy
+
+    return spacy.load("en_core_web_sm")
+
+
+def _select_non_overlapping(
+    windows: list[tuple[float, float, float]],
+    max_segments: int,
+    min_duration: float,
+) -> list[tuple[float, float]]:
+    selected: list[tuple[float, float]] = []
+    for start, end, _ in sorted(windows, key=lambda item: item[2], reverse=True):
+        if end - start < min_duration:
+            continue
+        if any(start < kept_end and end > kept_start for kept_start, kept_end in selected):
+            continue
+        selected.append((start, end))
+        if len(selected) >= max_segments:
+            break
+    return sorted(selected, key=lambda item: item[0])
+
+
+def _output_dir(config: Any) -> Path:
+    output_dir = getattr(config, "sns_output_dir", None) or Path("outputs") / "sns"
+    output_path = Path(output_dir)
+    output_path.mkdir(parents=True, exist_ok=True)
+    return output_path
+
+
+def _checkpoint_path(config: Any) -> Path:
+    value = getattr(config, "cg_detr_checkpoint", None) or Path("model_files") / "best.ckpt"
+    return Path(value)
+
+
+def _load_module(module_name: str, path: Path, package_name: str) -> Any:
+    spec = importlib.util.spec_from_file_location(module_name, path, submodule_search_locations=[str(path.parent)])
+    if spec is None or spec.loader is None:
+        raise ImportError(f"Could not load module spec for {path}")
+    module = importlib.util.module_from_spec(spec)
+    module.__package__ = package_name
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)
+    return module
diff --git a/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/sns/model_utils/__init__.py b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/sns/model_utils/__init__.py
new file mode 100644
index 0000000000..cb9117a92e
--- /dev/null
+++ b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/sns/model_utils/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""SNS model utility adapters."""
diff --git a/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/sns/model_utils/grounding_dino.py b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/sns/model_utils/grounding_dino.py
new file mode 100644
index 0000000000..da0f262cd8
--- /dev/null
+++ b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/sns/model_utils/grounding_dino.py
@@ -0,0 +1,150 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Grounding-DINO helpers for image forward extraction."""
+
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass
+from functools import lru_cache
+from typing import Any
+
+import numpy as np
+
+LOGGER = logging.getLogger(__name__)
+DEFAULT_GROUNDING_DINO_MODEL_ID = "IDEA-Research/grounding-dino-tiny"
+DEFAULT_BOX_THRESHOLD = 0.05
+
+
+@dataclass(frozen=True)
+class GroundingDinoDetector:
+    processor: Any
+    model: Any
+    device: Any
+
+
+def _device() -> Any:
+    import torch
+
+    if torch.cuda.is_available():
+        return torch.device("cuda")
+    mps_backend = getattr(torch.backends, "mps", None)
+    if mps_backend and mps_backend.is_available():
+        return torch.device("mps")
+    return torch.device("cpu")
+
+
+@lru_cache(maxsize=4)
+def _load_detector(model_id: str) -> GroundingDinoDetector:
+    try:
+        import torch
+        from huggingface_hub.utils import LocalEntryNotFoundError
+        from transformers import AutoModelForZeroShotObjectDetection, AutoProcessor
+    except ImportError as exc:
+        raise RuntimeError(
+            "Grounding-DINO forward extraction requires transformers and huggingface_hub. "
+            "Install the local extras with `python -m pip install -e '.[full]'`."
+        ) from exc
+
+    device = _device()
+    try:
+        processor = AutoProcessor.from_pretrained(model_id, local_files_only=True)
+        model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id, local_files_only=True).to(device)
+    except (LocalEntryNotFoundError, OSError, ValueError):
+        processor = AutoProcessor.from_pretrained(model_id)
+        model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)
+    model.eval()
+    return GroundingDinoDetector(processor=processor, model=model, device=device)
+
+
+def get_bboxes(
+    image: Any,
+    prompt: str,
+    model_id: str = DEFAULT_GROUNDING_DINO_MODEL_ID,
+    box_threshold: float = DEFAULT_BOX_THRESHOLD,
+) -> tuple[list[tuple[float, float, float, float]], list[float]]:
+    """Return Grounding-DINO boxes as xywh tuples plus confidence scores."""
+
+    import torch
+    from PIL import Image
+
+    if not isinstance(image, Image.Image):
+        raise TypeError(f"Expected PIL.Image.Image, got {type(image)!r}")
+    if image.mode != "RGB":
+        image = image.convert("RGB")
+    prompt = _normalize_prompt(prompt)
+    detector = _load_detector(model_id)
+    inputs = detector.processor(images=image, text=prompt, return_tensors="pt")
+    model_inputs = {key: value.to(detector.device) if hasattr(value, "to") else value for key, value in inputs.items()}
+    with torch.inference_mode():
+        outputs = detector.model(**model_inputs)
+    results = detector.processor.post_process_grounded_object_detection(
+        outputs,
+        model_inputs["input_ids"],
+        threshold=box_threshold,
+        target_sizes=[image.size[::-1]],
+    )
+    result = results[0]
+    boxes = [_xyxy_to_xywh(box) for box in result["boxes"].detach().cpu().tolist()]
+    scores = [float(score) for score in result["scores"].detach().cpu().tolist()]
+    return boxes, scores
+
+
+def calculate_min_span_bbox(
+    bboxes: list[tuple[float, float, float, float]],
+) -> tuple[float, float, float, float] | None:
+    if not bboxes:
+        return None
+    min_x = np.inf
+    min_y = np.inf
+    max_x = -np.inf
+    max_y = -np.inf
+    for x, y, width, height in bboxes:
+        min_x = min(min_x, x, x + width)
+        min_y = min(min_y, y, y + height)
+        max_x = max(max_x, x, x + width)
+        max_y = max(max_y, y, y + height)
+    return float(min_x), float(min_y), float(max_x - min_x), float(max_y - min_y)
+
+
+def calculate_lurl_from_xywh(
+    bbox: tuple[float, float, float, float],
+    padding: int = 0,
+    image_width: int = 0,
+    image_height: int = 0,
+) -> tuple[int, int, int, int]:
+    left, upper, width, height = bbox
+    right = left + width
+    lower = upper + height
+    if padding > 0:
+        left = max(0, left - padding)
+        upper = max(0, upper - padding)
+        right = min(image_width, right + padding)
+        lower = min(image_height, lower + padding)
+    return int(left), int(upper), int(right), int(lower)
+
+
+def _xyxy_to_xywh(box: list[float]) -> tuple[float, float, float, float]:
+    x0, y0, x1, y1 = box
+    return float(x0), float(y0), float(x1 - x0), float(y1 - y0)
+
+
+def _normalize_prompt(prompt: str) -> str:
+    normalized = prompt.strip()
+    if not normalized:
+        raise ValueError("Grounding-DINO prompt cannot be empty")
+    if not normalized.endswith("."):
+        normalized = f"{normalized}."
+    return normalized
diff --git a/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/sns/processor.py b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/sns/processor.py
new file mode 100644
index 0000000000..5be9204dc1
--- /dev/null
+++ b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/sns/processor.py
@@ -0,0 +1,224 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Lightweight SNS processor with explicit model-backend extension points."""
+
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass
+from typing import Any
+
+from omnifuse_tutorial.config.models import SNSConfig
+from omnifuse_tutorial.sns.backends import SNSBackend
+
+SENTENCE_RE = re.compile(r"(?<=[.!?])\s+|\n+")
+
+
+@dataclass
+class SNSProcessor:
+    config: SNSConfig
+    backend: SNSBackend
+    embedding_dim: int = 64
+
+    def process_record(self, record: dict[str, Any]) -> tuple[dict[str, Any], dict[str, Any]]:
+        if not self.config.enabled:
+            output = dict(record)
+            output["sns_raw_text"] = record.get("raw_text")
+            output["sns_annotation"] = record.get("annotation")
+            manifest = self._manifest(record, output, enabled=False, accepted=False, reason="disabled")
+            return output, manifest
+
+        output = dict(record)
+        output["sns_raw_text"] = _text_or_none(record.get("raw_text"))
+        output["sns_annotation"] = _text_or_none(record.get("annotation")) or ""
+        decisions: list[dict[str, Any]] = []
+
+        if self.config.direction in {"forward", "bidirectional"}:
+            output, decision = self._forward(output)
+            decisions.append(decision)
+        if self.config.direction in {"backward", "bidirectional"}:
+            output, decision = self._backward(output)
+            decisions.append(decision)
+
+        accepted = any(item.get("accepted") for item in decisions)
+        manifest = self._manifest(record, output, enabled=True, accepted=accepted, reason="processed")
+        manifest["decisions"] = decisions
+        return output, manifest
+
+    def _forward(self, record: dict[str, Any]) -> tuple[dict[str, Any], dict[str, Any]]:
+        modality = record["modality"]
+        annotation = _text_or_none(record.get("sns_annotation")) or _text_or_none(record.get("annotation")) or ""
+        if modality != "text":
+            return self.backend.forward_media(record, annotation)
+
+        raw_text = _text_or_none(record.get("raw_text")) or ""
+        sentences = _sentences(raw_text)
+        tau = self.config.tau_forward_text
+        components = self._annotation_components(annotation)
+        kept = [
+            sentence
+            for sentence in sentences
+            if components
+            and max(self.backend.text_text(sentence, component, self.embedding_dim) for component in components) >= tau
+        ]
+        if not kept:
+            return record, {"direction": "forward", "accepted": False, "reason": "no_sentence_above_threshold"}
+
+        candidate = " ".join(kept)
+        if not self._passes_mi_gate("text", candidate, annotation, raw_text, annotation):
+            return record, {"direction": "forward", "accepted": False, "reason": "mi_gate_failed"}
+
+        updated = dict(record)
+        updated["sns_raw_text"] = candidate
+        return updated, {
+            "direction": "forward",
+            "accepted": True,
+            "reason": "text_sentences_kept",
+            "kept_sentences": len(kept),
+            "total_sentences": len(sentences),
+            "annotation_components": components,
+        }
+
+    def _backward(self, record: dict[str, Any]) -> tuple[dict[str, Any], dict[str, Any]]:
+        metadata = record.get("metadata") if isinstance(record.get("metadata"), dict) else {}
+        raw_description = (
+            _text_or_none(record.get("sns_raw_text"))
+            or _text_or_none(record.get("raw_text"))
+            or _text_or_none(metadata.get("raw_description"))
+        )
+        if not raw_description:
+            raw_description = self.backend.describe_record(record)
+        annotation = _text_or_none(record.get("sns_annotation")) or _text_or_none(record.get("annotation")) or ""
+        sentences = _sentences(annotation)
+        kept = [
+            sentence
+            for sentence in sentences
+            if self.backend.text_text(sentence, raw_description, self.embedding_dim) >= self.config.tau_backward
+        ]
+        if not kept:
+            return record, {
+                "direction": "backward",
+                "accepted": False,
+                "reason": "no_annotation_sentence_above_threshold",
+            }
+
+        candidate = " ".join(kept)
+        modality = str(record.get("modality") or "text")
+        raw_value = self._raw_value_for_similarity(record, modality)
+        if not self._passes_mi_gate(modality, raw_value, candidate, raw_value, annotation):
+            return record, {"direction": "backward", "accepted": False, "reason": "mi_gate_failed"}
+
+        updated = dict(record)
+        updated["sns_annotation"] = candidate
+        return updated, {
+            "direction": "backward",
+            "accepted": True,
+            "reason": "annotation_sentences_kept",
+            "kept_sentences": len(kept),
+            "total_sentences": len(sentences),
+        }
+
+    def _passes_mi_gate(
+        self,
+        modality: str,
+        candidate_raw: Any,
+        candidate_annotation: str,
+        original_raw: Any,
+        original_annotation: str,
+    ) -> bool:
+        original_sim = max(
+            self._raw_annotation_similarity(original_raw, modality, original_annotation),
+            self.config.mi_eps,
+        )
+        candidate_sim = self._raw_annotation_similarity(candidate_raw, modality, candidate_annotation)
+        return candidate_sim >= self.config.mi_ratio * original_sim
+
+    def _raw_annotation_similarity(self, raw_value: Any, modality: str, annotation: str) -> float:
+        method = getattr(self.backend, "raw_annotation_similarity", None)
+        if callable(method):
+            return float(method(raw_value, modality, annotation))
+        if modality == "image":
+            return self.backend.image_text(raw_value, annotation)
+        if modality == "audio":
+            return self.backend.audio_text(raw_value, annotation)
+        if modality == "video":
+            return self.backend.video_text(raw_value, annotation)
+        return self.backend.text_text(raw_value, annotation, self.embedding_dim)
+
+    @staticmethod
+    def _raw_value_for_similarity(record: dict[str, Any], modality: str) -> Any:
+        if modality == "text":
+            return (
+                _text_or_none(record.get("sns_raw_text"))
+                or _text_or_none(record.get("raw_text"))
+                or _text_or_none(record.get("raw_path"))
+                or ""
+            )
+        return record.get("raw_path") or ""
+
+    def _annotation_components(self, annotation: str) -> list[str]:
+        if not self.config.use_ann_components:
+            return [annotation] if annotation.strip() else []
+        try:
+            from omnifuse_tutorial.sns.full_forward import annotation_components
+
+            return annotation_components(annotation, self.config)
+        except RuntimeError:
+            if self.config.require_forward_models:
+                raise
+            return [annotation] if annotation.strip() else []
+
+    def _manifest(
+        self,
+        original: dict[str, Any],
+        output: dict[str, Any],
+        enabled: bool,
+        accepted: bool,
+        reason: str,
+    ) -> dict[str, Any]:
+        return {
+            "pair_id": original["pair_id"],
+            "pool": original["pool"],
+            "modality": original["modality"],
+            "enabled": enabled,
+            "direction": self.config.direction,
+            "accepted": accepted,
+            "reason": reason,
+            "original_raw_path": original.get("raw_path"),
+            "original_annotation": original.get("annotation"),
+            "sns_raw_text": output.get("sns_raw_text"),
+            "sns_annotation": output.get("sns_annotation"),
+            "thresholds": {
+                "mi_ratio": self.config.mi_ratio,
+                "mi_eps": self.config.mi_eps,
+                "tau_forward_text": self.config.tau_forward_text,
+                "tau_forward_image": self.config.tau_forward_image,
+                "tau_forward_video": self.config.tau_forward_video,
+                "tau_forward_audio": self.config.tau_forward_audio,
+                "tau_backward": self.config.tau_backward,
+            },
+        }
+
+
+def _sentences(text: str) -> list[str]:
+    chunks = [chunk.strip() for chunk in SENTENCE_RE.split(_text_or_none(text) or "")]
+    return [chunk for chunk in chunks if chunk]
+
+
+def _text_or_none(value: Any) -> str | None:
+    if isinstance(value, str):
+        text = value.strip()
+        return text or None
+    return None
diff --git a/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/stages/__init__.py b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/stages/__init__.py
new file mode 100644
index 0000000000..3aa14fd77c
--- /dev/null
+++ b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/stages/__init__.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""NeMo Curator stages for the Omni-Fuse pipeline."""
+
+from omnifuse_tutorial.stages.datablend import DatablendRankingStage
+from omnifuse_tutorial.stages.eee import EEEEmbeddingStage
+from omnifuse_tutorial.stages.projection import ProjectionTrainingStage
+from omnifuse_tutorial.stages.reader import PairManifestReaderStage
+from omnifuse_tutorial.stages.sns import SNSStage
+
+__all__ = [
+    "DatablendRankingStage",
+    "EEEEmbeddingStage",
+    "PairManifestReaderStage",
+    "ProjectionTrainingStage",
+    "SNSStage",
+]
diff --git a/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/stages/datablend.py b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/stages/datablend.py
new file mode 100644
index 0000000000..31e9d99cc1
--- /dev/null
+++ b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/stages/datablend.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Datablend ranking/export stage."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any
+
+from nemo_curator.stages.base import ProcessingStage
+from nemo_curator.stages.resources import Resources
+
+from omnifuse_tutorial.compat.curator import make_document_batch, records_from_task
+from omnifuse_tutorial.config.models import ExperimentConfig
+from omnifuse_tutorial.data.io import write_jsonl
+from omnifuse_tutorial.datablend.ranker import DatablendRanker
+from omnifuse_tutorial.eee.backends import BackendFactory, backend_factory
+from omnifuse_tutorial.projection.trainer import ProjectionResult
+
+
+@dataclass
+class DatablendRankingStage(ProcessingStage[Any, Any]):
+    config: ExperimentConfig | None = None
+    backend_factory_fn: BackendFactory = backend_factory
+    name: str = "DatablendRanking"
+    resources: Resources = field(default_factory=lambda: Resources(cpus=1.0))
+
+    def inputs(self) -> tuple[list[str], list[str]]:
+        return ["data"], []
+
+    def outputs(self) -> tuple[list[str], list[str]]:
+        return ["data"], ["rank", "score"]
+
+    def process(self, task: Any) -> Any:
+        if self.config is None:
+            raise ValueError("DatablendRankingStage requires config")
+        metadata = dict(getattr(task, "_metadata", {}) or {})
+        projection = metadata.get("projection_result")
+        if not isinstance(projection, ProjectionResult):
+            raise ValueError("DatablendRankingStage requires projection_result metadata")
+        records = records_from_task(task)
+        backend = self.backend_factory_fn(self.config.eee, self.config.runtime)
+        ranker = DatablendRanker(self.config.datablend, backend)
+        ranked = ranker.rank(records, projection)
+        selected = ranker.select_top(ranked)
+
+        output_dir = self.config.run_dir / "datablend"
+        ranked_path = write_jsonl(output_dir / "datablend_ranked.jsonl", ranked)
+        topk_path = write_jsonl(output_dir / "datablend_topk.jsonl", selected)
+        metadata.update(
+            {
+                "datablend_ranked_path": str(ranked_path),
+                "datablend_topk_path": str(topk_path),
+                "datablend_size": len(selected),
+            }
+        )
+        return make_document_batch(
+            task_id=f"{task.task_id}_datablend",
+            dataset_name=task.dataset_name,
+            records=selected,
+            metadata=metadata,
+            stage_perf=getattr(task, "_stage_perf", []),
+        )
diff --git a/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/stages/eee.py b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/stages/eee.py
new file mode 100644
index 0000000000..4ff189c5a8
--- /dev/null
+++ b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/stages/eee.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Expert Embedding Engine stage."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any
+
+from nemo_curator.stages.base import ProcessingStage
+from nemo_curator.stages.resources import Resources
+
+from omnifuse_tutorial.compat.curator import make_document_batch, records_from_task
+from omnifuse_tutorial.config.models import ExperimentConfig
+from omnifuse_tutorial.data.io import write_json, write_npy
+from omnifuse_tutorial.eee.backends import BackendFactory, backend_factory
+from omnifuse_tutorial.eee.results import EmbeddingBundle
+
+
+@dataclass
+class EEEEmbeddingStage(ProcessingStage[Any, Any]):
+    config: ExperimentConfig | None = None
+    backend_factory_fn: BackendFactory = backend_factory
+    name: str = "EEEEmbedding"
+    resources: Resources = field(default_factory=lambda: Resources(cpus=1.0, gpus=1.0))
+
+    def inputs(self) -> tuple[list[str], list[str]]:
+        return ["data"], ["pair_id", "sns_annotation", "modality"]
+
+    def outputs(self) -> tuple[list[str], list[str]]:
+        return ["data"], []
+
+    def process(self, task: Any) -> Any:
+        if self.config is None:
+            raise ValueError("EEEEmbeddingStage requires config")
+        records = records_from_task(task)
+        backend = self.backend_factory_fn(self.config.eee, self.config.runtime)
+        experts = list(self.config.eee.experts)
+        embeddings: dict[str, list[list[float]]] = {}
+        for expert in experts:
+            rows: list[list[float]] = []
+            for record in records:
+                rows.append(backend.embed_raw(record, expert))
+                rows.append(backend.embed_annotation(record, expert))
+            embeddings[expert] = rows
+
+        bundle = EmbeddingBundle(
+            pair_ids=[record["pair_id"] for record in records],
+            modalities=[record["modality"] for record in records],
+            records=records,
+            experts=experts,
+            embeddings=embeddings,
+        )
+
+        output_dir = self.config.run_dir / "embeddings"
+        for expert, rows in embeddings.items():
+            safe_name = expert.replace("-", "_")
+            write_npy(output_dir / f"{safe_name}_interleaved.npy", rows)
+            write_npy(output_dir / f"{safe_name}_raw.npy", rows[0::2])
+            write_npy(output_dir / f"{safe_name}_annotation.npy", rows[1::2])
+        metadata_path = write_json(
+            output_dir / "metadata.json",
+            {
+                "pair_ids": bundle.pair_ids,
+                "modalities": bundle.modalities,
+                "experts": bundle.experts,
+                "embedding_dim": bundle.embedding_dim,
+            },
+        )
+
+        metadata = dict(getattr(task, "_metadata", {}) or {})
+        metadata["embedding_bundle"] = bundle
+        metadata["embedding_metadata_path"] = str(metadata_path)
+        return make_document_batch(
+            task_id=f"{task.task_id}_eee",
+            dataset_name=task.dataset_name,
+            records=records,
+            metadata=metadata,
+            stage_perf=getattr(task, "_stage_perf", []),
+        )
diff --git a/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/stages/projection.py b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/stages/projection.py
new file mode 100644
index 0000000000..ecfe778275
--- /dev/null
+++ b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/stages/projection.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Projection training stage."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any
+
+from nemo_curator.stages.base import ProcessingStage
+from nemo_curator.stages.resources import Resources
+
+from omnifuse_tutorial.compat.curator import make_document_batch, records_from_task
+from omnifuse_tutorial.config.models import ExperimentConfig
+from omnifuse_tutorial.data.io import write_json, write_npy
+from omnifuse_tutorial.eee.results import EmbeddingBundle
+from omnifuse_tutorial.projection.trainer import ProjectionTrainer
+
+
+@dataclass
+class ProjectionTrainingStage(ProcessingStage[Any, Any]):
+    config: ExperimentConfig | None = None
+    name: str = "ProjectionTraining"
+    resources: Resources = field(default_factory=lambda: Resources(cpus=1.0))
+
+    def inputs(self) -> tuple[list[str], list[str]]:
+        return ["data"], []
+
+    def outputs(self) -> tuple[list[str], list[str]]:
+        return ["data"], []
+
+    def process(self, task: Any) -> Any:
+        if self.config is None:
+            raise ValueError("ProjectionTrainingStage requires config")
+        metadata = dict(getattr(task, "_metadata", {}) or {})
+        bundle = metadata.get("embedding_bundle")
+        if not isinstance(bundle, EmbeddingBundle):
+            raise ValueError("ProjectionTrainingStage requires embedding_bundle metadata")
+
+        trainer = ProjectionTrainer(self.config.projection)
+        result = trainer.train_and_project(bundle)
+        output_dir = self.config.run_dir / "projection"
+        projected_path = write_npy(output_dir / "projected_embeddings.npy", result.projected_raw)
+        annotations_path = write_npy(output_dir / "annotation_embeddings.npy", result.annotation_embeddings)
+        model_path = write_json(output_dir / "model.json", result.model)
+        loss_path = write_json(output_dir / "loss_history.json", {"loss": result.loss_history})
+        metrics_path = write_json(output_dir / "metrics.json", {"recall_at_10": result.recall_at_10})
+
+        metadata.update(
+            {
+                "projection_result": result,
+                "projection_model_path": str(model_path),
+                "projection_loss_path": str(loss_path),
+                "projection_metrics_path": str(metrics_path),
+                "projected_embeddings_path": str(projected_path),
+                "annotation_embeddings_path": str(annotations_path),
+            }
+        )
+        return make_document_batch(
+            task_id=f"{task.task_id}_projection",
+            dataset_name=task.dataset_name,
+            records=records_from_task(task),
+            metadata=metadata,
+            stage_perf=getattr(task, "_stage_perf", []),
+        )
diff --git a/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/stages/reader.py b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/stages/reader.py
new file mode 100644
index 0000000000..08696fcfb7
--- /dev/null
+++ b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/stages/reader.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Reader stage for paired raw/annotation manifests."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any
+
+from nemo_curator.stages.base import ProcessingStage
+from nemo_curator.stages.resources import Resources
+from nemo_curator.tasks import EmptyTask
+
+from omnifuse_tutorial.compat.curator import make_document_batch
+from omnifuse_tutorial.config.models import ExperimentConfig
+from omnifuse_tutorial.data.loader import load_all_pools
+
+
+@dataclass
+class PairManifestReaderStage(ProcessingStage[Any, Any]):
+    config: ExperimentConfig | None = None
+    name: str = "PairManifestReader"
+    resources: Resources = field(default_factory=lambda: Resources(cpus=1.0))
+
+    def inputs(self) -> tuple[list[str], list[str]]:
+        return [], []
+
+    def outputs(self) -> tuple[list[str], list[str]]:
+        return ["data"], ["pair_id", "raw_path", "annotation", "modality"]
+
+    def process(self, task: EmptyTask) -> Any:
+        if self.config is None:
+            raise ValueError("PairManifestReaderStage requires config")
+        records = load_all_pools(self.config.data_pools)
+        return make_document_batch(
+            task_id=f"{self.config.experiment_id}_pairs",
+            dataset_name=self.config.experiment_id,
+            records=records,
+            metadata={"experiment_id": self.config.experiment_id},
+        )
diff --git a/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/stages/sns.py b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/stages/sns.py
new file mode 100644
index 0000000000..9cad695dd8
--- /dev/null
+++ b/tutorials/multimodal/omni-fuse-data-curation/omnifuse_tutorial/stages/sns.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""SNS Curator stage."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any
+
+from nemo_curator.stages.base import ProcessingStage
+from nemo_curator.stages.resources import Resources
+
+from omnifuse_tutorial.compat.curator import make_document_batch, records_from_task
+from omnifuse_tutorial.config.models import ExperimentConfig
+from omnifuse_tutorial.data.io import write_jsonl
+from omnifuse_tutorial.sns.backends import backend_factory
+from omnifuse_tutorial.sns.processor import SNSProcessor
+
+
+@dataclass
+class SNSStage(ProcessingStage[Any, Any]):
+    config: ExperimentConfig | None = None
+    name: str = "SNS"
+    resources: Resources = field(default_factory=lambda: Resources(cpus=1.0, gpus=1.0))
+
+    def inputs(self) -> tuple[list[str], list[str]]:
+        return ["data"], ["pair_id", "annotation", "modality"]
+
+    def outputs(self) -> tuple[list[str], list[str]]:
+        return ["data"], ["sns_annotation", "sns_raw_text"]
+
+    def process(self, task: Any) -> Any:
+        if self.config is None:
+            raise ValueError("SNSStage requires config")
+        if self.config.sns.sns_output_dir is None:
+            self.config.sns.sns_output_dir = self.config.run_dir / "sns" / "media"
+        backend = backend_factory(self.config.sns, self.config.eee, self.config.runtime)
+        processor = SNSProcessor(self.config.sns, embedding_dim=self.config.eee.embedding_dim, backend=backend)
+        output_records: list[dict[str, Any]] = []
+        manifest_rows: list[dict[str, Any]] = []
+        for record in records_from_task(task):
+            output, manifest = processor.process_record(record)
+            output_records.append(output)
+            manifest_rows.append(manifest)
+
+        manifest_path = self.config.run_dir / "sns" / "manifest.jsonl"
+        write_jsonl(manifest_path, manifest_rows)
+        metadata = dict(getattr(task, "_metadata", {}) or {})
+        metadata["sns_manifest_path"] = str(manifest_path)
+        return make_document_batch(
+            task_id=f"{task.task_id}_sns",
+            dataset_name=task.dataset_name,
+            records=output_records,
+            metadata=metadata,
+            stage_perf=getattr(task, "_stage_perf", []),
+        )
diff --git a/tutorials/multimodal/omni-fuse-data-curation/pyproject.toml b/tutorials/multimodal/omni-fuse-data-curation/pyproject.toml
new file mode 100644
index 0000000000..305826a24a
--- /dev/null
+++ b/tutorials/multimodal/omni-fuse-data-curation/pyproject.toml
@@ -0,0 +1,98 @@
+[build-system]
+requires = ["setuptools>=68"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "omnifuse-curator-tutorial"
+version = "0.1.0"
+description = "Standalone NeMo Curator tutorial for Omni-Fuse multimodal data curation."
+readme = "README.md"
+requires-python = ">=3.10,<3.13"
+dependencies = [
+    "nemo-curator>=1.1.0,<2",
+    "numpy>=1.26",
+    "pandas>=2.0",
+    "pyyaml>=6.0",
+    "requests>=2.31",
+    "spacy>=3.8.14,<4",
+    "torch>=2.2",
+    "accelerate>=0.26",
+    "einops>=0.6",
+    "easydict>=1.13",
+    "huggingface_hub>=0.25",
+    "ffmpeg-python",
+    "future",
+    "fvcore",
+    "lighthouse @ git+https://github.com/line/lighthouse.git",
+    "librosa>=0.10,<0.12",
+    "msclap",
+    "nvidia-ml-py>=12",
+    "opencv-python>=4,<5",
+    "Pillow>=10,<12",
+    "qwen-vl-utils>=0.0.8",
+    "sentence-transformers>=2.2,<4",
+    "soundfile>=0.12",
+    "torchlibrosa",
+    "torchcodec",
+    "clip @ git+https://github.com/openai/CLIP.git",
+    "torchaudio>=2.2",
+    "torchvision>=0.17",
+    "transformers>=4.57,<5",
+    "en-core-web-sm",
+    "peft>=0.19.1",
+    "decord>=0.6.0",
+    "pytorchvideo>=0.1.5",
+]
+
+[project.optional-dependencies]
+dev = [
+    "pytest>=8.0",
+]
+
+[tool.uv]
+override-dependencies = [
+    "numpy>=1.26",
+    "transformers>=4.57,<5",
+]
+
+[tool.uv.sources]
+en-core-web-sm = { url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl" }
+
+[tool.setuptools]
+py-modules = []
+
+[tool.ruff]
+extend = "../../../pyproject.toml"
+
+[tool.ruff.lint]
+extend-ignore = [
+    "ANN401",  # Tutorial glue handles dynamic Curator, torch, and model objects.
+    "ARG002",
+    "ARG005",
+    "B904",
+    "B905",
+    "BLE001",
+    "C901",
+    "EM101",
+    "EM102",
+    "F401",
+    "N801",
+    "N812",
+    "NPY002",
+    "PLR0911",
+    "PLR0912",
+    "PLR0913",
+    "PLR0915",
+    "PLR2004",
+    "PLW0603",
+    "RUF012",
+    "RUF034",
+    "S110",
+    "SIM102",
+    "SIM108",
+    "SIM115",
+    "TC001",
+    "TC002",
+    "TRY004",
+    "TRY300",
+]
diff --git a/tutorials/multimodal/omni-fuse-data-curation/utils.py b/tutorials/multimodal/omni-fuse-data-curation/utils.py
new file mode 100644
index 0000000000..55dc0c7523
--- /dev/null
+++ b/tutorials/multimodal/omni-fuse-data-curation/utils.py
@@ -0,0 +1,270 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Shared helpers for the Omni-Fuse data curation tutorial scripts."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+
+from omnifuse_tutorial.compat.curator import (
+    make_curator_pipeline,
+    make_document_batch,
+    make_empty_task,
+    records_from_task,
+)
+from omnifuse_tutorial.config.loader import load_config
+from omnifuse_tutorial.config.models import ExperimentConfig
+from omnifuse_tutorial.data.io import read_jsonl, write_json, write_jsonl
+from omnifuse_tutorial.data.loader import load_all_pools
+from omnifuse_tutorial.eee.results import EmbeddingBundle
+from omnifuse_tutorial.projection.trainer import ProjectionResult
+from omnifuse_tutorial.stages import (
+    DatablendRankingStage,
+    EEEEmbeddingStage,
+    PairManifestReaderStage,
+    ProjectionTrainingStage,
+    SNSStage,
+)
+
+
+def config_parser(description: str) -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(description=description)
+    parser.add_argument("--config", required=True, help="Path to the Omni-Fuse tutorial YAML config")
+    return parser
+
+
+def load_tutorial_config(config_path: str | Path) -> ExperimentConfig:
+    config = load_config(config_path)
+    config.run_dir.mkdir(parents=True, exist_ok=True)
+    write_json(config.run_dir / "config.resolved.json", config.to_dict())
+    return config
+
+
+def run_curator_step(name: str, stages: list[Any], initial_task: Any) -> Any:
+    """Run one numbered tutorial step as a real NeMo Curator Pipeline."""
+
+    pipeline = make_curator_pipeline(
+        name=name,
+        description=f"Omni-Fuse tutorial step: {name}",
+        stages=stages,
+    )
+    tasks = pipeline.run(initial_tasks=[initial_task])
+    if not tasks:
+        raise RuntimeError(f"Curator pipeline {name} produced no output tasks")
+    return tasks[-1]
+
+
+def run_reader(config: ExperimentConfig) -> Any:
+    return run_curator_step(
+        name=f"{config.experiment_id}-0-read-pairs",
+        stages=[PairManifestReaderStage(config=config)],
+        initial_task=make_empty_task(),
+    )
+
+
+def run_sns(config: ExperimentConfig) -> Any:
+    task = run_curator_step(
+        name=f"{config.experiment_id}-1-sns",
+        stages=[SNSStage(config=config)],
+        initial_task=run_reader(config),
+    )
+    records_path = config.run_dir / "sns" / "records.jsonl"
+    write_jsonl(records_path, records_from_task(task))
+    metadata = dict(getattr(task, "_metadata", {}) or {})
+    metadata["sns_records_path"] = str(records_path)
+    task._metadata = metadata
+    return task
+
+
+def load_sns_task(config: ExperimentConfig) -> Any:
+    records_path = config.run_dir / "sns" / "records.jsonl"
+    if not records_path.exists():
+        raise FileNotFoundError(f"Missing {records_path}. Run 1_sns.py first.")
+    records = read_jsonl(records_path)
+    return make_document_batch(
+        task_id=f"{config.experiment_id}_sns",
+        dataset_name=config.experiment_id,
+        records=records,
+        metadata={
+            "experiment_id": config.experiment_id,
+            "sns_records_path": str(records_path),
+            "sns_manifest_path": str(config.run_dir / "sns" / "manifest.jsonl"),
+        },
+    )
+
+
+def run_eee(config: ExperimentConfig) -> Any:
+    task = run_curator_step(
+        name=f"{config.experiment_id}-2-embed",
+        stages=[EEEEmbeddingStage(config=config)],
+        initial_task=load_sns_task(config),
+    )
+    records_path = config.run_dir / "embeddings" / "records.jsonl"
+    write_jsonl(records_path, records_from_task(task))
+    metadata = dict(getattr(task, "_metadata", {}) or {})
+    metadata["embedding_records_path"] = str(records_path)
+    task._metadata = metadata
+    return task
+
+
+def load_embedding_task(config: ExperimentConfig) -> Any:
+    records_path = config.run_dir / "embeddings" / "records.jsonl"
+    if not records_path.exists():
+        raise FileNotFoundError(f"Missing {records_path}. Run 2_embed.py first.")
+    records = read_jsonl(records_path)
+    bundle = load_embedding_bundle(config, records)
+    return make_document_batch(
+        task_id=f"{config.experiment_id}_embeddings",
+        dataset_name=config.experiment_id,
+        records=records,
+        metadata={
+            "experiment_id": config.experiment_id,
+            "embedding_bundle": bundle,
+            "embedding_records_path": str(records_path),
+            "embedding_metadata_path": str(config.run_dir / "embeddings" / "metadata.json"),
+        },
+    )
+
+
+def load_embedding_bundle(config: ExperimentConfig, records: list[dict[str, Any]] | None = None) -> EmbeddingBundle:
+    metadata_path = config.run_dir / "embeddings" / "metadata.json"
+    if not metadata_path.exists():
+        raise FileNotFoundError(f"Missing {metadata_path}. Run 2_embed.py first.")
+    metadata = json.loads(metadata_path.read_text(encoding="utf-8"))
+    if records is None:
+        records = read_jsonl(config.run_dir / "embeddings" / "records.jsonl")
+    embeddings: dict[str, list[list[float]]] = {}
+    for expert in metadata["experts"]:
+        safe_name = expert.replace("-", "_")
+        path = config.run_dir / "embeddings" / f"{safe_name}_interleaved.npy"
+        embeddings[expert] = np.load(path).astype(float).tolist()
+    return EmbeddingBundle(
+        pair_ids=list(metadata["pair_ids"]),
+        modalities=list(metadata["modalities"]),
+        records=records,
+        experts=list(metadata["experts"]),
+        embeddings=embeddings,
+    )
+
+
+def run_projection(config: ExperimentConfig) -> Any:
+    task = run_curator_step(
+        name=f"{config.experiment_id}-3-project",
+        stages=[ProjectionTrainingStage(config=config)],
+        initial_task=load_embedding_task(config),
+    )
+    records_path = config.run_dir / "projection" / "records.jsonl"
+    write_jsonl(records_path, records_from_task(task))
+    metadata = dict(getattr(task, "_metadata", {}) or {})
+    metadata["projection_records_path"] = str(records_path)
+    task._metadata = metadata
+    return task
+
+
+def load_projection_task(config: ExperimentConfig) -> Any:
+    records_path = config.run_dir / "projection" / "records.jsonl"
+    if not records_path.exists():
+        raise FileNotFoundError(f"Missing {records_path}. Run 3_project.py first.")
+    records = read_jsonl(records_path)
+    projection = load_projection_result(config)
+    return make_document_batch(
+        task_id=f"{config.experiment_id}_projection",
+        dataset_name=config.experiment_id,
+        records=records,
+        metadata={
+            "experiment_id": config.experiment_id,
+            "projection_result": projection,
+            "projected_embeddings_path": str(config.run_dir / "projection" / "projected_embeddings.npy"),
+            "annotation_embeddings_path": str(config.run_dir / "projection" / "annotation_embeddings.npy"),
+            "projection_model_path": str(config.run_dir / "projection" / "model.json"),
+            "projection_loss_path": str(config.run_dir / "projection" / "loss_history.json"),
+            "projection_metrics_path": str(config.run_dir / "projection" / "metrics.json"),
+        },
+    )
+
+
+def load_projection_result(config: ExperimentConfig) -> ProjectionResult:
+    output_dir = config.run_dir / "projection"
+    model = json.loads((output_dir / "model.json").read_text(encoding="utf-8"))
+    loss_payload = json.loads((output_dir / "loss_history.json").read_text(encoding="utf-8"))
+    metrics_payload = json.loads((output_dir / "metrics.json").read_text(encoding="utf-8"))
+    projected = np.load(output_dir / "projected_embeddings.npy").astype(float).tolist()
+    annotations = np.load(output_dir / "annotation_embeddings.npy").astype(float).tolist()
+    expert_weights = model.get("expert_weights")
+    if not isinstance(expert_weights, dict):
+        experts = model.get("experts") or []
+        expert_weights = {expert: 1.0 / len(experts) for expert in experts} if experts else {}
+    return ProjectionResult(
+        projected_raw=projected,
+        annotation_embeddings=annotations,
+        expert_weights={str(key): float(value) for key, value in expert_weights.items()},
+        loss_history=[float(value) for value in loss_payload.get("loss", [])],
+        recall_at_10={str(key): float(value) for key, value in metrics_payload.get("recall_at_10", {}).items()},
+        model=model,
+    )
+
+
+def run_datablend(config: ExperimentConfig) -> Any:
+    return run_curator_step(
+        name=f"{config.experiment_id}-4-datablend",
+        stages=[DatablendRankingStage(config=config)],
+        initial_task=load_projection_task(config),
+    )
+
+
+def validate_inputs(config: ExperimentConfig) -> dict[str, Any]:
+    records = load_all_pools(config.data_pools)
+    missing: list[str] = []
+    effective_sns_backend = config.eee.backend if config.sns.backend == "auto" else config.sns.backend
+    if config.eee.backend in {"hybrid", "api"} or effective_sns_backend in {"hybrid", "api"}:
+        if not (config.eee.nvidia_api_key or os.environ.get("NV_BUILD_API_KEY") or os.environ.get("NVIDIA_API_KEY")):
+            missing.append("NV_BUILD_API_KEY")
+    if config.eee.backend in {"hybrid", "local"} and "fusion" in config.eee.experts:
+        languagebind_value = os.environ.get("LANGUAGEBIND_ROOT")
+        languagebind_root = Path(languagebind_value).expanduser() if languagebind_value else None
+        default_languagebind_root = Path(__file__).resolve().parent / "third_party" / "LanguageBind"
+        if (languagebind_root is None or not languagebind_root.exists()) and not default_languagebind_root.exists():
+            missing.append("LANGUAGEBIND_ROOT")
+    uses_video_forward = any(pool.modality == "video" for pool in config.data_pools) and config.sns.direction in {
+        "forward",
+        "bidirectional",
+    }
+    if (
+        effective_sns_backend in {"hybrid", "local"}
+        and uses_video_forward
+        and not config.sns.cg_detr_checkpoint.exists()
+    ):
+        missing.append(str(config.sns.cg_detr_checkpoint))
+    if missing:
+        raise RuntimeError("Missing required API keys or local assets: " + ", ".join(missing))
+    return {
+        "experiment_id": config.experiment_id,
+        "run_dir": str(config.run_dir),
+        "records": len(records),
+        "modalities": sorted({record["modality"] for record in records}),
+        "sns_backend": config.sns.backend,
+        "eee_backend": config.eee.backend,
+        "experts": config.eee.experts,
+    }
+
+
+def print_outputs(payload: dict[str, Any]) -> None:
+    print(json.dumps(payload, indent=2, sort_keys=True))