diff --git a/docs/assets/recipes/code_generation/infinibyte.py b/docs/assets/recipes/code_generation/infinibyte.py
new file mode 100644
index 000000000..56f727413
--- /dev/null
+++ b/docs/assets/recipes/code_generation/infinibyte.py
@@ -0,0 +1,521 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "data-designer",
+# "datasets",
+# "pandas",
+# ]
+# ///
+"""Nemotron Nano InfiniByte Recipe: Cross-Source Problem Generation
+
+Generate more diverse and complex training problems by cross-breeding two
+source datasets. Each record pairs a "problem A" (e.g. a coding problem) with
+a "problem B" (e.g. a math or science problem), then uses an LLM to create new
+problems that incorporate concepts from both sources through either obfuscation
+(adding plausible but irrelevant complexity) or complication (genuinely
+increasing difficulty).
+
+This recipe implements the InfiniByte pipeline used for Nemotron Nano
+post-training data. The key idea: rather than generating problems from scratch,
+cross-join two existing problem datasets, then augment problem A with concepts
+from problem B to produce novel, harder problems.
+
+Pipeline architecture:
+
+ ┌─────────────────────────────────────────────────────────────────────────┐
+ │ PRE-PROCESSING (outside Data Designer) │
+ │ Download 2 HuggingFace datasets, cross-join with random sampling, │
+ │ save as CSV seed file. │
+ ├────────────────────────────────────────────────────────────────────────┤
+ │ STAGE 1: SEED + SAMPLER │
+ │ Seed: cross-joined CSV (problem_a, problem_b pairs) │
+ │ Sampler: combination_type (obfuscation / complication) │
+ ├────────────────────────────────────────────────────────────────────────┤
+ │ STAGE 2: CANDIDATE GENERATION (LLM Structured) │
+ │ Generate 2 candidate problems augmenting A with concepts from B. │
+ ├────────────────────────────────────────────────────────────────────────┤
+ │ STAGE 3: BEST PROBLEM SELECTION (LLM Structured) │
+ │ Select the best candidate based on adherence, difficulty, clarity. │
+ ├────────────────────────────────────────────────────────────────────────┤
+ │ STAGE 4: EVALUATION (LLM Structured) │
+ │ Score difficulty (1-3), clarity (1-3), adherence (1-3). │
+ ├────────────────────────────────────────────────────────────────────────┤
+ │ STAGE 5: SOLUTION GENERATION (LLM Text) │
+ │ Solve the new problem. │
+ └────────────────────────────────────────────────────────────────────────┘
+
+Prerequisites:
+ - NVIDIA_API_KEY environment variable for NVIDIA provider model aliases.
+ - Internet access for downloading HuggingFace datasets.
+
+Run:
+ # Basic usage (downloads OpenCodeReasoning + OpenMathReasoning, 100 records)
+ uv run infinibyte.py
+
+ # Customize dataset strategy and record count
+ uv run infinibyte.py --strategy ocr_omr --num-records 500 --limit 10000
+
+ # For help message and available options
+ uv run infinibyte.py --help
+"""
+
+from __future__ import annotations
+
+import hashlib
+import random
+import tempfile
+from pathlib import Path
+
+import pandas as pd
+from pydantic import BaseModel, Field
+
+import data_designer.config as dd
+from data_designer.interface import DataDesigner, DatasetCreationResults
+
+# =============================================================================
+# Pydantic models for structured LLM outputs
+# =============================================================================
+
+
+class NewProblem(BaseModel):
+ added_concepts: str = Field(
+ ..., description="Specific new concepts incorporated into the original problem"
+ )
+ problem: str = Field(
+ ..., description="A new problem statement incorporating new concepts from the provided problems."
+ )
+
+
+class NewProblemList(BaseModel):
+ problems: list[NewProblem] = Field(
+ default_factory=list, description="List of new problems"
+ )
+
+
+class NewProblemWithReasoning(NewProblem):
+ reasoning: str = Field(
+ ..., description="Concise explanation for selecting this specific new problem"
+ )
+
+
+class NewProblemEvals(BaseModel):
+ difficulty: int = Field(
+ ...,
+ description=(
+ "Evaluation of problem difficulty relative to original. "
+ "3: Significantly more challenging. "
+ "2: Moderately more difficult. "
+ "1: Similar to or easier than the original."
+ ),
+ )
+ clarity: int = Field(
+ ...,
+ description=(
+ "Evaluation of clarity and coherence. "
+ "3: Exceptionally clear and well-structured. "
+ "2: Generally clear with minor issues. "
+ "1: Unclear or poorly structured."
+ ),
+ )
+ adherence: int = Field(
+ ...,
+ description=(
+ "Evaluation of adherence to the combination goal. "
+ "3: Excellent adherence. "
+ "2: Adequate adherence. "
+ "1: Poor adherence."
+ ),
+ )
+
+
+# =============================================================================
+# Dataset download and cross-join
+# =============================================================================
+
+DATASET_CONFIGS = {
+ "ocr": {
+ "name": "nvidia/OpenCodeReasoning",
+ "config_name": "split_0",
+ "input_column": "input",
+ "output_column": "problem_a",
+ "output_id_column": "id_a",
+ "deduplicate": True,
+ "max_records": 500,
+ "random_seed": 27,
+ },
+ "omr": {
+ "name": "nvidia/OpenMathReasoning",
+ "split": "tir",
+ "input_column": "problem",
+ "output_column": "problem_b",
+ "output_id_column": "id_b",
+ "deduplicate": True,
+ "max_records": 500,
+ "random_seed": 7,
+ },
+}
+
+STRATEGIES = {
+ "ocr_omr": ("ocr", "omr"),
+}
+
+
+def fetch_hf_dataset_to_df(dataset_config: dict) -> pd.DataFrame:
+ """Load a HuggingFace dataset via streaming into a DataFrame with id and text columns.
+
+ Uses HF streaming to avoid downloading the full dataset, collecting up to
+ ``max_records`` deduplicated rows.
+ """
+ from datasets import load_dataset
+
+ dataset_name = dataset_config["name"]
+ config_name = dataset_config.get("config_name")
+ split = dataset_config.get("split")
+ input_col = dataset_config.get("input_column", "text")
+ output_col = dataset_config.get("output_column", input_col)
+ output_id_col = dataset_config.get("output_id_column", "id")
+ dedupe = dataset_config.get("deduplicate", False)
+ max_records = dataset_config.get("max_records", 500)
+ random_seed = dataset_config.get("random_seed", 42)
+
+ print(f"\n=== Streaming {dataset_name} (config={config_name}, split={split or 'all'}) ===")
+
+ load_kwargs: dict = {"path": dataset_name, "streaming": True}
+ if config_name:
+ load_kwargs["name"] = config_name
+ if split:
+ load_kwargs["split"] = split
+
+ ds = load_dataset(**load_kwargs)
+
+ # For DatasetDict (no split specified), use the first available split
+ if hasattr(ds, "keys"):
+ split_name = list(ds.keys())[0]
+ print(f" Using split: {split_name}")
+ ds = ds[split_name]
+
+ # Shuffle the stream for diversity, then collect up to max_records
+ ds = ds.shuffle(seed=random_seed)
+
+ rows = []
+ seen: set[str] = set()
+ scanned = 0
+ for rec in ds:
+ if input_col not in rec:
+ continue
+ text = rec[input_col]
+ scanned += 1
+ if dedupe:
+ if text in seen:
+ continue
+ seen.add(text)
+ rec_id = rec.get("id") or hashlib.md5(text.encode("utf-8")).hexdigest()
+ rows.append({output_id_col: rec_id, output_col: text})
+
+ if len(rows) >= max_records:
+ break
+
+ if scanned % 10_000 == 0:
+ print(f" Scanned {scanned} records, collected {len(rows)}...")
+
+ df = pd.DataFrame(rows)
+ print(f" Collected {len(df)} rows (scanned={scanned}, deduplicated={dedupe})")
+
+ return df
+
+
+def cross_join_with_limit(df1: pd.DataFrame, df2: pd.DataFrame, limit: int = 10_000) -> pd.DataFrame:
+ """Randomly sample pairs from the cartesian product of two DataFrames."""
+ n1, n2 = len(df1), len(df2)
+ total = n1 * n2
+ actual_limit = min(limit, total)
+
+ print(f"\nCross-joining {n1} x {n2} = {total} possible pairs, sampling {actual_limit}")
+
+ flat_indices = random.sample(range(total), actual_limit)
+ idx1 = [k // n2 for k in flat_indices]
+ idx2 = [k % n2 for k in flat_indices]
+
+ sub1 = df1.iloc[idx1].reset_index(drop=True)
+ sub2 = df2.iloc[idx2].reset_index(drop=True)
+
+ return pd.concat([sub1, sub2], axis=1)
+
+
+def prepare_seed_data(strategy: str = "ocr_omr", limit: int = 10_000) -> Path:
+ """Download datasets, cross-join, and save as CSV. Returns the CSV path."""
+ ds_a_key, ds_b_key = STRATEGIES[strategy]
+
+ df_a = fetch_hf_dataset_to_df(DATASET_CONFIGS[ds_a_key])
+ df_b = fetch_hf_dataset_to_df(DATASET_CONFIGS[ds_b_key])
+
+ cross_joined = cross_join_with_limit(df_a, df_b, limit=limit)
+
+ seed_path = Path(tempfile.mkdtemp()) / "infinibyte_seed.csv"
+ cross_joined.to_csv(seed_path, index=False)
+ print(f"\nSeed data saved to: {seed_path} ({len(cross_joined)} rows)")
+
+ return seed_path
+
+
+# =============================================================================
+# LLM prompts
+# =============================================================================
+
+PROBLEM_SYSTEM_PROMPT = """\
+You are an experienced competitive programmer, well versed in algorithms, \
+data structures, mathematics, physics, chemistry, biology and other sciences. \
+You excel in crafting problems that combine multiple concepts into a cohesive \
+problem statement.
+"""
+
+SOLUTION_SYSTEM_PROMPT = """\
+You are a helpful and harmless code assistant, well versed in competitive \
+coding problems and STEM subjects. You should think step-by-step before \
+responding to any instruction.
+
+You must use python programming language when generating code.
+You must use the python code block for just the final solution with the \
+following format:
+```python
+# Your final solution goes here
+```
+"""
+
+CANDIDATE_GENERATION_PROMPT = """\
+### Problem A:
+{{ problem_a }}
+
+### Problem B:
+{{ problem_b }}
+
+Carefully examine problems A and B above. Then formulate TWO new problems by \
+augmenting Problem A with concepts from Problem B.
+
+DO NOT REPEAT PROBLEM B VERBATIM WHEN AUGMENTING. INCORPORATE JUST THE \
+CONCEPTS FROM IT.
+
+{% if combination_type == 'obfuscation' %}
+Focus on obfuscation: Add concepts from Problem B to Problem A in a way that \
+makes the new problem seem more complex, but doesn't actually change the \
+solution. The added information should appear relevant but be effectively \
+irrelevant to solving the core problem. The goal is to create a problem that \
+appears more complicated than it actually is.
+
+NEVER DISCLOSE THAT ADDED INFORMATION IS IRRELEVANT OR THAT IT DOESN'T \
+AFFECT THE PROBLEM.
+IF A QUESTION IS POSED IN PROBLEM A, THAT QUESTION MUST REMAIN THE SAME.
+IF INPUT, OUTPUT, AND EXAMPLES ARE PRESENT IN PROBLEM A, INCLUDE THEM IN THE \
+FINAL PROBLEM AS WELL.
+
+{% elif combination_type == 'complication' %}
+Focus on complication: Integrate concepts from Problem B into Problem A to \
+genuinely increase the complexity. The solution should require understanding \
+and applying elements from both problems. The new problem should be more \
+challenging but still logically coherent and solvable.
+
+GIVEN THAT IT'S A NEW PROBLEM, DO NOT INCLUDE EXEMPLARY INPUT AND OUTPUT FROM \
+THE ORIGINAL PROBLEM.
+{% endif %}
+
+MAKE SURE TO INCORPORATE CONCEPTS ONLY FROM PROBLEM B.
+Your augmented problem should be believable and appear as a natural, cohesive \
+question without artificial divisions between the original elements.
+A reader should not be able to easily identify which parts came from Problem A \
+versus Problem B.
+
+DO NOT USE WORDS "PROBLEM A" OR "PROBLEM B" IN YOUR RESPONSE. INSTEAD, \
+PROVIDE A COMPLETE PROBLEM STATEMENT.
+"""
+
+BEST_SELECTION_PROMPT = """\
+### Original problem:
+{{ problem_a }}
+
+Examine candidate problems below which were created with the goal of \
+{% if combination_type == 'obfuscation' %}
+adding information to make the original problem seem more complex, without \
+actually changing the solution. The added information should be effectively \
+irrelevant to solving the core problem.
+{% elif combination_type == 'complication' %}
+genuinely increasing the complexity of the original problem by incorporating \
+new concepts that are logically coherent and solvable. Solving the new problem \
+should require understanding and applying newly introduced concepts.
+{% endif %}
+
+### Candidate problems:
+{{ problem_candidates }}
+
+Select the BEST problem based on the following criteria:
+1. Goal adherence (1-3): How well does the new problem adhere to the goal of \
+{% if combination_type == 'obfuscation' %}
+obfuscating the original problem without actually changing the solution
+{% elif combination_type == 'complication' %}
+increasing the complexity of the original problem by incorporating new \
+concepts that are logically coherent and solvable
+{% endif %}
+2. Difficulty (1-3): How challenging is the problem to answer, compared to the \
+original problem.
+3. Clarity (1-3): Is the problem clearly formulated and coherent?
+"""
+
+EVALUATION_PROMPT = """\
+### Original problem:
+{{ problem_a }}
+
+### New problem:
+{{ new_problem }}
+
+The new problem was created with the goal of \
+{% if combination_type == 'obfuscation' %}
+adding information to make the original problem seem more complex, without \
+actually changing the solution. The added information should be effectively \
+irrelevant to solving the core problem.
+{% elif combination_type == 'complication' %}
+genuinely increasing the complexity of the original problem by incorporating \
+new concepts that are logically coherent and solvable. Solving the new problem \
+should require understanding and applying newly introduced concepts.
+{% endif %}
+
+## Instructions:
+1. Carefully examine and compare the new problem to the original problem.
+2. Evaluate the new problem on goal adherence, difficulty and clarity.
+"""
+
+
+# =============================================================================
+# Pipeline builder
+# =============================================================================
+
+
+def build_config(model_alias: str, seed_path: Path) -> dd.DataDesignerConfigBuilder:
+ config_builder = dd.DataDesignerConfigBuilder()
+
+ # ── Seed data ────────────────────────────────────────────────────────
+
+ config_builder.with_seed_dataset(
+ dd.LocalFileSeedSource(path=str(seed_path)),
+ sampling_strategy=dd.SamplingStrategy.SHUFFLE,
+ )
+
+ # ── Stage 1: Combination type sampler ────────────────────────────────
+
+ config_builder.add_column(
+ dd.SamplerColumnConfig(
+ name="combination_type",
+ sampler_type=dd.SamplerType.CATEGORY,
+ params=dd.CategorySamplerParams(values=["obfuscation", "complication"]),
+ )
+ )
+
+ # ── Stage 2: Candidate problem generation ────────────────────────────
+
+ config_builder.add_column(
+ dd.LLMStructuredColumnConfig(
+ name="problem_candidates",
+ prompt=CANDIDATE_GENERATION_PROMPT,
+ system_prompt=PROBLEM_SYSTEM_PROMPT,
+ output_format=NewProblemList,
+ model_alias=model_alias,
+ )
+ )
+
+ # ── Stage 3: Best problem selection ──────────────────────────────────
+
+ config_builder.add_column(
+ dd.LLMStructuredColumnConfig(
+ name="best_problem_json",
+ prompt=BEST_SELECTION_PROMPT,
+ system_prompt=PROBLEM_SYSTEM_PROMPT,
+ output_format=NewProblemWithReasoning,
+ model_alias=model_alias,
+ )
+ )
+
+ config_builder.add_column(
+ dd.ExpressionColumnConfig(
+ name="new_problem",
+ expr="{{ best_problem_json.problem }}",
+ )
+ )
+
+ # ── Stage 4: Evaluation ──────────────────────────────────────────────
+
+ config_builder.add_column(
+ dd.LLMStructuredColumnConfig(
+ name="evals",
+ prompt=EVALUATION_PROMPT,
+ system_prompt=PROBLEM_SYSTEM_PROMPT,
+ output_format=NewProblemEvals,
+ model_alias=model_alias,
+ )
+ )
+
+ # ── Stage 5: Solution generation ─────────────────────────────────────
+ # NOTE: The evals column above already contains difficulty, clarity, and
+ # adherence scores as structured fields (e.g. {{ evals.difficulty }}).
+
+ config_builder.add_column(
+ dd.LLMTextColumnConfig(
+ name="solution",
+ prompt="Solve the following problem: {{ new_problem }}",
+ system_prompt=SOLUTION_SYSTEM_PROMPT,
+ model_alias=model_alias,
+ )
+ )
+
+ return config_builder
+
+
+# =============================================================================
+# Dataset creation
+# =============================================================================
+
+
+def create_dataset(
+ config_builder: dd.DataDesignerConfigBuilder,
+ num_records: int,
+ artifact_path: Path | str | None = None,
+) -> DatasetCreationResults:
+ data_designer = DataDesigner(artifact_path=artifact_path)
+ results = data_designer.create(config_builder, num_records=num_records)
+ return results
+
+
+if __name__ == "__main__":
+ from argparse import ArgumentParser
+
+ parser = ArgumentParser(description="Nemotron Nano InfiniByte Recipe")
+ parser.add_argument("--model-alias", type=str, default="nvidia-text")
+ parser.add_argument("--num-records", type=int, default=5)
+ parser.add_argument("--artifact-path", type=str, default=None)
+ parser.add_argument(
+ "--strategy",
+ type=str,
+ default="ocr_omr",
+ choices=list(STRATEGIES.keys()),
+ help="Cross-join strategy: which two datasets to combine (default: ocr_omr)",
+ )
+ parser.add_argument(
+ "--limit",
+ type=int,
+ default=10_000,
+ help="Maximum number of cross-joined pairs to sample (default: 10000)",
+ )
+ args = parser.parse_args()
+
+ print("=== Step 1: Preparing seed data ===")
+ seed_path = prepare_seed_data(strategy=args.strategy, limit=args.limit)
+
+ print("\n=== Step 2: Building pipeline and generating data ===")
+ config_builder = build_config(model_alias=args.model_alias, seed_path=seed_path)
+ results = create_dataset(
+ config_builder,
+ num_records=args.num_records,
+ artifact_path=args.artifact_path,
+ )
+
+ print(f"\nDataset saved to: {results.artifact_storage.final_dataset_path}")
+ results.load_analysis().to_report()
diff --git a/docs/assets/recipes/model_usability/prompt_sensitivity.py b/docs/assets/recipes/model_usability/prompt_sensitivity.py
new file mode 100644
index 000000000..4a4673e0e
--- /dev/null
+++ b/docs/assets/recipes/model_usability/prompt_sensitivity.py
@@ -0,0 +1,528 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "data-designer",
+# "pandas",
+# ]
+# ///
+"""Nemotron Nano Prompt Sensitivity Recipe: Regex-Verified Preamble Generation
+
+Generate diverse prompt preambles for RL training that pair natural-language
+instructions with regex-verified output format specifications. Each record
+contains a paraphrased preamble, a format instruction aligned to a regex
+pattern, and a composed user prompt -- all scored by four LLM judges.
+
+This recipe implements the preamble generation stage of the prompt sensitivity
+pipeline used for Nemotron Nano training. The key idea: seed the pipeline with
+10 regex-based answer formats (boxed, double-parens, angle brackets, XML tags,
+etc.), then use an LLM to paraphrase both the instruction preamble and the
+format specification while preserving the regex contract. Placement order
+variants control where the preamble, format instruction, and {problem}
+placeholder appear in the final prompt.
+
+Pipeline architecture:
+
+ ┌─────────────────────────────────────────────────────────────────────────┐
+ │ SEED DATA: 10 regex formats x 30 preambles │
+ │ Each row: format_key, output_regex, seed_preamble, │
+ │ seed_format_instruction │
+ ├────────────────────────────────────────────────────────────────────────┤
+ │ STAGE 1: DIVERSITY SAMPLERS (7 columns) │
+ │ sentence_length, sentence_type, tone, strictness_level, │
+ │ verbosity_level, domain_context, preamble_format_order │
+ ├────────────────────────────────────────────────────────────────────────┤
+ │ STAGE 2: PREAMBLE GENERATION (LLM) │
+ │ Paraphrase seed preamble respecting sampled controls. │
+ ├────────────────────────────────────────────────────────────────────────┤
+ │ STAGE 3: FORMAT INSTRUCTION GENERATION (LLM) │
+ │ Paraphrase format instruction preserving regex intent. │
+ ├────────────────────────────────────────────────────────────────────────┤
+ │ STAGE 4: USER PROMPT COMPOSITION (LLM) │
+ │ Assemble preamble + format instruction + {problem} placeholder. │
+ ├────────────────────────────────────────────────────────────────────────┤
+ │ STAGE 5: QUALITY SCORING (4 LLM judges) │
+ │ format_compliance (0-2), regex_alignment (0-1), │
+ │ order_coherence (0-1), preamble_quality (0-3) │
+ └────────────────────────────────────────────────────────────────────────┘
+
+Prerequisites:
+ - NVIDIA_API_KEY environment variable for NVIDIA provider model aliases.
+
+Run:
+ uv run prompt_sensitivity.py
+
+ uv run prompt_sensitivity.py --num-records 200
+
+ uv run prompt_sensitivity.py --help
+"""
+
+from __future__ import annotations
+
+import tempfile
+from pathlib import Path
+
+import pandas as pd
+
+import data_designer.config as dd
+from data_designer.interface import DataDesigner, DatasetCreationResults
+
+# =============================================================================
+# Seed data: regex format templates + seed preambles
+# =============================================================================
+
+FORMAT_TEMPLATES = [
+ {
+ "format_key": "fmt_00",
+ "output_regex": r"\boxed{([.*?])}",
+ "seed_format_instruction": "Make sure to put the answer (and only answer) inside \\boxed{}.",
+ },
+ {
+ "format_key": "fmt_01",
+ "output_regex": r"\(\((.*?)\)\)",
+ "seed_format_instruction": "Your final answer (and only the answer) should be enclosed within double parentheses.",
+ },
+ {
+ "format_key": "fmt_02",
+ "output_regex": r"Answer is\s*\[([A-Za-z])\]",
+ "seed_format_instruction": "Wrap answer in square brackets at the end: 'Answer is [X]'.",
+ },
+ {
+ "format_key": "fmt_03",
+ "output_regex": r"\(Answer:\s*([A-Za-z])\)",
+ "seed_format_instruction": "Conclude with (Answer: X), where X is the final answer.",
+ },
+ {
+ "format_key": "fmt_04",
+ "output_regex": r"Final Answer:\s*\|\|(.*?)\|\|",
+ "seed_format_instruction": "End your response with Final Answer: ||X||, where X is the final answer (and only the answer).",
+ },
+ {
+ "format_key": "fmt_05",
+ "output_regex": r"\[Answer:\s*([A-Za-z])\)",
+ "seed_format_instruction": "Remember to end with [Answer: X], where X is the final answer.",
+ },
+ {
+ "format_key": "fmt_06",
+ "output_regex": r"<<(.*?)>>",
+ "seed_format_instruction": "Put your final answer in double angle brackets.",
+ },
+ {
+ "format_key": "fmt_07",
+ "output_regex": r"\s*\*\*(.*?)\*\*",
+ "seed_format_instruction": "Give the answer at the end in this format -> **X**, where X is final answer.",
+ },
+ {
+ "format_key": "fmt_08",
+ "output_regex": r"\s*([.*?])\s*",
+ "seed_format_instruction": "Wrap your final answer in XML-style tags like this: X.",
+ },
+ {
+ "format_key": "fmt_09",
+ "output_regex": r"\boxed{([.*?])}",
+ "seed_format_instruction": "The final answer must be in \\boxed{} format. It's crucial to follow this format.",
+ },
+]
+
+SEED_PREAMBLES = [
+ "Solve the following problem step by step.",
+ "Provide a comprehensive solution to the problem below.",
+ "Think carefully and solve the following.",
+ "Analyze the problem and provide your answer.",
+ "Work through the following problem methodically.",
+ "Read the problem carefully and provide a detailed solution.",
+ "Consider all aspects of the problem before answering.",
+ "Break down the problem into steps and solve it.",
+ "Explain your reasoning as you solve the following.",
+ "Solve the problem, then provide the answer.",
+ "Provide your reasoning for the answer and state the final result.",
+ "Think step-by-step before giving your final answer.",
+ "Carefully analyze the following and provide your solution.",
+ "Show your work as you solve the following problem.",
+ "Reason through the problem below and give your answer.",
+ "Solve the following. Show your reasoning.",
+ "Provide a clear and complete solution.",
+ "Answer the following question with full explanation.",
+ "Walk through the solution step by step.",
+ "Read the following problem and solve it completely.",
+ "Give a thorough answer to the problem below.",
+ "Determine the answer to the following problem.",
+ "Present your solution with supporting reasoning.",
+ "Evaluate the following and provide your answer.",
+ "Think about the problem below and respond with your solution.",
+ "Provide a well-reasoned answer to the following.",
+ "Solve this problem. Explain each step.",
+ "Work out the answer to the following question.",
+ "Address the problem below with a complete solution.",
+ "Respond to the following with a clear answer.",
+]
+
+# =============================================================================
+# Placement order variants
+# =============================================================================
+
+PLACEMENT_ORDERS = [
+ "P + F + {problem}",
+ "F + P + {problem}",
+ "P + {problem} + F",
+ "F + {problem} + P",
+ "{problem} + P + F",
+ "{problem} + F + P",
+ "PF + {problem}",
+ "{problem} + PF",
+]
+
+# =============================================================================
+# LLM prompts
+# =============================================================================
+
+PREAMBLE_GEN_PROMPT = """\
+You are rewriting a seed preamble for an open-ended question prompt.
+
+Seed preamble: {{ seed_preamble }}
+
+Constraints:
+- Sentence length: {{ sentence_length }}
+- Sentence type: {{ sentence_type }}
+- Tone: {{ tone }}
+- Strictness: {{ strictness_level }}
+- Verbosity: {{ verbosity_level }}
+- Domain: {{ domain_context }}
+
+Instructions:
+- Paraphrase the seed preamble (do NOT copy it verbatim).
+- Produce a concise instruction line for generic open-ended questions.
+- Keep it neutral and generic; do NOT include output formatting requirements.
+- Respect the constraints above (length, type, tone, strictness, verbosity).
+- Output ONLY the rewritten preamble, nothing else.
+"""
+
+FORMAT_INSTRUCTION_GEN_PROMPT = """\
+You are rewriting a format instruction that tells the user how to present their final answer.
+
+Seed format instruction: {{ seed_format_instruction }}
+Output regex pattern: {{ output_regex }}
+
+Constraints:
+- Sentence length: {{ sentence_length }}
+- Tone: {{ tone }}
+
+Instructions:
+- Paraphrase the seed format instruction while preserving its intent.
+- The instruction must unambiguously specify how the final answer should be formatted.
+- The answer must be required at the end of the response.
+- Do NOT refer to the type of answer (sentence, paragraph, math expression).
+- Respect the sentence length and tone constraints.
+- Output ONLY the rewritten format instruction, nothing else.
+"""
+
+USER_PROMPT_COMPOSITION_PROMPT = """\
+Compose a final user prompt from the following parts.
+
+Preamble (P): {{ preamble }}
+Format instruction (F): {{ format_instruction }}
+Placement order: {{ preamble_format_order }}
+
+Instructions:
+- Concatenate the parts in the order specified by "Placement order".
+- Use {problem} as a literal placeholder for the question text.
+- For "PF" or "FP" merged orders, combine P and F into a single natural sentence.
+- Ensure {problem} has newlines before and after it for readability.
+- Preserve the exact text of P and F; do NOT abbreviate or add new content.
+- Output ONLY the composed user prompt, nothing else.
+"""
+
+# =============================================================================
+# Judge rubrics
+# =============================================================================
+
+FORMAT_COMPLIANCE_SCORES = [
+ dd.Score(
+ name="Format Compliance",
+ description="Does the format instruction unambiguously enforce the intended output format and require the answer at end of response?",
+ options={
+ "2": "Explicit, unambiguous, requires ending with answer in specified format.",
+ "1": "Mentions format but leaves room for trailing text after the answer.",
+ "0": "Ambiguous, doesn't mention format, or specifies an alternative format.",
+ },
+ ),
+]
+
+REGEX_ALIGNMENT_SCORES = [
+ dd.Score(
+ name="Regex Alignment",
+ description="Does the format instruction semantically and structurally align with the output_regex pattern?",
+ options={
+ "1": "Instruction matches the regex pattern intent.",
+ "0": "Instruction conflicts with or deviates from the regex intent.",
+ },
+ ),
+]
+
+ORDER_COHERENCE_SCORES = [
+ dd.Score(
+ name="Order Coherence",
+ description="Is the composed user prompt coherent with respect to the ordering of preamble, format instruction, and {problem} placeholder?",
+ options={
+ "1": "Makes sense given the part ordering.",
+ "0": "Confusing or contradictory ordering.",
+ },
+ ),
+]
+
+PREAMBLE_QUALITY_SCORES = [
+ dd.Score(
+ name="Preamble Quality",
+ description="Assess the preamble for clarity, concision, generic tone, and adherence to the sampled controls.",
+ options={
+ "3": "Clear, concise, generic, adheres to all controls.",
+ "2": "Good with minor issues in tone or length.",
+ "1": "Fair with noticeable issues.",
+ "0": "Poor, unclear, or conflicts with controls.",
+ },
+ ),
+]
+
+# =============================================================================
+# Judge prompts
+# =============================================================================
+
+FORMAT_COMPLIANCE_JUDGE_PROMPT = """\
+Evaluate the format instruction for compliance.
+
+Format instruction: {{ format_instruction }}
+Output regex: {{ output_regex }}
+Seed format instruction: {{ seed_format_instruction }}
+"""
+
+REGEX_ALIGNMENT_JUDGE_PROMPT = """\
+Evaluate whether the format instruction aligns with the regex pattern.
+
+Format instruction: {{ format_instruction }}
+Output regex: {{ output_regex }}
+"""
+
+ORDER_COHERENCE_JUDGE_PROMPT = """\
+Evaluate whether the composed user prompt is coherent given the placement order.
+
+User prompt: {{ user_prompt }}
+Preamble: {{ preamble }}
+Format instruction: {{ format_instruction }}
+Placement order: {{ preamble_format_order }}
+"""
+
+PREAMBLE_QUALITY_JUDGE_PROMPT = """\
+Evaluate the preamble for quality.
+
+Preamble: {{ preamble }}
+Seed preamble: {{ seed_preamble }}
+Sentence length: {{ sentence_length }}
+Sentence type: {{ sentence_type }}
+Tone: {{ tone }}
+Strictness: {{ strictness_level }}
+Verbosity: {{ verbosity_level }}
+"""
+
+
+# =============================================================================
+# Seed data builder
+# =============================================================================
+
+
+def build_seed_dataframe() -> pd.DataFrame:
+ """Build the seed DataFrame as the cross product of formats x preambles."""
+ rows = []
+ for fmt in FORMAT_TEMPLATES:
+ for preamble in SEED_PREAMBLES:
+ rows.append(
+ {
+ "format_key": fmt["format_key"],
+ "output_regex": fmt["output_regex"],
+ "seed_format_instruction": fmt["seed_format_instruction"],
+ "seed_preamble": preamble,
+ }
+ )
+ return pd.DataFrame(rows)
+
+
+# =============================================================================
+# Pipeline builder
+# =============================================================================
+
+
+def build_config(model_alias: str) -> tuple[dd.DataDesignerConfigBuilder, Path]:
+ config_builder = dd.DataDesignerConfigBuilder()
+
+ # ── Seed data ────────────────────────────────────────────────────────
+ seed_df = build_seed_dataframe()
+ seed_path = Path(tempfile.mkdtemp()) / "prompt_sensitivity_seed.csv"
+ seed_df.to_csv(seed_path, index=False)
+
+ config_builder.with_seed_dataset(
+ dd.LocalFileSeedSource(path=str(seed_path)),
+ sampling_strategy=dd.SamplingStrategy.SHUFFLE,
+ )
+
+ # ── Stage 1: Diversity samplers ──────────────────────────────────────
+
+ config_builder.add_column(
+ dd.SamplerColumnConfig(
+ name="sentence_length",
+ sampler_type=dd.SamplerType.CATEGORY,
+ params=dd.CategorySamplerParams(values=["short", "medium"]),
+ )
+ )
+
+ config_builder.add_column(
+ dd.SamplerColumnConfig(
+ name="sentence_type",
+ sampler_type=dd.SamplerType.CATEGORY,
+ params=dd.CategorySamplerParams(values=["imperative", "declarative", "interrogative"]),
+ )
+ )
+
+ config_builder.add_column(
+ dd.SamplerColumnConfig(
+ name="tone",
+ sampler_type=dd.SamplerType.CATEGORY,
+ params=dd.CategorySamplerParams(values=["formal", "neutral", "concise", "informal", "strict"]),
+ )
+ )
+
+ config_builder.add_column(
+ dd.SamplerColumnConfig(
+ name="strictness_level",
+ sampler_type=dd.SamplerType.CATEGORY,
+ params=dd.CategorySamplerParams(values=["low", "medium", "high"]),
+ )
+ )
+
+ config_builder.add_column(
+ dd.SamplerColumnConfig(
+ name="verbosity_level",
+ sampler_type=dd.SamplerType.CATEGORY,
+ params=dd.CategorySamplerParams(values=["concise", "standard"]),
+ )
+ )
+
+ config_builder.add_column(
+ dd.SamplerColumnConfig(
+ name="domain_context",
+ sampler_type=dd.SamplerType.CATEGORY,
+ params=dd.CategorySamplerParams(values=["general"]),
+ )
+ )
+
+ config_builder.add_column(
+ dd.SamplerColumnConfig(
+ name="preamble_format_order",
+ sampler_type=dd.SamplerType.CATEGORY,
+ params=dd.CategorySamplerParams(values=PLACEMENT_ORDERS),
+ )
+ )
+
+ # ── Stage 2: Preamble generation ─────────────────────────────────────
+
+ config_builder.add_column(
+ dd.LLMTextColumnConfig(
+ name="preamble",
+ prompt=PREAMBLE_GEN_PROMPT,
+ model_alias=model_alias,
+ )
+ )
+
+ # ── Stage 3: Format instruction generation ───────────────────────────
+
+ config_builder.add_column(
+ dd.LLMTextColumnConfig(
+ name="format_instruction",
+ prompt=FORMAT_INSTRUCTION_GEN_PROMPT,
+ model_alias=model_alias,
+ )
+ )
+
+ # ── Stage 4: User prompt composition ─────────────────────────────────
+
+ config_builder.add_column(
+ dd.LLMTextColumnConfig(
+ name="user_prompt",
+ prompt=USER_PROMPT_COMPOSITION_PROMPT,
+ model_alias=model_alias,
+ )
+ )
+
+ # ── Stage 5: Quality scoring ─────────────────────────────────────────
+
+ config_builder.add_column(
+ dd.LLMJudgeColumnConfig(
+ name="format_compliance_result",
+ prompt=FORMAT_COMPLIANCE_JUDGE_PROMPT,
+ scores=FORMAT_COMPLIANCE_SCORES,
+ model_alias=model_alias,
+ )
+ )
+
+ config_builder.add_column(
+ dd.LLMJudgeColumnConfig(
+ name="regex_alignment_result",
+ prompt=REGEX_ALIGNMENT_JUDGE_PROMPT,
+ scores=REGEX_ALIGNMENT_SCORES,
+ model_alias=model_alias,
+ )
+ )
+
+ config_builder.add_column(
+ dd.LLMJudgeColumnConfig(
+ name="order_coherence_result",
+ prompt=ORDER_COHERENCE_JUDGE_PROMPT,
+ scores=ORDER_COHERENCE_SCORES,
+ model_alias=model_alias,
+ )
+ )
+
+ config_builder.add_column(
+ dd.LLMJudgeColumnConfig(
+ name="preamble_quality_result",
+ prompt=PREAMBLE_QUALITY_JUDGE_PROMPT,
+ scores=PREAMBLE_QUALITY_SCORES,
+ model_alias=model_alias,
+ )
+ )
+
+ return config_builder, seed_path
+
+
+# =============================================================================
+# Dataset creation
+# =============================================================================
+
+
+def create_dataset(
+ config_builder: dd.DataDesignerConfigBuilder,
+ num_records: int,
+ artifact_path: Path | str | None = None,
+) -> DatasetCreationResults:
+ data_designer = DataDesigner(artifact_path=artifact_path)
+ results = data_designer.create(config_builder, num_records=num_records)
+ return results
+
+
+if __name__ == "__main__":
+ from argparse import ArgumentParser
+
+ parser = ArgumentParser(description="Nemotron Nano Prompt Sensitivity Recipe")
+ parser.add_argument("--model-alias", type=str, default="nvidia-text")
+ parser.add_argument("--num-records", type=int, default=10)
+ parser.add_argument("--artifact-path", type=str, default=None)
+ args = parser.parse_args()
+
+ config_builder, _seed_path = build_config(model_alias=args.model_alias)
+ results = create_dataset(
+ config_builder,
+ num_records=args.num_records,
+ artifact_path=args.artifact_path,
+ )
+
+ print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}")
+ results.load_analysis().to_report()
diff --git a/docs/assets/recipes/model_usability/structured_data.py b/docs/assets/recipes/model_usability/structured_data.py
new file mode 100644
index 000000000..d0a87737a
--- /dev/null
+++ b/docs/assets/recipes/model_usability/structured_data.py
@@ -0,0 +1,434 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+# "data-designer",
+# ]
+# ///
+"""Nemotron Nano Structured Data Recipe: Multi-Format Schema Generation
+
+Generate synthetic structured data across multiple output formats (JSON, YAML,
+XML, Markdown) with controlled schema complexity, conversational grounding,
+and best-of-3 candidate generation.
+
+This recipe implements the pipeline used to produce structured-data SFT records
+for Nemotron Nano training. Each record contains a generated schema, a natural
+user request, grounding Q&A conversation pairs, and three candidate structured
+outputs that conform to the schema.
+
+Pipeline architecture:
+
+ ┌─────────────────────────────────────────────────────────────────────────┐
+ │ STAGE 1: SAMPLING (DIVERSITY CONTROLS) │
+ │ │
+ │ Format Controls Schema Controls Conversation Controls │
+ │ ├─ output_format ├─ schema_rigidity ├─ num_turns │
+ │ └─ topic (conditional) ├─ schema_fields_count ├─ tone │
+ │ ├─ schema_complexity └─ detail_level │
+ │ └─ nesting_depth │
+ ├────────────────────────────────────────────────────────────────────────┤
+ │ STAGE 2: SCHEMA GENERATION (LLM) │
+ │ Generates format-specific schema from sampled controls + topic. │
+ ├────────────────────────────────────────────────────────────────────────┤
+ │ STAGE 3: USER PROMPT GENERATION (LLM) │
+ │ Natural-language request matching tone and detail level. │
+ ├────────────────────────────────────────────────────────────────────────┤
+ │ STAGE 4: CONVERSATION PAIRS (LLM) │
+ │ Q&A pairs covering schema facts for grounding. │
+ ├────────────────────────────────────────────────────────────────────────┤
+ │ STAGE 5: STRUCTURED OUTPUT (LLM, best-of-3) │
+ │ Three candidate structured outputs conforming to the schema. │
+ └────────────────────────────────────────────────────────────────────────┘
+
+Prerequisites:
+ - NVIDIA_API_KEY environment variable for NVIDIA provider model aliases.
+
+Run:
+ uv run structured_data.py
+
+ uv run structured_data.py --num-records 100 --output-format json
+
+ uv run structured_data.py --help
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import data_designer.config as dd
+from data_designer.interface import DataDesigner, DatasetCreationResults
+
+# =============================================================================
+# Topics: representative subset of categories and subtopics
+# =============================================================================
+
+TOPICS: dict[str, list[str]] = {
+ "Leisure Activities": [
+ "Outdoor Recreation",
+ "Board Games",
+ "DIY Crafts",
+ "Photography",
+ "Gardening",
+ ],
+ "Daily Life": [
+ "Morning Routines",
+ "Grocery Shopping",
+ "Commuting",
+ "Household Chores",
+ "Meal Planning",
+ ],
+ "Education and Learning": [
+ "Online Courses",
+ "Study Techniques",
+ "Language Learning",
+ "STEM Education",
+ "Library Systems",
+ ],
+ "Technology and Gadgets": [
+ "Smartphones",
+ "Smart Home Devices",
+ "Wearable Tech",
+ "Cloud Computing",
+ "Cybersecurity Basics",
+ ],
+ "Health and Wellness": [
+ "Nutrition Planning",
+ "Mental Health",
+ "Exercise Routines",
+ "Sleep Hygiene",
+ "Preventive Care",
+ ],
+ "Finance and Money": [
+ "Personal Budgeting",
+ "Investment Basics",
+ "Tax Preparation",
+ "Credit Management",
+ "Retirement Planning",
+ ],
+ "Food and Cooking": [
+ "Baking Techniques",
+ "Meal Prep",
+ "International Cuisines",
+ "Dietary Restrictions",
+ "Kitchen Equipment",
+ ],
+ "Travel and Transportation": [
+ "Trip Planning",
+ "Public Transit",
+ "Road Trips",
+ "Travel Insurance",
+ "Packing Strategies",
+ ],
+ "Arts and Culture": [
+ "Music Theory",
+ "Film Analysis",
+ "Theater Production",
+ "Contemporary Art",
+ "Creative Writing",
+ ],
+ "Work and Careers": [
+ "Resume Building",
+ "Interview Preparation",
+ "Remote Work",
+ "Project Management",
+ "Career Transitions",
+ ],
+}
+
+# =============================================================================
+# Prompts
+# =============================================================================
+
+SCHEMA_GENERATION_PROMPT = """\
+Create a schema for a structured object response in the format {{ output_format }}.
+
+Controls:
+- Rigidity: {{ schema_rigidity }}
+- Top-level properties: {{ schema_fields_count }}
+- Complexity: {{ schema_complexity }}
+- Nesting depth: {{ nesting_depth }}
+- Topic: {{ topic_category }} / {{ topic_subtopic }}
+
+Instructions:
+- Output only an object with keys: "name", "schema", and "strict", formatted as \
+{{ output_format }}.
+- "name" must be appropriate with the Topic: {{ topic_category }} / {{ topic_subtopic }}
+- "schema" should be a valid structured schema as specified in {{ output_format }}.
+- Use {{ schema_fields_count }} top-level properties, relevant to the topic.
+- Include at least one boolean and, if appropriate, one enum.
+- All top-level properties must be listed in "required".
+- Set "additionalProperties": false at every object level.
+- If {{ schema_complexity }} is "complex", make the schema deeply nested: at least two \
+levels of nested objects, with at least one object nested three levels deep. Keep nesting \
+relevant to the topic.
+- If "simple", keep nesting minimal or flat.
+- "strict" must be true.
+
+Formatting by output_format:
+- "json": Output a valid JSON object, no code fences or comments.
+- "yaml": Output a valid YAML object, no code fences or comments.
+- "xml": Output a valid XML document with root "root" and child elements "name", "schema", \
+and "strict". "schema" can be a string or nested XML.
+- "markdown": Output a Markdown code block with the JSON object, using triple backticks and \
+"json" as the language, no extra text.
+
+Output only the object in the specified format. No explanations or extra text.
+"""
+
+USER_PROMPT_GENERATION = """\
+You are a human user asking an AI assistant to produce a structured output. Write a natural, \
+concise request that would lead to filling in a schema about {{ topic_category }} / \
+{{ topic_subtopic }}.
+
+The request should:
+- Sound like something a real person would type or say
+- Describe what data they want without exposing the schema itself
+- Mention the desired output format: {{ output_format }}
+- Match the tone: {{ tone }} and detail level: {{ detail_level }}
+
+Do not include the schema, code fences, or technical formatting. Just the user request.
+"""
+
+CONVERSATION_PROMPT = """\
+Write a short Q&A conversation about the following topic. Follow the selected JSON Schema \
+fields as the underlying facts to cover, but DO NOT output JSON here.
+
+Topic context:
+- Category: {{ topic_category }}
+- Subtopic: {{ topic_subtopic }}
+
+Constraints:
+- Number of Q&A pairs: {{ num_turns }}
+- Tone: {{ tone }}
+- Detail level: {{ detail_level }}
+
+Write alternating question/answer pairs that make these facts unambiguous for the chosen \
+schema: {{ structured_schema }}
+Return only a Python list of [question, answer] pairs (no extra text).
+"""
+
+STRUCTURED_OUTPUT_PROMPT = """\
+You will produce a {{ output_format }} instance that conforms strictly to the following \
+schema (no extra keys).
+
+Schema:
+{{ structured_schema }}
+
+You are given a Python list of [question, answer] pairs:
+{{ conversation_pairs }}
+
+Instructions:
+- Derive values only from the answers given.
+- Render ONLY the {{ output_format }} instance, with no commentary.
+- Formatting rules:
+ - If output_format is "json", output a single JSON object (no code fences).
+ - If output_format is "yaml", output a YAML mapping (no code fences).
+ - If output_format is "xml", output an XML document with root .
+ - If output_format is "markdown", output a fenced code block with ```json.
+- Ensure the content validates against the schema when parsed back to JSON.
+"""
+
+# =============================================================================
+# Supported output formats
+# =============================================================================
+
+OUTPUT_FORMATS = ["json", "yaml", "xml", "markdown"]
+
+
+# =============================================================================
+# Pipeline builder
+# =============================================================================
+
+
+def build_config(
+ model_alias: str,
+ output_format: str | None = None,
+) -> dd.DataDesignerConfigBuilder:
+ config_builder = dd.DataDesignerConfigBuilder()
+
+ # ── Stage 1: Sampling ────────────────────────────────────────────────
+
+ config_builder.add_column(
+ dd.SamplerColumnConfig(
+ name="record_id",
+ sampler_type=dd.SamplerType.UUID,
+ params=dd.UUIDSamplerParams(prefix="SD-", short_form=True, uppercase=True),
+ )
+ )
+
+ formats = [output_format] if output_format else OUTPUT_FORMATS
+ config_builder.add_column(
+ dd.SamplerColumnConfig(
+ name="output_format",
+ sampler_type=dd.SamplerType.CATEGORY,
+ params=dd.CategorySamplerParams(values=formats),
+ )
+ )
+
+ config_builder.add_column(
+ dd.SamplerColumnConfig(
+ name="topic_category",
+ sampler_type=dd.SamplerType.CATEGORY,
+ params=dd.CategorySamplerParams(values=list(TOPICS.keys())),
+ )
+ )
+
+ config_builder.add_column(
+ dd.SamplerColumnConfig(
+ name="topic_subtopic",
+ sampler_type=dd.SamplerType.SUBCATEGORY,
+ params=dd.SubcategorySamplerParams(
+ category="topic_category",
+ values=TOPICS,
+ ),
+ )
+ )
+
+ config_builder.add_column(
+ dd.SamplerColumnConfig(
+ name="schema_rigidity",
+ sampler_type=dd.SamplerType.CATEGORY,
+ params=dd.CategorySamplerParams(values=["strict", "moderate"]),
+ )
+ )
+
+ config_builder.add_column(
+ dd.SamplerColumnConfig(
+ name="schema_fields_count",
+ sampler_type=dd.SamplerType.CATEGORY,
+ params=dd.CategorySamplerParams(
+ values=["3", "4", "5", "6", "7", "8", "9", "10", "11", "12"],
+ ),
+ )
+ )
+
+ config_builder.add_column(
+ dd.SamplerColumnConfig(
+ name="schema_complexity",
+ sampler_type=dd.SamplerType.CATEGORY,
+ params=dd.CategorySamplerParams(values=["simple", "complex"]),
+ )
+ )
+
+ config_builder.add_column(
+ dd.SamplerColumnConfig(
+ name="nesting_depth",
+ sampler_type=dd.SamplerType.CATEGORY,
+ params=dd.CategorySamplerParams(values=["1", "2", "3", "4"]),
+ )
+ )
+
+ config_builder.add_column(
+ dd.SamplerColumnConfig(
+ name="num_turns",
+ sampler_type=dd.SamplerType.CATEGORY,
+ params=dd.CategorySamplerParams(values=["2", "3", "4", "5", "6", "7", "8"]),
+ )
+ )
+
+ config_builder.add_column(
+ dd.SamplerColumnConfig(
+ name="tone",
+ sampler_type=dd.SamplerType.CATEGORY,
+ params=dd.CategorySamplerParams(values=["neutral", "enthusiastic", "factual"]),
+ )
+ )
+
+ config_builder.add_column(
+ dd.SamplerColumnConfig(
+ name="detail_level",
+ sampler_type=dd.SamplerType.CATEGORY,
+ params=dd.CategorySamplerParams(
+ values=["brief", "standard", "detailed", "super verbose"],
+ ),
+ )
+ )
+
+ # ── Stage 2: Schema generation ───────────────────────────────────────
+
+ config_builder.add_column(
+ dd.LLMTextColumnConfig(
+ name="structured_schema",
+ prompt=SCHEMA_GENERATION_PROMPT,
+ model_alias=model_alias,
+ )
+ )
+
+ # ── Stage 3: User prompt generation ──────────────────────────────────
+
+ config_builder.add_column(
+ dd.LLMTextColumnConfig(
+ name="user_prompt",
+ prompt=USER_PROMPT_GENERATION,
+ model_alias=model_alias,
+ )
+ )
+
+ # ── Stage 4: Conversation pairs ──────────────────────────────────────
+
+ config_builder.add_column(
+ dd.LLMTextColumnConfig(
+ name="conversation_pairs",
+ prompt=CONVERSATION_PROMPT,
+ model_alias=model_alias,
+ )
+ )
+
+ # ── Stage 5: Best-of-3 structured output ─────────────────────────────
+
+ for i in range(3):
+ config_builder.add_column(
+ dd.LLMTextColumnConfig(
+ name=f"structured_output_{i}",
+ prompt=STRUCTURED_OUTPUT_PROMPT,
+ model_alias=model_alias,
+ )
+ )
+
+ return config_builder
+
+
+# =============================================================================
+# Dataset creation
+# =============================================================================
+
+
+def create_dataset(
+ config_builder: dd.DataDesignerConfigBuilder,
+ num_records: int,
+ artifact_path: Path | str | None = None,
+) -> DatasetCreationResults:
+ data_designer = DataDesigner(artifact_path=artifact_path)
+ results = data_designer.create(config_builder, num_records=num_records)
+ return results
+
+
+if __name__ == "__main__":
+ from argparse import ArgumentParser
+
+ parser = ArgumentParser(description="Nemotron Nano Structured Data Recipe")
+ parser.add_argument("--model-alias", type=str, default="nvidia-text")
+ parser.add_argument("--num-records", type=int, default=5)
+ parser.add_argument("--artifact-path", type=str, default=None)
+ parser.add_argument(
+ "--output-format",
+ type=str,
+ default=None,
+ choices=OUTPUT_FORMATS,
+ help="Generate for a single output format (default: all formats)",
+ )
+ args = parser.parse_args()
+
+ config_builder = build_config(
+ model_alias=args.model_alias,
+ output_format=args.output_format,
+ )
+ results = create_dataset(
+ config_builder,
+ num_records=args.num_records,
+ artifact_path=args.artifact_path,
+ )
+
+ print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}")
+ results.load_analysis().to_report()
diff --git a/docs/recipes/cards.md b/docs/recipes/cards.md
index ef2752936..5f85fc27b 100644
--- a/docs/recipes/cards.md
+++ b/docs/recipes/cards.md
@@ -172,6 +172,60 @@ Each recipe is a self-contained example that can be run independently.
[:material-book-open-page-variant: View Recipe](mcp_and_tooluse/search_agent.md){ .md-button }
[Download Code :octicons-download-24:](../assets/recipes/mcp_and_tooluse/search_agent.py){ .md-button download="search_agent.py" }
+- :material-code-json:{ .lg .middle } **Nemotron Nano Structured Data**
+
+ Generate multi-format structured data (JSON, YAML, XML, Markdown) with controlled schema complexity, conversational grounding, and best-of-3 candidate generation.
+
+ ---
+
+ **Demonstrates:**
+
+ - Multi-format structured output generation
+ - Conditional topic sampling with SubcategorySamplerParams
+ - Multi-stage LLM pipeline (schema, prompt, conversation, output)
+ - Best-of-3 candidate generation
+
+ ---
+
+ [:material-book-open-page-variant: View Recipe](model_usability/structured_data.md){ .md-button }
+ [Download Code :octicons-download-24:](../assets/recipes/model_usability/structured_data.py){ .md-button download="structured_data.py" }
+
+- :material-format-list-text:{ .lg .middle } **Nemotron Nano Prompt Sensitivity**
+
+ Generate diverse prompt preambles for RL training with regex-verified output format specifications, LLM paraphrasing, placement order variants, and four LLM judges.
+
+ ---
+
+ **Demonstrates:**
+
+ - Seed dataset with format templates and preambles
+ - LLM paraphrasing with diversity controls
+ - Regex-aligned format instruction generation
+ - 4 LLM judges (format compliance, regex alignment, order coherence, preamble quality)
+
+ ---
+
+ [:material-book-open-page-variant: View Recipe](model_usability/prompt_sensitivity.md){ .md-button }
+ [Download Code :octicons-download-24:](../assets/recipes/model_usability/prompt_sensitivity.py){ .md-button download="prompt_sensitivity.py" }
+
+- :material-infinity:{ .lg .middle } **Nemotron Nano InfiniByte**
+
+ Generate more diverse and complex problems by cross-breeding two HuggingFace datasets, augmenting problems through obfuscation or complication with structured LLM evaluation.
+
+ ---
+
+ **Demonstrates:**
+
+ - Cross-source dataset seeding from HuggingFace
+ - LLMStructuredColumnConfig with Pydantic models
+ - Multi-stage structured generation (candidates, selection, evaluation)
+ - Expression columns for score extraction
+
+ ---
+
+ [:material-book-open-page-variant: View Recipe](code_generation/infinibyte.md){ .md-button }
+ [Download Code :octicons-download-24:](../assets/recipes/code_generation/infinibyte.py){ .md-button download="infinibyte.py" }
+
- :material-file-document-multiple:{ .lg .middle } **Markdown Section Seed Reader**
Define a custom `FileSystemSeedReader` inline and turn Markdown files into one seed row per heading section.
diff --git a/docs/recipes/code_generation/infinibyte.md b/docs/recipes/code_generation/infinibyte.md
new file mode 100644
index 000000000..662757bc9
--- /dev/null
+++ b/docs/recipes/code_generation/infinibyte.md
@@ -0,0 +1,9 @@
+# Nemotron Nano InfiniByte
+
+Generate more diverse and complex training problems by cross-breeding two source datasets. Cross-joins coding and math/science problems, then uses an LLM to create augmented problems through obfuscation or complication, with structured evaluation and solution generation. Used for Nemotron Nano post-training data.
+
+[Download Code :octicons-download-24:](../../assets/recipes/code_generation/infinibyte.py){ .md-button download="infinibyte.py" }
+
+```python
+--8<-- "assets/recipes/code_generation/infinibyte.py"
+```
diff --git a/docs/recipes/model_usability/prompt_sensitivity.md b/docs/recipes/model_usability/prompt_sensitivity.md
new file mode 100644
index 000000000..92db61353
--- /dev/null
+++ b/docs/recipes/model_usability/prompt_sensitivity.md
@@ -0,0 +1,9 @@
+# Nemotron Nano Prompt Sensitivity
+
+Generate diverse prompt preambles for RL training that pair natural-language instructions with regex-verified output format specifications. Uses seed format templates, LLM paraphrasing, placement order variants, and four LLM judges to produce high-quality, format-compliant preambles. Used for Nemotron Nano prompt sensitivity training.
+
+[Download Code :octicons-download-24:](../../assets/recipes/model_usability/prompt_sensitivity.py){ .md-button download="prompt_sensitivity.py" }
+
+```python
+--8<-- "assets/recipes/model_usability/prompt_sensitivity.py"
+```
diff --git a/docs/recipes/model_usability/structured_data.md b/docs/recipes/model_usability/structured_data.md
new file mode 100644
index 000000000..aace7f064
--- /dev/null
+++ b/docs/recipes/model_usability/structured_data.md
@@ -0,0 +1,9 @@
+# Nemotron Nano Structured Data
+
+Generate multi-format structured data (JSON, YAML, XML, Markdown) with controlled schema complexity, conversational grounding, and best-of-3 candidate generation. Used for Nemotron Nano structured-output SFT training.
+
+[Download Code :octicons-download-24:](../../assets/recipes/model_usability/structured_data.py){ .md-button download="structured_data.py" }
+
+```python
+--8<-- "assets/recipes/model_usability/structured_data.py"
+```
diff --git a/mkdocs.yml b/mkdocs.yml
index 464491d4f..4ace5081f 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -46,6 +46,10 @@ nav:
- Text to Python: recipes/code_generation/text_to_python.md
- Text to SQL: recipes/code_generation/text_to_sql.md
- "Nemotron Super Text to SQL": recipes/code_generation/enterprise_text_to_sql.md
+ - "Nemotron Nano InfiniByte": recipes/code_generation/infinibyte.md
+ - Model Usability:
+ - "Nemotron Nano Structured Data": recipes/model_usability/structured_data.md
+ - "Nemotron Nano Prompt Sensitivity": recipes/model_usability/prompt_sensitivity.md
- Plugin Development:
- Markdown Section Seed Reader Plugin: recipes/plugin_development/markdown_seed_reader.md
- QA and Chat: