diff --git a/docs/assets/recipes/code_generation/infinibyte.py b/docs/assets/recipes/code_generation/infinibyte.py new file mode 100644 index 000000000..56f727413 --- /dev/null +++ b/docs/assets/recipes/code_generation/infinibyte.py @@ -0,0 +1,521 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "data-designer", +# "datasets", +# "pandas", +# ] +# /// +"""Nemotron Nano InfiniByte Recipe: Cross-Source Problem Generation + +Generate more diverse and complex training problems by cross-breeding two +source datasets. Each record pairs a "problem A" (e.g. a coding problem) with +a "problem B" (e.g. a math or science problem), then uses an LLM to create new +problems that incorporate concepts from both sources through either obfuscation +(adding plausible but irrelevant complexity) or complication (genuinely +increasing difficulty). + +This recipe implements the InfiniByte pipeline used for Nemotron Nano +post-training data. The key idea: rather than generating problems from scratch, +cross-join two existing problem datasets, then augment problem A with concepts +from problem B to produce novel, harder problems. + +Pipeline architecture: + + ┌─────────────────────────────────────────────────────────────────────────┐ + │ PRE-PROCESSING (outside Data Designer) │ + │ Download 2 HuggingFace datasets, cross-join with random sampling, │ + │ save as CSV seed file. │ + ├────────────────────────────────────────────────────────────────────────┤ + │ STAGE 1: SEED + SAMPLER │ + │ Seed: cross-joined CSV (problem_a, problem_b pairs) │ + │ Sampler: combination_type (obfuscation / complication) │ + ├────────────────────────────────────────────────────────────────────────┤ + │ STAGE 2: CANDIDATE GENERATION (LLM Structured) │ + │ Generate 2 candidate problems augmenting A with concepts from B. │ + ├────────────────────────────────────────────────────────────────────────┤ + │ STAGE 3: BEST PROBLEM SELECTION (LLM Structured) │ + │ Select the best candidate based on adherence, difficulty, clarity. │ + ├────────────────────────────────────────────────────────────────────────┤ + │ STAGE 4: EVALUATION (LLM Structured) │ + │ Score difficulty (1-3), clarity (1-3), adherence (1-3). │ + ├────────────────────────────────────────────────────────────────────────┤ + │ STAGE 5: SOLUTION GENERATION (LLM Text) │ + │ Solve the new problem. │ + └────────────────────────────────────────────────────────────────────────┘ + +Prerequisites: + - NVIDIA_API_KEY environment variable for NVIDIA provider model aliases. + - Internet access for downloading HuggingFace datasets. + +Run: + # Basic usage (downloads OpenCodeReasoning + OpenMathReasoning, 100 records) + uv run infinibyte.py + + # Customize dataset strategy and record count + uv run infinibyte.py --strategy ocr_omr --num-records 500 --limit 10000 + + # For help message and available options + uv run infinibyte.py --help +""" + +from __future__ import annotations + +import hashlib +import random +import tempfile +from pathlib import Path + +import pandas as pd +from pydantic import BaseModel, Field + +import data_designer.config as dd +from data_designer.interface import DataDesigner, DatasetCreationResults + +# ============================================================================= +# Pydantic models for structured LLM outputs +# ============================================================================= + + +class NewProblem(BaseModel): + added_concepts: str = Field( + ..., description="Specific new concepts incorporated into the original problem" + ) + problem: str = Field( + ..., description="A new problem statement incorporating new concepts from the provided problems." + ) + + +class NewProblemList(BaseModel): + problems: list[NewProblem] = Field( + default_factory=list, description="List of new problems" + ) + + +class NewProblemWithReasoning(NewProblem): + reasoning: str = Field( + ..., description="Concise explanation for selecting this specific new problem" + ) + + +class NewProblemEvals(BaseModel): + difficulty: int = Field( + ..., + description=( + "Evaluation of problem difficulty relative to original. " + "3: Significantly more challenging. " + "2: Moderately more difficult. " + "1: Similar to or easier than the original." + ), + ) + clarity: int = Field( + ..., + description=( + "Evaluation of clarity and coherence. " + "3: Exceptionally clear and well-structured. " + "2: Generally clear with minor issues. " + "1: Unclear or poorly structured." + ), + ) + adherence: int = Field( + ..., + description=( + "Evaluation of adherence to the combination goal. " + "3: Excellent adherence. " + "2: Adequate adherence. " + "1: Poor adherence." + ), + ) + + +# ============================================================================= +# Dataset download and cross-join +# ============================================================================= + +DATASET_CONFIGS = { + "ocr": { + "name": "nvidia/OpenCodeReasoning", + "config_name": "split_0", + "input_column": "input", + "output_column": "problem_a", + "output_id_column": "id_a", + "deduplicate": True, + "max_records": 500, + "random_seed": 27, + }, + "omr": { + "name": "nvidia/OpenMathReasoning", + "split": "tir", + "input_column": "problem", + "output_column": "problem_b", + "output_id_column": "id_b", + "deduplicate": True, + "max_records": 500, + "random_seed": 7, + }, +} + +STRATEGIES = { + "ocr_omr": ("ocr", "omr"), +} + + +def fetch_hf_dataset_to_df(dataset_config: dict) -> pd.DataFrame: + """Load a HuggingFace dataset via streaming into a DataFrame with id and text columns. + + Uses HF streaming to avoid downloading the full dataset, collecting up to + ``max_records`` deduplicated rows. + """ + from datasets import load_dataset + + dataset_name = dataset_config["name"] + config_name = dataset_config.get("config_name") + split = dataset_config.get("split") + input_col = dataset_config.get("input_column", "text") + output_col = dataset_config.get("output_column", input_col) + output_id_col = dataset_config.get("output_id_column", "id") + dedupe = dataset_config.get("deduplicate", False) + max_records = dataset_config.get("max_records", 500) + random_seed = dataset_config.get("random_seed", 42) + + print(f"\n=== Streaming {dataset_name} (config={config_name}, split={split or 'all'}) ===") + + load_kwargs: dict = {"path": dataset_name, "streaming": True} + if config_name: + load_kwargs["name"] = config_name + if split: + load_kwargs["split"] = split + + ds = load_dataset(**load_kwargs) + + # For DatasetDict (no split specified), use the first available split + if hasattr(ds, "keys"): + split_name = list(ds.keys())[0] + print(f" Using split: {split_name}") + ds = ds[split_name] + + # Shuffle the stream for diversity, then collect up to max_records + ds = ds.shuffle(seed=random_seed) + + rows = [] + seen: set[str] = set() + scanned = 0 + for rec in ds: + if input_col not in rec: + continue + text = rec[input_col] + scanned += 1 + if dedupe: + if text in seen: + continue + seen.add(text) + rec_id = rec.get("id") or hashlib.md5(text.encode("utf-8")).hexdigest() + rows.append({output_id_col: rec_id, output_col: text}) + + if len(rows) >= max_records: + break + + if scanned % 10_000 == 0: + print(f" Scanned {scanned} records, collected {len(rows)}...") + + df = pd.DataFrame(rows) + print(f" Collected {len(df)} rows (scanned={scanned}, deduplicated={dedupe})") + + return df + + +def cross_join_with_limit(df1: pd.DataFrame, df2: pd.DataFrame, limit: int = 10_000) -> pd.DataFrame: + """Randomly sample pairs from the cartesian product of two DataFrames.""" + n1, n2 = len(df1), len(df2) + total = n1 * n2 + actual_limit = min(limit, total) + + print(f"\nCross-joining {n1} x {n2} = {total} possible pairs, sampling {actual_limit}") + + flat_indices = random.sample(range(total), actual_limit) + idx1 = [k // n2 for k in flat_indices] + idx2 = [k % n2 for k in flat_indices] + + sub1 = df1.iloc[idx1].reset_index(drop=True) + sub2 = df2.iloc[idx2].reset_index(drop=True) + + return pd.concat([sub1, sub2], axis=1) + + +def prepare_seed_data(strategy: str = "ocr_omr", limit: int = 10_000) -> Path: + """Download datasets, cross-join, and save as CSV. Returns the CSV path.""" + ds_a_key, ds_b_key = STRATEGIES[strategy] + + df_a = fetch_hf_dataset_to_df(DATASET_CONFIGS[ds_a_key]) + df_b = fetch_hf_dataset_to_df(DATASET_CONFIGS[ds_b_key]) + + cross_joined = cross_join_with_limit(df_a, df_b, limit=limit) + + seed_path = Path(tempfile.mkdtemp()) / "infinibyte_seed.csv" + cross_joined.to_csv(seed_path, index=False) + print(f"\nSeed data saved to: {seed_path} ({len(cross_joined)} rows)") + + return seed_path + + +# ============================================================================= +# LLM prompts +# ============================================================================= + +PROBLEM_SYSTEM_PROMPT = """\ +You are an experienced competitive programmer, well versed in algorithms, \ +data structures, mathematics, physics, chemistry, biology and other sciences. \ +You excel in crafting problems that combine multiple concepts into a cohesive \ +problem statement. +""" + +SOLUTION_SYSTEM_PROMPT = """\ +You are a helpful and harmless code assistant, well versed in competitive \ +coding problems and STEM subjects. You should think step-by-step before \ +responding to any instruction. + +You must use python programming language when generating code. +You must use the python code block for just the final solution with the \ +following format: +```python +# Your final solution goes here +``` +""" + +CANDIDATE_GENERATION_PROMPT = """\ +### Problem A: +{{ problem_a }} + +### Problem B: +{{ problem_b }} + +Carefully examine problems A and B above. Then formulate TWO new problems by \ +augmenting Problem A with concepts from Problem B. + +DO NOT REPEAT PROBLEM B VERBATIM WHEN AUGMENTING. INCORPORATE JUST THE \ +CONCEPTS FROM IT. + +{% if combination_type == 'obfuscation' %} +Focus on obfuscation: Add concepts from Problem B to Problem A in a way that \ +makes the new problem seem more complex, but doesn't actually change the \ +solution. The added information should appear relevant but be effectively \ +irrelevant to solving the core problem. The goal is to create a problem that \ +appears more complicated than it actually is. + +NEVER DISCLOSE THAT ADDED INFORMATION IS IRRELEVANT OR THAT IT DOESN'T \ +AFFECT THE PROBLEM. +IF A QUESTION IS POSED IN PROBLEM A, THAT QUESTION MUST REMAIN THE SAME. +IF INPUT, OUTPUT, AND EXAMPLES ARE PRESENT IN PROBLEM A, INCLUDE THEM IN THE \ +FINAL PROBLEM AS WELL. + +{% elif combination_type == 'complication' %} +Focus on complication: Integrate concepts from Problem B into Problem A to \ +genuinely increase the complexity. The solution should require understanding \ +and applying elements from both problems. The new problem should be more \ +challenging but still logically coherent and solvable. + +GIVEN THAT IT'S A NEW PROBLEM, DO NOT INCLUDE EXEMPLARY INPUT AND OUTPUT FROM \ +THE ORIGINAL PROBLEM. +{% endif %} + +MAKE SURE TO INCORPORATE CONCEPTS ONLY FROM PROBLEM B. +Your augmented problem should be believable and appear as a natural, cohesive \ +question without artificial divisions between the original elements. +A reader should not be able to easily identify which parts came from Problem A \ +versus Problem B. + +DO NOT USE WORDS "PROBLEM A" OR "PROBLEM B" IN YOUR RESPONSE. INSTEAD, \ +PROVIDE A COMPLETE PROBLEM STATEMENT. +""" + +BEST_SELECTION_PROMPT = """\ +### Original problem: +{{ problem_a }} + +Examine candidate problems below which were created with the goal of \ +{% if combination_type == 'obfuscation' %} +adding information to make the original problem seem more complex, without \ +actually changing the solution. The added information should be effectively \ +irrelevant to solving the core problem. +{% elif combination_type == 'complication' %} +genuinely increasing the complexity of the original problem by incorporating \ +new concepts that are logically coherent and solvable. Solving the new problem \ +should require understanding and applying newly introduced concepts. +{% endif %} + +### Candidate problems: +{{ problem_candidates }} + +Select the BEST problem based on the following criteria: +1. Goal adherence (1-3): How well does the new problem adhere to the goal of \ +{% if combination_type == 'obfuscation' %} +obfuscating the original problem without actually changing the solution +{% elif combination_type == 'complication' %} +increasing the complexity of the original problem by incorporating new \ +concepts that are logically coherent and solvable +{% endif %} +2. Difficulty (1-3): How challenging is the problem to answer, compared to the \ +original problem. +3. Clarity (1-3): Is the problem clearly formulated and coherent? +""" + +EVALUATION_PROMPT = """\ +### Original problem: +{{ problem_a }} + +### New problem: +{{ new_problem }} + +The new problem was created with the goal of \ +{% if combination_type == 'obfuscation' %} +adding information to make the original problem seem more complex, without \ +actually changing the solution. The added information should be effectively \ +irrelevant to solving the core problem. +{% elif combination_type == 'complication' %} +genuinely increasing the complexity of the original problem by incorporating \ +new concepts that are logically coherent and solvable. Solving the new problem \ +should require understanding and applying newly introduced concepts. +{% endif %} + +## Instructions: +1. Carefully examine and compare the new problem to the original problem. +2. Evaluate the new problem on goal adherence, difficulty and clarity. +""" + + +# ============================================================================= +# Pipeline builder +# ============================================================================= + + +def build_config(model_alias: str, seed_path: Path) -> dd.DataDesignerConfigBuilder: + config_builder = dd.DataDesignerConfigBuilder() + + # ── Seed data ──────────────────────────────────────────────────────── + + config_builder.with_seed_dataset( + dd.LocalFileSeedSource(path=str(seed_path)), + sampling_strategy=dd.SamplingStrategy.SHUFFLE, + ) + + # ── Stage 1: Combination type sampler ──────────────────────────────── + + config_builder.add_column( + dd.SamplerColumnConfig( + name="combination_type", + sampler_type=dd.SamplerType.CATEGORY, + params=dd.CategorySamplerParams(values=["obfuscation", "complication"]), + ) + ) + + # ── Stage 2: Candidate problem generation ──────────────────────────── + + config_builder.add_column( + dd.LLMStructuredColumnConfig( + name="problem_candidates", + prompt=CANDIDATE_GENERATION_PROMPT, + system_prompt=PROBLEM_SYSTEM_PROMPT, + output_format=NewProblemList, + model_alias=model_alias, + ) + ) + + # ── Stage 3: Best problem selection ────────────────────────────────── + + config_builder.add_column( + dd.LLMStructuredColumnConfig( + name="best_problem_json", + prompt=BEST_SELECTION_PROMPT, + system_prompt=PROBLEM_SYSTEM_PROMPT, + output_format=NewProblemWithReasoning, + model_alias=model_alias, + ) + ) + + config_builder.add_column( + dd.ExpressionColumnConfig( + name="new_problem", + expr="{{ best_problem_json.problem }}", + ) + ) + + # ── Stage 4: Evaluation ────────────────────────────────────────────── + + config_builder.add_column( + dd.LLMStructuredColumnConfig( + name="evals", + prompt=EVALUATION_PROMPT, + system_prompt=PROBLEM_SYSTEM_PROMPT, + output_format=NewProblemEvals, + model_alias=model_alias, + ) + ) + + # ── Stage 5: Solution generation ───────────────────────────────────── + # NOTE: The evals column above already contains difficulty, clarity, and + # adherence scores as structured fields (e.g. {{ evals.difficulty }}). + + config_builder.add_column( + dd.LLMTextColumnConfig( + name="solution", + prompt="Solve the following problem: {{ new_problem }}", + system_prompt=SOLUTION_SYSTEM_PROMPT, + model_alias=model_alias, + ) + ) + + return config_builder + + +# ============================================================================= +# Dataset creation +# ============================================================================= + + +def create_dataset( + config_builder: dd.DataDesignerConfigBuilder, + num_records: int, + artifact_path: Path | str | None = None, +) -> DatasetCreationResults: + data_designer = DataDesigner(artifact_path=artifact_path) + results = data_designer.create(config_builder, num_records=num_records) + return results + + +if __name__ == "__main__": + from argparse import ArgumentParser + + parser = ArgumentParser(description="Nemotron Nano InfiniByte Recipe") + parser.add_argument("--model-alias", type=str, default="nvidia-text") + parser.add_argument("--num-records", type=int, default=5) + parser.add_argument("--artifact-path", type=str, default=None) + parser.add_argument( + "--strategy", + type=str, + default="ocr_omr", + choices=list(STRATEGIES.keys()), + help="Cross-join strategy: which two datasets to combine (default: ocr_omr)", + ) + parser.add_argument( + "--limit", + type=int, + default=10_000, + help="Maximum number of cross-joined pairs to sample (default: 10000)", + ) + args = parser.parse_args() + + print("=== Step 1: Preparing seed data ===") + seed_path = prepare_seed_data(strategy=args.strategy, limit=args.limit) + + print("\n=== Step 2: Building pipeline and generating data ===") + config_builder = build_config(model_alias=args.model_alias, seed_path=seed_path) + results = create_dataset( + config_builder, + num_records=args.num_records, + artifact_path=args.artifact_path, + ) + + print(f"\nDataset saved to: {results.artifact_storage.final_dataset_path}") + results.load_analysis().to_report() diff --git a/docs/assets/recipes/model_usability/prompt_sensitivity.py b/docs/assets/recipes/model_usability/prompt_sensitivity.py new file mode 100644 index 000000000..4a4673e0e --- /dev/null +++ b/docs/assets/recipes/model_usability/prompt_sensitivity.py @@ -0,0 +1,528 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "data-designer", +# "pandas", +# ] +# /// +"""Nemotron Nano Prompt Sensitivity Recipe: Regex-Verified Preamble Generation + +Generate diverse prompt preambles for RL training that pair natural-language +instructions with regex-verified output format specifications. Each record +contains a paraphrased preamble, a format instruction aligned to a regex +pattern, and a composed user prompt -- all scored by four LLM judges. + +This recipe implements the preamble generation stage of the prompt sensitivity +pipeline used for Nemotron Nano training. The key idea: seed the pipeline with +10 regex-based answer formats (boxed, double-parens, angle brackets, XML tags, +etc.), then use an LLM to paraphrase both the instruction preamble and the +format specification while preserving the regex contract. Placement order +variants control where the preamble, format instruction, and {problem} +placeholder appear in the final prompt. + +Pipeline architecture: + + ┌─────────────────────────────────────────────────────────────────────────┐ + │ SEED DATA: 10 regex formats x 30 preambles │ + │ Each row: format_key, output_regex, seed_preamble, │ + │ seed_format_instruction │ + ├────────────────────────────────────────────────────────────────────────┤ + │ STAGE 1: DIVERSITY SAMPLERS (7 columns) │ + │ sentence_length, sentence_type, tone, strictness_level, │ + │ verbosity_level, domain_context, preamble_format_order │ + ├────────────────────────────────────────────────────────────────────────┤ + │ STAGE 2: PREAMBLE GENERATION (LLM) │ + │ Paraphrase seed preamble respecting sampled controls. │ + ├────────────────────────────────────────────────────────────────────────┤ + │ STAGE 3: FORMAT INSTRUCTION GENERATION (LLM) │ + │ Paraphrase format instruction preserving regex intent. │ + ├────────────────────────────────────────────────────────────────────────┤ + │ STAGE 4: USER PROMPT COMPOSITION (LLM) │ + │ Assemble preamble + format instruction + {problem} placeholder. │ + ├────────────────────────────────────────────────────────────────────────┤ + │ STAGE 5: QUALITY SCORING (4 LLM judges) │ + │ format_compliance (0-2), regex_alignment (0-1), │ + │ order_coherence (0-1), preamble_quality (0-3) │ + └────────────────────────────────────────────────────────────────────────┘ + +Prerequisites: + - NVIDIA_API_KEY environment variable for NVIDIA provider model aliases. + +Run: + uv run prompt_sensitivity.py + + uv run prompt_sensitivity.py --num-records 200 + + uv run prompt_sensitivity.py --help +""" + +from __future__ import annotations + +import tempfile +from pathlib import Path + +import pandas as pd + +import data_designer.config as dd +from data_designer.interface import DataDesigner, DatasetCreationResults + +# ============================================================================= +# Seed data: regex format templates + seed preambles +# ============================================================================= + +FORMAT_TEMPLATES = [ + { + "format_key": "fmt_00", + "output_regex": r"\boxed{([.*?])}", + "seed_format_instruction": "Make sure to put the answer (and only answer) inside \\boxed{}.", + }, + { + "format_key": "fmt_01", + "output_regex": r"\(\((.*?)\)\)", + "seed_format_instruction": "Your final answer (and only the answer) should be enclosed within double parentheses.", + }, + { + "format_key": "fmt_02", + "output_regex": r"Answer is\s*\[([A-Za-z])\]", + "seed_format_instruction": "Wrap answer in square brackets at the end: 'Answer is [X]'.", + }, + { + "format_key": "fmt_03", + "output_regex": r"\(Answer:\s*([A-Za-z])\)", + "seed_format_instruction": "Conclude with (Answer: X), where X is the final answer.", + }, + { + "format_key": "fmt_04", + "output_regex": r"Final Answer:\s*\|\|(.*?)\|\|", + "seed_format_instruction": "End your response with Final Answer: ||X||, where X is the final answer (and only the answer).", + }, + { + "format_key": "fmt_05", + "output_regex": r"\[Answer:\s*([A-Za-z])\)", + "seed_format_instruction": "Remember to end with [Answer: X], where X is the final answer.", + }, + { + "format_key": "fmt_06", + "output_regex": r"<<(.*?)>>", + "seed_format_instruction": "Put your final answer in double angle brackets.", + }, + { + "format_key": "fmt_07", + "output_regex": r"\s*\*\*(.*?)\*\*", + "seed_format_instruction": "Give the answer at the end in this format -> **X**, where X is final answer.", + }, + { + "format_key": "fmt_08", + "output_regex": r"\s*([.*?])\s*", + "seed_format_instruction": "Wrap your final answer in XML-style tags like this: X.", + }, + { + "format_key": "fmt_09", + "output_regex": r"\boxed{([.*?])}", + "seed_format_instruction": "The final answer must be in \\boxed{} format. It's crucial to follow this format.", + }, +] + +SEED_PREAMBLES = [ + "Solve the following problem step by step.", + "Provide a comprehensive solution to the problem below.", + "Think carefully and solve the following.", + "Analyze the problem and provide your answer.", + "Work through the following problem methodically.", + "Read the problem carefully and provide a detailed solution.", + "Consider all aspects of the problem before answering.", + "Break down the problem into steps and solve it.", + "Explain your reasoning as you solve the following.", + "Solve the problem, then provide the answer.", + "Provide your reasoning for the answer and state the final result.", + "Think step-by-step before giving your final answer.", + "Carefully analyze the following and provide your solution.", + "Show your work as you solve the following problem.", + "Reason through the problem below and give your answer.", + "Solve the following. Show your reasoning.", + "Provide a clear and complete solution.", + "Answer the following question with full explanation.", + "Walk through the solution step by step.", + "Read the following problem and solve it completely.", + "Give a thorough answer to the problem below.", + "Determine the answer to the following problem.", + "Present your solution with supporting reasoning.", + "Evaluate the following and provide your answer.", + "Think about the problem below and respond with your solution.", + "Provide a well-reasoned answer to the following.", + "Solve this problem. Explain each step.", + "Work out the answer to the following question.", + "Address the problem below with a complete solution.", + "Respond to the following with a clear answer.", +] + +# ============================================================================= +# Placement order variants +# ============================================================================= + +PLACEMENT_ORDERS = [ + "P + F + {problem}", + "F + P + {problem}", + "P + {problem} + F", + "F + {problem} + P", + "{problem} + P + F", + "{problem} + F + P", + "PF + {problem}", + "{problem} + PF", +] + +# ============================================================================= +# LLM prompts +# ============================================================================= + +PREAMBLE_GEN_PROMPT = """\ +You are rewriting a seed preamble for an open-ended question prompt. + +Seed preamble: {{ seed_preamble }} + +Constraints: +- Sentence length: {{ sentence_length }} +- Sentence type: {{ sentence_type }} +- Tone: {{ tone }} +- Strictness: {{ strictness_level }} +- Verbosity: {{ verbosity_level }} +- Domain: {{ domain_context }} + +Instructions: +- Paraphrase the seed preamble (do NOT copy it verbatim). +- Produce a concise instruction line for generic open-ended questions. +- Keep it neutral and generic; do NOT include output formatting requirements. +- Respect the constraints above (length, type, tone, strictness, verbosity). +- Output ONLY the rewritten preamble, nothing else. +""" + +FORMAT_INSTRUCTION_GEN_PROMPT = """\ +You are rewriting a format instruction that tells the user how to present their final answer. + +Seed format instruction: {{ seed_format_instruction }} +Output regex pattern: {{ output_regex }} + +Constraints: +- Sentence length: {{ sentence_length }} +- Tone: {{ tone }} + +Instructions: +- Paraphrase the seed format instruction while preserving its intent. +- The instruction must unambiguously specify how the final answer should be formatted. +- The answer must be required at the end of the response. +- Do NOT refer to the type of answer (sentence, paragraph, math expression). +- Respect the sentence length and tone constraints. +- Output ONLY the rewritten format instruction, nothing else. +""" + +USER_PROMPT_COMPOSITION_PROMPT = """\ +Compose a final user prompt from the following parts. + +Preamble (P): {{ preamble }} +Format instruction (F): {{ format_instruction }} +Placement order: {{ preamble_format_order }} + +Instructions: +- Concatenate the parts in the order specified by "Placement order". +- Use {problem} as a literal placeholder for the question text. +- For "PF" or "FP" merged orders, combine P and F into a single natural sentence. +- Ensure {problem} has newlines before and after it for readability. +- Preserve the exact text of P and F; do NOT abbreviate or add new content. +- Output ONLY the composed user prompt, nothing else. +""" + +# ============================================================================= +# Judge rubrics +# ============================================================================= + +FORMAT_COMPLIANCE_SCORES = [ + dd.Score( + name="Format Compliance", + description="Does the format instruction unambiguously enforce the intended output format and require the answer at end of response?", + options={ + "2": "Explicit, unambiguous, requires ending with answer in specified format.", + "1": "Mentions format but leaves room for trailing text after the answer.", + "0": "Ambiguous, doesn't mention format, or specifies an alternative format.", + }, + ), +] + +REGEX_ALIGNMENT_SCORES = [ + dd.Score( + name="Regex Alignment", + description="Does the format instruction semantically and structurally align with the output_regex pattern?", + options={ + "1": "Instruction matches the regex pattern intent.", + "0": "Instruction conflicts with or deviates from the regex intent.", + }, + ), +] + +ORDER_COHERENCE_SCORES = [ + dd.Score( + name="Order Coherence", + description="Is the composed user prompt coherent with respect to the ordering of preamble, format instruction, and {problem} placeholder?", + options={ + "1": "Makes sense given the part ordering.", + "0": "Confusing or contradictory ordering.", + }, + ), +] + +PREAMBLE_QUALITY_SCORES = [ + dd.Score( + name="Preamble Quality", + description="Assess the preamble for clarity, concision, generic tone, and adherence to the sampled controls.", + options={ + "3": "Clear, concise, generic, adheres to all controls.", + "2": "Good with minor issues in tone or length.", + "1": "Fair with noticeable issues.", + "0": "Poor, unclear, or conflicts with controls.", + }, + ), +] + +# ============================================================================= +# Judge prompts +# ============================================================================= + +FORMAT_COMPLIANCE_JUDGE_PROMPT = """\ +Evaluate the format instruction for compliance. + +Format instruction: {{ format_instruction }} +Output regex: {{ output_regex }} +Seed format instruction: {{ seed_format_instruction }} +""" + +REGEX_ALIGNMENT_JUDGE_PROMPT = """\ +Evaluate whether the format instruction aligns with the regex pattern. + +Format instruction: {{ format_instruction }} +Output regex: {{ output_regex }} +""" + +ORDER_COHERENCE_JUDGE_PROMPT = """\ +Evaluate whether the composed user prompt is coherent given the placement order. + +User prompt: {{ user_prompt }} +Preamble: {{ preamble }} +Format instruction: {{ format_instruction }} +Placement order: {{ preamble_format_order }} +""" + +PREAMBLE_QUALITY_JUDGE_PROMPT = """\ +Evaluate the preamble for quality. + +Preamble: {{ preamble }} +Seed preamble: {{ seed_preamble }} +Sentence length: {{ sentence_length }} +Sentence type: {{ sentence_type }} +Tone: {{ tone }} +Strictness: {{ strictness_level }} +Verbosity: {{ verbosity_level }} +""" + + +# ============================================================================= +# Seed data builder +# ============================================================================= + + +def build_seed_dataframe() -> pd.DataFrame: + """Build the seed DataFrame as the cross product of formats x preambles.""" + rows = [] + for fmt in FORMAT_TEMPLATES: + for preamble in SEED_PREAMBLES: + rows.append( + { + "format_key": fmt["format_key"], + "output_regex": fmt["output_regex"], + "seed_format_instruction": fmt["seed_format_instruction"], + "seed_preamble": preamble, + } + ) + return pd.DataFrame(rows) + + +# ============================================================================= +# Pipeline builder +# ============================================================================= + + +def build_config(model_alias: str) -> tuple[dd.DataDesignerConfigBuilder, Path]: + config_builder = dd.DataDesignerConfigBuilder() + + # ── Seed data ──────────────────────────────────────────────────────── + seed_df = build_seed_dataframe() + seed_path = Path(tempfile.mkdtemp()) / "prompt_sensitivity_seed.csv" + seed_df.to_csv(seed_path, index=False) + + config_builder.with_seed_dataset( + dd.LocalFileSeedSource(path=str(seed_path)), + sampling_strategy=dd.SamplingStrategy.SHUFFLE, + ) + + # ── Stage 1: Diversity samplers ────────────────────────────────────── + + config_builder.add_column( + dd.SamplerColumnConfig( + name="sentence_length", + sampler_type=dd.SamplerType.CATEGORY, + params=dd.CategorySamplerParams(values=["short", "medium"]), + ) + ) + + config_builder.add_column( + dd.SamplerColumnConfig( + name="sentence_type", + sampler_type=dd.SamplerType.CATEGORY, + params=dd.CategorySamplerParams(values=["imperative", "declarative", "interrogative"]), + ) + ) + + config_builder.add_column( + dd.SamplerColumnConfig( + name="tone", + sampler_type=dd.SamplerType.CATEGORY, + params=dd.CategorySamplerParams(values=["formal", "neutral", "concise", "informal", "strict"]), + ) + ) + + config_builder.add_column( + dd.SamplerColumnConfig( + name="strictness_level", + sampler_type=dd.SamplerType.CATEGORY, + params=dd.CategorySamplerParams(values=["low", "medium", "high"]), + ) + ) + + config_builder.add_column( + dd.SamplerColumnConfig( + name="verbosity_level", + sampler_type=dd.SamplerType.CATEGORY, + params=dd.CategorySamplerParams(values=["concise", "standard"]), + ) + ) + + config_builder.add_column( + dd.SamplerColumnConfig( + name="domain_context", + sampler_type=dd.SamplerType.CATEGORY, + params=dd.CategorySamplerParams(values=["general"]), + ) + ) + + config_builder.add_column( + dd.SamplerColumnConfig( + name="preamble_format_order", + sampler_type=dd.SamplerType.CATEGORY, + params=dd.CategorySamplerParams(values=PLACEMENT_ORDERS), + ) + ) + + # ── Stage 2: Preamble generation ───────────────────────────────────── + + config_builder.add_column( + dd.LLMTextColumnConfig( + name="preamble", + prompt=PREAMBLE_GEN_PROMPT, + model_alias=model_alias, + ) + ) + + # ── Stage 3: Format instruction generation ─────────────────────────── + + config_builder.add_column( + dd.LLMTextColumnConfig( + name="format_instruction", + prompt=FORMAT_INSTRUCTION_GEN_PROMPT, + model_alias=model_alias, + ) + ) + + # ── Stage 4: User prompt composition ───────────────────────────────── + + config_builder.add_column( + dd.LLMTextColumnConfig( + name="user_prompt", + prompt=USER_PROMPT_COMPOSITION_PROMPT, + model_alias=model_alias, + ) + ) + + # ── Stage 5: Quality scoring ───────────────────────────────────────── + + config_builder.add_column( + dd.LLMJudgeColumnConfig( + name="format_compliance_result", + prompt=FORMAT_COMPLIANCE_JUDGE_PROMPT, + scores=FORMAT_COMPLIANCE_SCORES, + model_alias=model_alias, + ) + ) + + config_builder.add_column( + dd.LLMJudgeColumnConfig( + name="regex_alignment_result", + prompt=REGEX_ALIGNMENT_JUDGE_PROMPT, + scores=REGEX_ALIGNMENT_SCORES, + model_alias=model_alias, + ) + ) + + config_builder.add_column( + dd.LLMJudgeColumnConfig( + name="order_coherence_result", + prompt=ORDER_COHERENCE_JUDGE_PROMPT, + scores=ORDER_COHERENCE_SCORES, + model_alias=model_alias, + ) + ) + + config_builder.add_column( + dd.LLMJudgeColumnConfig( + name="preamble_quality_result", + prompt=PREAMBLE_QUALITY_JUDGE_PROMPT, + scores=PREAMBLE_QUALITY_SCORES, + model_alias=model_alias, + ) + ) + + return config_builder, seed_path + + +# ============================================================================= +# Dataset creation +# ============================================================================= + + +def create_dataset( + config_builder: dd.DataDesignerConfigBuilder, + num_records: int, + artifact_path: Path | str | None = None, +) -> DatasetCreationResults: + data_designer = DataDesigner(artifact_path=artifact_path) + results = data_designer.create(config_builder, num_records=num_records) + return results + + +if __name__ == "__main__": + from argparse import ArgumentParser + + parser = ArgumentParser(description="Nemotron Nano Prompt Sensitivity Recipe") + parser.add_argument("--model-alias", type=str, default="nvidia-text") + parser.add_argument("--num-records", type=int, default=10) + parser.add_argument("--artifact-path", type=str, default=None) + args = parser.parse_args() + + config_builder, _seed_path = build_config(model_alias=args.model_alias) + results = create_dataset( + config_builder, + num_records=args.num_records, + artifact_path=args.artifact_path, + ) + + print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}") + results.load_analysis().to_report() diff --git a/docs/assets/recipes/model_usability/structured_data.py b/docs/assets/recipes/model_usability/structured_data.py new file mode 100644 index 000000000..d0a87737a --- /dev/null +++ b/docs/assets/recipes/model_usability/structured_data.py @@ -0,0 +1,434 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "data-designer", +# ] +# /// +"""Nemotron Nano Structured Data Recipe: Multi-Format Schema Generation + +Generate synthetic structured data across multiple output formats (JSON, YAML, +XML, Markdown) with controlled schema complexity, conversational grounding, +and best-of-3 candidate generation. + +This recipe implements the pipeline used to produce structured-data SFT records +for Nemotron Nano training. Each record contains a generated schema, a natural +user request, grounding Q&A conversation pairs, and three candidate structured +outputs that conform to the schema. + +Pipeline architecture: + + ┌─────────────────────────────────────────────────────────────────────────┐ + │ STAGE 1: SAMPLING (DIVERSITY CONTROLS) │ + │ │ + │ Format Controls Schema Controls Conversation Controls │ + │ ├─ output_format ├─ schema_rigidity ├─ num_turns │ + │ └─ topic (conditional) ├─ schema_fields_count ├─ tone │ + │ ├─ schema_complexity └─ detail_level │ + │ └─ nesting_depth │ + ├────────────────────────────────────────────────────────────────────────┤ + │ STAGE 2: SCHEMA GENERATION (LLM) │ + │ Generates format-specific schema from sampled controls + topic. │ + ├────────────────────────────────────────────────────────────────────────┤ + │ STAGE 3: USER PROMPT GENERATION (LLM) │ + │ Natural-language request matching tone and detail level. │ + ├────────────────────────────────────────────────────────────────────────┤ + │ STAGE 4: CONVERSATION PAIRS (LLM) │ + │ Q&A pairs covering schema facts for grounding. │ + ├────────────────────────────────────────────────────────────────────────┤ + │ STAGE 5: STRUCTURED OUTPUT (LLM, best-of-3) │ + │ Three candidate structured outputs conforming to the schema. │ + └────────────────────────────────────────────────────────────────────────┘ + +Prerequisites: + - NVIDIA_API_KEY environment variable for NVIDIA provider model aliases. + +Run: + uv run structured_data.py + + uv run structured_data.py --num-records 100 --output-format json + + uv run structured_data.py --help +""" + +from __future__ import annotations + +from pathlib import Path + +import data_designer.config as dd +from data_designer.interface import DataDesigner, DatasetCreationResults + +# ============================================================================= +# Topics: representative subset of categories and subtopics +# ============================================================================= + +TOPICS: dict[str, list[str]] = { + "Leisure Activities": [ + "Outdoor Recreation", + "Board Games", + "DIY Crafts", + "Photography", + "Gardening", + ], + "Daily Life": [ + "Morning Routines", + "Grocery Shopping", + "Commuting", + "Household Chores", + "Meal Planning", + ], + "Education and Learning": [ + "Online Courses", + "Study Techniques", + "Language Learning", + "STEM Education", + "Library Systems", + ], + "Technology and Gadgets": [ + "Smartphones", + "Smart Home Devices", + "Wearable Tech", + "Cloud Computing", + "Cybersecurity Basics", + ], + "Health and Wellness": [ + "Nutrition Planning", + "Mental Health", + "Exercise Routines", + "Sleep Hygiene", + "Preventive Care", + ], + "Finance and Money": [ + "Personal Budgeting", + "Investment Basics", + "Tax Preparation", + "Credit Management", + "Retirement Planning", + ], + "Food and Cooking": [ + "Baking Techniques", + "Meal Prep", + "International Cuisines", + "Dietary Restrictions", + "Kitchen Equipment", + ], + "Travel and Transportation": [ + "Trip Planning", + "Public Transit", + "Road Trips", + "Travel Insurance", + "Packing Strategies", + ], + "Arts and Culture": [ + "Music Theory", + "Film Analysis", + "Theater Production", + "Contemporary Art", + "Creative Writing", + ], + "Work and Careers": [ + "Resume Building", + "Interview Preparation", + "Remote Work", + "Project Management", + "Career Transitions", + ], +} + +# ============================================================================= +# Prompts +# ============================================================================= + +SCHEMA_GENERATION_PROMPT = """\ +Create a schema for a structured object response in the format {{ output_format }}. + +Controls: +- Rigidity: {{ schema_rigidity }} +- Top-level properties: {{ schema_fields_count }} +- Complexity: {{ schema_complexity }} +- Nesting depth: {{ nesting_depth }} +- Topic: {{ topic_category }} / {{ topic_subtopic }} + +Instructions: +- Output only an object with keys: "name", "schema", and "strict", formatted as \ +{{ output_format }}. +- "name" must be appropriate with the Topic: {{ topic_category }} / {{ topic_subtopic }} +- "schema" should be a valid structured schema as specified in {{ output_format }}. +- Use {{ schema_fields_count }} top-level properties, relevant to the topic. +- Include at least one boolean and, if appropriate, one enum. +- All top-level properties must be listed in "required". +- Set "additionalProperties": false at every object level. +- If {{ schema_complexity }} is "complex", make the schema deeply nested: at least two \ +levels of nested objects, with at least one object nested three levels deep. Keep nesting \ +relevant to the topic. +- If "simple", keep nesting minimal or flat. +- "strict" must be true. + +Formatting by output_format: +- "json": Output a valid JSON object, no code fences or comments. +- "yaml": Output a valid YAML object, no code fences or comments. +- "xml": Output a valid XML document with root "root" and child elements "name", "schema", \ +and "strict". "schema" can be a string or nested XML. +- "markdown": Output a Markdown code block with the JSON object, using triple backticks and \ +"json" as the language, no extra text. + +Output only the object in the specified format. No explanations or extra text. +""" + +USER_PROMPT_GENERATION = """\ +You are a human user asking an AI assistant to produce a structured output. Write a natural, \ +concise request that would lead to filling in a schema about {{ topic_category }} / \ +{{ topic_subtopic }}. + +The request should: +- Sound like something a real person would type or say +- Describe what data they want without exposing the schema itself +- Mention the desired output format: {{ output_format }} +- Match the tone: {{ tone }} and detail level: {{ detail_level }} + +Do not include the schema, code fences, or technical formatting. Just the user request. +""" + +CONVERSATION_PROMPT = """\ +Write a short Q&A conversation about the following topic. Follow the selected JSON Schema \ +fields as the underlying facts to cover, but DO NOT output JSON here. + +Topic context: +- Category: {{ topic_category }} +- Subtopic: {{ topic_subtopic }} + +Constraints: +- Number of Q&A pairs: {{ num_turns }} +- Tone: {{ tone }} +- Detail level: {{ detail_level }} + +Write alternating question/answer pairs that make these facts unambiguous for the chosen \ +schema: {{ structured_schema }} +Return only a Python list of [question, answer] pairs (no extra text). +""" + +STRUCTURED_OUTPUT_PROMPT = """\ +You will produce a {{ output_format }} instance that conforms strictly to the following \ +schema (no extra keys). + +Schema: +{{ structured_schema }} + +You are given a Python list of [question, answer] pairs: +{{ conversation_pairs }} + +Instructions: +- Derive values only from the answers given. +- Render ONLY the {{ output_format }} instance, with no commentary. +- Formatting rules: + - If output_format is "json", output a single JSON object (no code fences). + - If output_format is "yaml", output a YAML mapping (no code fences). + - If output_format is "xml", output an XML document with root . + - If output_format is "markdown", output a fenced code block with ```json. +- Ensure the content validates against the schema when parsed back to JSON. +""" + +# ============================================================================= +# Supported output formats +# ============================================================================= + +OUTPUT_FORMATS = ["json", "yaml", "xml", "markdown"] + + +# ============================================================================= +# Pipeline builder +# ============================================================================= + + +def build_config( + model_alias: str, + output_format: str | None = None, +) -> dd.DataDesignerConfigBuilder: + config_builder = dd.DataDesignerConfigBuilder() + + # ── Stage 1: Sampling ──────────────────────────────────────────────── + + config_builder.add_column( + dd.SamplerColumnConfig( + name="record_id", + sampler_type=dd.SamplerType.UUID, + params=dd.UUIDSamplerParams(prefix="SD-", short_form=True, uppercase=True), + ) + ) + + formats = [output_format] if output_format else OUTPUT_FORMATS + config_builder.add_column( + dd.SamplerColumnConfig( + name="output_format", + sampler_type=dd.SamplerType.CATEGORY, + params=dd.CategorySamplerParams(values=formats), + ) + ) + + config_builder.add_column( + dd.SamplerColumnConfig( + name="topic_category", + sampler_type=dd.SamplerType.CATEGORY, + params=dd.CategorySamplerParams(values=list(TOPICS.keys())), + ) + ) + + config_builder.add_column( + dd.SamplerColumnConfig( + name="topic_subtopic", + sampler_type=dd.SamplerType.SUBCATEGORY, + params=dd.SubcategorySamplerParams( + category="topic_category", + values=TOPICS, + ), + ) + ) + + config_builder.add_column( + dd.SamplerColumnConfig( + name="schema_rigidity", + sampler_type=dd.SamplerType.CATEGORY, + params=dd.CategorySamplerParams(values=["strict", "moderate"]), + ) + ) + + config_builder.add_column( + dd.SamplerColumnConfig( + name="schema_fields_count", + sampler_type=dd.SamplerType.CATEGORY, + params=dd.CategorySamplerParams( + values=["3", "4", "5", "6", "7", "8", "9", "10", "11", "12"], + ), + ) + ) + + config_builder.add_column( + dd.SamplerColumnConfig( + name="schema_complexity", + sampler_type=dd.SamplerType.CATEGORY, + params=dd.CategorySamplerParams(values=["simple", "complex"]), + ) + ) + + config_builder.add_column( + dd.SamplerColumnConfig( + name="nesting_depth", + sampler_type=dd.SamplerType.CATEGORY, + params=dd.CategorySamplerParams(values=["1", "2", "3", "4"]), + ) + ) + + config_builder.add_column( + dd.SamplerColumnConfig( + name="num_turns", + sampler_type=dd.SamplerType.CATEGORY, + params=dd.CategorySamplerParams(values=["2", "3", "4", "5", "6", "7", "8"]), + ) + ) + + config_builder.add_column( + dd.SamplerColumnConfig( + name="tone", + sampler_type=dd.SamplerType.CATEGORY, + params=dd.CategorySamplerParams(values=["neutral", "enthusiastic", "factual"]), + ) + ) + + config_builder.add_column( + dd.SamplerColumnConfig( + name="detail_level", + sampler_type=dd.SamplerType.CATEGORY, + params=dd.CategorySamplerParams( + values=["brief", "standard", "detailed", "super verbose"], + ), + ) + ) + + # ── Stage 2: Schema generation ─────────────────────────────────────── + + config_builder.add_column( + dd.LLMTextColumnConfig( + name="structured_schema", + prompt=SCHEMA_GENERATION_PROMPT, + model_alias=model_alias, + ) + ) + + # ── Stage 3: User prompt generation ────────────────────────────────── + + config_builder.add_column( + dd.LLMTextColumnConfig( + name="user_prompt", + prompt=USER_PROMPT_GENERATION, + model_alias=model_alias, + ) + ) + + # ── Stage 4: Conversation pairs ────────────────────────────────────── + + config_builder.add_column( + dd.LLMTextColumnConfig( + name="conversation_pairs", + prompt=CONVERSATION_PROMPT, + model_alias=model_alias, + ) + ) + + # ── Stage 5: Best-of-3 structured output ───────────────────────────── + + for i in range(3): + config_builder.add_column( + dd.LLMTextColumnConfig( + name=f"structured_output_{i}", + prompt=STRUCTURED_OUTPUT_PROMPT, + model_alias=model_alias, + ) + ) + + return config_builder + + +# ============================================================================= +# Dataset creation +# ============================================================================= + + +def create_dataset( + config_builder: dd.DataDesignerConfigBuilder, + num_records: int, + artifact_path: Path | str | None = None, +) -> DatasetCreationResults: + data_designer = DataDesigner(artifact_path=artifact_path) + results = data_designer.create(config_builder, num_records=num_records) + return results + + +if __name__ == "__main__": + from argparse import ArgumentParser + + parser = ArgumentParser(description="Nemotron Nano Structured Data Recipe") + parser.add_argument("--model-alias", type=str, default="nvidia-text") + parser.add_argument("--num-records", type=int, default=5) + parser.add_argument("--artifact-path", type=str, default=None) + parser.add_argument( + "--output-format", + type=str, + default=None, + choices=OUTPUT_FORMATS, + help="Generate for a single output format (default: all formats)", + ) + args = parser.parse_args() + + config_builder = build_config( + model_alias=args.model_alias, + output_format=args.output_format, + ) + results = create_dataset( + config_builder, + num_records=args.num_records, + artifact_path=args.artifact_path, + ) + + print(f"Dataset saved to: {results.artifact_storage.final_dataset_path}") + results.load_analysis().to_report() diff --git a/docs/recipes/cards.md b/docs/recipes/cards.md index ef2752936..5f85fc27b 100644 --- a/docs/recipes/cards.md +++ b/docs/recipes/cards.md @@ -172,6 +172,60 @@ Each recipe is a self-contained example that can be run independently. [:material-book-open-page-variant: View Recipe](mcp_and_tooluse/search_agent.md){ .md-button } [Download Code :octicons-download-24:](../assets/recipes/mcp_and_tooluse/search_agent.py){ .md-button download="search_agent.py" } +- :material-code-json:{ .lg .middle } **Nemotron Nano Structured Data** + + Generate multi-format structured data (JSON, YAML, XML, Markdown) with controlled schema complexity, conversational grounding, and best-of-3 candidate generation. + + --- + + **Demonstrates:** + + - Multi-format structured output generation + - Conditional topic sampling with SubcategorySamplerParams + - Multi-stage LLM pipeline (schema, prompt, conversation, output) + - Best-of-3 candidate generation + + --- + + [:material-book-open-page-variant: View Recipe](model_usability/structured_data.md){ .md-button } + [Download Code :octicons-download-24:](../assets/recipes/model_usability/structured_data.py){ .md-button download="structured_data.py" } + +- :material-format-list-text:{ .lg .middle } **Nemotron Nano Prompt Sensitivity** + + Generate diverse prompt preambles for RL training with regex-verified output format specifications, LLM paraphrasing, placement order variants, and four LLM judges. + + --- + + **Demonstrates:** + + - Seed dataset with format templates and preambles + - LLM paraphrasing with diversity controls + - Regex-aligned format instruction generation + - 4 LLM judges (format compliance, regex alignment, order coherence, preamble quality) + + --- + + [:material-book-open-page-variant: View Recipe](model_usability/prompt_sensitivity.md){ .md-button } + [Download Code :octicons-download-24:](../assets/recipes/model_usability/prompt_sensitivity.py){ .md-button download="prompt_sensitivity.py" } + +- :material-infinity:{ .lg .middle } **Nemotron Nano InfiniByte** + + Generate more diverse and complex problems by cross-breeding two HuggingFace datasets, augmenting problems through obfuscation or complication with structured LLM evaluation. + + --- + + **Demonstrates:** + + - Cross-source dataset seeding from HuggingFace + - LLMStructuredColumnConfig with Pydantic models + - Multi-stage structured generation (candidates, selection, evaluation) + - Expression columns for score extraction + + --- + + [:material-book-open-page-variant: View Recipe](code_generation/infinibyte.md){ .md-button } + [Download Code :octicons-download-24:](../assets/recipes/code_generation/infinibyte.py){ .md-button download="infinibyte.py" } + - :material-file-document-multiple:{ .lg .middle } **Markdown Section Seed Reader** Define a custom `FileSystemSeedReader` inline and turn Markdown files into one seed row per heading section. diff --git a/docs/recipes/code_generation/infinibyte.md b/docs/recipes/code_generation/infinibyte.md new file mode 100644 index 000000000..662757bc9 --- /dev/null +++ b/docs/recipes/code_generation/infinibyte.md @@ -0,0 +1,9 @@ +# Nemotron Nano InfiniByte + +Generate more diverse and complex training problems by cross-breeding two source datasets. Cross-joins coding and math/science problems, then uses an LLM to create augmented problems through obfuscation or complication, with structured evaluation and solution generation. Used for Nemotron Nano post-training data. + +[Download Code :octicons-download-24:](../../assets/recipes/code_generation/infinibyte.py){ .md-button download="infinibyte.py" } + +```python +--8<-- "assets/recipes/code_generation/infinibyte.py" +``` diff --git a/docs/recipes/model_usability/prompt_sensitivity.md b/docs/recipes/model_usability/prompt_sensitivity.md new file mode 100644 index 000000000..92db61353 --- /dev/null +++ b/docs/recipes/model_usability/prompt_sensitivity.md @@ -0,0 +1,9 @@ +# Nemotron Nano Prompt Sensitivity + +Generate diverse prompt preambles for RL training that pair natural-language instructions with regex-verified output format specifications. Uses seed format templates, LLM paraphrasing, placement order variants, and four LLM judges to produce high-quality, format-compliant preambles. Used for Nemotron Nano prompt sensitivity training. + +[Download Code :octicons-download-24:](../../assets/recipes/model_usability/prompt_sensitivity.py){ .md-button download="prompt_sensitivity.py" } + +```python +--8<-- "assets/recipes/model_usability/prompt_sensitivity.py" +``` diff --git a/docs/recipes/model_usability/structured_data.md b/docs/recipes/model_usability/structured_data.md new file mode 100644 index 000000000..aace7f064 --- /dev/null +++ b/docs/recipes/model_usability/structured_data.md @@ -0,0 +1,9 @@ +# Nemotron Nano Structured Data + +Generate multi-format structured data (JSON, YAML, XML, Markdown) with controlled schema complexity, conversational grounding, and best-of-3 candidate generation. Used for Nemotron Nano structured-output SFT training. + +[Download Code :octicons-download-24:](../../assets/recipes/model_usability/structured_data.py){ .md-button download="structured_data.py" } + +```python +--8<-- "assets/recipes/model_usability/structured_data.py" +``` diff --git a/mkdocs.yml b/mkdocs.yml index 464491d4f..4ace5081f 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -46,6 +46,10 @@ nav: - Text to Python: recipes/code_generation/text_to_python.md - Text to SQL: recipes/code_generation/text_to_sql.md - "Nemotron Super Text to SQL": recipes/code_generation/enterprise_text_to_sql.md + - "Nemotron Nano InfiniByte": recipes/code_generation/infinibyte.md + - Model Usability: + - "Nemotron Nano Structured Data": recipes/model_usability/structured_data.md + - "Nemotron Nano Prompt Sensitivity": recipes/model_usability/prompt_sensitivity.md - Plugin Development: - Markdown Section Seed Reader Plugin: recipes/plugin_development/markdown_seed_reader.md - QA and Chat: