fix(compiler): bound per-concept prompt size to prevent blowup (#39) (#47)

ethanj · web-flow · commit a4f68181ff73 · 2026-04-28T18:49:05.000-07:00
diff --git a/src/compiler/index.ts b/src/compiler/index.ts
@@ -44,6 +44,7 @@ import {
 import { markOrphaned, orphanUnownedFrozenPages } from "./orphan.js";
 import { resolveLinks } from "./resolver.js";
 import { generateIndex } from "./indexgen.js";
+import { buildBudgetedCombinedContent, type SourceSlice } from "./prompt-budget.js";
 import { addObsidianMeta, generateMOC } from "./obsidian.js";
 import { updateEmbeddings } from "../utils/embeddings.js";
 import { writeCandidate } from "./candidates.js";
@@ -462,12 +463,19 @@ export function reconcileConceptMetadata(
  * contributing material rather than just the last source processed.
  * Metadata is reconciled across all contributing concepts via
  * reconcileConceptMetadata so contradictions from later sources are not lost.
+ *
+ * Combined content is then run through {@link buildBudgetedCombinedContent}
+ * so popular concepts that appear in many overlapping sources do not blow
+ * past the LLM provider's context window (issue #39). When the raw total
+ * fits the budget, the output is byte-identical to the previous unbudgeted
+ * concatenation.
  */
 function mergeExtractions(
   extractions: ExtractionResult[],
   frozenSlugs: Set<string>,
 ): MergedConcept[] {
   const bySlug = new Map<string, MergedConcept>();
+  const slicesBySlug = new Map<string, SourceSlice[]>();
 
   for (const result of extractions) {
     if (result.concepts.length === 0) continue;
@@ -480,18 +488,30 @@ function mergeExtractions(
       if (existing) {
         existing.concept = reconcileConceptMetadata(existing.concept, concept);
         existing.sourceFiles.push(result.sourceFile);
-        existing.combinedContent += `\n\n--- SOURCE: ${result.sourceFile} ---\n\n${result.sourceContent}`;
       } else {
         bySlug.set(slug, {
           slug,
           concept,
           sourceFiles: [result.sourceFile],
-          combinedContent: `--- SOURCE: ${result.sourceFile} ---\n\n${result.sourceContent}`,
+          combinedContent: "",
         });
+        slicesBySlug.set(slug, []);
       }
+      slicesBySlug.get(slug)!.push({
+        file: result.sourceFile,
+        content: result.sourceContent,
+      });
     }
   }
 
+  for (const merged of bySlug.values()) {
+    const slices = slicesBySlug.get(merged.slug) ?? [];
+    merged.combinedContent = buildBudgetedCombinedContent(
+      merged.concept.concept,
+      slices,
+    );
+  }
+
   return Array.from(bySlug.values());
 }
 
diff --git a/src/compiler/prompt-budget.ts b/src/compiler/prompt-budget.ts
@@ -0,0 +1,105 @@
+/**
+ * Per-concept prompt-budget enforcement (issue #39).
+ *
+ * When the same concept is extracted from many overlapping sources, the
+ * page-generation prompt would otherwise concatenate every full source
+ * — linear in source count — and reliably blow past the LLM provider's
+ * context window. This module clips each contributing source's slice to
+ * a fair share of a configurable total budget and emits a single warning
+ * when truncation kicks in.
+ *
+ * The fix is deliberately defensive (proportional truncation) rather than
+ * smart (semantic ranking / summarisation). It prevents crashes while a
+ * deeper retrieval-driven solution is designed.
+ */
+
+import * as output from "../utils/output.js";
+import {
+  DEFAULT_PROMPT_BUDGET_CHARS,
+  PROMPT_BUDGET_ENV_VAR,
+} from "../utils/constants.js";
+
+/** Marker appended to a source slice when it was truncated to fit the budget. */
+const TRUNCATION_MARKER = "\n\n[…truncated for prompt budget — see #39…]";
+
+/** A single source's contribution to the combined per-concept content. */
+export interface SourceSlice {
+  /** Source filename (e.g. "ml-paper.md") shown as a section header in the prompt. */
+  file: string;
+  /** Raw extracted source content, before any budgeting. */
+  content: string;
+}
+
+/**
+ * Resolve the active prompt-budget character cap. Reads the
+ * `LLMWIKI_PROMPT_BUDGET_CHARS` env var when present and parseable; falls
+ * back to `DEFAULT_PROMPT_BUDGET_CHARS`. Invalid values (non-numeric or
+ * <= 0) are ignored so a typo can't accidentally truncate every prompt
+ * to nothing.
+ */
+export function resolvePromptBudgetChars(): number {
+  const raw = process.env[PROMPT_BUDGET_ENV_VAR];
+  if (!raw) return DEFAULT_PROMPT_BUDGET_CHARS;
+  const parsed = Number.parseInt(raw, 10);
+  if (!Number.isFinite(parsed) || parsed <= 0) return DEFAULT_PROMPT_BUDGET_CHARS;
+  return parsed;
+}
+
+/**
+ * Combine per-source slices into the single content blob the LLM prompt
+ * receives, applying a fair-share budget when the raw total would exceed
+ * the configured ceiling. When no truncation is needed the output is
+ * byte-identical to the previous unbudgeted concatenation, so existing
+ * compile output is unchanged for typical workloads.
+ *
+ * @param concept - Human-readable concept title (used in the warning only).
+ * @param slices - One entry per contributing source, in arrival order.
+ * @returns The combined content string suitable for buildPagePrompt.
+ */
+export function buildBudgetedCombinedContent(
+  concept: string,
+  slices: SourceSlice[],
+): string {
+  const budget = resolvePromptBudgetChars();
+  const totalRaw = slices.reduce((sum, s) => sum + s.content.length, 0);
+
+  if (totalRaw <= budget) {
+    return formatSlices(slices);
+  }
+
+  const perSource = Math.max(1, Math.floor(budget / slices.length));
+  warnTruncation(concept, totalRaw, slices.length, perSource, budget);
+
+  const trimmed = slices.map((s) =>
+    s.content.length > perSource
+      ? { ...s, content: s.content.slice(0, perSource) + TRUNCATION_MARKER }
+      : s,
+  );
+  return formatSlices(trimmed);
+}
+
+/** Render the slice list using the same `--- SOURCE: ---` headers the LLM is taught to read. */
+function formatSlices(slices: SourceSlice[]): string {
+  return slices
+    .map((s) => `--- SOURCE: ${s.file} ---\n\n${s.content}`)
+    .join("\n\n");
+}
+
+/** Emit a single, actionable warning when the budget kicks in for a concept. */
+function warnTruncation(
+  concept: string,
+  totalRaw: number,
+  sourceCount: number,
+  perSource: number,
+  budget: number,
+): void {
+  output.status(
+    "!",
+    output.warn(
+      `Combined source content for "${concept}" (${totalRaw.toLocaleString()} chars across ` +
+        `${sourceCount} sources) exceeds the ${budget.toLocaleString()}-char prompt budget; ` +
+        `truncating each source to ~${perSource.toLocaleString()} chars. ` +
+        `Raise via ${PROMPT_BUDGET_ENV_VAR} when running against larger-context models.`,
+    ),
+  );
+}
diff --git a/src/utils/constants.ts b/src/utils/constants.ts
@@ -9,6 +9,25 @@ export const MAX_SOURCE_CHARS = 100_000;
 /** Minimum source content length to ingest without a warning. */
 export const MIN_SOURCE_CHARS = 50;
 
+/**
+ * Default character budget for the combined source content sent to the LLM
+ * during page generation for a single concept (issue #39).
+ *
+ * Caps the per-prompt content at ~200,000 chars (~50k tokens). When two or
+ * more sources contribute to the same concept and their combined raw size
+ * exceeds this budget, each source's slice is proportionally truncated so
+ * the prompt fits the model's context window. Without this cap, popular
+ * concepts that appear in many overlapping documents reliably blow past
+ * the LLM provider's context limit and the compile crashes.
+ *
+ * Override via the LLMWIKI_PROMPT_BUDGET_CHARS env var when running against
+ * larger-context (raise) or smaller-context (lower) models.
+ */
+export const DEFAULT_PROMPT_BUDGET_CHARS = 200_000;
+
+/** Env var that overrides DEFAULT_PROMPT_BUDGET_CHARS at runtime. */
+export const PROMPT_BUDGET_ENV_VAR = "LLMWIKI_PROMPT_BUDGET_CHARS";
+
 /** Number of most relevant wiki pages to load for query context. */
 export const QUERY_PAGE_LIMIT = 5;
 
diff --git a/test/fixtures/aimock-helper.ts b/test/fixtures/aimock-helper.ts
@@ -95,6 +95,37 @@ export function mockOpenAIEnv(
   };
 }
 
+/**
+ * Walk aimock's recorded requests and return the system-prompt content
+ * from the first request whose user-message content satisfies the
+ * predicate. Returns null when no matching request is found.
+ *
+ * Centralised because every aimock-backed CLI test that wants to assert
+ * "the LLM saw <X> in the system prompt" has to slice the same way:
+ * aimock normalises Anthropic's top-level `system` field into a
+ * `{role: "system", content: ...}` message in `body.messages`, so the
+ * walker has to inspect both system and user messages per request and
+ * disambiguate by user-message content.
+ */
+export function findSystemPromptByUserMessage(
+  handle: MockClaudeHandle,
+  predicate: (userMessage: string) => boolean,
+): string | null {
+  const requests = handle.mock.getRequests() as Array<{ body?: unknown }>;
+  for (const req of requests) {
+    const body = req.body as { messages?: unknown } | undefined;
+    if (!Array.isArray(body?.messages)) continue;
+    let systemPrompt = "";
+    let userPrompt = "";
+    for (const msg of body.messages as Array<{ role?: unknown; content?: unknown }>) {
+      if (msg.role === "system" && typeof msg.content === "string") systemPrompt = msg.content;
+      if (msg.role === "user" && typeof msg.content === "string") userPrompt = msg.content;
+    }
+    if (predicate(userPrompt)) return systemPrompt;
+  }
+  return null;
+}
+
 /** Live state managed by {@link useAimockLifecycle}. */
 export interface AimockLifecycle {
   /** Currently-running mock, or null between tests. Set by `start()`. */
diff --git a/test/output-language-query-integration.test.ts b/test/output-language-query-integration.test.ts
@@ -14,6 +14,7 @@ import { describe, it, expect } from "vitest";
 import { mkdir, writeFile } from "fs/promises";
 import path from "path";
 import {
+  findSystemPromptByUserMessage,
   mockClaudeEnv,
   useAimockLifecycle,
   type MockClaudeHandle,
@@ -66,25 +67,14 @@ function stubQueryResponses(handle: MockClaudeHandle): void {
   handle.mock.onMessage(/.*/, { content: ANSWER_TEXT });
 }
 
-/** Pull the system prompt for the answer-generation request out of aimock's recording. */
+/**
+ * Pull the system prompt for the answer-generation request out of aimock's
+ * recording. The answer-generation request includes "Relevant wiki pages:"
+ * in the user message; the page-selection request includes "Wiki Index:"
+ * instead, so the predicate disambiguates the two.
+ */
 function findAnswerSystemPrompt(handle: MockClaudeHandle): string | null {
-  const requests = handle.mock.getRequests() as Array<{ body?: unknown }>;
-  for (const req of requests) {
-    const body = req.body as { messages?: unknown } | undefined;
-    if (!Array.isArray(body?.messages)) continue;
-    let systemPrompt = "";
-    let userPrompt = "";
-    for (const msg of body.messages as Array<{ role?: unknown; content?: unknown }>) {
-      if (msg.role === "system" && typeof msg.content === "string") systemPrompt = msg.content;
-      if (msg.role === "user" && typeof msg.content === "string") userPrompt = msg.content;
-    }
-    // The answer-generation request includes "Relevant wiki pages:" in the user
-    // message; the page-selection request includes "Wiki Index:" instead.
-    if (userPrompt.includes("Relevant wiki pages:")) {
-      return systemPrompt;
-    }
-  }
-  return null;
+  return findSystemPromptByUserMessage(handle, (u) => u.includes("Relevant wiki pages:"));
 }
 
 describe("query --lang CLI integration (#37 query path)", () => {
diff --git a/test/prompt-blowup-integration.test.ts b/test/prompt-blowup-integration.test.ts
diff --git a/test/prompt-budget.test.ts b/test/prompt-budget.test.ts