Skip to content

Commit a4f6818

Browse files
authored
fix(compiler): bound per-concept prompt size to prevent blowup (#39) (#47)
1 parent 0f06c4f commit a4f6818

7 files changed

Lines changed: 454 additions & 20 deletions

File tree

src/compiler/index.ts

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ import {
4444
import { markOrphaned, orphanUnownedFrozenPages } from "./orphan.js";
4545
import { resolveLinks } from "./resolver.js";
4646
import { generateIndex } from "./indexgen.js";
47+
import { buildBudgetedCombinedContent, type SourceSlice } from "./prompt-budget.js";
4748
import { addObsidianMeta, generateMOC } from "./obsidian.js";
4849
import { updateEmbeddings } from "../utils/embeddings.js";
4950
import { writeCandidate } from "./candidates.js";
@@ -462,12 +463,19 @@ export function reconcileConceptMetadata(
462463
* contributing material rather than just the last source processed.
463464
* Metadata is reconciled across all contributing concepts via
464465
* reconcileConceptMetadata so contradictions from later sources are not lost.
466+
*
467+
* Combined content is then run through {@link buildBudgetedCombinedContent}
468+
* so popular concepts that appear in many overlapping sources do not blow
469+
* past the LLM provider's context window (issue #39). When the raw total
470+
* fits the budget, the output is byte-identical to the previous unbudgeted
471+
* concatenation.
465472
*/
466473
function mergeExtractions(
467474
extractions: ExtractionResult[],
468475
frozenSlugs: Set<string>,
469476
): MergedConcept[] {
470477
const bySlug = new Map<string, MergedConcept>();
478+
const slicesBySlug = new Map<string, SourceSlice[]>();
471479

472480
for (const result of extractions) {
473481
if (result.concepts.length === 0) continue;
@@ -480,18 +488,30 @@ function mergeExtractions(
480488
if (existing) {
481489
existing.concept = reconcileConceptMetadata(existing.concept, concept);
482490
existing.sourceFiles.push(result.sourceFile);
483-
existing.combinedContent += `\n\n--- SOURCE: ${result.sourceFile} ---\n\n${result.sourceContent}`;
484491
} else {
485492
bySlug.set(slug, {
486493
slug,
487494
concept,
488495
sourceFiles: [result.sourceFile],
489-
combinedContent: `--- SOURCE: ${result.sourceFile} ---\n\n${result.sourceContent}`,
496+
combinedContent: "",
490497
});
498+
slicesBySlug.set(slug, []);
491499
}
500+
slicesBySlug.get(slug)!.push({
501+
file: result.sourceFile,
502+
content: result.sourceContent,
503+
});
492504
}
493505
}
494506

507+
for (const merged of bySlug.values()) {
508+
const slices = slicesBySlug.get(merged.slug) ?? [];
509+
merged.combinedContent = buildBudgetedCombinedContent(
510+
merged.concept.concept,
511+
slices,
512+
);
513+
}
514+
495515
return Array.from(bySlug.values());
496516
}
497517

src/compiler/prompt-budget.ts

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
/**
2+
* Per-concept prompt-budget enforcement (issue #39).
3+
*
4+
* When the same concept is extracted from many overlapping sources, the
5+
* page-generation prompt would otherwise concatenate every full source
6+
* — linear in source count — and reliably blow past the LLM provider's
7+
* context window. This module clips each contributing source's slice to
8+
* a fair share of a configurable total budget and emits a single warning
9+
* when truncation kicks in.
10+
*
11+
* The fix is deliberately defensive (proportional truncation) rather than
12+
* smart (semantic ranking / summarisation). It prevents crashes while a
13+
* deeper retrieval-driven solution is designed.
14+
*/
15+
16+
import * as output from "../utils/output.js";
17+
import {
18+
DEFAULT_PROMPT_BUDGET_CHARS,
19+
PROMPT_BUDGET_ENV_VAR,
20+
} from "../utils/constants.js";
21+
22+
/** Marker appended to a source slice when it was truncated to fit the budget. */
23+
const TRUNCATION_MARKER = "\n\n[…truncated for prompt budget — see #39…]";
24+
25+
/** A single source's contribution to the combined per-concept content. */
26+
export interface SourceSlice {
27+
/** Source filename (e.g. "ml-paper.md") shown as a section header in the prompt. */
28+
file: string;
29+
/** Raw extracted source content, before any budgeting. */
30+
content: string;
31+
}
32+
33+
/**
34+
* Resolve the active prompt-budget character cap. Reads the
35+
* `LLMWIKI_PROMPT_BUDGET_CHARS` env var when present and parseable; falls
36+
* back to `DEFAULT_PROMPT_BUDGET_CHARS`. Invalid values (non-numeric or
37+
* <= 0) are ignored so a typo can't accidentally truncate every prompt
38+
* to nothing.
39+
*/
40+
export function resolvePromptBudgetChars(): number {
41+
const raw = process.env[PROMPT_BUDGET_ENV_VAR];
42+
if (!raw) return DEFAULT_PROMPT_BUDGET_CHARS;
43+
const parsed = Number.parseInt(raw, 10);
44+
if (!Number.isFinite(parsed) || parsed <= 0) return DEFAULT_PROMPT_BUDGET_CHARS;
45+
return parsed;
46+
}
47+
48+
/**
49+
* Combine per-source slices into the single content blob the LLM prompt
50+
* receives, applying a fair-share budget when the raw total would exceed
51+
* the configured ceiling. When no truncation is needed the output is
52+
* byte-identical to the previous unbudgeted concatenation, so existing
53+
* compile output is unchanged for typical workloads.
54+
*
55+
* @param concept - Human-readable concept title (used in the warning only).
56+
* @param slices - One entry per contributing source, in arrival order.
57+
* @returns The combined content string suitable for buildPagePrompt.
58+
*/
59+
export function buildBudgetedCombinedContent(
60+
concept: string,
61+
slices: SourceSlice[],
62+
): string {
63+
const budget = resolvePromptBudgetChars();
64+
const totalRaw = slices.reduce((sum, s) => sum + s.content.length, 0);
65+
66+
if (totalRaw <= budget) {
67+
return formatSlices(slices);
68+
}
69+
70+
const perSource = Math.max(1, Math.floor(budget / slices.length));
71+
warnTruncation(concept, totalRaw, slices.length, perSource, budget);
72+
73+
const trimmed = slices.map((s) =>
74+
s.content.length > perSource
75+
? { ...s, content: s.content.slice(0, perSource) + TRUNCATION_MARKER }
76+
: s,
77+
);
78+
return formatSlices(trimmed);
79+
}
80+
81+
/** Render the slice list using the same `--- SOURCE: ---` headers the LLM is taught to read. */
82+
function formatSlices(slices: SourceSlice[]): string {
83+
return slices
84+
.map((s) => `--- SOURCE: ${s.file} ---\n\n${s.content}`)
85+
.join("\n\n");
86+
}
87+
88+
/** Emit a single, actionable warning when the budget kicks in for a concept. */
89+
function warnTruncation(
90+
concept: string,
91+
totalRaw: number,
92+
sourceCount: number,
93+
perSource: number,
94+
budget: number,
95+
): void {
96+
output.status(
97+
"!",
98+
output.warn(
99+
`Combined source content for "${concept}" (${totalRaw.toLocaleString()} chars across ` +
100+
`${sourceCount} sources) exceeds the ${budget.toLocaleString()}-char prompt budget; ` +
101+
`truncating each source to ~${perSource.toLocaleString()} chars. ` +
102+
`Raise via ${PROMPT_BUDGET_ENV_VAR} when running against larger-context models.`,
103+
),
104+
);
105+
}

src/utils/constants.ts

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,25 @@ export const MAX_SOURCE_CHARS = 100_000;
99
/** Minimum source content length to ingest without a warning. */
1010
export const MIN_SOURCE_CHARS = 50;
1111

12+
/**
13+
* Default character budget for the combined source content sent to the LLM
14+
* during page generation for a single concept (issue #39).
15+
*
16+
* Caps the per-prompt content at ~200,000 chars (~50k tokens). When two or
17+
* more sources contribute to the same concept and their combined raw size
18+
* exceeds this budget, each source's slice is proportionally truncated so
19+
* the prompt fits the model's context window. Without this cap, popular
20+
* concepts that appear in many overlapping documents reliably blow past
21+
* the LLM provider's context limit and the compile crashes.
22+
*
23+
* Override via the LLMWIKI_PROMPT_BUDGET_CHARS env var when running against
24+
* larger-context (raise) or smaller-context (lower) models.
25+
*/
26+
export const DEFAULT_PROMPT_BUDGET_CHARS = 200_000;
27+
28+
/** Env var that overrides DEFAULT_PROMPT_BUDGET_CHARS at runtime. */
29+
export const PROMPT_BUDGET_ENV_VAR = "LLMWIKI_PROMPT_BUDGET_CHARS";
30+
1231
/** Number of most relevant wiki pages to load for query context. */
1332
export const QUERY_PAGE_LIMIT = 5;
1433

test/fixtures/aimock-helper.ts

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,37 @@ export function mockOpenAIEnv(
9595
};
9696
}
9797

98+
/**
99+
* Walk aimock's recorded requests and return the system-prompt content
100+
* from the first request whose user-message content satisfies the
101+
* predicate. Returns null when no matching request is found.
102+
*
103+
* Centralised because every aimock-backed CLI test that wants to assert
104+
* "the LLM saw <X> in the system prompt" has to slice the same way:
105+
* aimock normalises Anthropic's top-level `system` field into a
106+
* `{role: "system", content: ...}` message in `body.messages`, so the
107+
* walker has to inspect both system and user messages per request and
108+
* disambiguate by user-message content.
109+
*/
110+
export function findSystemPromptByUserMessage(
111+
handle: MockClaudeHandle,
112+
predicate: (userMessage: string) => boolean,
113+
): string | null {
114+
const requests = handle.mock.getRequests() as Array<{ body?: unknown }>;
115+
for (const req of requests) {
116+
const body = req.body as { messages?: unknown } | undefined;
117+
if (!Array.isArray(body?.messages)) continue;
118+
let systemPrompt = "";
119+
let userPrompt = "";
120+
for (const msg of body.messages as Array<{ role?: unknown; content?: unknown }>) {
121+
if (msg.role === "system" && typeof msg.content === "string") systemPrompt = msg.content;
122+
if (msg.role === "user" && typeof msg.content === "string") userPrompt = msg.content;
123+
}
124+
if (predicate(userPrompt)) return systemPrompt;
125+
}
126+
return null;
127+
}
128+
98129
/** Live state managed by {@link useAimockLifecycle}. */
99130
export interface AimockLifecycle {
100131
/** Currently-running mock, or null between tests. Set by `start()`. */

test/output-language-query-integration.test.ts

Lines changed: 8 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import { describe, it, expect } from "vitest";
1414
import { mkdir, writeFile } from "fs/promises";
1515
import path from "path";
1616
import {
17+
findSystemPromptByUserMessage,
1718
mockClaudeEnv,
1819
useAimockLifecycle,
1920
type MockClaudeHandle,
@@ -66,25 +67,14 @@ function stubQueryResponses(handle: MockClaudeHandle): void {
6667
handle.mock.onMessage(/.*/, { content: ANSWER_TEXT });
6768
}
6869

69-
/** Pull the system prompt for the answer-generation request out of aimock's recording. */
70+
/**
71+
* Pull the system prompt for the answer-generation request out of aimock's
72+
* recording. The answer-generation request includes "Relevant wiki pages:"
73+
* in the user message; the page-selection request includes "Wiki Index:"
74+
* instead, so the predicate disambiguates the two.
75+
*/
7076
function findAnswerSystemPrompt(handle: MockClaudeHandle): string | null {
71-
const requests = handle.mock.getRequests() as Array<{ body?: unknown }>;
72-
for (const req of requests) {
73-
const body = req.body as { messages?: unknown } | undefined;
74-
if (!Array.isArray(body?.messages)) continue;
75-
let systemPrompt = "";
76-
let userPrompt = "";
77-
for (const msg of body.messages as Array<{ role?: unknown; content?: unknown }>) {
78-
if (msg.role === "system" && typeof msg.content === "string") systemPrompt = msg.content;
79-
if (msg.role === "user" && typeof msg.content === "string") userPrompt = msg.content;
80-
}
81-
// The answer-generation request includes "Relevant wiki pages:" in the user
82-
// message; the page-selection request includes "Wiki Index:" instead.
83-
if (userPrompt.includes("Relevant wiki pages:")) {
84-
return systemPrompt;
85-
}
86-
}
87-
return null;
77+
return findSystemPromptByUserMessage(handle, (u) => u.includes("Relevant wiki pages:"));
8878
}
8979

9080
describe("query --lang CLI integration (#37 query path)", () => {

0 commit comments

Comments
 (0)