atomicmemory · ethanj · May 1, 2026 · May 1, 2026
diff --git a/README.md b/README.md
@@ -229,17 +229,16 @@ confidence: 0.82           # 0–1, LLM-reported confidence in the synthesized p
 provenanceState: merged    # extracted | merged | inferred | ambiguous
 contradictedBy:
   - slug: probabilistic-reasoning
-inferredParagraphs: 1      # paragraphs the LLM marked as inferred (vs cited)
 ---
 ```
 
-When multiple sources merge into one slug, metadata is reconciled: `min` confidence, `provenanceState = 'merged'`, union of `contradictedBy` (deduped by slug), `max` `inferredParagraphs`.
+When multiple sources merge into one slug, metadata is reconciled: `min` confidence, `provenanceState = 'merged'`, union of `contradictedBy` (deduped by slug).
 
 `llmwiki lint` adds three rules that surface this metadata:
 
 - `low-confidence` — flags pages with `confidence` below a threshold
 - `contradicted-page` — flags pages with non-empty `contradictedBy`
-- `excess-inferred-paragraphs` — flags pages with too many inferred paragraphs without citations
+- `excess-inferred-paragraphs` — flags pages whose body has too many uncited prose paragraphs (counted directly from the rendered text — the body is the single source of truth, no frontmatter field involved)
 
 ## Claim-level provenance
 

diff --git a/src/compiler/index.ts b/src/compiler/index.ts
@@ -422,7 +422,9 @@ interface MergedConcept {
  * - confidence: min (most pessimistic value wins)
  * - provenanceState: always 'merged' once two sources are involved
  * - contradictedBy: union by slug (deduplicating on slug identity)
- * - inferredParagraphs: max (any source claiming inference wins)
+ *
+ * `inferredParagraphs` is no longer reconciled — it is derived from the
+ * rendered page body at lint time, not from extraction metadata.
  */
 export function reconcileConceptMetadata(
   existing: ExtractedConcept,
@@ -451,13 +453,6 @@ export function reconcileConceptMetadata(
   }
   reconciled.contradictedBy = refs.length > 0 ? refs : undefined;
 
-  // Max inferredParagraphs — any source flagging inference raises the count.
-  if (typeof incoming.inferredParagraphs === "number") {
-    reconciled.inferredParagraphs = typeof existing.inferredParagraphs === "number"
-      ? Math.max(existing.inferredParagraphs, incoming.inferredParagraphs)
-      : incoming.inferredParagraphs;
-  }
-
   return reconciled;
 }
 

diff --git a/src/compiler/prompts.ts b/src/compiler/prompts.ts
@@ -88,11 +88,6 @@ export const CONCEPT_EXTRACTION_TOOL = {
               },
               description: "Slugs of other concepts whose evidence contradicts this one.",
             },
-            inferred_paragraphs: {
-              type: "integer",
-              description:
-                "Estimated number of paragraphs in the page that will be inferred rather than directly cited.",
-            },
           },
           required: ["concept", "summary", "is_new"],
         },
@@ -134,8 +129,6 @@ export function buildExtractionPrompt(
     "    or 'ambiguous' if the source is contradictory or unclear.",
     "  - contradicted_by: slugs of other concepts (in this batch or the index)",
     "    whose evidence conflicts with this one.",
-    "  - inferred_paragraphs: estimated number of paragraphs in the resulting",
-    "    page that will be inferred rather than directly citable.",
     indexSection,
     "\n\n--- SOURCE DOCUMENT ---\n\n",
     sourceContent,
@@ -206,7 +199,6 @@ interface RawConcept {
   confidence?: unknown;
   provenance_state?: unknown;
   contradicted_by?: unknown;
-  inferred_paragraphs?: unknown;
 }
 
 /** True if the raw concept has the required string/boolean fields. */
@@ -248,10 +240,6 @@ function mapRawConcept(c: RawConcept): ExtractedConcept {
     confidence: typeof c.confidence === "number" ? c.confidence : undefined,
     provenanceState: provenance,
     contradictedBy: coerceContradictedBy(c.contradicted_by),
-    inferredParagraphs: typeof c.inferred_paragraphs === "number" &&
-      Number.isInteger(c.inferred_paragraphs) && c.inferred_paragraphs >= 0
-      ? c.inferred_paragraphs
-      : undefined,
   };
 }
 

diff --git a/src/compiler/provenance.ts b/src/compiler/provenance.ts
@@ -30,9 +30,6 @@ export function addProvenanceMeta(
   if (concept.contradictedBy && concept.contradictedBy.length > 0) {
     fields.contradictedBy = concept.contradictedBy;
   }
-  if (typeof concept.inferredParagraphs === "number") {
-    fields.inferredParagraphs = concept.inferredParagraphs;
-  }
 }
 
 /**

diff --git a/src/linter/rules.ts b/src/linter/rules.ts
@@ -288,18 +288,22 @@ export async function checkContradictedPages(root: string): Promise<LintResult[]
 }
 
 /**
- * Flag pages with too many inferred paragraphs unsupported by direct citations.
- * Uses the metadata-reported count when present and falls back to counting
- * uncited prose paragraphs in the body.
+ * Flag pages with too many inferred paragraphs unsupported by direct
+ * citations. Always derived from the rendered page body — the body is
+ * the single source of truth, no metadata field is consulted. Earlier
+ * versions trusted an LLM-estimated `inferredParagraphs` frontmatter
+ * field, but that estimate was made before the page even existed and
+ * routinely disagreed with what the model actually produced. Counting
+ * uncited prose paragraphs in the rendered body matches what a
+ * reviewer would see and survives hand-edits.
  */
 export async function checkInferredWithoutCitations(root: string): Promise<LintResult[]> {
   const pages = await collectAllPages(root);
   const results: LintResult[] = [];
 
   for (const page of pages) {
-    const { meta, body } = parseFrontmatter(page.content);
-    const provenance = parseProvenanceMetadata(meta);
-    const inferred = provenance.inferredParagraphs ?? countUncitedProseParagraphs(body);
+    const { body } = parseFrontmatter(page.content);
+    const inferred = countUncitedProseParagraphs(body);
     if (inferred <= MAX_INFERRED_PARAGRAPHS_WITHOUT_CITATIONS) continue;
     results.push({
       rule: "excess-inferred-paragraphs",
@@ -312,8 +316,15 @@ export async function checkInferredWithoutCitations(root: string): Promise<LintR
   return results;
 }
 
-/** Match a paragraph that looks like prose (not a heading, list, or code block). */
-const PROSE_PARAGRAPH_LEAD = /^[A-Za-z]/;
+/**
+ * Match a paragraph that looks like prose (not a heading, list, or code
+ * block). Uses the Unicode `Letter` property so non-ASCII pages
+ * generated via `--lang Chinese`, `--lang Japanese`, etc. (#46) are
+ * still detected — the previous `[A-Za-z]` form silently dropped CJK,
+ * Cyrillic, Greek, and Arabic prose, leaving
+ * `excess-inferred-paragraphs` blind on those pages.
+ */
+const PROSE_PARAGRAPH_LEAD = /^\p{L}/u;
 
 /** Count prose paragraphs in a body that lack a ^[citation] marker. */
 function countUncitedProseParagraphs(body: string): number {

diff --git a/src/utils/markdown.ts b/src/utils/markdown.ts
@@ -268,16 +268,15 @@ function parseContradictedBy(raw: unknown): ContradictionRef[] | undefined {
   return refs.length > 0 ? refs : undefined;
 }
 
-/** Parse the inferred paragraph count, requiring a non-negative integer. */
-function parseInferredParagraphs(raw: unknown): number | undefined {
-  if (typeof raw !== "number" || !Number.isInteger(raw) || raw < 0) return undefined;
-  return raw;
-}
-
 /**
  * Extract provenance metadata fields from a parsed frontmatter record.
  * Defensively handles missing or malformed values so existing pages without
  * the new fields continue to parse correctly.
+ *
+ * Note: legacy pages may also carry an `inferredParagraphs` frontmatter
+ * field from earlier compiles. It is intentionally not parsed here —
+ * the lint rule derives the count from the rendered body instead, so
+ * the cached field is ignored.
  * @param meta - Raw frontmatter object as returned by parseFrontmatter.
  * @returns Typed provenance metadata with only the fields that were present.
  */
@@ -288,7 +287,6 @@ export function parseProvenanceMetadata(
     confidence: parseConfidence(meta.confidence),
     provenanceState: parseProvenanceState(meta.provenanceState),
     contradictedBy: parseContradictedBy(meta.contradictedBy),
-    inferredParagraphs: parseInferredParagraphs(meta.inferredParagraphs),
   };
 }
 

diff --git a/src/utils/types.ts b/src/utils/types.ts
@@ -25,16 +25,22 @@ export interface ContradictionRef {
 
 /**
  * Provenance metadata shared between extraction-time concept records and
- * page-frontmatter records. Both surfaces carry the same four optional
- * fields — confidence, lifecycle state, contradictions, and inferred
- * paragraph count — so a single shared shape keeps the two ends of the
- * pipeline from drifting apart as new fields are added.
+ * page-frontmatter records. Both surfaces carry the same three optional
+ * fields — confidence, lifecycle state, and contradictions — so a single
+ * shared shape keeps the two ends of the pipeline from drifting apart as
+ * new fields are added.
  *
  * Extended by {@link ExtractedConcept} and {@link WikiFrontmatter} via
  * `interface … extends ProvenanceMetadata`, so the JSON shapes
  * serialised on disk and over the LLM tool boundary stay byte-identical
  * to the previous flat layout (TypeScript erases the indirection at
  * compile time).
+ *
+ * `inferredParagraphs` used to live here too but was an unreliable
+ * extraction-time guess about the future page body. It is now derived
+ * from the rendered body at lint time (see
+ * `checkInferredWithoutCitations`) — body is the single source of
+ * truth, no metadata field involved.
  */
 export interface ProvenanceMetadata {
   /** Numeric confidence in 0..1 — overall confidence in the content. */
@@ -43,8 +49,6 @@ export interface ProvenanceMetadata {
   provenanceState?: ProvenanceState;
   /** Slugs of other concepts/pages whose evidence contradicts this one. */
   contradictedBy?: ContradictionRef[];
-  /** Number of paragraphs that are inferred rather than directly extracted. */
-  inferredParagraphs?: number;
 }
 
 /** A single concept extracted from a source by the LLM. */

diff --git a/test/compile-claim-provenance.test.ts b/test/compile-claim-provenance.test.ts
@@ -35,7 +35,6 @@ function buildExtractionResponse(): string {
         confidence: 0.9,
         provenance_state: "extracted",
         contradicted_by: [],
-        inferred_paragraphs: 0,
       },
     ],
   });

diff --git a/test/compile-provenance.test.ts b/test/compile-provenance.test.ts
@@ -35,7 +35,6 @@ function buildExtractionResponse(): string {
         confidence: 0.3,
         provenance_state: "inferred",
         contradicted_by: [{ slug: "other", reason: "conflicting evidence" }],
-        inferred_paragraphs: 2,
       },
     ],
   });
@@ -69,7 +68,6 @@ describe("compile-path provenance metadata", () => {
     expect(provenance.contradictedBy).toEqual([
       { slug: "other", reason: "conflicting evidence" },
     ]);
-    expect(provenance.inferredParagraphs).toBe(2);
   });
 
   it("emits a contradiction warning to console during compilation", async () => {

diff --git a/test/confidence-metadata-integration.test.ts b/test/confidence-metadata-integration.test.ts
@@ -142,15 +142,24 @@ describe("confidence metadata — CLI lint integration", () => {
   // excess-inferred-paragraphs rule
   // -------------------------------------------------------------------------
 
-  it("reports excess-inferred-paragraphs when inferredParagraphs > 2 with no citations", async () => {
+  it("reports excess-inferred-paragraphs when the body has too many uncited prose paragraphs", async () => {
+    // Each uncited prose paragraph contributes to the count; five exceeds
+    // the max of two. Body is the only signal — the lint rule no longer
+    // reads any frontmatter inferredParagraphs field.
+    const body = [
+      "First uncited prose paragraph.",
+      "Second uncited prose paragraph.",
+      "Third uncited prose paragraph.",
+      "Fourth uncited prose paragraph.",
+      "Fifth uncited prose paragraph.",
+    ].join("\n\n");
     const content = buildPageContent(
-      { title: "Inferred Concept", summary: "Mostly inferred.", inferredParagraphs: "5" },
-      "This page declares five inferred paragraphs, exceeding the maximum of two.",
+      { title: "Inferred Concept", summary: "Mostly inferred." },
+      body,
     );
     const root = await createWikiFixture("inferred", content);
     try {
       const { stdout } = await runLint(root);
-      // Assert on the message count and threshold text.
       expect(stdout).toContain("5 inferred paragraphs");
       expect(stdout).toContain("max 2");
     } finally {
@@ -200,19 +209,26 @@ describe("confidence metadata — CLI lint integration", () => {
   // -------------------------------------------------------------------------
 
   it("surfaces all three new rule messages when a page violates all constraints", async () => {
-    // Use raw YAML array syntax for contradictedBy alongside other scalar fields.
+    // The inferred-paragraphs rule now derives its count from the body —
+    // include enough uncited prose paragraphs to trigger it alongside
+    // the low-confidence and contradiction signals.
+    const body = [
+      "This page deliberately violates all three new lint rules.",
+      "Second uncited prose paragraph here.",
+      "Third uncited prose paragraph here.",
+      "Fourth uncited prose paragraph here.",
+    ].join("\n\n");
     const content = [
       "---",
       ...sharedFrontmatterLines(),
       "title: All Flags Concept",
       "summary: Triggers every new rule.",
       "confidence: 0.1",
-      "inferredParagraphs: 4",
       "contradictedBy:",
       "  - slug: rival-page",
       "---",
       "",
-      "This page deliberately violates all three new lint rules.",
+      body,
     ].join("\n");
     const root = await createWikiFixture("all-flags", content);
     try {