From 77331c50bc3de8e99b49bd88dbc18509d9f83c21 Mon Sep 17 00:00:00 2001 From: dacharyc Date: Sun, 15 Mar 2026 00:19:03 -0400 Subject: [PATCH 1/7] Add 'rendering-strategy', 'tabbed-content-serialization', 'section-header-quality' --- README.md | 27 +- package-lock.json | 135 +++- package.json | 1 + .../section-header-quality.ts | 295 +++++++- .../tabbed-content-serialization.ts | 229 +++++- src/checks/index.ts | 1 + .../page-size/content-start-position.ts | 12 +- src/checks/page-size/page-size-html.ts | 14 +- src/checks/page-size/rendering-strategy.ts | 182 +++++ src/helpers/detect-rendering.ts | 115 +++ src/helpers/detect-tabs.ts | 354 ++++++++++ src/helpers/fetch-page.ts | 23 + src/helpers/index.ts | 5 + src/runner.ts | 1 + src/types.ts | 9 + test/integration/check-pipeline.test.ts | 52 ++ test/integration/cli.test.ts | 2 +- test/unit/checks/rendering-strategy.test.ts | 251 +++++++ .../checks/section-header-quality.test.ts | 594 ++++++++++++++++ .../tabbed-content-serialization.test.ts | 385 +++++++++++ test/unit/helpers/detect-rendering.test.ts | 165 +++++ test/unit/helpers/detect-tabs.test.ts | 654 ++++++++++++++++++ test/unit/helpers/fetch-page.test.ts | 102 +++ test/unit/runner.test.ts | 2 +- 24 files changed, 3562 insertions(+), 48 deletions(-) create mode 100644 src/checks/page-size/rendering-strategy.ts create mode 100644 src/helpers/detect-rendering.ts create mode 100644 src/helpers/detect-tabs.ts create mode 100644 src/helpers/fetch-page.ts create mode 100644 test/unit/checks/rendering-strategy.test.ts create mode 100644 test/unit/checks/section-header-quality.test.ts create mode 100644 test/unit/checks/tabbed-content-serialization.test.ts create mode 100644 test/unit/helpers/detect-rendering.test.ts create mode 100644 test/unit/helpers/detect-tabs.test.ts create mode 100644 test/unit/helpers/fetch-page.test.ts diff --git a/README.md b/README.md index 38d6f71..33297f2 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ Test your documentation site against the [Agent-Friendly Documentation Spec](htt Agents don't use docs like humans. They hit truncation limits, get walls of CSS instead of content, can't follow cross-host redirects, and don't know about quality-of-life improvements like `llms.txt` or `.md` docs pages that would make life swell. Maybe this is because the industry has lacked guidance - until now. -afdocs runs 21 checks across 8 categories to evaluate how well your docs serve agent consumers. 16 are fully implemented; the rest return `skip` until completed. +afdocs runs 22 checks across 8 categories to evaluate how well your docs serve agent consumers. 19 are fully implemented; the rest return `skip` until completed. > **Status: Early development (0.x)** > This project is under active development. Check IDs, CLI flags, and output formats may change between minor versions. Feel free to try it out, but don't build automation against specific output until 1.0. @@ -43,7 +43,7 @@ Authentication ✓ auth-gate-detection: All 50 sampled pages are publicly accessible Summary - 9 passed, 3 failed, 9 skipped (21 total) + 9 passed, 3 failed, 10 skipped (22 total) ``` ## Install @@ -144,7 +144,7 @@ describe('agent-friendliness', () => { ## Checks -21 checks across 8 categories. Checks marked with \* are not yet implemented and return `skip`. +22 checks across 8 categories. Checks marked with \* are not yet implemented and return `skip`. ### Category 1: llms.txt @@ -165,19 +165,20 @@ describe('agent-friendliness', () => { ### Category 3: Page Size and Truncation Risk -| Check | Description | -| ------------------------ | ------------------------------------------------ | -| `page-size-markdown` | Character count when served as markdown | -| `page-size-html` | Character count of HTML and post-conversion size | -| `content-start-position` | How far into the response actual content begins | +| Check | Description | +| ------------------------ | --------------------------------------------------------------- | +| `rendering-strategy` | Whether pages contain server-rendered content or are SPA shells | +| `page-size-markdown` | Character count when served as markdown | +| `page-size-html` | Character count of HTML and post-conversion size | +| `content-start-position` | How far into the response actual content begins | ### Category 4: Content Structure -| Check | Description | -| --------------------------------- | -------------------------------------------------- | -| `tabbed-content-serialization` \* | Whether tabbed content creates oversized output | -| `section-header-quality` \* | Whether headers in tabbed sections include context | -| `markdown-code-fence-validity` | Whether markdown has unclosed code fences | +| Check | Description | +| ------------------------------ | -------------------------------------------------- | +| `tabbed-content-serialization` | Whether tabbed content creates oversized output | +| `section-header-quality` | Whether headers in tabbed sections include context | +| `markdown-code-fence-validity` | Whether markdown has unclosed code fences | ### Category 5: URL Stability and Redirects diff --git a/package-lock.json b/package-lock.json index 7cbc6aa..d780a0d 100644 --- a/package-lock.json +++ b/package-lock.json @@ -11,6 +11,7 @@ "dependencies": { "chalk": "^5.4.1", "commander": "^13.1.0", + "node-html-parser": "^7.1.0", "turndown": "^7.2.2", "yaml": "^2.7.0" }, @@ -32,7 +33,7 @@ "vitest": "^4.0.18" }, "engines": { - "node": ">=20" + "node": ">=22" } }, "node_modules/@babel/helper-string-parser": { @@ -1807,6 +1808,12 @@ "node": "20 || >=22" } }, + "node_modules/boolbase": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz", + "integrity": "sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==", + "license": "ISC" + }, "node_modules/brace-expansion": { "version": "5.0.2", "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-5.0.2.tgz", @@ -2042,6 +2049,34 @@ "node": ">= 8" } }, + "node_modules/css-select": { + "version": "5.2.2", + "resolved": "https://registry.npmjs.org/css-select/-/css-select-5.2.2.tgz", + "integrity": "sha512-TizTzUddG/xYLA3NXodFM0fSbNizXjOKhqiQQwvhlspadZokn1KDy0NZFS0wuEubIYAV5/c1/lAr0TaaFXEXzw==", + "license": "BSD-2-Clause", + "dependencies": { + "boolbase": "^1.0.0", + "css-what": "^6.1.0", + "domhandler": "^5.0.2", + "domutils": "^3.0.1", + "nth-check": "^2.0.1" + }, + "funding": { + "url": "https://github.com/sponsors/fb55" + } + }, + "node_modules/css-what": { + "version": "6.2.2", + "resolved": "https://registry.npmjs.org/css-what/-/css-what-6.2.2.tgz", + "integrity": "sha512-u/O3vwbptzhMs3L1fQE82ZSLHQQfto5gyZzwteVIEyeaY5Fc7R4dapF/BvRoSYFeqfBk4m0V1Vafq5Pjv25wvA==", + "license": "BSD-2-Clause", + "engines": { + "node": ">= 6" + }, + "funding": { + "url": "https://github.com/sponsors/fb55" + } + }, "node_modules/debug": { "version": "4.4.3", "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz", @@ -2067,6 +2102,61 @@ "dev": true, "license": "MIT" }, + "node_modules/dom-serializer": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-2.0.0.tgz", + "integrity": "sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==", + "license": "MIT", + "dependencies": { + "domelementtype": "^2.3.0", + "domhandler": "^5.0.2", + "entities": "^4.2.0" + }, + "funding": { + "url": "https://github.com/cheeriojs/dom-serializer?sponsor=1" + } + }, + "node_modules/domelementtype": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-2.3.0.tgz", + "integrity": "sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/fb55" + } + ], + "license": "BSD-2-Clause" + }, + "node_modules/domhandler": { + "version": "5.0.3", + "resolved": "https://registry.npmjs.org/domhandler/-/domhandler-5.0.3.tgz", + "integrity": "sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==", + "license": "BSD-2-Clause", + "dependencies": { + "domelementtype": "^2.3.0" + }, + "engines": { + "node": ">= 4" + }, + "funding": { + "url": "https://github.com/fb55/domhandler?sponsor=1" + } + }, + "node_modules/domutils": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/domutils/-/domutils-3.2.2.tgz", + "integrity": "sha512-6kZKyUajlDuqlHKVX1w7gyslj9MPIXzIFiz/rGu35uC1wMi+kMhQwGhl4lt9unC9Vb9INnY9Z3/ZA3+FhASLaw==", + "license": "BSD-2-Clause", + "dependencies": { + "dom-serializer": "^2.0.0", + "domelementtype": "^2.3.0", + "domhandler": "^5.0.3" + }, + "funding": { + "url": "https://github.com/fb55/domutils?sponsor=1" + } + }, "node_modules/emoji-regex": { "version": "8.0.0", "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz", @@ -2074,6 +2164,18 @@ "dev": true, "license": "MIT" }, + "node_modules/entities": { + "version": "4.5.0", + "resolved": "https://registry.npmjs.org/entities/-/entities-4.5.0.tgz", + "integrity": "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==", + "license": "BSD-2-Clause", + "engines": { + "node": ">=0.12" + }, + "funding": { + "url": "https://github.com/fb55/entities?sponsor=1" + } + }, "node_modules/environment": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/environment/-/environment-1.1.0.tgz", @@ -2513,6 +2615,15 @@ "node": ">=8" } }, + "node_modules/he": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/he/-/he-1.2.0.tgz", + "integrity": "sha512-F/1DnUGPopORZi0ni+CvrCgHQ5FyEAHRLSApuYWMmrbSwoN2Mn/7k+Gl38gJnR7yyDZk6WLXwiGod1JOWNDKGw==", + "license": "MIT", + "bin": { + "he": "bin/he" + } + }, "node_modules/headers-polyfill": { "version": "4.0.3", "resolved": "https://registry.npmjs.org/headers-polyfill/-/headers-polyfill-4.0.3.tgz", @@ -3166,6 +3277,28 @@ "dev": true, "license": "MIT" }, + "node_modules/node-html-parser": { + "version": "7.1.0", + "resolved": "https://registry.npmjs.org/node-html-parser/-/node-html-parser-7.1.0.tgz", + "integrity": "sha512-iJo8b2uYGT40Y8BTyy5ufL6IVbN8rbm/1QK2xffXU/1a/v3AAa0d1YAoqBNYqaS4R/HajkWIpIfdE6KcyFh1AQ==", + "license": "MIT", + "dependencies": { + "css-select": "^5.1.0", + "he": "1.2.0" + } + }, + "node_modules/nth-check": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/nth-check/-/nth-check-2.1.1.tgz", + "integrity": "sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==", + "license": "BSD-2-Clause", + "dependencies": { + "boolbase": "^1.0.0" + }, + "funding": { + "url": "https://github.com/fb55/nth-check?sponsor=1" + } + }, "node_modules/obug": { "version": "2.1.1", "resolved": "https://registry.npmjs.org/obug/-/obug-2.1.1.tgz", diff --git a/package.json b/package.json index b9b2626..a1cd25d 100644 --- a/package.json +++ b/package.json @@ -60,6 +60,7 @@ "dependencies": { "chalk": "^5.4.1", "commander": "^13.1.0", + "node-html-parser": "^7.1.0", "turndown": "^7.2.2", "yaml": "^2.7.0" }, diff --git a/src/checks/content-structure/section-header-quality.ts b/src/checks/content-structure/section-header-quality.ts index 3010ab0..82484ec 100644 --- a/src/checks/content-structure/section-header-quality.ts +++ b/src/checks/content-structure/section-header-quality.ts @@ -1,12 +1,291 @@ +import { parse } from 'node-html-parser'; import { registerCheck } from '../registry.js'; -import type { CheckContext, CheckResult } from '../../types.js'; +import type { CheckContext, CheckResult, CheckStatus } from '../../types.js'; +import type { DetectedTabGroup } from '../../helpers/detect-tabs.js'; + +interface TabbedPageResult { + url: string; + tabGroups: DetectedTabGroup[]; + totalTabbedChars: number; + status: CheckStatus; + error?: string; +} + +interface GroupHeaderAnalysis { + url: string; + framework: string; + totalHeaders: number; + genericHeaders: number; + contextualHeaders: number; + hasGenericMajority: boolean; + hasCrossGroupGeneric: boolean; +} + +const MD_HEADING_RE = /^#{1,6}\s+(.+)$/gm; + +/** + * Extract header text from content that may be HTML, markdown, or a mix (MDX). + * Tries HTML parsing first, then falls back to markdown heading regex. + */ +function extractHeaders(content: string): string[] { + const headers: string[] = []; + + // HTML headers + const root = parse(content); + const htmlHeaders = root.querySelectorAll('h1, h2, h3, h4, h5, h6'); + for (const h of htmlHeaders) { + const text = h.textContent.trim(); + if (text.length > 0) headers.push(text); + } + + // Markdown headers (## Heading) + let match; + while ((match = MD_HEADING_RE.exec(content)) !== null) { + const text = match[1].trim(); + if (text.length > 0) headers.push(text); + } + + return headers; +} + +async function check(ctx: CheckContext): Promise { + const id = 'section-header-quality'; + const category = 'content-structure'; + + const tabResult = ctx.previousResults.get('tabbed-content-serialization'); + + if (!tabResult || tabResult.status === 'skip') { + return { + id, + category, + status: 'skip', + message: 'Skipped: tabbed-content-serialization did not run', + }; + } + + const tabbedPages = (tabResult.details?.tabbedPages as TabbedPageResult[] | undefined) ?? []; + const pagesWithGroups = tabbedPages.filter((p) => p.tabGroups && p.tabGroups.length > 0); + + if (pagesWithGroups.length === 0) { + return { + id, + category, + status: 'pass', + message: 'No tabbed content found; header quality check not applicable', + }; + } + + const analyses: GroupHeaderAnalysis[] = []; + // Track unique headers per analysis for cross-group pass + const analysisHeaderSets: Set[] = []; + + for (const page of pagesWithGroups) { + for (const group of page.tabGroups) { + if (group.panels.length < 2) continue; + + // Extract headers from each panel + const panelHeaders: Array<{ label: string | null; headers: string[] }> = group.panels.map( + (panel) => ({ + label: panel.label, + headers: extractHeaders(panel.html), + }), + ); + + // Count how many times each header text appears across panels + const headerCounts = new Map(); + const uniqueHeaders = new Set(); + for (const ph of panelHeaders) { + for (const h of ph.headers) { + const lower = h.toLowerCase(); + headerCounts.set(lower, (headerCounts.get(lower) ?? 0) + 1); + uniqueHeaders.add(lower); + } + } + + const allHeaders = panelHeaders.flatMap((ph) => ph.headers); + let genericCount = 0; + let contextualCount = 0; + + for (const ph of panelHeaders) { + for (const h of ph.headers) { + const lower = h.toLowerCase(); + const appearsInMultiple = (headerCounts.get(lower) ?? 0) >= 2; + + // A header is contextual if it includes the panel label or is unique + const includesLabel = ph.label != null && lower.includes(ph.label.toLowerCase()); + + if (includesLabel || !appearsInMultiple) { + contextualCount++; + } else { + genericCount++; + } + } + } + + const totalHeaders = allHeaders.length; + const hasGenericMajority = totalHeaders > 0 && genericCount > totalHeaders / 2; + + analysisHeaderSets.push(uniqueHeaders); + analyses.push({ + url: page.url, + framework: group.framework, + totalHeaders, + genericHeaders: genericCount, + contextualHeaders: contextualCount, + hasGenericMajority, + hasCrossGroupGeneric: false, + }); + } + } + + // Cross-group analysis: detect identical headers repeated across separate tab groups + // on the same page without variant context (e.g. "Build a MongoDB Search Query" + // appearing in 7 driver-specific tab groups). + let crossGroupGenericGroupCount = 0; + let crossGroupTotalGroupCount = 0; + const crossGroupRepeatedHeaders: Array<{ url: string; header: string; groupCount: number }> = []; + + for (const page of pagesWithGroups) { + if (page.tabGroups.length < 2) continue; + + // Collect all panel labels and unique headers per group + const allLabels = new Set(); + const perGroup: Set[] = []; + for (const group of page.tabGroups) { + const headers = new Set(); + for (const panel of group.panels) { + if (panel.label) allLabels.add(panel.label.toLowerCase()); + for (const h of extractHeaders(panel.html)) headers.add(h.toLowerCase()); + } + perGroup.push(headers); + } + + // Count how many groups each header appears in + const headerGroupCount = new Map(); + for (const hs of perGroup) { + for (const h of hs) headerGroupCount.set(h, (headerGroupCount.get(h) ?? 0) + 1); + } + + // A header is cross-group generic if it appears in 2+ groups and doesn't + // include any panel label (i.e. lacks variant context) + const crossGenericSet = new Set(); + for (const [header, count] of headerGroupCount) { + if (count >= 2 && ![...allLabels].some((l) => header.includes(l))) { + crossGenericSet.add(header); + crossGroupRepeatedHeaders.push({ url: page.url, header, groupCount: count }); + } + } + + // Count groups affected by cross-group generic headers + for (const hs of perGroup) { + if (hs.size === 0) continue; + crossGroupTotalGroupCount++; + if ([...hs].some((h) => crossGenericSet.has(h))) crossGroupGenericGroupCount++; + } + + // Update individual analyses with cross-group flag + if (crossGenericSet.size > 0) { + for (let i = 0; i < analyses.length; i++) { + if (analyses[i].url !== page.url) continue; + if ([...analysisHeaderSets[i]].some((h) => crossGenericSet.has(h))) { + analyses[i].hasCrossGroupGeneric = true; + } + } + } + } + + if (analyses.length === 0 && crossGroupTotalGroupCount === 0) { + return { + id, + category, + status: 'pass', + message: 'Tab groups have fewer than 2 panels; header quality check not applicable', + }; + } + + const groupsWithGenericMajority = analyses.filter((a) => a.hasGenericMajority).length; + const groupsWithHeaders = analyses.filter((a) => a.totalHeaders > 0).length; + + // If no tab panels contain any section headers, we can't evaluate quality + if (groupsWithHeaders === 0 && crossGroupTotalGroupCount === 0) { + return { + id, + category, + status: 'skip', + message: `${pagesWithGroups.length} page(s) with tabs found, but no section headers inside tab panels to evaluate`, + }; + } + + // Identify affected pages: pages where any group has within-group or cross-group issues + const pagesWithWithinGroupIssues = new Set( + analyses.filter((a) => a.hasGenericMajority).map((a) => a.url), + ); + const pagesWithCrossGroupIssues = new Set(crossGroupRepeatedHeaders.map((h) => h.url)); + const affectedPages = new Set([...pagesWithWithinGroupIssues, ...pagesWithCrossGroupIssues]); + + // Count pages where we actually found headers to evaluate + const pagesWithHeaders = new Set(analyses.filter((a) => a.totalHeaders > 0).map((a) => a.url)); + + // Scoring: use group-level ratios for fine-grained thresholds + // Within-group: ratio of groups-with-headers that have majority-generic + let withinStatus: CheckStatus = 'pass'; + if (groupsWithHeaders > 0) { + const wRatio = groupsWithGenericMajority / groupsWithHeaders; + if (wRatio > 0.5) withinStatus = 'fail'; + else if (wRatio > 0.25) withinStatus = 'warn'; + } + + // Cross-group: ratio of groups on multi-group pages that have cross-group generics + let crossGroupStatus: CheckStatus = 'pass'; + if (crossGroupTotalGroupCount > 0) { + const cRatio = crossGroupGenericGroupCount / crossGroupTotalGroupCount; + if (cRatio > 0.5) crossGroupStatus = 'fail'; + else if (cRatio > 0.25) crossGroupStatus = 'warn'; + } + + // Combined status: worst of both + const statusRank: Record = { pass: 0, skip: 0, warn: 1, fail: 2, error: 2 }; + const status: CheckStatus = + statusRank[crossGroupStatus] > statusRank[withinStatus] ? crossGroupStatus : withinStatus; + + // Build a page-oriented message for docs teams + let message: string; + if (affectedPages.size === 0) { + message = `${pagesWithHeaders.size} page(s) with tab headers checked; headers include variant context`; + } else { + // Find the most-repeated cross-group header for a concrete example + const worstHeader = + crossGroupRepeatedHeaders.length > 0 + ? [...crossGroupRepeatedHeaders].sort((a, b) => b.groupCount - a.groupCount)[0] + : null; + + const pageSummary = + `${affectedPages.size} of ${pagesWithHeaders.size} page(s) with tab headers ` + + `don't distinguish between variants`; + + if (worstHeader) { + message = `${pageSummary} (e.g. "${worstHeader.header}" repeats across ${worstHeader.groupCount} tab groups)`; + } else { + message = pageSummary; + } + } -async function check(_ctx: CheckContext): Promise { return { - id: 'section-header-quality', - category: 'content-structure', - status: 'skip', - message: 'Not yet implemented', + id, + category, + status, + message, + details: { + pagesWithTabs: pagesWithGroups.length, + pagesAffected: affectedPages.size, + totalGroupsAnalyzed: analyses.length, + groupsWithHeaders, + groupsWithGenericMajority, + crossGroupGenericGroupCount, + crossGroupTotalGroupCount, + crossGroupRepeatedHeaders, + analyses, + }, }; } @@ -14,6 +293,8 @@ registerCheck({ id: 'section-header-quality', category: 'content-structure', description: 'Whether headers in tabbed sections include variant context', - dependsOn: ['tabbed-content-serialization'], + // No hard dependency: we read from previousResults if available, + // but the check handles missing data gracefully (returns skip). + dependsOn: [], run: check, }); diff --git a/src/checks/content-structure/tabbed-content-serialization.ts b/src/checks/content-structure/tabbed-content-serialization.ts index 6e91f67..3126ef0 100644 --- a/src/checks/content-structure/tabbed-content-serialization.ts +++ b/src/checks/content-structure/tabbed-content-serialization.ts @@ -1,12 +1,229 @@ import { registerCheck } from '../registry.js'; -import type { CheckContext, CheckResult } from '../../types.js'; +import { discoverAndSamplePages } from '../../helpers/get-page-urls.js'; +import { htmlToMarkdown } from '../../helpers/html-to-markdown.js'; +import { fetchPage } from '../../helpers/fetch-page.js'; +import { detectTabGroups } from '../../helpers/detect-tabs.js'; +import { toMdUrls } from '../../helpers/to-md-urls.js'; +import type { CheckContext, CheckResult, CheckStatus } from '../../types.js'; +import type { DetectedTabGroup } from '../../helpers/detect-tabs.js'; + +interface TabbedPageResult { + url: string; + tabGroups: DetectedTabGroup[]; + totalTabbedChars: number; + status: CheckStatus; + source?: 'html' | 'md-fallback' | 'markdown'; + error?: string; +} + +function sizeStatus(chars: number): CheckStatus { + if (chars <= 50_000) return 'pass'; + if (chars <= 100_000) return 'warn'; + return 'fail'; +} + +function worstStatus(statuses: CheckStatus[]): CheckStatus { + if (statuses.includes('fail')) return 'fail'; + if (statuses.includes('warn')) return 'warn'; + return 'pass'; +} + +function formatSize(chars: number): string { + if (chars >= 1000) return `${Math.round(chars / 1000)}K`; + return String(chars); +} + +/** + * Try to fetch a .md fallback URL for a page. Returns the body if successful, null otherwise. + */ +async function tryMdFallback(ctx: CheckContext, pageUrl: string): Promise { + const candidates = toMdUrls(pageUrl); + for (const mdUrl of candidates) { + try { + const response = await ctx.http.fetch(mdUrl); + if (!response.ok) continue; + const contentType = response.headers.get('content-type') ?? ''; + if (!contentType.includes('text/markdown') && !contentType.includes('text/plain')) continue; + const body = await response.text(); + // Sanity check: must have some content and not be HTML + if (body.length > 0 && !body.trimStart().startsWith('; + const match = pageResults.find((r) => r.url === url); + return match?.status === 'fail'; +} + +async function analyzePage(ctx: CheckContext, url: string): Promise { + const page = await fetchPage(ctx, url); + + // For markdown responses, run MDX detection directly + if (!page.isHtml) { + const tabGroups = detectTabGroups(page.body); + if (tabGroups.length === 0) { + return { url, tabGroups: [], totalTabbedChars: 0, status: 'pass', source: 'markdown' }; + } + // For markdown content, the serialized size is the raw content of the tab groups + let totalTabbedChars = 0; + for (const group of tabGroups) { + totalTabbedChars += group.htmlSlice.length; + } + return { + url, + tabGroups, + totalTabbedChars, + status: sizeStatus(totalTabbedChars), + source: 'markdown', + }; + } + + // HTML response: try HTML-based detection first + const tabGroups = detectTabGroups(page.body); + if (tabGroups.length > 0) { + let totalTabbedChars = 0; + for (const group of tabGroups) { + const md = htmlToMarkdown(group.htmlSlice); + totalTabbedChars += md.length; + } + return { + url, + tabGroups, + totalTabbedChars, + status: sizeStatus(totalTabbedChars), + source: 'html', + }; + } + + // No tabs found in HTML. If rendering-strategy flagged this as an SPA shell, + // try the markdown path as a fallback so we can still analyze tab content + // for agents that support content negotiation. + if (isSpaShell(ctx, url)) { + const mdBody = await tryMdFallback(ctx, url); + if (mdBody) { + const mdTabGroups = detectTabGroups(mdBody); + if (mdTabGroups.length > 0) { + let totalTabbedChars = 0; + for (const group of mdTabGroups) { + totalTabbedChars += group.htmlSlice.length; + } + return { + url, + tabGroups: mdTabGroups, + totalTabbedChars, + status: sizeStatus(totalTabbedChars), + source: 'md-fallback', + }; + } + } + } + + return { url, tabGroups: [], totalTabbedChars: 0, status: 'pass', source: 'html' }; +} + +async function check(ctx: CheckContext): Promise { + const id = 'tabbed-content-serialization'; + const category = 'content-structure'; + + const { + urls: pageUrls, + totalPages, + sampled: wasSampled, + warnings, + } = await discoverAndSamplePages(ctx); + + const results: TabbedPageResult[] = []; + const concurrency = ctx.options.maxConcurrency; + + for (let i = 0; i < pageUrls.length; i += concurrency) { + const batch = pageUrls.slice(i, i + concurrency); + const batchResults = await Promise.all( + batch.map(async (url): Promise => { + try { + return await analyzePage(ctx, url); + } catch (err) { + return { + url, + tabGroups: [], + totalTabbedChars: 0, + status: 'fail', + error: err instanceof Error ? err.message : String(err), + }; + } + }), + ); + results.push(...batchResults); + } + + const successful = results.filter((r) => !r.error); + const fetchErrors = results.filter((r) => r.error).length; + + if (successful.length === 0) { + const suffix = fetchErrors > 0 ? `; ${fetchErrors} failed to fetch` : ''; + return { + id, + category, + status: 'fail', + message: `Could not fetch any pages to analyze${suffix}`, + details: { + totalPages, + testedPages: results.length, + sampled: wasSampled, + fetchErrors, + tabbedPages: results, + discoveryWarnings: warnings, + }, + }; + } + + const pagesWithTabs = successful.filter((r) => r.tabGroups.length > 0); + const totalGroupsFound = successful.reduce((sum, r) => sum + r.tabGroups.length, 0); + const overallStatus = worstStatus(successful.map((r) => r.status)); + const pageLabel = wasSampled ? 'sampled pages' : 'pages'; + + let message: string; + if (totalGroupsFound === 0) { + message = `No tabbed content detected across ${successful.length} ${pageLabel}`; + } else if (overallStatus === 'pass') { + message = `${totalGroupsFound} tab group(s) across ${pagesWithTabs.length} of ${successful.length} ${pageLabel}; all serialize under 50K chars`; + } else if (overallStatus === 'warn') { + const worst = Math.max(...successful.map((r) => r.totalTabbedChars)); + message = `${totalGroupsFound} tab group(s) found; worst page serializes to ${formatSize(worst)} chars (50K–100K)`; + } else { + const worst = Math.max(...successful.map((r) => r.totalTabbedChars)); + message = `${totalGroupsFound} tab group(s) found; worst page serializes to ${formatSize(worst)} chars (over 100K)`; + } + + if (fetchErrors > 0) { + message += `; ${fetchErrors} failed to fetch`; + } -async function check(_ctx: CheckContext): Promise { return { - id: 'tabbed-content-serialization', - category: 'content-structure', - status: 'skip', - message: 'Not yet implemented', + id, + category, + status: overallStatus, + message, + details: { + totalPages, + testedPages: results.length, + sampled: wasSampled, + pagesWithTabs: pagesWithTabs.length, + totalGroupsFound, + fetchErrors, + tabbedPages: results, + discoveryWarnings: warnings, + }, }; } diff --git a/src/checks/index.ts b/src/checks/index.ts index c8e0221..ecbfe0a 100644 --- a/src/checks/index.ts +++ b/src/checks/index.ts @@ -12,6 +12,7 @@ import './markdown-availability/markdown-url-support.js'; import './markdown-availability/content-negotiation.js'; // Category 3: Page Size +import './page-size/rendering-strategy.js'; import './page-size/page-size-markdown.js'; import './page-size/page-size-html.js'; import './page-size/content-start-position.js'; diff --git a/src/checks/page-size/content-start-position.ts b/src/checks/page-size/content-start-position.ts index 64c55a2..f3dc584 100644 --- a/src/checks/page-size/content-start-position.ts +++ b/src/checks/page-size/content-start-position.ts @@ -1,7 +1,7 @@ import { registerCheck } from '../registry.js'; -import { looksLikeHtml } from '../../helpers/detect-markdown.js'; import { discoverAndSamplePages } from '../../helpers/get-page-urls.js'; import { htmlToMarkdown } from '../../helpers/html-to-markdown.js'; +import { fetchPage } from '../../helpers/fetch-page.js'; import type { CheckContext, CheckResult, CheckStatus } from '../../types.js'; interface PagePositionResult { @@ -184,14 +184,8 @@ async function check(ctx: CheckContext): Promise { const batchResults = await Promise.all( batch.map(async (url): Promise => { try { - const response = await ctx.http.fetch(url); - const body = await response.text(); - const contentType = response.headers.get('content-type') ?? ''; - const isMarkdownType = - contentType.includes('text/markdown') || contentType.includes('text/plain'); - const isHtml = - !isMarkdownType && (contentType.includes('text/html') || looksLikeHtml(body)); - const markdown = isHtml ? htmlToMarkdown(body) : body; + const page = await fetchPage(ctx, url); + const markdown = page.isHtml ? htmlToMarkdown(page.body) : page.body; const totalChars = markdown.length; const contentStartChar = findContentStart(markdown); const contentStartPercent = diff --git a/src/checks/page-size/page-size-html.ts b/src/checks/page-size/page-size-html.ts index dff37bf..65ea027 100644 --- a/src/checks/page-size/page-size-html.ts +++ b/src/checks/page-size/page-size-html.ts @@ -1,7 +1,7 @@ import { registerCheck } from '../registry.js'; -import { looksLikeHtml } from '../../helpers/detect-markdown.js'; import { discoverAndSamplePages } from '../../helpers/get-page-urls.js'; import { htmlToMarkdown } from '../../helpers/html-to-markdown.js'; +import { fetchPage } from '../../helpers/fetch-page.js'; import type { CheckContext, CheckResult, CheckStatus } from '../../types.js'; interface PageSizeResult { @@ -50,18 +50,12 @@ async function check(ctx: CheckContext): Promise { const batchResults = await Promise.all( batch.map(async (url): Promise => { try { - const response = await ctx.http.fetch(url); - const body = await response.text(); - const contentType = response.headers.get('content-type') ?? ''; - const isMarkdownType = - contentType.includes('text/markdown') || contentType.includes('text/plain'); - const isHtml = - !isMarkdownType && (contentType.includes('text/html') || looksLikeHtml(body)); + const page = await fetchPage(ctx, url); // Skip conversion if the response is already markdown - const html = isHtml ? body : ''; + const html = page.isHtml ? page.body : ''; const htmlChars = html.length; - const converted = isHtml ? htmlToMarkdown(body) : body; + const converted = page.isHtml ? htmlToMarkdown(page.body) : page.body; const convertedChars = converted.length; const ratio = htmlChars > 0 ? Math.round((1 - convertedChars / htmlChars) * 100) : 0; diff --git a/src/checks/page-size/rendering-strategy.ts b/src/checks/page-size/rendering-strategy.ts new file mode 100644 index 0000000..be5624a --- /dev/null +++ b/src/checks/page-size/rendering-strategy.ts @@ -0,0 +1,182 @@ +import { registerCheck } from '../registry.js'; +import { discoverAndSamplePages } from '../../helpers/get-page-urls.js'; +import { fetchPage } from '../../helpers/fetch-page.js'; +import { analyzeRendering, type RenderingAnalysis } from '../../helpers/detect-rendering.js'; +import type { CheckContext, CheckResult, CheckStatus } from '../../types.js'; + +interface PageRenderingResult { + url: string; + status: CheckStatus; + analysis: RenderingAnalysis; + error?: string; +} + +function pageStatus(analysis: RenderingAnalysis): CheckStatus { + if (!analysis.hasSpaMarkers) return 'pass'; + if (analysis.hasContent) return 'pass'; + + // SPA markers present but sparse content — borderline + if ( + analysis.contentHeadings >= 1 || + analysis.contentParagraphs >= 2 || + analysis.codeBlocks >= 1 + ) { + return 'warn'; + } + + return 'fail'; +} + +function worstStatus(statuses: CheckStatus[]): CheckStatus { + if (statuses.includes('fail')) return 'fail'; + if (statuses.includes('warn')) return 'warn'; + return 'pass'; +} + +async function check(ctx: CheckContext): Promise { + const id = 'rendering-strategy'; + const category = 'page-size'; + + const { + urls: pageUrls, + totalPages, + sampled: wasSampled, + warnings, + } = await discoverAndSamplePages(ctx); + + const results: PageRenderingResult[] = []; + const concurrency = ctx.options.maxConcurrency; + + for (let i = 0; i < pageUrls.length; i += concurrency) { + const batch = pageUrls.slice(i, i + concurrency); + const batchResults = await Promise.all( + batch.map(async (url): Promise => { + try { + const page = await fetchPage(ctx, url); + + // Only analyze HTML responses — markdown responses are inherently accessible + if (!page.isHtml) { + return { + url, + status: 'pass', + analysis: { + hasContent: true, + hasSpaMarkers: false, + spaMarker: null, + contentHeadings: 0, + contentParagraphs: 0, + codeBlocks: 0, + hasMainContent: false, + visibleTextLength: page.body.length, + htmlLength: 0, + }, + }; + } + + const analysis = analyzeRendering(page.body); + return { url, status: pageStatus(analysis), analysis }; + } catch (err) { + return { + url, + status: 'fail', + analysis: { + hasContent: false, + hasSpaMarkers: false, + spaMarker: null, + contentHeadings: 0, + contentParagraphs: 0, + codeBlocks: 0, + hasMainContent: false, + visibleTextLength: 0, + htmlLength: 0, + }, + error: err instanceof Error ? err.message : String(err), + }; + } + }), + ); + results.push(...batchResults); + } + + const successful = results.filter((r) => !r.error); + const fetchErrors = results.filter((r) => r.error).length; + + if (successful.length === 0) { + const suffix = fetchErrors > 0 ? `; ${fetchErrors} failed to fetch` : ''; + return { + id, + category, + status: 'fail', + message: `Could not fetch any pages to analyze${suffix}`, + details: { + totalPages, + testedPages: results.length, + sampled: wasSampled, + fetchErrors, + pageResults: results, + discoveryWarnings: warnings, + }, + }; + } + + const spaShells = successful.filter((r) => r.status === 'fail'); + const sparse = successful.filter((r) => r.status === 'warn'); + const ok = successful.filter((r) => r.status === 'pass'); + const overallStatus = worstStatus(successful.map((r) => r.status)); + const pageLabel = wasSampled ? 'sampled pages' : 'pages'; + + // Identify the framework from the first failing page for the message + const firstShell = spaShells[0]; + const frameworkHint = firstShell?.analysis.spaMarker + ? ` (${firstShell.analysis.spaMarker.replace('id="', '').replace('"', '')} detected)` + : ''; + + let message: string; + if (overallStatus === 'pass') { + message = `All ${successful.length} ${pageLabel} contain server-rendered content`; + } else if (spaShells.length > 0) { + message = + `${spaShells.length} of ${successful.length} ${pageLabel} appear to be ` + + `client-side rendered SPA shells${frameworkHint}; ` + + `agents using HTTP fetches will see no content`; + if (sparse.length > 0) { + message += `; ${sparse.length} more have page structure but little substantive content`; + } + } else { + message = + `${sparse.length} of ${successful.length} ${pageLabel} have server-rendered ` + + `page structure but little substantive content; agents will see headings ` + + `and navigation but not the page's actual documentation`; + } + + if (fetchErrors > 0) { + message += `; ${fetchErrors} failed to fetch`; + } + + return { + id, + category, + status: overallStatus, + message, + details: { + totalPages, + testedPages: results.length, + sampled: wasSampled, + serverRendered: ok.length, + sparseContent: sparse.length, + spaShells: spaShells.length, + fetchErrors, + pageResults: results, + discoveryWarnings: warnings, + }, + }; +} + +registerCheck({ + id: 'rendering-strategy', + category: 'page-size', + description: + 'Whether pages contain server-rendered content or are client-side rendered SPA shells', + dependsOn: [], + run: check, +}); diff --git a/src/helpers/detect-rendering.ts b/src/helpers/detect-rendering.ts new file mode 100644 index 0000000..3710086 --- /dev/null +++ b/src/helpers/detect-rendering.ts @@ -0,0 +1,115 @@ +import { parse } from 'node-html-parser'; + +const SPA_MARKERS = ['id="___gatsby"', 'id="__next"', 'id="__nuxt"', 'id="root"']; + +export interface RenderingAnalysis { + /** Whether the page appears to be server-rendered with real content. */ + hasContent: boolean; + /** Whether known SPA framework markers were found. */ + hasSpaMarkers: boolean; + /** Which SPA marker was found, if any. */ + spaMarker: string | null; + /** Number of content headings found (excluding nav-only headings). */ + contentHeadings: number; + /** Number of paragraphs with substantial prose (>30 chars). */ + contentParagraphs: number; + /** Number of code blocks found. */ + codeBlocks: number; + /** Whether a
or [role="main"] element with children exists. */ + hasMainContent: boolean; + /** Visible text length after stripping script/style/noscript. */ + visibleTextLength: number; + /** Total HTML length. */ + htmlLength: number; +} + +/** + * Analyze whether an HTML page contains server-rendered content or is + * a client-side-rendered SPA shell. + * + * Unlike a simple text-ratio heuristic, this checks for concrete content + * signals: headings, paragraphs with prose, code blocks, and main content + * regions. SSR sites with heavy bundled assets (low text ratio but real + * content) will pass; true SPA shells (framework marker + no content) will fail. + */ +export function analyzeRendering(html: string): RenderingAnalysis { + const htmlLength = html.length; + + // Check for SPA framework markers + let spaMarker: string | null = null; + for (const marker of SPA_MARKERS) { + if (html.includes(marker)) { + spaMarker = marker; + break; + } + } + const hasSpaMarkers = spaMarker !== null; + + // Parse and strip non-content elements + const root = parse(html); + const body = root.querySelector('body') ?? root; + + for (const el of body.querySelectorAll('script, style, noscript, svg')) { + el.remove(); + } + + // Visible text + const visibleText = body.textContent.replace(/\s+/g, ' ').trim(); + const visibleTextLength = visibleText.length; + + // Content signals: headings with substantive text + const headings = body.querySelectorAll('h1, h2, h3, h4, h5, h6'); + let contentHeadings = 0; + for (const h of headings) { + const text = h.textContent.trim(); + // Skip very short headings that are likely nav labels + if (text.length > 3) contentHeadings++; + } + + // Content signals: paragraphs with prose + const paragraphs = body.querySelectorAll('p'); + let contentParagraphs = 0; + for (const p of paragraphs) { + const text = p.textContent.trim(); + if (text.length > 30) contentParagraphs++; + } + + // Content signals: code blocks + const codeBlocks = body.querySelectorAll('pre, code').length; + + // Content signals: main content region with substantive content inside it. + // An SPA shell can have a
element with just a page title and breadcrumbs, + // so we check for real content (paragraphs, code) inside
specifically. + const main = body.querySelector('main, [role="main"]'); + let hasMainContent = false; + if (main) { + const mainParas = main.querySelectorAll('p'); + let mainParagraphs = 0; + for (const p of mainParas) { + if (p.textContent.trim().length > 30) mainParagraphs++; + } + const mainCode = main.querySelectorAll('pre, code').length; + hasMainContent = mainParagraphs >= 2 || mainCode >= 1; + } + + // Determine if the page has real content + // A page has content if it has enough content signals, regardless of text ratio + const hasContent = + contentHeadings >= 3 || + contentParagraphs >= 5 || + (hasMainContent && contentHeadings >= 1) || + codeBlocks >= 3 || + !hasSpaMarkers; // No SPA markers = traditional server-rendered, assume content + + return { + hasContent, + hasSpaMarkers, + spaMarker, + contentHeadings, + contentParagraphs, + codeBlocks, + hasMainContent, + visibleTextLength, + htmlLength, + }; +} diff --git a/src/helpers/detect-tabs.ts b/src/helpers/detect-tabs.ts new file mode 100644 index 0000000..8af0e55 --- /dev/null +++ b/src/helpers/detect-tabs.ts @@ -0,0 +1,354 @@ +import { parse, type HTMLElement } from 'node-html-parser'; + +export interface TabPanel { + label: string | null; + html: string; +} + +export interface DetectedTabGroup { + framework: string; + tabCount: number; + htmlSlice: string; + panels: TabPanel[]; +} + +type Detector = ( + root: HTMLElement, + claimed: Set, + source?: string, +) => DetectedTabGroup[]; + +function isDescendantOf(node: HTMLElement, ancestor: HTMLElement): boolean { + let current = node.parentNode; + while (current) { + if (current === ancestor) return true; + current = current.parentNode; + } + return false; +} + +function isInsideClaimed(node: HTMLElement, claimed: Set): boolean { + for (const container of claimed) { + if (container === node || isDescendantOf(node, container)) return true; + } + return false; +} + +function textOf(el: HTMLElement): string { + // Clone to avoid mutating the original DOM, then strip
'; + const mdContent = `# Tutorial\n\n\nAtlas content\nLocal content\n\n`; + + server.use( + http.get( + 'http://tcs-spa.local/docs/page1', + () => + new HttpResponse(spaHtml, { + status: 200, + headers: { 'Content-Type': 'text/html' }, + }), + ), + http.get( + 'http://tcs-spa.local/docs/page1.md', + () => + new HttpResponse(mdContent, { + status: 200, + headers: { 'Content-Type': 'text/markdown' }, + }), + ), + ); + + const content = `# Docs\n> Summary\n## Links\n- [Page 1](http://tcs-spa.local/docs/page1): First\n`; + const ctx = makeCtx(content); + // Simulate rendering-strategy having flagged this URL as an SPA shell + ctx.previousResults.set('rendering-strategy', { + id: 'rendering-strategy', + category: 'page-size', + status: 'fail', + message: 'SPA shell detected', + details: { + pageResults: [{ url: 'http://tcs-spa.local/docs/page1', status: 'fail' }], + }, + }); + const result = await check.run(ctx); + const tabbedPages = result.details?.tabbedPages as Array<{ + tabGroups: Array<{ framework: string }>; + source: string; + }>; + expect(tabbedPages[0].tabGroups).toHaveLength(1); + expect(tabbedPages[0].tabGroups[0].framework).toBe('mdx'); + expect(tabbedPages[0].source).toBe('md-fallback'); + }); + + it('warns when tabbed content is between 50K-100K chars', async () => { + // Create tab panels that serialize to ~75K chars in markdown + const panelContent = '

' + 'w'.repeat(37_000) + '

'; + const tabHtml = ` +
+
Alpha
+
Beta
+
${panelContent}
+
${panelContent}
+
+ `; + server.use( + http.get( + 'http://tcs-warn.local/docs/page1', + () => + new HttpResponse(`${tabHtml}`, { + status: 200, + headers: { 'Content-Type': 'text/html' }, + }), + ), + ); + + const content = `# Docs\n> Summary\n## Links\n- [Page 1](http://tcs-warn.local/docs/page1): First\n`; + const result = await check.run(makeCtx(content)); + expect(result.status).toBe('warn'); + expect(result.message).toContain('50K–100K'); + }); + + it('includes fetch errors in message when tabs are found', async () => { + const tabHtml = ` +
+
Python
+
JS
+
print("hi")
+
console.log("hi")
+
+ `; + server.use( + http.get( + 'http://tcs-partial1.local/docs/page1', + () => + new HttpResponse(`${tabHtml}`, { + status: 200, + headers: { 'Content-Type': 'text/html' }, + }), + ), + http.get('http://tcs-partial2.local/docs/page2', () => HttpResponse.error()), + ); + + const content = `# Docs\n> Summary\n## Links\n- [Page 1](http://tcs-partial1.local/docs/page1): First\n- [Page 2](http://tcs-partial2.local/docs/page2): Second\n`; + const result = await check.run(makeCtx(content)); + expect(result.message).toContain('1 failed to fetch'); + expect(result.details?.fetchErrors).toBe(1); + // Should still have found the tab groups from the successful page + expect(result.details?.totalGroupsFound).toBeGreaterThan(0); + }); + + it('SPA shell falls through when tryMdFallback returns null (all candidates fail)', async () => { + const spaHtml = + '
'; + + server.use( + http.get( + 'http://tcs-spa-notabs.local/docs/page1', + () => + new HttpResponse(spaHtml, { + status: 200, + headers: { 'Content-Type': 'text/html' }, + }), + ), + // .md candidate returns 404 so tryMdFallback returns null + http.get( + 'http://tcs-spa-notabs.local/docs/page1.md', + () => new HttpResponse('Not found', { status: 404 }), + ), + http.get( + 'http://tcs-spa-notabs.local/docs/page1/index.md', + () => new HttpResponse('Not found', { status: 404 }), + ), + ); + + const content = `# Docs\n> Summary\n## Links\n- [Page 1](http://tcs-spa-notabs.local/docs/page1): First\n`; + const ctx = makeCtx(content); + ctx.previousResults.set('rendering-strategy', { + id: 'rendering-strategy', + category: 'page-size', + status: 'fail', + message: 'SPA shell detected', + details: { + pageResults: [{ url: 'http://tcs-spa-notabs.local/docs/page1', status: 'fail' }], + }, + }); + const result = await check.run(ctx); + expect(result.details?.totalGroupsFound).toBe(0); + const tabbedPages = result.details?.tabbedPages as Array<{ + tabGroups: Array; + source: string; + }>; + expect(tabbedPages[0].tabGroups).toHaveLength(0); + // tryMdFallback returned null, so falls through to default html source + expect(tabbedPages[0].source).toBe('html'); + }); + + it('does not try .md fallback for non-SPA HTML', async () => { + // Regular server-rendered HTML with no tabs + server.use( + http.get( + 'http://tcs-nospa.local/docs/page1', + () => + new HttpResponse( + '

Hello

' + 'Real content. '.repeat(100) + '

', + { + status: 200, + headers: { 'Content-Type': 'text/html' }, + }, + ), + ), + // This .md URL has tabs, but should NOT be fetched + http.get( + 'http://tcs-nospa.local/docs/page1.md', + () => + new HttpResponse('A', { + status: 200, + headers: { 'Content-Type': 'text/markdown' }, + }), + ), + ); + + const content = `# Docs\n> Summary\n## Links\n- [Page 1](http://tcs-nospa.local/docs/page1): First\n`; + const result = await check.run(makeCtx(content)); + expect(result.message).toContain('No tabbed content'); + const tabbedPages = result.details?.tabbedPages as Array<{ source: string }>; + expect(tabbedPages[0].source).toBe('html'); + }); +}); diff --git a/test/unit/helpers/detect-rendering.test.ts b/test/unit/helpers/detect-rendering.test.ts new file mode 100644 index 0000000..ce09133 --- /dev/null +++ b/test/unit/helpers/detect-rendering.test.ts @@ -0,0 +1,165 @@ +import { describe, it, expect } from 'vitest'; +import { analyzeRendering } from '../../../src/helpers/detect-rendering.js'; + +describe('analyzeRendering', () => { + it('passes for traditional server-rendered HTML with no SPA markers', () => { + const html = + '

Hello World

' + + 'Real content here. '.repeat(20) + + '

'; + const result = analyzeRendering(html); + expect(result.hasContent).toBe(true); + expect(result.hasSpaMarkers).toBe(false); + }); + + it('passes for SSR site with Next.js marker and real content', () => { + // Simulates GitHub docs: __next marker but real headings and paragraphs + const html = + '
' + + '
' + + '

REST API

' + + '

Authentication

' + + '

Rate Limits

' + + '

You can authenticate to the REST API to access more endpoints.

' + + '

Learn how to use the GitHub REST API effectively.

' + + '

Follow these best practices when using the API.

' + + '

Check out our development quickstart guide for details.

' + + '

You can use OAuth tokens or personal access tokens.

' + + '
' + + '
'; + const result = analyzeRendering(html); + expect(result.hasContent).toBe(true); + expect(result.hasSpaMarkers).toBe(true); + expect(result.spaMarker).toBe('id="__next"'); + expect(result.contentHeadings).toBe(3); + }); + + it('fails for Gatsby SPA shell with no content', () => { + const html = + '
' + + ''; + const result = analyzeRendering(html); + expect(result.hasContent).toBe(false); + expect(result.hasSpaMarkers).toBe(true); + expect(result.spaMarker).toBe('id="___gatsby"'); + expect(result.contentHeadings).toBe(0); + expect(result.contentParagraphs).toBe(0); + }); + + it('fails for SPA shell with only nav chrome text', () => { + // Simulates MongoDB: ___gatsby marker, nav links, no real content + const navText = 'Products Platform Atlas Database Search Tools Documentation '; + const html = + '' + + '
' + + '' + + '' + + '
'; + const result = analyzeRendering(html); + expect(result.hasContent).toBe(false); + expect(result.hasSpaMarkers).toBe(true); + expect(result.contentHeadings).toBe(0); + }); + + it('passes for SSR site with heavy assets but main content region', () => { + // Simulates Stripe docs: low text ratio but real content inside
+ const html = + '' + + '
' + + '
' + + '

API Reference

' + + '

You can use the Stripe API in test mode, which does not affect your live data.

' + + '

The API supports both synchronous and asynchronous request patterns for flexibility.

' + + '
' + + '' + + '
'; + const result = analyzeRendering(html); + expect(result.hasContent).toBe(true); + expect(result.hasSpaMarkers).toBe(true); + expect(result.hasMainContent).toBe(true); + }); + + it('fails for SPA shell with main element but only breadcrumbs', () => { + // Simulates MongoDB Atlas Search tutorial:
exists but has only title + breadcrumbs + const html = + '' + + '
' + + '
' + + '

MongoDB Search Quick Start

' + + '' + + '
' + + '' + + '
'; + const result = analyzeRendering(html); + expect(result.hasContent).toBe(false); + expect(result.hasSpaMarkers).toBe(true); + expect(result.hasMainContent).toBe(false); + }); + + it('passes for Next.js SSG site with multiple headings', () => { + const html = + '
' + + '

Getting Started

' + + '

Installation

' + + '

Configuration

' + + '

Usage

' + + '

Welcome to our documentation.

' + + '
'; + const result = analyzeRendering(html); + expect(result.hasContent).toBe(true); + expect(result.contentHeadings).toBe(4); + }); + + it('detects __nuxt marker', () => { + const html = '
'; + const result = analyzeRendering(html); + expect(result.hasSpaMarkers).toBe(true); + expect(result.spaMarker).toBe('id="__nuxt"'); + expect(result.hasContent).toBe(false); + }); + + it('counts code blocks as content signals', () => { + const html = + '
' + + '
const x = 1;
' + + '
const y = 2;
' + + '
const z = 3;
' + + '
'; + const result = analyzeRendering(html); + expect(result.hasContent).toBe(true); + expect(result.codeBlocks).toBeGreaterThanOrEqual(3); + }); + + it('does not count very short headings (nav labels)', () => { + const html = + '
' + + '

API

' + // 3 chars, should be excluded + '

FAQ

' + // 3 chars, should be excluded + '
'; + const result = analyzeRendering(html); + expect(result.contentHeadings).toBe(0); + expect(result.hasContent).toBe(false); + }); + + it('handles empty HTML', () => { + const result = analyzeRendering(''); + expect(result.hasContent).toBe(true); // No SPA markers = assume content + expect(result.hasSpaMarkers).toBe(false); + }); +}); diff --git a/test/unit/helpers/detect-tabs.test.ts b/test/unit/helpers/detect-tabs.test.ts new file mode 100644 index 0000000..cedef37 --- /dev/null +++ b/test/unit/helpers/detect-tabs.test.ts @@ -0,0 +1,654 @@ +import { describe, it, expect } from 'vitest'; +import { detectTabGroups } from '../../../src/helpers/detect-tabs.js'; + +describe('detectTabGroups', () => { + it('returns empty array for HTML with no tabs', () => { + const html = '

Hello

No tabs here.

'; + expect(detectTabGroups(html)).toEqual([]); + }); + + it('detects Docusaurus tabs', () => { + const html = ` +
+
    + + +
+
import requests
+
const fetch = require('node-fetch');
+
+ `; + const groups = detectTabGroups(html); + expect(groups).toHaveLength(1); + expect(groups[0].framework).toBe('docusaurus'); + expect(groups[0].tabCount).toBe(2); + expect(groups[0].panels).toHaveLength(2); + expect(groups[0].panels[0].label).toBe('Python'); + expect(groups[0].panels[1].label).toBe('JavaScript'); + }); + + it('detects MkDocs Material tabs', () => { + const html = ` +
+
+ + +
+
+
echo hello
+
Write-Host hello
+
+
+ `; + const groups = detectTabGroups(html); + expect(groups).toHaveLength(1); + expect(groups[0].framework).toBe('mkdocs'); + expect(groups[0].tabCount).toBe(2); + expect(groups[0].panels[0].label).toBe('Bash'); + }); + + it('detects Sphinx tabs', () => { + const html = ` +
+
C++
+
Rust
+
std::cout
+
println!
+
+ `; + const groups = detectTabGroups(html); + expect(groups).toHaveLength(1); + expect(groups[0].framework).toBe('sphinx'); + expect(groups[0].tabCount).toBe(2); + expect(groups[0].panels[0].label).toBe('C++'); + }); + + it('detects Microsoft Learn tabs', () => { + const html = ` +
+ C# + Java +
Console.WriteLine
+
System.out.println
+
+ `; + const groups = detectTabGroups(html); + expect(groups).toHaveLength(1); + expect(groups[0].framework).toBe('microsoft-learn'); + expect(groups[0].tabCount).toBe(2); + expect(groups[0].panels[0].label).toBe('C#'); + }); + + it('detects generic ARIA tabs', () => { + const html = ` +
+
+ + +
+

Content A

+

Content B

+
+ `; + const groups = detectTabGroups(html); + expect(groups).toHaveLength(1); + expect(groups[0].framework).toBe('generic-aria'); + expect(groups[0].tabCount).toBe(2); + expect(groups[0].panels[0].label).toBe('Tab A'); + }); + + it('does not double-detect Docusaurus tabs as generic ARIA', () => { + const html = ` +
+
    + + +
+
python code
+
node code
+
+ `; + const groups = detectTabGroups(html); + expect(groups).toHaveLength(1); + expect(groups[0].framework).toBe('docusaurus'); + }); + + it('detects multiple tab groups on same page', () => { + const html = ` +
+
A
+

Panel A

+
+
+
B
+

Panel B

+
+ `; + const groups = detectTabGroups(html); + expect(groups).toHaveLength(2); + }); + + it('includes htmlSlice as outerHTML of the container', () => { + const html = ` +
content
+ `; + const groups = detectTabGroups(html); + expect(groups).toHaveLength(1); + expect(groups[0].htmlSlice).toContain('tabbed-set'); + expect(groups[0].htmlSlice).toContain('content'); + }); + + it('handles empty HTML gracefully', () => { + expect(detectTabGroups('')).toEqual([]); + }); + + it('skips tab groups with no panels (likely navigation)', () => { + const html = ` +
+ One +
+ `; + // Tabs without panels are typically site navigation, not content + const groups = detectTabGroups(html); + expect(groups).toHaveLength(0); + }); + + it('detects MDX-style / (MongoDB pattern)', () => { + const md = ` +# Guide + + + + + +## Python Setup + +\`\`\`python +pip install pymongo +\`\`\` + + + + + +## Node.js Setup + +\`\`\`bash +npm install mongodb +\`\`\` + + + + + `; + const groups = detectTabGroups(md); + expect(groups).toHaveLength(1); + expect(groups[0].framework).toBe('mdx'); + expect(groups[0].tabCount).toBe(2); + expect(groups[0].panels[0].label).toBe('Python'); + expect(groups[0].panels[1].label).toBe('Node.js'); + }); + + it('detects MDX-style / (Docusaurus MDX pattern)', () => { + const md = ` + + + +\`\`\`bash +npm install foo +\`\`\` + + + + +\`\`\`bash +yarn add foo +\`\`\` + + + + `; + const groups = detectTabGroups(md); + expect(groups).toHaveLength(1); + expect(groups[0].framework).toBe('mdx'); + expect(groups[0].tabCount).toBe(2); + expect(groups[0].panels[0].label).toBe('npm'); + expect(groups[0].panels[1].label).toBe('yarn'); + }); + + it('detects multiple MDX tab groups', () => { + const md = ` + +Content A +Content B + + +Some text between. + + +Content X +Content Y +Content Z + + `; + const groups = detectTabGroups(md); + expect(groups).toHaveLength(2); + expect(groups[0].tabCount).toBe(2); + expect(groups[1].tabCount).toBe(3); + }); + + it('falls back to TabItem value attribute when label is absent', () => { + const md = ` + +Go code +Rust code + + `; + const groups = detectTabGroups(md); + expect(groups).toHaveLength(1); + expect(groups[0].panels[0].label).toBe('go'); + expect(groups[0].panels[1].label).toBe('rust'); + }); + + it('detects multiple consecutive MDX groups separated by markdown', () => { + const md = ` +# Getting Started + + + + +Install with Homebrew: + +\`\`\`bash +brew install myapp +\`\`\` + + + + +Install with apt: + +\`\`\`bash +sudo apt install myapp +\`\`\` + + + + +## Configuration + +After installing, configure the app: + + + + +\`\`\`bash +myapp config --os darwin +\`\`\` + + + + +\`\`\`bash +myapp config --os linux +\`\`\` + + + + +## Advanced Usage + +For power users, here are some tips: + + + + +Use launchd to run as a service. + + + + +Use systemd to run as a service. + + + + +Use NSSM to run as a service. + + + + `; + const groups = detectTabGroups(md); + expect(groups).toHaveLength(3); + expect(groups[0].tabCount).toBe(2); + expect(groups[0].panels[0].label).toBe('macOS'); + expect(groups[1].tabCount).toBe(2); + expect(groups[1].panels[0].label).toBe('macOS'); + expect(groups[2].tabCount).toBe(3); + expect(groups[2].panels[2].label).toBe('Windows'); + }); + + it('finds panels via ancestor walking (grandparent container)', () => { + // LeafyGreen-style: tablist and tabpanels are not direct siblings. + // The tabpanels are inside a separate wrapper div, both under a + // shared grandparent container. + const html = ` +
+
+
+ + +
+
+
+
fmt.Println("hello")
+
println!("hello");
+
+
+ `; + const groups = detectTabGroups(html); + expect(groups).toHaveLength(1); + expect(groups[0].framework).toBe('generic-aria'); + expect(groups[0].tabCount).toBe(2); + expect(groups[0].panels[0].label).toBe('Go'); + expect(groups[0].panels[1].label).toBe('Rust'); + }); + + it('finds panels via ancestor walking (great-grandparent container)', () => { + // Even deeper nesting: tablist is 3 levels below the container + // that holds the tabpanels. + const html = ` +
+
+
+
+ + +
+
+
+

Alpha content

+

Beta content

+
+ `; + const groups = detectTabGroups(html); + expect(groups).toHaveLength(1); + expect(groups[0].tabCount).toBe(2); + expect(groups[0].panels[0].label).toBe('Alpha'); + expect(groups[0].panels[1].label).toBe('Beta'); + }); + + it('textOf strips embedded style tags from tab labels', () => { + const html = ` +
+
+ + +
+

Content 1

+

Content 2

+
+ `; + const groups = detectTabGroups(html); + expect(groups).toHaveLength(1); + expect(groups[0].framework).toBe('generic-aria'); + expect(groups[0].panels[0].label).toBe('Clean Label'); + expect(groups[0].panels[1].label).toBe('Another Label'); + }); + + it('findContainerWithPanels returns null when panels are too deep', () => { + // Tablist nested 5+ levels deep from any ancestor with tabpanels. + // maxDepth is 4, so it should not find any panels. + const html = ` +
+
+
+
+
+
+
+ + +
+
+
+
+
+
+

Panel 1

+

Panel 2

+
+ `; + const groups = detectTabGroups(html); + expect(groups).toHaveLength(0); + }); + + it('handles unclosed MDX Tabs tag gracefully', () => { + const md = ` +# Guide + + + + + +\`\`\`python +pip install pymongo +\`\`\` + + + + + +\`\`\`bash +npm install mongodb +\`\`\` + + + `; + // No closing , so findTabsBlocks should handle gracefully + const groups = detectTabGroups(md); + expect(groups).toHaveLength(0); + }); + + it('handles more panels than labels (null label fallback)', () => { + // MkDocs with 1 label but 2 panel blocks — second panel gets null label + const html = ` +
+
+
+
first
+
second
+
+
+ `; + const groups = detectTabGroups(html); + expect(groups).toHaveLength(1); + expect(groups[0].framework).toBe('mkdocs'); + expect(groups[0].tabCount).toBe(2); + expect(groups[0].panels[0].label).toBe('Only Label'); + expect(groups[0].panels[1].label).toBeNull(); + }); + + it('handles more tabs than panels (empty html fallback)', () => { + // Sphinx with 2 tabs but only 1 panel — second panel gets empty html + const html = ` +
+
Tab A
+
Tab B
+
only panel
+
+ `; + const groups = detectTabGroups(html); + expect(groups).toHaveLength(1); + expect(groups[0].framework).toBe('sphinx'); + expect(groups[0].tabCount).toBe(2); + expect(groups[0].panels[0].label).toBe('Tab A'); + expect(groups[0].panels[0].html).toContain('only panel'); + expect(groups[0].panels[1].label).toBe('Tab B'); + expect(groups[0].panels[1].html).toBe(''); + }); + + it('handles MS Learn with more tabs than panels', () => { + const html = ` +
+ A + B + C +
only A
+
+ `; + const groups = detectTabGroups(html); + expect(groups).toHaveLength(1); + expect(groups[0].tabCount).toBe(3); + expect(groups[0].panels[0].label).toBe('A'); + expect(groups[0].panels[1].label).toBe('B'); + expect(groups[0].panels[1].html).toBe(''); + expect(groups[0].panels[2].label).toBe('C'); + expect(groups[0].panels[2].html).toBe(''); + }); + + it('generic ARIA handles more tabs than panels', () => { + const html = ` +
+
+ + + +
+

only X

+
+ `; + const groups = detectTabGroups(html); + expect(groups).toHaveLength(1); + expect(groups[0].framework).toBe('generic-aria'); + expect(groups[0].tabCount).toBe(3); + expect(groups[0].panels[2].label).toBe('Z'); + expect(groups[0].panels[2].html).toBe(''); + }); + + it('MDX skips inside nested (depth tracking)', () => { + // Outer Tabs with inner nested Tabs — findTabsBlocks returns the outer + // block as a single unit, and depth tracking skips the inner elements + const md = ` + + + + +Nested content 1 +Nested content 2 + + + +Outer B content + + `; + const groups = detectTabGroups(md); + // Only the outer group is detected; inner s are skipped by depth check + expect(groups).toHaveLength(1); + expect(groups[0].tabCount).toBe(2); + expect(groups[0].panels[0].label).toBe('Outer A'); + expect(groups[0].panels[1].label).toBe('Outer B'); + }); + + it('MDX Tab without label attribute returns null label', () => { + const md = ` + +Content with no label attribute + + `; + const groups = detectTabGroups(md); + expect(groups).toHaveLength(1); + expect(groups[0].panels[0].label).toBeNull(); + }); + + it('Docusaurus handles more panels than tabs (null label fallback)', () => { + const html = ` +
+
    + +
+
panel 1
+
panel 2
+
+ `; + const groups = detectTabGroups(html); + expect(groups).toHaveLength(1); + expect(groups[0].framework).toBe('docusaurus'); + expect(groups[0].panels[0].label).toBe('Only Tab'); + expect(groups[0].panels[1].label).toBeNull(); + }); + + it('Sphinx detector skips container already claimed by MkDocs', () => { + // A .sphinx-tabs container nested inside a .tabbed-set (MkDocs). + // MkDocs runs first and claims the outer container. Sphinx should + // skip the inner .sphinx-tabs since it's inside the claimed region. + const html = ` +
+
+
+
+
+
Inner
+
inner content
+
+
+
+
+ `; + const groups = detectTabGroups(html); + // MkDocs claims the outer container; Sphinx skips the inner one + expect(groups).toHaveLength(1); + expect(groups[0].framework).toBe('mkdocs'); + }); + + it('MDX handles unclosed tag (no matching )', () => { + const md = ` + +content with no closing tag + + `; + const groups = detectTabGroups(md); + expect(groups).toHaveLength(1); + expect(groups[0].panels[0].label).toBe('Alpha'); + // Content should include everything after the opening tag + expect(groups[0].panels[0].html).toContain('content with no closing tag'); + }); + + it('MDX skips block with no children', () => { + const md = ` + +Just some text, no Tab elements here. + + `; + const groups = detectTabGroups(md); + expect(groups).toHaveLength(0); + }); + + it('Docusaurus detector uses ancestor walking when panels are not siblings', () => { + // Docusaurus with a wrapper structure where tablist and panels + // share a grandparent rather than a direct parent. + const html = ` +
+
+
    + + + +
+
+
+
npm install foo
+
yarn add foo
+
pnpm add foo
+
+
+ `; + const groups = detectTabGroups(html); + expect(groups).toHaveLength(1); + expect(groups[0].framework).toBe('docusaurus'); + expect(groups[0].tabCount).toBe(3); + expect(groups[0].panels[0].label).toBe('npm'); + expect(groups[0].panels[1].label).toBe('yarn'); + expect(groups[0].panels[2].label).toBe('pnpm'); + }); +}); diff --git a/test/unit/helpers/fetch-page.test.ts b/test/unit/helpers/fetch-page.test.ts new file mode 100644 index 0000000..3ba2c6e --- /dev/null +++ b/test/unit/helpers/fetch-page.test.ts @@ -0,0 +1,102 @@ +import { describe, it, expect, beforeAll } from 'vitest'; +import { http, HttpResponse } from 'msw'; +import { setupServer } from 'msw/node'; +import { createContext } from '../../../src/runner.js'; +import { fetchPage } from '../../../src/helpers/fetch-page.js'; + +const server = setupServer(); + +beforeAll(() => { + server.listen({ onUnhandledRequest: 'bypass' }); + return () => server.close(); +}); + +describe('fetchPage', () => { + it('returns body and detects HTML content', async () => { + server.use( + http.get( + 'http://fp-basic.local/page', + () => + new HttpResponse('

Hello

', { + status: 200, + headers: { 'Content-Type': 'text/html' }, + }), + ), + ); + + const ctx = createContext('http://fp-basic.local', { requestDelay: 0 }); + const page = await fetchPage(ctx, 'http://fp-basic.local/page'); + + expect(page.isHtml).toBe(true); + expect(page.body).toContain('

Hello

'); + expect(page.contentType).toContain('text/html'); + }); + + it('detects markdown content as non-HTML', async () => { + server.use( + http.get( + 'http://fp-md.local/page', + () => + new HttpResponse('# Hello\n\nMarkdown content.', { + status: 200, + headers: { 'Content-Type': 'text/markdown' }, + }), + ), + ); + + const ctx = createContext('http://fp-md.local', { requestDelay: 0 }); + const page = await fetchPage(ctx, 'http://fp-md.local/page'); + + expect(page.isHtml).toBe(false); + expect(page.body).toContain('# Hello'); + }); + + it('returns cached result on second call without re-fetching', async () => { + let fetchCount = 0; + server.use( + http.get('http://fp-cache.local/page', () => { + fetchCount++; + return new HttpResponse('

Content

', { + status: 200, + headers: { 'Content-Type': 'text/html' }, + }); + }), + ); + + const ctx = createContext('http://fp-cache.local', { requestDelay: 0 }); + const first = await fetchPage(ctx, 'http://fp-cache.local/page'); + const second = await fetchPage(ctx, 'http://fp-cache.local/page'); + + expect(fetchCount).toBe(1); + expect(first).toBe(second); + }); + + it('caches different URLs independently', async () => { + server.use( + http.get( + 'http://fp-multi.local/page1', + () => + new HttpResponse('Page 1', { + status: 200, + headers: { 'Content-Type': 'text/html' }, + }), + ), + http.get( + 'http://fp-multi.local/page2', + () => + new HttpResponse('# Page 2', { + status: 200, + headers: { 'Content-Type': 'text/markdown' }, + }), + ), + ); + + const ctx = createContext('http://fp-multi.local', { requestDelay: 0 }); + const page1 = await fetchPage(ctx, 'http://fp-multi.local/page1'); + const page2 = await fetchPage(ctx, 'http://fp-multi.local/page2'); + + expect(page1.isHtml).toBe(true); + expect(page2.isHtml).toBe(false); + expect(ctx.htmlCache.size).toBe(2); + }); +}); diff --git a/test/unit/runner.test.ts b/test/unit/runner.test.ts index cb82bff..f37556f 100644 --- a/test/unit/runner.test.ts +++ b/test/unit/runner.test.ts @@ -69,7 +69,7 @@ describe('runner', () => { it('stub checks return skip with "Not yet implemented"', async () => { const report = await runChecks('http://stub.local', { - checkIds: ['tabbed-content-serialization'], + checkIds: ['auth-alternative-access'], requestDelay: 0, }); From 70c45f3a578d7f61f114f00c26786627d7513d80 Mon Sep 17 00:00:00 2001 From: dacharyc Date: Sun, 15 Mar 2026 13:51:11 -0400 Subject: [PATCH 2/7] Add 'markdown-content-parity' check --- README.md | 12 +- .../observability/markdown-content-parity.ts | 672 ++++++++++++++++- src/helpers/fetch-page.ts | 2 +- src/types.ts | 1 + .../checks/markdown-content-parity.test.ts | 698 ++++++++++++++++++ 5 files changed, 1372 insertions(+), 13 deletions(-) create mode 100644 test/unit/checks/markdown-content-parity.test.ts diff --git a/README.md b/README.md index 33297f2..321fc5a 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ Test your documentation site against the [Agent-Friendly Documentation Spec](htt Agents don't use docs like humans. They hit truncation limits, get walls of CSS instead of content, can't follow cross-host redirects, and don't know about quality-of-life improvements like `llms.txt` or `.md` docs pages that would make life swell. Maybe this is because the industry has lacked guidance - until now. -afdocs runs 22 checks across 8 categories to evaluate how well your docs serve agent consumers. 19 are fully implemented; the rest return `skip` until completed. +afdocs runs 22 checks across 8 categories to evaluate how well your docs serve agent consumers. 20 are fully implemented; the rest return `skip` until completed. > **Status: Early development (0.x)** > This project is under active development. Check IDs, CLI flags, and output formats may change between minor versions. Feel free to try it out, but don't build automation against specific output until 1.0. @@ -195,11 +195,11 @@ describe('agent-friendliness', () => { ### Category 7: Observability and Content Health -| Check | Description | -| ---------------------------- | ---------------------------------------------- | -| `llms-txt-freshness` \* | Whether `llms.txt` reflects current site state | -| `markdown-content-parity` \* | Whether markdown and HTML versions match | -| `cache-header-hygiene` | Whether cache headers allow timely updates | +| Check | Description | +| ------------------------- | ---------------------------------------------- | +| `llms-txt-freshness` \* | Whether `llms.txt` reflects current site state | +| `markdown-content-parity` | Whether markdown and HTML versions match | +| `cache-header-hygiene` | Whether cache headers allow timely updates | ### Category 8: Authentication and Access diff --git a/src/checks/observability/markdown-content-parity.ts b/src/checks/observability/markdown-content-parity.ts index 354a43c..414d64f 100644 --- a/src/checks/observability/markdown-content-parity.ts +++ b/src/checks/observability/markdown-content-parity.ts @@ -1,12 +1,672 @@ +import { parse } from 'node-html-parser'; import { registerCheck } from '../registry.js'; -import type { CheckContext, CheckResult } from '../../types.js'; +import { fetchPage } from '../../helpers/fetch-page.js'; +import type { CheckContext, CheckResult, CheckStatus } from '../../types.js'; + +/** Thresholds for the percentage of HTML segments not found in markdown. */ +const WARN_THRESHOLD = 5; +const FAIL_THRESHOLD = 20; + +/** Minimum character length for a text segment to be considered meaningful. */ +const MIN_SEGMENT_LENGTH = 20; + +/** + * Minimum number of unique HTML segments required for a meaningful comparison. + * Pages below this threshold auto-pass because the percentage is too volatile + * (e.g., 3 breadcrumb items on a 10-segment page = 30% "missing"). + */ +const MIN_SEGMENTS_FOR_COMPARISON = 10; + +/** HTML tags to strip before extracting text (non-content chrome). */ +const STRIP_TAGS = [ + 'script', + 'style', + 'nav', + 'footer', + 'header', + 'noscript', + 'button', + 'svg', + 'aside', +]; + +/** CSS selectors for common doc-site chrome that lives inside
. */ +const STRIP_SELECTORS = [ + '[aria-label="breadcrumb"]', + '[aria-label="pagination"]', + '[class*="breadcrumb"]', + '[class*="pagination"]', + '[class*="prev-next"]', + '[class*="prevnext"]', + '[class*="page-nav"]', + '[class*="feedback"]', + '[class*="helpful"]', + '[class*="table-of-contents"]', + '[class*="toc"]', + '[rel="prev"]', + '[rel="next"]', + '.sr-only', +]; + +/** + * Segment-level patterns for common non-content text that survives DOM stripping. + * Matched against normalized (lowercased, whitespace-collapsed) segments. + */ +const NOISE_PATTERNS = [ + /^last updated/, + /^was this page helpful/, + /^thank you for your feedback/, + /^previous\s+\S.*next\s+\S/, // "Previous X Next Y" pagination + /^start from the beginning$/, + /^join our .* server/, // "Join our Discord Server..." + /^loading video content/, + /^\/.+\/.+/, // breadcrumb paths like "/Connect to Neon/..." +]; + +interface PageParityResult { + url: string; + markdownSource: string; + status: CheckStatus; + /** Percentage of HTML text segments not found in the markdown version. */ + missingPercent: number; + /** Total meaningful text segments extracted from HTML. */ + totalSegments: number; + /** Number of HTML segments not found in the markdown. */ + missingSegments: number; + /** Sample of missing segments for diagnostics. */ + sampleDiffs: string[]; + error?: string; +} + +/** + * Known HTML tag names used to distinguish real tags from angle-bracket + * placeholders like or in code examples. + * Only needs to cover tags that appear in node-html-parser's .text output + * (i.e., tags inside
 that survive as raw text).
+ */
+const HTML_TAG_NAMES = new Set([
+  'a',
+  'abbr',
+  'address',
+  'article',
+  'aside',
+  'audio',
+  'b',
+  'bdi',
+  'bdo',
+  'blockquote',
+  'body',
+  'br',
+  'button',
+  'canvas',
+  'caption',
+  'cite',
+  'code',
+  'col',
+  'colgroup',
+  'data',
+  'dd',
+  'del',
+  'details',
+  'dfn',
+  'dialog',
+  'div',
+  'dl',
+  'dt',
+  'em',
+  'embed',
+  'fieldset',
+  'figcaption',
+  'figure',
+  'footer',
+  'form',
+  'h1',
+  'h2',
+  'h3',
+  'h4',
+  'h5',
+  'h6',
+  'head',
+  'header',
+  'hr',
+  'html',
+  'i',
+  'iframe',
+  'img',
+  'input',
+  'ins',
+  'kbd',
+  'label',
+  'legend',
+  'li',
+  'link',
+  'main',
+  'map',
+  'mark',
+  'meta',
+  'meter',
+  'nav',
+  'noscript',
+  'object',
+  'ol',
+  'optgroup',
+  'option',
+  'output',
+  'p',
+  'param',
+  'picture',
+  'pre',
+  'progress',
+  'q',
+  'rp',
+  'rt',
+  'ruby',
+  's',
+  'samp',
+  'script',
+  'section',
+  'select',
+  'slot',
+  'small',
+  'source',
+  'span',
+  'strong',
+  'style',
+  'sub',
+  'summary',
+  'sup',
+  'table',
+  'tbody',
+  'td',
+  'template',
+  'textarea',
+  'tfoot',
+  'th',
+  'thead',
+  'time',
+  'title',
+  'tr',
+  'track',
+  'u',
+  'ul',
+  'var',
+  'video',
+  'wbr',
+]);
+
+/** Block-level HTML elements that should produce line breaks in extracted text. */
+const BLOCK_TAGS = new Set([
+  'p',
+  'div',
+  'h1',
+  'h2',
+  'h3',
+  'h4',
+  'h5',
+  'h6',
+  'li',
+  'tr',
+  'td',
+  'th',
+  'blockquote',
+  'pre',
+  'dt',
+  'dd',
+  'figcaption',
+  'section',
+  'article',
+  'details',
+  'summary',
+  'br',
+  'hr',
+]);
+
+/**
+ * Minimum link density (0–1) and minimum link count for an element to be
+ * classified as navigation chrome. Navigation panels are structurally
+ * distinguishable from content: they consist almost entirely of links with
+ * very little non-link text between them. Content sections, even link-heavy
+ * ones like "Related resources", include enough description text to stay
+ * well below this threshold.
+ */
+const NAV_LINK_DENSITY_THRESHOLD = 0.7;
+const NAV_MIN_LINK_COUNT = 10;
+
+/**
+ * Extract plain text from HTML, stripping chrome elements.
+ * Inserts newlines between block-level elements so that paragraphs,
+ * list items, etc. become separate lines in the output.
+ */
+/**
+ * Heuristic selectors for content containers, tried in order when
+ * 
and
are not present. Common across doc platforms + * like Mintlify, ReadMe, Docusaurus/Starlight, and custom sites. + */ +const CONTENT_SELECTORS = [ + '[role="main"]', + '#content', + '.sl-markdown-content', + '.markdown-content', + '.markdown-body', + '.docs-content', + '.doc-content', + '.main-pane', + '.page-content', + '.prose', +]; + +function extractHtmlText(html: string): string { + const root = parse(html); + + // Prefer the tightest content container available. + // Priority: heuristic selector inside article/main > article inside main + // > article > heuristic selector inside main > main > heuristic on root > body + const main = root.querySelector('main'); + const article = main?.querySelector('article') ?? root.querySelector('article'); + let content: ReturnType = null; + + // Look for a heuristic content selector inside the best semantic container + const semanticContainer = article ?? main; + if (semanticContainer) { + for (const selector of CONTENT_SELECTORS) { + content = semanticContainer.querySelector(selector); + if (content) break; + } + } + // Fall back to the semantic container itself + if (!content) content = semanticContainer; + + // If no semantic container, try heuristic selectors on the root + if (!content) { + for (const selector of CONTENT_SELECTORS) { + content = root.querySelector(selector); + if (content) break; + } + } + + if (!content) content = root.querySelector('body'); + if (!content) return root.text; + + // Remove non-content elements by tag + for (const tag of STRIP_TAGS) { + for (const el of content.querySelectorAll(tag)) { + el.remove(); + } + } + + // Remove common doc-site chrome by CSS selector + for (const selector of STRIP_SELECTORS) { + for (const el of content.querySelectorAll(selector)) { + el.remove(); + } + } + + // Remove elements that look like navigation based on link density. + // Navigation panels (sidebars, header menus) are structurally distinct + // from content: they consist almost entirely of links. This catches + // nav-like elements that use
instead of