diff --git a/packages/code-infra/package.json b/packages/code-infra/package.json index 2ad8820dc..3726ce147 100644 --- a/packages/code-infra/package.json +++ b/packages/code-infra/package.json @@ -39,6 +39,10 @@ "./stylelint": { "types": "./build/stylelint/index.d.mts", "default": "./src/stylelint/index.mjs" + }, + "./brokenLinksChecker": { + "types": "./build/brokenLinksChecker/index.d.mts", + "default": "./src/brokenLinksChecker/index.mjs" } }, "bin": { @@ -79,6 +83,7 @@ "babel-plugin-transform-remove-imports": "^1.8.1", "chalk": "^5.6.2", "clipboardy": "^5.0.0", + "content-type": "^1.0.5", "env-ci": "^11.2.0", "eslint-config-prettier": "^10.1.8", "eslint-import-resolver-typescript": "^4.4.4", @@ -96,6 +101,7 @@ "globals": "^16.4.0", "globby": "^15.0.0", "minimatch": "^10.0.3", + "node-html-parser": "^7.0.1", "open": "^10.2.0", "postcss-styled-syntax": "^0.7.1", "regexp.escape": "^2.0.1", @@ -106,14 +112,16 @@ "yargs": "^18.0.0" }, "peerDependencies": { + "@next/eslint-plugin-next": "*", "eslint": "^9.0.0", "prettier": "^3.5.3", - "typescript": "^5.0.0", - "@next/eslint-plugin-next": "*" + "typescript": "^5.0.0" }, "devDependencies": { + "@octokit/types": "^15.0.1", "@types/babel__core": "^7.20.5", "@types/babel__preset-env": "^7.10.0", + "@types/content-type": "^1.1.9", "@types/env-ci": "^3.1.4", "@types/eslint-plugin-jsx-a11y": "^6.10.1", "@types/estree": "^1.0.8", @@ -123,8 +131,9 @@ "@typescript-eslint/parser": "^8.46.2", "@typescript-eslint/rule-tester": "^8.46.2", "eslint": "^9.38.0", - "@octokit/types": "^15.0.1", + "get-port": "^7.1.0", "prettier": "^3.6.2", + "serve": "^14.2.5", "typescript-eslint": "^8.46.2" }, "files": [ diff --git a/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/broken-links.html b/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/broken-links.html new file mode 100644 index 000000000..30e4cad7a --- /dev/null +++ b/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/broken-links.html @@ -0,0 +1,20 @@ + + + + + + Page with Broken Links + + +

Page with Broken Links

+

This page contains links to non-existent pages.

+ + + diff --git a/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/broken-targets.html b/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/broken-targets.html new file mode 100644 index 000000000..43432271e --- /dev/null +++ b/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/broken-targets.html @@ -0,0 +1,22 @@ + + + + + + Page with Broken Targets + + +

Page with Broken Targets

+

This page contains links to valid pages but with invalid hash targets.

+ + + diff --git a/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/example.md b/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/example.md new file mode 100644 index 000000000..f36aa4a57 --- /dev/null +++ b/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/example.md @@ -0,0 +1,9 @@ +# Example Markdown File + +This is a markdown file with an HTML code snippet: + +```html +This link is in a code snippet +``` + +This link should not be crawled because this is a markdown file, not HTML. diff --git a/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/external-links.html b/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/external-links.html new file mode 100644 index 000000000..8a7c86248 --- /dev/null +++ b/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/external-links.html @@ -0,0 +1,21 @@ + + + + + + Page with External Links + + +

Page with External Links

+

This page contains external links that should be ignored.

+ + + diff --git a/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/ignored-page.html b/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/ignored-page.html new file mode 100644 index 000000000..2b2053b5e --- /dev/null +++ b/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/ignored-page.html @@ -0,0 +1,17 @@ + + + + + + Ignored Page + + +

Ignored Page

+

This page should be ignored by the crawler.

+ + + diff --git a/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/index.html b/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/index.html new file mode 100644 index 000000000..e773c4dfc --- /dev/null +++ b/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/index.html @@ -0,0 +1,26 @@ + + + + + + Test Site Home + + +

Test Site Home

+ + + diff --git a/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/known-targets.json b/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/known-targets.json new file mode 100644 index 000000000..e900dbce1 --- /dev/null +++ b/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/known-targets.json @@ -0,0 +1,5 @@ +{ + "targets": { + "/api-page.html": ["#method1", "#method2", "#method3"] + } +} diff --git a/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/nested/page.html b/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/nested/page.html new file mode 100644 index 000000000..9cfd3acd9 --- /dev/null +++ b/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/nested/page.html @@ -0,0 +1,19 @@ + + + + + + Nested Page + + +

Nested Page

+

This is a page in a nested directory.

+ + + diff --git a/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/orphaned-page.html b/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/orphaned-page.html new file mode 100644 index 000000000..acb3df281 --- /dev/null +++ b/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/orphaned-page.html @@ -0,0 +1,20 @@ + + + + + + Orphaned Page + + +

Orphaned Page

+

This page is not linked from anywhere and can only be discovered via seedUrls.

+ + + diff --git a/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/page-with-api-links.html b/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/page-with-api-links.html new file mode 100644 index 000000000..7a44d4bdf --- /dev/null +++ b/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/page-with-api-links.html @@ -0,0 +1,20 @@ + + + + + + Page with API Links + + +

Page with API Links

+

This page links to API documentation with known targets.

+ + + diff --git a/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/page-with-custom-targets.html b/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/page-with-custom-targets.html new file mode 100644 index 000000000..bc1c3ffc5 --- /dev/null +++ b/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/page-with-custom-targets.html @@ -0,0 +1,24 @@ + + + + + + Page with Custom Targets + + +

Page with Custom Targets

+ +
+

Custom ID Section

+
+
+

This target should be ignored

+
+ + diff --git a/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/page-with-ignored-content.html b/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/page-with-ignored-content.html new file mode 100644 index 000000000..2b6203ab0 --- /dev/null +++ b/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/page-with-ignored-content.html @@ -0,0 +1,28 @@ + + + + + + Page with Ignored Content + + +

Page with Ignored Content

+ + +
+

Main content

+

+ Link with sidebar class directly +

+
+ + diff --git a/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/page-with-known-target-links.html b/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/page-with-known-target-links.html new file mode 100644 index 000000000..ce58e30d8 --- /dev/null +++ b/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/page-with-known-target-links.html @@ -0,0 +1,19 @@ + + + + + + Page with Known Target Links + + +

Page with Known Target Links

+

This page links to external pages with known targets.

+ + + diff --git a/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/valid.html b/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/valid.html new file mode 100644 index 000000000..fde0a32f7 --- /dev/null +++ b/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/valid.html @@ -0,0 +1,20 @@ + + + + + + Valid Page + + +

Valid Page

+

This page has only valid internal links.

+ + + diff --git a/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/with-anchors.html b/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/with-anchors.html new file mode 100644 index 000000000..b870c0fae --- /dev/null +++ b/packages/code-infra/src/brokenLinksChecker/__fixtures__/static-site/with-anchors.html @@ -0,0 +1,31 @@ + + + + + + Page with Anchors + + +

Page with Anchors

+ +
+

Section 1

+

Content for section 1

+
+
+

Section 2

+

Content for section 2

+
+
+

Section 3

+

Content for section 3

+
+ + diff --git a/packages/code-infra/src/brokenLinksChecker/index.mjs b/packages/code-infra/src/brokenLinksChecker/index.mjs new file mode 100644 index 000000000..5d04171ee --- /dev/null +++ b/packages/code-infra/src/brokenLinksChecker/index.mjs @@ -0,0 +1,635 @@ +/* eslint-disable no-console */ +import { execaCommand } from 'execa'; +import timers from 'node:timers/promises'; +import { parse } from 'node-html-parser'; +import * as fs from 'node:fs/promises'; +import * as path from 'node:path'; +import chalk from 'chalk'; +import { Transform } from 'node:stream'; +import contentType from 'content-type'; + +const DEFAULT_CONCURRENCY = 4; + +/** + * Creates a Transform stream that prefixes each line with a given string. + * Useful for distinguishing server logs from other output. + * @param {string} prefix - String to prepend to each line + * @returns {Transform} Transform stream that adds the prefix to each line + */ +const prefixLines = (prefix) => { + let leftover = ''; + return new Transform({ + transform(chunk, enc, cb) { + const lines = (leftover + chunk.toString()).split(/\r?\n/); + leftover = /** @type {string} */ (lines.pop()); + this.push(lines.map((l) => `${prefix + l}\n`).join('')); + cb(); + }, + flush(cb) { + if (leftover) { + this.push(`${prefix + leftover}\n`); + } + cb(); + }, + }); +}; + +/** + * Maps page URLs to sets of known target IDs (anchors) on that page. + * Used to track which link targets (e.g., #section-id) exist on each page. + * @typedef {Map>} LinkStructure + */ + +/** + * Serialized representation of LinkStructure for JSON storage. + * Converts Maps and Sets to plain objects and arrays for file persistence. + * @typedef {Object} SerializedLinkStructure + * @property {Record} targets - Object mapping page URLs to arrays of target IDs + */ + +/** + * Fetches a URL and throws an error if the response is not OK. + * @param {string | URL} url - URL to fetch + * @returns {Promise} Fetch response if successful + * @throws {Error} If the response status is not OK (not in 200-299 range) + */ +async function fetchUrl(url) { + const res = await fetch(url); + if (!res.ok) { + throw new Error(`Failed to fetch ${url}: [${res.status}] ${res.statusText}`); + } + return res; +} + +/** + * Polls a URL until it responds successfully or times out. + * Used to wait for a dev server to start. + * @param {string} url - URL to poll + * @param {number} timeout - Maximum milliseconds to wait before timing out + * @returns {Promise} Resolves when URL responds successfully + * @throws {Error} If timeout is reached before URL responds + */ +async function pollUrl(url, timeout) { + const start = Date.now(); + while (true) { + try { + // eslint-disable-next-line no-await-in-loop + await fetchUrl(url); + return; + } catch (/** @type {any} */ error) { + if (Date.now() - start > timeout) { + throw new Error(`Timeout waiting for ${url}: ${error.message}`, { cause: error }); + } + // eslint-disable-next-line no-await-in-loop + await timers.setTimeout(1000); + } + } +} + +/** + * Converts serialized link structure (from JSON) back to Map/Set form. + * @param {SerializedLinkStructure} data - Serialized structure with plain objects/arrays + * @returns {LinkStructure} Deserialized structure using Map and Set + */ +function deserializeLinkStructure(data) { + const linkStructure = new Map(); + for (const url of Object.keys(data.targets)) { + linkStructure.set(url, new Set(data.targets[url])); + } + return linkStructure; +} + +/** + * Data about a crawled page including its URL, HTTP status, and available link targets. + * @typedef {Object} PageData + * @property {string} url - The normalized page URL (without trailing slash unless root) + * @property {number} status - HTTP status code from the response (e.g., 200, 404, 500) + * @property {Set} targets - Set of available anchor targets on the page, keyed by hash (e.g., '#intro') + */ + +/** + * Serializes and writes discovered page targets to a JSON file. + * @param {Map} pages - Map of crawled pages with their targets + * @param {string} outPath - File path to write the JSON output + * @returns {Promise} + */ +async function writePagesToFile(pages, outPath) { + /** @type {SerializedLinkStructure} */ + const fileContent = { targets: {} }; + for (const [url, pageData] of pages.entries()) { + fileContent.targets[url] = Array.from(pageData.targets.keys()); + } + const dir = path.dirname(outPath); + await fs.mkdir(dir, { recursive: true }); + await fs.writeFile(outPath, JSON.stringify(fileContent, null, 2), 'utf-8'); +} + +/** + * Computes the accessible name of an element according to ARIA rules. + * Polyfill for `node.computedName` available only in Chrome v112+. + * Checks in order: aria-label, aria-labelledby, label[for], img alt, innerText. + * @param {import('node-html-parser').HTMLElement | null} elm - Element to compute name for + * @param {import('node-html-parser').HTMLElement} ownerDocument - Document containing the element + * @returns {string} The computed accessible name, or empty string if none found + */ +function getAccessibleName(elm, ownerDocument) { + if (!elm) { + return ''; + } + + // 1. aria-label + const ariaLabel = elm.getAttribute('aria-label')?.trim(); + if (ariaLabel) { + return ariaLabel; + } + + // 2. aria-labelledby + const labelledby = elm.getAttribute('aria-labelledby'); + if (labelledby) { + const labels = []; + for (const id of labelledby.split(/\s+/)) { + const label = getAccessibleName(ownerDocument.getElementById(id), ownerDocument); + if (label) { + labels.push(label); + } + } + const label = labels.join(' ').trim(); + if (label) { + return label; + } + } + + // 3.