diff --git a/src/browserlib/extract-cddl.mjs b/src/browserlib/extract-cddl.mjs new file mode 100644 index 00000000..769349ce --- /dev/null +++ b/src/browserlib/extract-cddl.mjs @@ -0,0 +1,125 @@ +import getCodeElements from './get-code-elements.mjs'; +import trimSpaces from './trim-spaces.mjs'; + +/** + * Extract the list of CDDL definitions in the current spec. + * + * A spec may define more that one CDDL module. For example, the WebDriver BiDi + * spec has CDDL definitions that apply to either of both the local end and the + * remote end. The functions returns an array that lists all CDDL modules. + * + * Each CDDL module is represented as an object with the following keys whose + * values are strings: + * - shortname: the CDDL module shortname. Shortname is "" if the spec does not + * define any module, and "all" for the dump of all CDDL definitions. + * - label: A full name for the CDDL module, when defined. + * - cddl: A dump of the CDDL definitions. + * + * If the spec defines more than one module, the first item in the array is the + * "all" module that contains a dump of all CDDL definitions, regardless of the + * module they are actually defined for (the assumption is that looking at the + * union of all CDDL modules defined in a spec will always make sense, and that + * a spec will never reuse the same rule name with a different definition for + * different CDDL modules). + * + * @function + * @public + * @return {Array} A dump of the CDDL definitions per CDDL module, or an empty + * array if the spec does not contain any CDDL. + */ +export default function () { + // Specs with CDDL are either recent enough that they all use the same + // `
` convention, or they don't flag CDDL blocks in any + // way, making it impossible to extract them. + const cddlSelectors = ['pre.cddl:not(.exclude):not(.extract)']; + const excludeSelectors = ['#cddl-index']; + + // Retrieve all elements that contains CDDL content + const cddlEls = getCodeElements(cddlSelectors, { excludeSelectors }); + + // Start by assembling the list of modules + const modules = {}; + for (const el of cddlEls) { + const elModules = getModules(el); + for (const name of elModules) { + // "all" does not create a module on its own, that's the name of + // the CDDL module that contains all CDDL definitions. + if (name !== 'all') { + modules[name] = []; + } + } + } + + // Assemble the CDDL per module + const mergedCddl = []; + for (const el of cddlEls) { + const cddl = trimSpaces(el.textContent); + if (!cddl) { + continue; + } + // All CDDL appears in the "all" module. + mergedCddl.push(cddl); + let elModules = getModules(el); + if (elModules.length === 0) { + // No module means the CDDL is defined for all modules + elModules = Object.keys(modules); + } + for (const name of elModules) { + // CDDL defined for the "all" module is only defined for it + if (name !== 'all') { + if (!modules[name]) { + modules[name] = []; + } + modules[name].push(cddl); + } + } + } + + if (mergedCddl.length === 0) { + return []; + } + + const res = [{ + name: Object.keys(modules).length > 0 ? 'all' : '', + cddl: mergedCddl.join('\n\n') + }]; + for (const [name, cddl] of Object.entries(modules)) { + res.push({ name, cddl: cddl.join('\n\n') }); + } + // Remove trailing spaces and use spaces throughout + for (const cddlModule of res) { + cddlModule.cddl = cddlModule.cddl + .replace(/\s+$/gm, '\n') + .replace(/\t/g, ' ') + .trim(); + } + return res; +} + + +/** + * Retrieve the list of CDDL module shortnames that the element references. + * + * This list of modules is either specified in a `data-cddl-module` attribute + * or directly within the class attribute prefixed by `cddl-` or suffixed by + * `-cddl`. + */ +function getModules(el) { + const moduleAttr = el.getAttribute('data-cddl-module'); + if (moduleAttr) { + return moduleAttr.split(',').map(str => str.trim()); + } + + const list = []; + const classes = el.classList.values() + for (const name of classes) { + const match = name.match(/^(.*)-cddl$|^cddl-(.*)$/); + if (match) { + const shortname = match[1] ?? match[2]; + if (!list.includes(shortname)) { + list.push(shortname); + } + } + } + return list; +} diff --git a/src/browserlib/extract-webidl.mjs b/src/browserlib/extract-webidl.mjs index 34ad6c86..4a6cbc1d 100644 --- a/src/browserlib/extract-webidl.mjs +++ b/src/browserlib/extract-webidl.mjs @@ -1,14 +1,14 @@ import getGenerator from './get-generator.mjs'; -import informativeSelector from './informative-selector.mjs'; -import cloneAndClean from './clone-and-clean.mjs'; +import getCodeElements from './get-code-elements.mjs'; +import trimSpaces from './trim-spaces.mjs'; /** * Extract the list of WebIDL definitions in the current spec * * @function * @public - * @return {Promise} The promise to get a dump of the IDL definitions, or - * an empty string if the spec does not contain any IDL. + * @return {String} A dump of the IDL definitions, or an empty string if the + * spec does not contain any IDL. */ export default function () { const generator = getGenerator(); @@ -70,56 +70,21 @@ function extractBikeshedIdl() { * sure that it only extracts elements once. */ function extractRespecIdl() { - // Helper function that trims individual lines in an IDL block, - // removing as much space as possible from the beginning of the page - // while preserving indentation. Rules followed: - // - Always trim the first line - // - Remove whitespaces from the end of each line - // - Replace lines that contain spaces with empty lines - // - Drop same number of leading whitespaces from all other lines - const trimIdlSpaces = idl => { - const lines = idl.trim().split('\n'); - const toRemove = lines - .slice(1) - .filter(line => line.search(/\S/) > -1) - .reduce( - (min, line) => Math.min(min, line.search(/\S/)), - Number.MAX_VALUE); - return lines - .map(line => { - let firstRealChat = line.search(/\S/); - if (firstRealChat === -1) { - return ''; - } - else if (firstRealChat === 0) { - return line.replace(/\s+$/, ''); - } - else { - return line.substring(toRemove).replace(/\s+$/, ''); - } - }) - .join('\n'); - }; - - // Detect the IDL index appendix if there's one (to exclude it) - const idlEl = document.querySelector('#idl-index pre') || - document.querySelector('.chapter-idl pre'); // SVG 2 draft - - let idl = [ + const idlSelectors = [ 'pre.idl:not(.exclude):not(.extract):not(#actual-idl-index)', 'pre:not(.exclude):not(.extract) > code.idl-code:not(.exclude):not(.extract)', 'pre:not(.exclude):not(.extract) > code.idl:not(.exclude):not(.extract)', 'div.idl-code:not(.exclude):not(.extract) > pre:not(.exclude):not(.extract)', 'pre.widl:not(.exclude):not(.extract)' - ] - .map(sel => [...document.querySelectorAll(sel)]) - .reduce((res, elements) => res.concat(elements), []) - .filter(el => el !== idlEl) - .filter((el, idx, self) => self.indexOf(el) === idx) - .filter(el => !el.closest(informativeSelector)) - .map(cloneAndClean) - .map(el => trimIdlSpaces(el.textContent)) - .join('\n\n'); + ]; - return idl; + const excludeSelectors = [ + '#idl-index', + '.chapter-idl' + ]; + + const idlElements = getCodeElements(idlSelectors, { excludeSelectors }); + return idlElements + .map(el => trimSpaces(el.textContent)) + .join('\n\n'); } \ No newline at end of file diff --git a/src/browserlib/get-code-elements.mjs b/src/browserlib/get-code-elements.mjs new file mode 100644 index 00000000..1b3d4632 --- /dev/null +++ b/src/browserlib/get-code-elements.mjs @@ -0,0 +1,21 @@ +import informativeSelector from './informative-selector.mjs'; +import cloneAndClean from './clone-and-clean.mjs'; + +/** + * Helper function that returns a set of code elements in document order based + * on a given set of selectors, excluding elements that are within an index. + * + * The function excludes elements defined in informative sections. + * + * The code elements are cloned and cleaned before they are returned to strip + * annotations and other asides. + */ +export default function getCodeElements(codeSelectors, { excludeSelectors = [] }) { + return [...document.querySelectorAll(codeSelectors.join(', '))] + // Skip excluded and elements and those in informative content + .filter(el => !el.closest(excludeSelectors.join(', '))) + .filter(el => !el.closest(informativeSelector)) + + // Clone and clean the elements + .map(cloneAndClean); +} \ No newline at end of file diff --git a/src/browserlib/reffy.json b/src/browserlib/reffy.json index 8a0b7a2c..036b3993 100644 --- a/src/browserlib/reffy.json +++ b/src/browserlib/reffy.json @@ -62,5 +62,9 @@ "href": "./extract-ids.mjs", "property": "ids", "needsIdToHeadingMap": true + }, + { + "href": "./extract-cddl.mjs", + "property": "cddl" } ] diff --git a/src/browserlib/trim-spaces.mjs b/src/browserlib/trim-spaces.mjs new file mode 100644 index 00000000..e7450486 --- /dev/null +++ b/src/browserlib/trim-spaces.mjs @@ -0,0 +1,36 @@ +/** + * Helper function that trims individual lines in a code block, removing as + * much space as possible from the beginning of the page while preserving + * indentation. + * + * Typically useful for CDDL and IDL extracts + * + * Rules followed: + * - Always trim the first line + * - Remove whitespaces from the end of each line + * - Replace lines that contain spaces with empty lines + * - Drop same number of leading whitespaces from all other lines + */ +export default function trimSpaces(code) { + const lines = code.trim().split('\n'); + const toRemove = lines + .slice(1) + .filter(line => line.search(/\S/) > -1) + .reduce( + (min, line) => Math.min(min, line.search(/\S/)), + Number.MAX_VALUE); + return lines + .map(line => { + let firstRealChar = line.search(/\S/); + if (firstRealChar === -1) { + return ''; + } + else if (firstRealChar === 0) { + return line.replace(/\s+$/, ''); + } + else { + return line.substring(toRemove).replace(/\s+$/, ''); + } + }) + .join('\n'); +} \ No newline at end of file diff --git a/src/lib/specs-crawler.js b/src/lib/specs-crawler.js index 67a98a3d..b4092b05 100644 --- a/src/lib/specs-crawler.js +++ b/src/lib/specs-crawler.js @@ -251,6 +251,29 @@ async function saveSpecResults(spec, settings) { return `css/${spec.shortname}.json`; }; + async function saveCddl(spec) { + let cddlHeader = ` + ; GENERATED CONTENT - DO NOT EDIT + ; Content was automatically extracted by Reffy into webref + ; (https://github.com/w3c/webref) + ; Source: ${spec.title} (${spec.crawled})`; + cddlHeader = cddlHeader.replace(/^\s+/gm, '').trim() + '\n\n'; + const res = []; + for (const cddlModule of spec.cddl) { + const cddl = cddlHeader + cddlModule.cddl + '\n'; + const filename = spec.shortname + + (cddlModule.name ? `-${cddlModule.name}` : '') + + '.cddl'; + await fs.promises.writeFile( + path.join(folders.cddl, filename), cddl); + res.push({ + name: cddlModule.name, + file: `cddl/${filename}` + }); + } + return res; + }; + // Save IDL dumps if (spec.idl) { spec.idl = await saveIdl(spec); @@ -283,9 +306,14 @@ async function saveSpecResults(spec, settings) { (typeof thing == 'object') && (Object.keys(thing).length === 0); } + // Save CDDL extracts (text files, multiple modules possible) + if (!isEmpty(spec.cddl)) { + spec.cddl = await saveCddl(spec); + } + // Save all other extracts from crawling modules const remainingModules = modules.filter(mod => - !mod.metadata && mod.property !== 'css' && mod.property !== 'idl'); + !mod.metadata && !['cddl', 'css', 'idl'].includes(mod.property)); for (const mod of remainingModules) { await saveExtract(spec, mod.property, spec => !isEmpty(spec[mod.property])); } diff --git a/src/lib/util.js b/src/lib/util.js index 9659ce69..3d76e0b1 100644 --- a/src/lib/util.js +++ b/src/lib/util.js @@ -796,6 +796,36 @@ async function expandSpecResult(spec, baseFolder, properties) { return; } + // Treat CDDL extracts separately, one spec may have multiple CDDL + // extracts (actual treatment is similar to IDL extracts otherwise) + if (property === 'cddl') { + if (!spec[property]) { + return; + } + for (const cddlModule of spec[property]) { + if (!cddlModule.file) { + continue; + } + if (baseFolder.startsWith('https:')) { + const url = (new URL(cddlModule.file, baseFolder)).toString(); + const response = await fetch(url, { nolog: true }); + contents = await response.text(); + } + else { + const filename = path.join(baseFolder, cddlModule.file); + contents = await fs.readFile(filename, 'utf8'); + } + if (contents.startsWith('; GENERATED CONTENT - DO NOT EDIT')) { + // Normalize newlines to avoid off-by-one slices when we remove + // the trailing newline that was added by saveCddl + contents = contents.replace(/\r/g, ''); + const endOfHeader = contents.indexOf('\n\n'); + contents = contents.substring(endOfHeader + 2).slice(0, -1); + } + cddlModule.cddl = contents; + } + } + // Only consider properties that link to an extract, i.e. an IDL // or JSON file in subfolder. if (!spec[property] || diff --git a/tests/crawl-test.json b/tests/crawl-test.json index 58fcd06b..3c58e15d 100644 --- a/tests/crawl-test.json +++ b/tests/crawl-test.json @@ -24,6 +24,7 @@ }, "title": "WOFF2", "algorithms": [], + "cddl": [], "css": { "atrules": [], "properties": [], @@ -99,6 +100,7 @@ "title": "No Title", "generator": "respec", "algorithms": [], + "cddl": [], "css": { "atrules": [], "properties": [], @@ -224,6 +226,7 @@ }, "title": "[No title found for https://w3c.github.io/accelerometer/]", "algorithms": [], + "cddl": [], "css": { "atrules": [], "properties": [], diff --git a/tests/extract-cddl.js b/tests/extract-cddl.js new file mode 100644 index 00000000..82dd383a --- /dev/null +++ b/tests/extract-cddl.js @@ -0,0 +1,170 @@ +import assert from 'node:assert'; +import path from 'node:path'; +import { fileURLToPath } from 'node:url'; +import puppeteer from 'puppeteer'; +import { rollup } from 'rollup'; +const scriptPath = path.dirname(fileURLToPath(import.meta.url)); + +const tests = [ + { + title: 'extracts CDDL from pre.cddl', + html: `cddl = tstr`, + res: 'cddl = tstr' + }, + + { + title: 'produces no CDDL when there is no CDDL', + html: `Me no define CDDL
`, + res: [] + }, + + { + title: 'merges multiples blocks of CDDL', + html: `cddl = * rule+rule = tstr`, + res: `cddl = * rule + +rule = tstr` + }, + + { + title: 'strips trailing spaces', + html: `+ cddl = * rule`, + res: `cddl = * rule` + }, + + { + title: 'preserves internal indentation', + html: `+ rule = ( + typedef / + groupdef + ) + typedef = tstr + groupdef = tstr +`, + res: `rule = ( + typedef / + groupdef +) +typedef = tstr + groupdef = tstr` + }, + + { + title: 'extracts CDDL module names from data-cddl-module', + html: `cddl = tstr`, + res: [ + { name: 'all', cddl: 'cddl = tstr' }, + { name: 'mod', cddl: 'cddl = tstr' } + ] + }, + + { + title: 'extracts CDDL module name defined as class', + html: `cddl = tstr`, + res: [ + { name: 'all', cddl: 'cddl = tstr' }, + { name: 'mod1', cddl: 'cddl = tstr' }, + { name: 'mod2', cddl: 'cddl = tstr' } + ] + }, + + { + title: 'assembles CDDL in modules', + html: ` ++ rule = (cddl1 / cddl2) +++ cddl1 = tstr +++ cddl2 = tstr +++ typedef = tstr + groupdef = tstr ++ `, + res: [ + { + name: 'all', + cddl: +`rule = (cddl1 / cddl2) + +cddl1 = tstr + +cddl2 = tstr + +typedef = tstr +groupdef = tstr` + }, + { + name: 'mod1', + cddl: +`cddl1 = tstr + +typedef = tstr +groupdef = tstr` + }, + { + name: 'mod2', + cddl: +`cddl2 = tstr + +typedef = tstr +groupdef = tstr` + } + ] + } +]; + +function isString(x) { + return Object.prototype.toString.call(x) === "[object String]"; +} + +describe("CDDL extraction", function () { + this.slow(5000); + + let browser; + let extractCode; + + before(async () => { + const extractBundle = await rollup({ + input: path.resolve(scriptPath, '../src/browserlib/extract-cddl.mjs') + }); + const extractOutput = (await extractBundle.generate({ + name: 'extractCddl', + format: 'iife' + })).output; + extractCode = extractOutput[0].code; + + browser = await puppeteer.launch({ headless: true }); + }); + + for (const test of tests) { + it(test.title, async () => { + const page = await browser.newPage(); + page.setContent(test.html); + await page.addScriptTag({ content: extractCode }); + + const extracted = await page.evaluate(async () => extractCddl()); + await page.close(); + + if (isString(test.res)) { + assert.deepEqual(extracted.length, 1, + `Expected extraction to return 1 CDDL module, got ${extracted.length}`); + assert.deepEqual(extracted[0].cddl, test.res); + } + else { + assert.deepEqual(extracted, test.res); + } + }); + } + + after(async () => { + await browser.close(); + }); +});