diff --git a/src/browserlib/extract-cddl.mjs b/src/browserlib/extract-cddl.mjs new file mode 100644 index 00000000..769349ce --- /dev/null +++ b/src/browserlib/extract-cddl.mjs @@ -0,0 +1,125 @@ +import getCodeElements from './get-code-elements.mjs'; +import trimSpaces from './trim-spaces.mjs'; + +/** + * Extract the list of CDDL definitions in the current spec. + * + * A spec may define more that one CDDL module. For example, the WebDriver BiDi + * spec has CDDL definitions that apply to either of both the local end and the + * remote end. The functions returns an array that lists all CDDL modules. + * + * Each CDDL module is represented as an object with the following keys whose + * values are strings: + * - shortname: the CDDL module shortname. Shortname is "" if the spec does not + * define any module, and "all" for the dump of all CDDL definitions. + * - label: A full name for the CDDL module, when defined. + * - cddl: A dump of the CDDL definitions. + * + * If the spec defines more than one module, the first item in the array is the + * "all" module that contains a dump of all CDDL definitions, regardless of the + * module they are actually defined for (the assumption is that looking at the + * union of all CDDL modules defined in a spec will always make sense, and that + * a spec will never reuse the same rule name with a different definition for + * different CDDL modules). + * + * @function + * @public + * @return {Array} A dump of the CDDL definitions per CDDL module, or an empty + * array if the spec does not contain any CDDL. + */ +export default function () { + // Specs with CDDL are either recent enough that they all use the same + // `
` convention, or they don't flag CDDL blocks in any
+    // way, making it impossible to extract them.
+    const cddlSelectors = ['pre.cddl:not(.exclude):not(.extract)'];
+    const excludeSelectors = ['#cddl-index'];
+
+    // Retrieve all elements that contains CDDL content
+    const cddlEls = getCodeElements(cddlSelectors, { excludeSelectors });
+
+    // Start by assembling the list of modules
+    const modules = {};
+    for (const el of cddlEls) {
+        const elModules = getModules(el);
+        for (const name of elModules) {
+            // "all" does not create a module on its own, that's the name of
+            // the CDDL module that contains all CDDL definitions.
+            if (name !== 'all') {
+                modules[name] = [];
+            }
+        }
+    }
+
+    // Assemble the CDDL per module
+    const mergedCddl = [];
+    for (const el of cddlEls) {
+        const cddl = trimSpaces(el.textContent);
+        if (!cddl) {
+            continue;
+        }
+        // All CDDL appears in the "all" module.
+        mergedCddl.push(cddl);
+        let elModules = getModules(el);
+        if (elModules.length === 0) {
+            // No module means the CDDL is defined for all modules
+            elModules = Object.keys(modules);
+        }
+        for (const name of elModules) {
+            // CDDL defined for the "all" module is only defined for it
+            if (name !== 'all') {
+                if (!modules[name]) {
+                    modules[name] = [];
+                }
+                modules[name].push(cddl);
+            }
+        }
+    }
+
+    if (mergedCddl.length === 0) {
+        return [];
+    }
+
+    const res = [{
+        name: Object.keys(modules).length > 0 ? 'all' : '',
+        cddl: mergedCddl.join('\n\n')
+    }];
+    for (const [name, cddl] of Object.entries(modules)) {
+        res.push({ name, cddl: cddl.join('\n\n') });
+    }
+    // Remove trailing spaces and use spaces throughout
+    for (const cddlModule of res) {
+        cddlModule.cddl = cddlModule.cddl
+            .replace(/\s+$/gm, '\n')
+            .replace(/\t/g, '  ')
+            .trim();
+    }
+    return res;
+}
+
+
+/**
+ * Retrieve the list of CDDL module shortnames that the element references.
+ *
+ * This list of modules is either specified in a `data-cddl-module` attribute
+ * or directly within the class attribute prefixed by `cddl-` or suffixed by
+ * `-cddl`.
+ */
+function getModules(el) {
+    const moduleAttr = el.getAttribute('data-cddl-module');
+    if (moduleAttr) {
+        return moduleAttr.split(',').map(str => str.trim());
+    }
+
+    const list = [];
+    const classes = el.classList.values()
+    for (const name of classes) {
+        const match = name.match(/^(.*)-cddl$|^cddl-(.*)$/);
+        if (match) {
+            const shortname = match[1] ?? match[2];
+            if (!list.includes(shortname)) {
+                list.push(shortname);
+            }
+        }
+    }
+    return list;
+}
diff --git a/src/browserlib/extract-webidl.mjs b/src/browserlib/extract-webidl.mjs
index 34ad6c86..4a6cbc1d 100644
--- a/src/browserlib/extract-webidl.mjs
+++ b/src/browserlib/extract-webidl.mjs
@@ -1,14 +1,14 @@
 import getGenerator from './get-generator.mjs';
-import informativeSelector from './informative-selector.mjs';
-import cloneAndClean from './clone-and-clean.mjs';
+import getCodeElements from './get-code-elements.mjs';
+import trimSpaces from './trim-spaces.mjs';
 
 /**
  * Extract the list of WebIDL definitions in the current spec
  *
  * @function
  * @public 
- * @return {Promise} The promise to get a dump of the IDL definitions, or
- *   an empty string if the spec does not contain any IDL.
+ * @return {String} A dump of the IDL definitions, or an empty string if the
+ * spec does not contain any IDL.
  */
 export default function () {
     const generator = getGenerator();
@@ -70,56 +70,21 @@ function extractBikeshedIdl() {
  * sure that it only extracts elements once.
  */
 function extractRespecIdl() {
-    // Helper function that trims individual lines in an IDL block,
-    // removing as much space as possible from the beginning of the page
-    // while preserving indentation. Rules followed:
-    // - Always trim the first line
-    // - Remove whitespaces from the end of each line
-    // - Replace lines that contain spaces with empty lines
-    // - Drop same number of leading whitespaces from all other lines
-    const trimIdlSpaces = idl => {
-        const lines = idl.trim().split('\n');
-        const toRemove = lines
-            .slice(1)
-            .filter(line => line.search(/\S/) > -1)
-            .reduce(
-                (min, line) => Math.min(min, line.search(/\S/)),
-                Number.MAX_VALUE);
-        return lines
-            .map(line => {
-                let firstRealChat = line.search(/\S/);
-                if (firstRealChat === -1) {
-                    return '';
-                }
-                else if (firstRealChat === 0) {
-                    return line.replace(/\s+$/, '');
-                }
-                else {
-                    return line.substring(toRemove).replace(/\s+$/, '');
-                }
-            })
-            .join('\n');
-    };
-
-    // Detect the IDL index appendix if there's one (to exclude it)
-    const idlEl = document.querySelector('#idl-index pre') ||
-        document.querySelector('.chapter-idl pre'); // SVG 2 draft
-
-    let idl = [
+    const idlSelectors = [
         'pre.idl:not(.exclude):not(.extract):not(#actual-idl-index)',
         'pre:not(.exclude):not(.extract) > code.idl-code:not(.exclude):not(.extract)',
         'pre:not(.exclude):not(.extract) > code.idl:not(.exclude):not(.extract)',
         'div.idl-code:not(.exclude):not(.extract) > pre:not(.exclude):not(.extract)',
         'pre.widl:not(.exclude):not(.extract)'
-    ]
-        .map(sel => [...document.querySelectorAll(sel)])
-        .reduce((res, elements) => res.concat(elements), [])
-        .filter(el => el !== idlEl)
-        .filter((el, idx, self) => self.indexOf(el) === idx)
-        .filter(el => !el.closest(informativeSelector))
-        .map(cloneAndClean)
-        .map(el => trimIdlSpaces(el.textContent))
-        .join('\n\n');
+    ];
 
-    return idl;
+    const excludeSelectors = [
+        '#idl-index',
+        '.chapter-idl'
+    ];
+
+    const idlElements = getCodeElements(idlSelectors, { excludeSelectors });
+    return idlElements
+        .map(el => trimSpaces(el.textContent))
+        .join('\n\n');
 }
\ No newline at end of file
diff --git a/src/browserlib/get-code-elements.mjs b/src/browserlib/get-code-elements.mjs
new file mode 100644
index 00000000..1b3d4632
--- /dev/null
+++ b/src/browserlib/get-code-elements.mjs
@@ -0,0 +1,21 @@
+import informativeSelector from './informative-selector.mjs';
+import cloneAndClean from './clone-and-clean.mjs';
+
+/**
+ * Helper function that returns a set of code elements in document order based
+ * on a given set of selectors, excluding elements that are within an index.
+ *
+ * The function excludes elements defined in informative sections.
+ *
+ * The code elements are cloned and cleaned before they are returned to strip
+ * annotations and other asides.
+ */
+export default function getCodeElements(codeSelectors, { excludeSelectors = [] }) {
+    return [...document.querySelectorAll(codeSelectors.join(', '))]
+        // Skip excluded and elements and those in informative content
+        .filter(el => !el.closest(excludeSelectors.join(', ')))
+        .filter(el => !el.closest(informativeSelector))
+
+        // Clone and clean the elements
+        .map(cloneAndClean);
+}
\ No newline at end of file
diff --git a/src/browserlib/reffy.json b/src/browserlib/reffy.json
index 8a0b7a2c..036b3993 100644
--- a/src/browserlib/reffy.json
+++ b/src/browserlib/reffy.json
@@ -62,5 +62,9 @@
     "href": "./extract-ids.mjs",
     "property": "ids",
     "needsIdToHeadingMap": true
+  },
+  {
+    "href": "./extract-cddl.mjs",
+    "property": "cddl"
   }
 ]
diff --git a/src/browserlib/trim-spaces.mjs b/src/browserlib/trim-spaces.mjs
new file mode 100644
index 00000000..e7450486
--- /dev/null
+++ b/src/browserlib/trim-spaces.mjs
@@ -0,0 +1,36 @@
+/**
+ * Helper function that trims individual lines in a code block, removing as
+ * much space as possible from the beginning of the page while preserving
+ * indentation.
+ *
+ * Typically useful for CDDL and IDL extracts
+ *
+ * Rules followed:
+ * - Always trim the first line
+ * - Remove whitespaces from the end of each line
+ * - Replace lines that contain spaces with empty lines
+ * - Drop same number of leading whitespaces from all other lines
+ */
+export default function trimSpaces(code) {
+    const lines = code.trim().split('\n');
+    const toRemove = lines
+        .slice(1)
+        .filter(line => line.search(/\S/) > -1)
+        .reduce(
+            (min, line) => Math.min(min, line.search(/\S/)),
+            Number.MAX_VALUE);
+    return lines
+        .map(line => {
+            let firstRealChar = line.search(/\S/);
+            if (firstRealChar === -1) {
+                return '';
+            }
+            else if (firstRealChar === 0) {
+                return line.replace(/\s+$/, '');
+            }
+            else {
+                return line.substring(toRemove).replace(/\s+$/, '');
+            }
+        })
+        .join('\n');
+}
\ No newline at end of file
diff --git a/src/lib/specs-crawler.js b/src/lib/specs-crawler.js
index 67a98a3d..b4092b05 100644
--- a/src/lib/specs-crawler.js
+++ b/src/lib/specs-crawler.js
@@ -251,6 +251,29 @@ async function saveSpecResults(spec, settings) {
         return `css/${spec.shortname}.json`;
     };
 
+    async function saveCddl(spec) {
+        let cddlHeader = `
+            ; GENERATED CONTENT - DO NOT EDIT
+            ; Content was automatically extracted by Reffy into webref
+            ; (https://github.com/w3c/webref)
+            ; Source: ${spec.title} (${spec.crawled})`;
+        cddlHeader = cddlHeader.replace(/^\s+/gm, '').trim() + '\n\n';
+        const res = [];
+        for (const cddlModule of spec.cddl) {
+            const cddl = cddlHeader + cddlModule.cddl + '\n';
+            const filename = spec.shortname +
+                (cddlModule.name ? `-${cddlModule.name}` : '') +
+                '.cddl';
+            await fs.promises.writeFile(
+                path.join(folders.cddl, filename), cddl);
+            res.push({
+                name: cddlModule.name,
+                file: `cddl/${filename}`
+            });
+        }
+        return res;
+    };
+
     // Save IDL dumps
     if (spec.idl) {
         spec.idl = await saveIdl(spec);
@@ -283,9 +306,14 @@ async function saveSpecResults(spec, settings) {
             (typeof thing == 'object') && (Object.keys(thing).length === 0);
     }
 
+    // Save CDDL extracts (text files, multiple modules possible)
+    if (!isEmpty(spec.cddl)) {
+        spec.cddl = await saveCddl(spec);
+    }
+
     // Save all other extracts from crawling modules
     const remainingModules = modules.filter(mod =>
-        !mod.metadata && mod.property !== 'css' && mod.property !== 'idl');
+        !mod.metadata && !['cddl', 'css', 'idl'].includes(mod.property));
     for (const mod of remainingModules) {
         await saveExtract(spec, mod.property, spec => !isEmpty(spec[mod.property]));
     }
diff --git a/src/lib/util.js b/src/lib/util.js
index 9659ce69..3d76e0b1 100644
--- a/src/lib/util.js
+++ b/src/lib/util.js
@@ -796,6 +796,36 @@ async function expandSpecResult(spec, baseFolder, properties) {
             return;
         }
 
+        // Treat CDDL extracts separately, one spec may have multiple CDDL
+        // extracts (actual treatment is similar to IDL extracts otherwise)
+        if (property === 'cddl') {
+            if (!spec[property]) {
+                return;
+            }
+            for (const cddlModule of spec[property]) {
+                if (!cddlModule.file) {
+                    continue;
+                }
+                if (baseFolder.startsWith('https:')) {
+                    const url = (new URL(cddlModule.file, baseFolder)).toString();
+                    const response = await fetch(url, { nolog: true });
+                    contents = await response.text();
+                }
+                else {
+                    const filename = path.join(baseFolder, cddlModule.file);
+                    contents = await fs.readFile(filename, 'utf8');
+                }
+                if (contents.startsWith('; GENERATED CONTENT - DO NOT EDIT')) {
+                    // Normalize newlines to avoid off-by-one slices when we remove
+                    // the trailing newline that was added by saveCddl
+                    contents = contents.replace(/\r/g, '');
+                    const endOfHeader = contents.indexOf('\n\n');
+                    contents = contents.substring(endOfHeader + 2).slice(0, -1);
+                }
+                cddlModule.cddl = contents;
+            }
+        }
+
         // Only consider properties that link to an extract, i.e. an IDL
         // or JSON file in subfolder.
         if (!spec[property] ||
diff --git a/tests/crawl-test.json b/tests/crawl-test.json
index 58fcd06b..3c58e15d 100644
--- a/tests/crawl-test.json
+++ b/tests/crawl-test.json
@@ -24,6 +24,7 @@
     },
     "title": "WOFF2",
     "algorithms": [],
+    "cddl": [],
     "css": {
       "atrules": [],
       "properties": [],
@@ -99,6 +100,7 @@
     "title": "No Title",
     "generator": "respec",
     "algorithms": [],
+    "cddl": [],
     "css": {
       "atrules": [],
       "properties": [],
@@ -224,6 +226,7 @@
     },
     "title": "[No title found for https://w3c.github.io/accelerometer/]",
     "algorithms": [],
+    "cddl": [],
     "css": {
       "atrules": [],
       "properties": [],
diff --git a/tests/extract-cddl.js b/tests/extract-cddl.js
new file mode 100644
index 00000000..82dd383a
--- /dev/null
+++ b/tests/extract-cddl.js
@@ -0,0 +1,170 @@
+import assert from 'node:assert';
+import path from 'node:path';
+import { fileURLToPath } from 'node:url';
+import puppeteer from 'puppeteer';
+import { rollup } from 'rollup';
+const scriptPath = path.dirname(fileURLToPath(import.meta.url));
+
+const tests = [
+  {
+    title: 'extracts CDDL from pre.cddl',
+    html: `
cddl = tstr
`, + res: 'cddl = tstr' + }, + + { + title: 'produces no CDDL when there is no CDDL', + html: `

Me no define CDDL

`, + res: [] + }, + + { + title: 'merges multiples blocks of CDDL', + html: `
cddl = * rule
+
rule = tstr
`, + res: `cddl = * rule + +rule = tstr` + }, + + { + title: 'strips trailing spaces', + html: `
+            cddl = * rule    
`, + res: `cddl = * rule` + }, + + { + title: 'preserves internal indentation', + html: `
+            rule = (
+              typedef /
+              groupdef
+            )
+            typedef = tstr
+              groupdef = tstr
+          
`, + res: `rule = ( + typedef / + groupdef +) +typedef = tstr + groupdef = tstr` + }, + + { + title: 'extracts CDDL module names from data-cddl-module', + html: `
cddl = tstr
`, + res: [ + { name: 'all', cddl: 'cddl = tstr' }, + { name: 'mod', cddl: 'cddl = tstr' } + ] + }, + + { + title: 'extracts CDDL module name defined as class', + html: `
cddl = tstr
`, + res: [ + { name: 'all', cddl: 'cddl = tstr' }, + { name: 'mod1', cddl: 'cddl = tstr' }, + { name: 'mod2', cddl: 'cddl = tstr' } + ] + }, + + { + title: 'assembles CDDL in modules', + html: ` +
+        rule = (cddl1 / cddl2)
+      
+
+        cddl1 = tstr
+      
+
+        cddl2 = tstr
+      
+
+        typedef = tstr
+        groupdef = tstr
+      
+ `, + res: [ + { + name: 'all', + cddl: +`rule = (cddl1 / cddl2) + +cddl1 = tstr + +cddl2 = tstr + +typedef = tstr +groupdef = tstr` + }, + { + name: 'mod1', + cddl: +`cddl1 = tstr + +typedef = tstr +groupdef = tstr` + }, + { + name: 'mod2', + cddl: +`cddl2 = tstr + +typedef = tstr +groupdef = tstr` + } + ] + } +]; + +function isString(x) { + return Object.prototype.toString.call(x) === "[object String]"; +} + +describe("CDDL extraction", function () { + this.slow(5000); + + let browser; + let extractCode; + + before(async () => { + const extractBundle = await rollup({ + input: path.resolve(scriptPath, '../src/browserlib/extract-cddl.mjs') + }); + const extractOutput = (await extractBundle.generate({ + name: 'extractCddl', + format: 'iife' + })).output; + extractCode = extractOutput[0].code; + + browser = await puppeteer.launch({ headless: true }); + }); + + for (const test of tests) { + it(test.title, async () => { + const page = await browser.newPage(); + page.setContent(test.html); + await page.addScriptTag({ content: extractCode }); + + const extracted = await page.evaluate(async () => extractCddl()); + await page.close(); + + if (isString(test.res)) { + assert.deepEqual(extracted.length, 1, + `Expected extraction to return 1 CDDL module, got ${extracted.length}`); + assert.deepEqual(extracted[0].cddl, test.res); + } + else { + assert.deepEqual(extracted, test.res); + } + }); + } + + after(async () => { + await browser.close(); + }); +});