Major rewrite to make things more consistent and extensible

**IMPORTANT: Work in progress, not yet functional!** The code in Strudy did the same thing in multiple ways: - The main CLI was linked to the first type of analysis reports, that still get published to `w3c/webref-analysis`. - A few CLIs were there, used mainly for exploration purpose. - The study modules could sometimes be used as CLI, for no apparent reason. - The study modules used different conventions for inputs/outputs - The reporting code was somewhat separated from the rest, whereas it could be useful to extend it. - Preparing flexible reports and amending the "file an issue" workflow was a bit tedious. This rewrite attempts to solve the above by: - Rewriting study modules to expose one similar study function - Adding a generic study module that knows about all anomaly types, dispatches the study to the different study modules depending on the request, and can structure and format the result as needed in markdown to file one or more issues. - Adding a few tests to be more confident that the analyses are doing the right thing! - **(Not done yet)** Updating the entry points to use the new study module - **(Not done yet)** Getting rid of all side CLIs
w3c · Aug 2, 2024 · 785dfc5 · 785dfc5
1 parent ba21462
commit 785dfc5
Show file tree

Hide file tree

Showing 12 changed files with 1,418 additions and 161 deletions.
diff --git a/src/lib/study-algorithms.js b/src/lib/study-algorithms.js
@@ -1,11 +1,4 @@
 import { JSDOM } from 'jsdom';
-import { recordCategorizedAnomaly } from './util.js';
-
-const possibleAnomalies = [
-  'missingTaskForPromise',
-  'missingTaskForEvent'
-];
-
 
 /**
  * Normalize whitespaces in string to make analysis easier
@@ -57,9 +50,8 @@ function nestParallelSteps(algo) {
 /**
  * Main function, study all algorithms
  */
-function studyAlgorithms(edResults) {
+function studyAlgorithms(specs) {
   const report = [];
-  const recordAnomaly = recordCategorizedAnomaly(report, 'algorithms', possibleAnomalies);
 
   // Return human-friendly markdown that identifies the given algorithm
   function getAlgoName(algo) {
@@ -95,11 +87,19 @@ function studyAlgorithms(edResults) {
           // https://w3c.github.io/clipboard-apis/#dom-clipboard-read
           !html.includes('systemClipboardRepresentation')
       ) {
-        recordAnomaly(spec, 'missingTaskForPromise', `${getAlgoName(algo)} has a parallel step that resolves/rejects a promise directly`);
+        report.push({
+          name: 'missingTaskForPromise',
+          message: `${getAlgoName(algo)} has a parallel step that resolves/rejects a promise directly`,
+          spec
+        });
         return true;
       }
       else if (html.match(/fire an?( \w+)? event/i)) {
-        recordAnomaly(spec, 'missingTaskForEvent', `${getAlgoName(algo)} has a parallel step that fires an event directly`);
+        report.push({
+          name: 'missingTaskForEvent',
+          message: `${getAlgoName(algo)} has a parallel step that fires an event directly`,
+          spec
+        });
         return true;
       }
     }
@@ -133,13 +133,10 @@ function studyAlgorithms(edResults) {
     return anomalyFound;
   }
 
-  // We're only interested in specs that define algorithms
-  const specs = edResults.filter(spec => !!spec.algorithms);
-
   // Study algorithms in turn.
   // Note: the root level of each algorithm is its first step. It may say
   // something like "run these steps in parallel" in particular.
-  for (const spec of specs) {
+  for (const spec of specs.filter(spec => !!spec.algorithms)) {
     for (const algo of spec.algorithms) {
       nestParallelSteps(algo);
       studyAlgorithmStep(spec, algo, algo);

diff --git a/src/lib/study-backrefs.js b/src/lib/study-backrefs.js
@@ -1,18 +1,3 @@
-import { loadCrawlResults, recordCategorizedAnomaly } from './util.js';
-import { fileURLToPath } from 'node:url';
-
-const possibleAnomalies = [
-  'brokenLinks',
-  'datedUrls',
-  'evolvingLinks',
-  'frailLinks',
-  'nonCanonicalRefs',
-  'notDfn',
-  'notExported',
-  'outdatedSpecs',
-  'unknownSpecs'
-];
-
 /**
  * The backrefs analyzer only checks links to other specs. This function returns
  * true when a link does target a spec, and false if it targets something else
@@ -39,65 +24,99 @@ const matchSpecUrl = url =>
  TODO: DRY
  Copied from browser-specs/src/compute-shortname.js
 */
-function computeShortname (url) {
-  function parseUrl (url) {
+function computeShortname(url) {
+  function parseUrl(url) {
     // Handle /TR/ URLs
-    const w3cTr = url.match(/^https?:\/\/(?:www\.)?w3\.org\/TR\/([^/]+)\/$/);
+    const w3cTr = url.match(/^https?:\/\/(?:www\.)?w3\.org\/TR\/([^\/]+)\/$/);
     if (w3cTr) {
       return w3cTr[1];
     }
 
     // Handle WHATWG specs
-    const whatwg = url.match(/\/\/(.+)\.spec\.whatwg\.org\/?/);
+    const whatwg = url.match(/\/\/(.+)\.spec\.whatwg\.org\//);
     if (whatwg) {
-      return whatwg[1];
+        return whatwg[1];
     }
 
     // Handle TC39 Proposals
-    const tc39 = url.match(/\/\/tc39\.es\/proposal-([^/]+)\/$/);
+    const tc39 = url.match(/\/\/tc39\.es\/proposal-([^\/]+)\/$/);
     if (tc39) {
-      return 'tc39-' + tc39[1];
+        return "tc39-" + tc39[1];
     }
 
+
     // Handle Khronos extensions
-    const khronos = url.match(/https:\/\/registry\.khronos\.org\/webgl\/extensions\/([^/]+)\/$/);
+    const khronos = url.match(/https:\/\/registry\.khronos\.org\/webgl\/extensions\/([^\/]+)\/$/);
     if (khronos) {
-      return khronos[1];
+        return khronos[1];
     }
 
     // Handle extension specs defined in the same repo as the main spec
     // (e.g. generate a "gamepad-extensions" name for
     // https://w3c.github.io/gamepad/extensions.html")
-    const ext = url.match(/\/.*\.github\.io\/([^/]+)\/(extensions?)\.html$/);
+    const ext = url.match(/\/.*\.github\.io\/([^\/]+)\/(extensions?)\.html$/);
     if (ext) {
       return ext[1] + '-' + ext[2];
     }
 
     // Handle draft specs on GitHub, excluding the "webappsec-" prefix for
     // specifications developed by the Web Application Security Working Group
-    const github = url.match(/\/.*\.github\.io\/(?:webappsec-)?([^/]+)\//);
+    const github = url.match(/\/.*\.github\.io\/(?:webappsec-)?([^\/]+)\//);
     if (github) {
-      return github[1];
+        return github[1];
     }
 
     // Handle CSS WG specs
-    const css = url.match(/\/drafts\.(?:csswg|fxtf|css-houdini)\.org\/([^/]+)\//);
+    const css = url.match(/\/drafts\.(?:csswg|fxtf|css-houdini)\.org\/([^\/]+)\//);
     if (css) {
       return css[1];
     }
 
     // Handle SVG drafts
-    const svg = url.match(/\/svgwg\.org\/specs\/(?:svg-)?([^/]+)\//);
+    const svg = url.match(/\/svgwg\.org\/specs\/(?:svg-)?([^\/]+)\//);
     if (svg) {
-      return 'svg-' + svg[1];
+      return "svg-" + svg[1];
+    }
+
+    // Handle IETF RFCs
+    const rfcs = url.match(/\/www.rfc-editor\.org\/rfc\/(rfc[0-9]+)/);
+    if (rfcs) {
+      return rfcs[1];
+    }
+
+    // Handle IETF group drafts
+    const ietfDraft = url.match(/\/datatracker\.ietf\.org\/doc\/html\/draft-ietf-[^\-]+-([^\/]+)/);
+    if (ietfDraft) {
+      return ietfDraft[1];
+    }
+
+    // Handle IETF individual drafts, stripping group name
+    // TODO: retrieve the list of IETF groups to make sure that the group name
+    // is an actual group name and not the beginning of the shortname:
+    // https://datatracker.ietf.org/api/v1/group/group/
+    // (multiple requests needed due to pagination, "?limit=1000" is the max)
+    const ietfIndDraft = url.match(/\/datatracker\.ietf\.org\/doc\/html\/draft-[^\-]+-([^\/]+)/);
+    if (ietfIndDraft) {
+      if (ietfIndDraft[1].indexOf('-') !== -1) {
+        return ietfIndDraft[1].slice(ietfIndDraft[1].indexOf('-') + 1);
+      }
+      else {
+        return ietfIndDraft[1];
+      }
+    }
+
+    // Handle TAG findings
+    const tag = url.match(/^https?:\/\/(?:www\.)?w3\.org\/2001\/tag\/doc\/([^\/]+)\/?$/);
+    if (tag) {
+      return tag[1];
     }
 
     // Return name when one was given
     if (!url.match(/\//)) {
       return url;
     }
 
-    throw new Error(`Cannot extract meaningful name from ${url}`);
+    throw `Cannot extract meaningful name from ${url}`;
   }
 
   // Parse the URL to extract the name
@@ -107,8 +126,8 @@ function computeShortname (url) {
   // Latin characters (a-z letters, digits, underscore and "-"), and that it
   // only contains a dot for fractional levels at the end of the name
   // (e.g. "blah-1.2" is good but "blah.blah" and "blah-3.1-blah" are not)
-  if (!name.match(/^[\w-]+((?<=-\d+)\.\d+)?$/)) {
-    throw new Error(`Specification name contains unexpected characters: ${name} (extracted from ${url})`);
+  if (!name.match(/^[\w\-]+((?<=\-v?\d+)\.\d+)?$/)) {
+    throw `Specification name contains unexpected characters: ${name} (extracted from ${url})`;
   }
 
   return name;
@@ -234,18 +253,21 @@ const matchAnchor = (url, anchor) => link => {
   return link === (url + '#' + anchor) || link === (url + '#' + encodeURIComponent(anchor));
 };
 
-function studyBackrefs (edResults, trResults = [], htmlFragments = {}, shortnameFilter) {
-  trResults = trResults || [];
+async function studyBackrefs(specs, { crawlResults = null, trResults = [], htmlFragments = null } = {}) {
+  crawlResults = crawlResults ?? specs;
   const report = [];
 
-  edResults.forEach(spec => {
-    if (shortnameFilter && spec.shortname !== shortnameFilter) return;
-    studyLinks(spec, spec.links?.rawlinks, report, edResults, trResults, htmlFragments);
+  // Donwload automatic map of multipages anchors in HTML spec
+  const fragmentsUrl = 'https://html.spec.whatwg.org/multipage/fragment-links.json';
+  htmlFragments = htmlFragments ?? await fetch(fragmentsUrl).then(r => r.json());
+
+  specs.forEach(spec => {
+    studyLinks(spec, spec.links?.rawlinks, report, crawlResults, trResults, htmlFragments);
     // given the current limitation of classification of links for bikeshed
     // https://github.com/w3c/reffy/issues/1584
     // we also check autolinks for bikeshed specs 
     if (spec.generator === "bikeshed") {
-      studyLinks(spec, spec.links?.autolinks, report, edResults, trResults, htmlFragments);
+      studyLinks(spec, spec.links?.autolinks, report, crawlResults, trResults, htmlFragments);
     }
   });
   return report;
@@ -254,7 +276,9 @@ function studyBackrefs (edResults, trResults = [], htmlFragments = {}, shortname
 function studyLinks(spec, links, report, edResults, trResults, htmlFragments) {
   if (!links) return;
 
-  const recordAnomaly = recordCategorizedAnomaly(report, 'links', possibleAnomalies);
+  function recordAnomaly(spec, name, message) {
+    report.push({ name, message, spec });
+  }
 
   Object.keys(links)
     .filter(matchSpecUrl)
@@ -421,22 +445,4 @@ function studyLinks(spec, links, report, edResults, trResults, htmlFragments) {
     });
 }
 
-/**************************************************
-Export methods for use as module
-**************************************************/
-export default studyBackrefs;
-
-if (process.argv[1] === fileURLToPath(import.meta.url)) {
-  const crawl = await loadCrawlResults(process.argv[2], process.argv[3]);
-  let htmlFragments = {};
-  try {
-    console.info('Downloading HTML spec fragments data…');
-    htmlFragments = await fetch('https://html.spec.whatwg.org/multipage/fragment-links.json').then(r => r.json());
-    console.info('- done');
-  } catch (err) {
-    console.error('- failed: could not fetch HTML fragments data, may report false positive broken links on HTML spec');
-  }
-
-  const results = studyBackrefs(crawl.ed, crawl.tr, htmlFragments, process.argv[4] ?? undefined);
-  console.log(results);
-}
+export default studyBackrefs;