Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Major rewrite to make things more consistent and extensible #664

Merged
merged 9 commits into from
Aug 23, 2024
27 changes: 12 additions & 15 deletions src/lib/study-algorithms.js
Original file line number Diff line number Diff line change
@@ -1,11 +1,4 @@
import { JSDOM } from 'jsdom';
import { recordCategorizedAnomaly } from './util.js';

const possibleAnomalies = [
'missingTaskForPromise',
'missingTaskForEvent'
];


/**
* Normalize whitespaces in string to make analysis easier
Expand Down Expand Up @@ -57,9 +50,8 @@ function nestParallelSteps(algo) {
/**
* Main function, study all algorithms
*/
function studyAlgorithms(edResults) {
function studyAlgorithms(specs) {
const report = [];
const recordAnomaly = recordCategorizedAnomaly(report, 'algorithms', possibleAnomalies);

// Return human-friendly markdown that identifies the given algorithm
function getAlgoName(algo) {
Expand Down Expand Up @@ -95,11 +87,19 @@ function studyAlgorithms(edResults) {
// https://w3c.github.io/clipboard-apis/#dom-clipboard-read
!html.includes('systemClipboardRepresentation')
) {
recordAnomaly(spec, 'missingTaskForPromise', `${getAlgoName(algo)} has a parallel step that resolves/rejects a promise directly`);
report.push({
name: 'missingTaskForPromise',
message: `${getAlgoName(algo)} has a parallel step that resolves/rejects a promise directly`,
spec
});
return true;
}
else if (html.match(/fire an?( \w+)? event/i)) {
recordAnomaly(spec, 'missingTaskForEvent', `${getAlgoName(algo)} has a parallel step that fires an event directly`);
report.push({
name: 'missingTaskForEvent',
message: `${getAlgoName(algo)} has a parallel step that fires an event directly`,
spec
});
return true;
}
}
Expand Down Expand Up @@ -133,13 +133,10 @@ function studyAlgorithms(edResults) {
return anomalyFound;
}

// We're only interested in specs that define algorithms
const specs = edResults.filter(spec => !!spec.algorithms);

// Study algorithms in turn.
// Note: the root level of each algorithm is its first step. It may say
// something like "run these steps in parallel" in particular.
for (const spec of specs) {
for (const spec of specs.filter(spec => !!spec.algorithms)) {
for (const algo of spec.algorithms) {
nestParallelSteps(algo);
studyAlgorithmStep(spec, algo, algo);
Expand Down
124 changes: 65 additions & 59 deletions src/lib/study-backrefs.js
Original file line number Diff line number Diff line change
@@ -1,18 +1,3 @@
import { loadCrawlResults, recordCategorizedAnomaly } from './util.js';
import { fileURLToPath } from 'node:url';

const possibleAnomalies = [
'brokenLinks',
'datedUrls',
'evolvingLinks',
'frailLinks',
'nonCanonicalRefs',
'notDfn',
'notExported',
'outdatedSpecs',
'unknownSpecs'
];

/**
* The backrefs analyzer only checks links to other specs. This function returns
* true when a link does target a spec, and false if it targets something else
Expand All @@ -39,65 +24,99 @@ const matchSpecUrl = url =>
TODO: DRY
Copied from browser-specs/src/compute-shortname.js
*/
function computeShortname (url) {
function parseUrl (url) {
function computeShortname(url) {
function parseUrl(url) {
// Handle /TR/ URLs
const w3cTr = url.match(/^https?:\/\/(?:www\.)?w3\.org\/TR\/([^/]+)\/$/);
const w3cTr = url.match(/^https?:\/\/(?:www\.)?w3\.org\/TR\/([^\/]+)\/$/);
if (w3cTr) {
return w3cTr[1];
}

// Handle WHATWG specs
const whatwg = url.match(/\/\/(.+)\.spec\.whatwg\.org\/?/);
const whatwg = url.match(/\/\/(.+)\.spec\.whatwg\.org\//);
if (whatwg) {
return whatwg[1];
return whatwg[1];
}

// Handle TC39 Proposals
const tc39 = url.match(/\/\/tc39\.es\/proposal-([^/]+)\/$/);
const tc39 = url.match(/\/\/tc39\.es\/proposal-([^\/]+)\/$/);
if (tc39) {
return 'tc39-' + tc39[1];
return "tc39-" + tc39[1];
}


// Handle Khronos extensions
const khronos = url.match(/https:\/\/registry\.khronos\.org\/webgl\/extensions\/([^/]+)\/$/);
const khronos = url.match(/https:\/\/registry\.khronos\.org\/webgl\/extensions\/([^\/]+)\/$/);
if (khronos) {
return khronos[1];
return khronos[1];
}

// Handle extension specs defined in the same repo as the main spec
// (e.g. generate a "gamepad-extensions" name for
// https://w3c.github.io/gamepad/extensions.html")
const ext = url.match(/\/.*\.github\.io\/([^/]+)\/(extensions?)\.html$/);
const ext = url.match(/\/.*\.github\.io\/([^\/]+)\/(extensions?)\.html$/);
if (ext) {
return ext[1] + '-' + ext[2];
}

// Handle draft specs on GitHub, excluding the "webappsec-" prefix for
// specifications developed by the Web Application Security Working Group
const github = url.match(/\/.*\.github\.io\/(?:webappsec-)?([^/]+)\//);
const github = url.match(/\/.*\.github\.io\/(?:webappsec-)?([^\/]+)\//);
if (github) {
return github[1];
return github[1];
}

// Handle CSS WG specs
const css = url.match(/\/drafts\.(?:csswg|fxtf|css-houdini)\.org\/([^/]+)\//);
const css = url.match(/\/drafts\.(?:csswg|fxtf|css-houdini)\.org\/([^\/]+)\//);
if (css) {
return css[1];
}

// Handle SVG drafts
const svg = url.match(/\/svgwg\.org\/specs\/(?:svg-)?([^/]+)\//);
const svg = url.match(/\/svgwg\.org\/specs\/(?:svg-)?([^\/]+)\//);
if (svg) {
return 'svg-' + svg[1];
return "svg-" + svg[1];
}

// Handle IETF RFCs
const rfcs = url.match(/\/www.rfc-editor\.org\/rfc\/(rfc[0-9]+)/);
if (rfcs) {
return rfcs[1];
}

// Handle IETF group drafts
const ietfDraft = url.match(/\/datatracker\.ietf\.org\/doc\/html\/draft-ietf-[^\-]+-([^\/]+)/);
if (ietfDraft) {
return ietfDraft[1];
}

// Handle IETF individual drafts, stripping group name
// TODO: retrieve the list of IETF groups to make sure that the group name
// is an actual group name and not the beginning of the shortname:
// https://datatracker.ietf.org/api/v1/group/group/
// (multiple requests needed due to pagination, "?limit=1000" is the max)
const ietfIndDraft = url.match(/\/datatracker\.ietf\.org\/doc\/html\/draft-[^\-]+-([^\/]+)/);
if (ietfIndDraft) {
if (ietfIndDraft[1].indexOf('-') !== -1) {
return ietfIndDraft[1].slice(ietfIndDraft[1].indexOf('-') + 1);
}
else {
return ietfIndDraft[1];
}
}

// Handle TAG findings
const tag = url.match(/^https?:\/\/(?:www\.)?w3\.org\/2001\/tag\/doc\/([^\/]+)\/?$/);
if (tag) {
return tag[1];
}

// Return name when one was given
if (!url.match(/\//)) {
return url;
}

throw new Error(`Cannot extract meaningful name from ${url}`);
throw `Cannot extract meaningful name from ${url}`;
}

// Parse the URL to extract the name
Expand All @@ -107,8 +126,8 @@ function computeShortname (url) {
// Latin characters (a-z letters, digits, underscore and "-"), and that it
// only contains a dot for fractional levels at the end of the name
// (e.g. "blah-1.2" is good but "blah.blah" and "blah-3.1-blah" are not)
if (!name.match(/^[\w-]+((?<=-\d+)\.\d+)?$/)) {
throw new Error(`Specification name contains unexpected characters: ${name} (extracted from ${url})`);
if (!name.match(/^[\w\-]+((?<=\-v?\d+)\.\d+)?$/)) {
throw `Specification name contains unexpected characters: ${name} (extracted from ${url})`;
}

return name;
Expand Down Expand Up @@ -234,18 +253,21 @@ const matchAnchor = (url, anchor) => link => {
return link === (url + '#' + anchor) || link === (url + '#' + encodeURIComponent(anchor));
};

function studyBackrefs (edResults, trResults = [], htmlFragments = {}, shortnameFilter) {
trResults = trResults || [];
async function studyBackrefs(specs, { crawlResults = null, trResults = [], htmlFragments = null } = {}) {
crawlResults = crawlResults ?? specs;
const report = [];

edResults.forEach(spec => {
if (shortnameFilter && spec.shortname !== shortnameFilter) return;
studyLinks(spec, spec.links?.rawlinks, report, edResults, trResults, htmlFragments);
// Donwload automatic map of multipages anchors in HTML spec
const fragmentsUrl = 'https://html.spec.whatwg.org/multipage/fragment-links.json';
htmlFragments = htmlFragments ?? await fetch(fragmentsUrl).then(r => r.json());

specs.forEach(spec => {
studyLinks(spec, spec.links?.rawlinks, report, crawlResults, trResults, htmlFragments);
// given the current limitation of classification of links for bikeshed
// https:/w3c/reffy/issues/1584
// we also check autolinks for bikeshed specs
if (spec.generator === "bikeshed") {
studyLinks(spec, spec.links?.autolinks, report, edResults, trResults, htmlFragments);
studyLinks(spec, spec.links?.autolinks, report, crawlResults, trResults, htmlFragments);
}
});
return report;
Expand All @@ -254,7 +276,9 @@ function studyBackrefs (edResults, trResults = [], htmlFragments = {}, shortname
function studyLinks(spec, links, report, edResults, trResults, htmlFragments) {
if (!links) return;

const recordAnomaly = recordCategorizedAnomaly(report, 'links', possibleAnomalies);
function recordAnomaly(spec, name, message) {
report.push({ name, message, spec });
}

Object.keys(links)
.filter(matchSpecUrl)
Expand Down Expand Up @@ -421,22 +445,4 @@ function studyLinks(spec, links, report, edResults, trResults, htmlFragments) {
});
}

/**************************************************
Export methods for use as module
**************************************************/
export default studyBackrefs;

if (process.argv[1] === fileURLToPath(import.meta.url)) {
const crawl = await loadCrawlResults(process.argv[2], process.argv[3]);
let htmlFragments = {};
try {
console.info('Downloading HTML spec fragments data…');
htmlFragments = await fetch('https://html.spec.whatwg.org/multipage/fragment-links.json').then(r => r.json());
console.info('- done');
} catch (err) {
console.error('- failed: could not fetch HTML fragments data, may report false positive broken links on HTML spec');
}

const results = studyBackrefs(crawl.ed, crawl.tr, htmlFragments, process.argv[4] ?? undefined);
console.log(results);
}
export default studyBackrefs;
Loading