Skip to content

Commit 2f6f076

Browse files
author
Peter Bengtsson
authored
LinkChecker and better linksToSkip function (#23001)
* LinkChecker and better linksToSkip function Part of #1253 * try now * escaped * make sure it skips all non-english links * feedbacked
1 parent e59c79a commit 2f6f076

3 files changed

Lines changed: 33 additions & 10 deletions

File tree

lib/excluded-links.js

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,28 @@
1-
// Linkinator treats the following as regex.
1+
/**
2+
* This file exports a mix of strings and of regexes. Linkinator relies
3+
* on this in `script/check-english-links.js` when we encounter external
4+
* links that we *specifically ignore*. That means, that URLs or patterns
5+
* mentioned in this file might appear within our content but we don't
6+
* bother checking that they actually work.
7+
*/
8+
9+
/* eslint-disable prefer-regex-literals */
10+
211
export default [
312
// Skip GitHub search links.
4-
'https://github.com/search\\?',
5-
'https://github.com/github/gitignore/search\\?',
13+
// E.g. https://github.com/search?foo=bar
14+
new RegExp('https://github\\.com/search\\?'),
15+
new RegExp('https://github\\.com/github/gitignore/search\\?'),
616

717
// These links require auth.
8-
'https://github.com/settings/profile',
9-
'https://github.com/github/docs/edit',
10-
'https://github.com/github/insights-releases/releases/latest',
11-
'https://classroom.github.com/videos',
18+
new RegExp('https://github\\.com/settings/profile'),
19+
new RegExp('https://github\\.com/github/docs/edit'),
20+
new RegExp('https://github\\.com/github/insights-releases/releases/latest'),
21+
new RegExp('https://classroom\\.github.com/videos'),
1222

1323
// Oneoff links that link checkers think are broken but are not.
1424
'https://haveibeenpwned.com/',
15-
'https://www.ilo.org/dyn/normlex/en/f\\?p=NORMLEXPUB:12100:0::NO::P12100_ILO_CODE:P029',
25+
'https://www.ilo.org/dyn/normlex/en/f?p=NORMLEXPUB:12100:0::NO::P12100_ILO_CODE:P029',
1626
'https://www.linkedin.com/company/github',
1727
'https://www.facebook.com/',
1828
'https://ko-fi.com/',

script/check-english-links.js

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ program
5252
// Skip non-English content.
5353
const languagesToSkip = Object.keys(libLanguages)
5454
.filter((code) => code !== 'en')
55-
.map((code) => `${root}/${code}`)
55+
.map((code) => new RegExp(`${root}/${code}`))
5656

5757
// Skip deprecated Enterprise content.
5858
// Capture the old format https://docs.github.com/enterprise/2.1/
@@ -66,7 +66,19 @@ const config = {
6666
recurse: !program.opts().dryRun,
6767
silent: true,
6868
// The values in this array are treated as regexes.
69-
linksToSkip: [enterpriseReleasesToSkip, ...languagesToSkip, ...excludedLinks],
69+
linksToSkip: linksToSkipFactory([enterpriseReleasesToSkip, ...languagesToSkip, ...excludedLinks]),
70+
}
71+
72+
// Return a function that can as quickly as possible check if a certain
73+
// href input should be skipped.
74+
// Do this so we can use a `Set` and a `iterable.some()` for a speedier
75+
// check. The default implementation in Linkinator, if you set
76+
// the `linksToSkip` config to be an array, it will, for every URL it
77+
// checks turn that into a new regex every single time.
78+
function linksToSkipFactory(regexAndURLs) {
79+
const set = new Set(regexAndURLs.filter((regexOrURL) => typeof regexOrURL === 'string'))
80+
const regexes = regexAndURLs.filter((regexOrURL) => regexOrURL instanceof RegExp)
81+
return (href) => set.has(href) || regexes.some((regex) => regex.test(href))
7082
}
7183

7284
main()

tests/meta/repository-references.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ const IGNORE_PATHS = [
7979
'.vscode', // Not part of the repo but could be for a developer locally
8080
'node_modules',
8181
'translations',
82+
'.linkinator',
8283
'**/*.png', // Do not check images or font files.
8384
'**/*.jpg', // We could just put all of assets/* here, but that would prevent any
8485
'**/*.gif', // READMEs or other text-based files from being checked.

0 commit comments

Comments
 (0)