Merge pull request #12502 from github/repo-sync

Octomerger · web-flow · commit 5954e7238545 · 2021-11-30T16:54:45.000-05:00
repo sync
diff --git a/.github/workflows/create-translation-batch-pr.yml b/.github/workflows/create-translation-batch-pr.yml
@@ -138,8 +138,8 @@ jobs:
 
       - name: Check in CSV report
         run: |
-          mkdir -p log
-          csvFile=log/${{ matrix.language_code }}-resets.csv
+          mkdir -p translations/log
+          csvFile=translations/log/${{ matrix.language_code }}-resets.csv
           script/i18n/report-reset-files.js --report-type=csv --language=${{ matrix.language_code }} --log-file=/tmp/batch.log > $csvFile
           git add -f $csvFile && git commit -m "Check in ${{ matrix.language }} CSV report" || echo "Nothing to commit"
 
diff --git a/lib/excluded-links.js b/lib/excluded-links.js
@@ -1,18 +1,28 @@
-// Linkinator treats the following as regex.
+/**
+ * This file exports a mix of strings and of regexes. Linkinator relies
+ * on this in `script/check-english-links.js` when we encounter external
+ * links that we *specifically ignore*. That means, that URLs or patterns
+ * mentioned in this file might appear within our content but we don't
+ * bother checking that they actually work.
+ */
+
+/* eslint-disable prefer-regex-literals */
+
 export default [
   // Skip GitHub search links.
-  'https://github.com/search\\?',
-  'https://github.com/github/gitignore/search\\?',
+  // E.g. https://github.com/search?foo=bar
+  new RegExp('https://github\\.com/search\\?'),
+  new RegExp('https://github\\.com/github/gitignore/search\\?'),
 
   // These links require auth.
-  'https://github.com/settings/profile',
-  'https://github.com/github/docs/edit',
-  'https://github.com/github/insights-releases/releases/latest',
-  'https://classroom.github.com/videos',
+  new RegExp('https://github\\.com/settings/profile'),
+  new RegExp('https://github\\.com/github/docs/edit'),
+  new RegExp('https://github\\.com/github/insights-releases/releases/latest'),
+  new RegExp('https://classroom\\.github.com/videos'),
 
   // Oneoff links that link checkers think are broken but are not.
   'https://haveibeenpwned.com/',
-  'https://www.ilo.org/dyn/normlex/en/f\\?p=NORMLEXPUB:12100:0::NO::P12100_ILO_CODE:P029',
+  'https://www.ilo.org/dyn/normlex/en/f?p=NORMLEXPUB:12100:0::NO::P12100_ILO_CODE:P029',
   'https://www.linkedin.com/company/github',
   'https://www.facebook.com/',
   'https://ko-fi.com/',
diff --git a/script/check-english-links.js b/script/check-english-links.js
@@ -52,7 +52,7 @@ program
 // Skip non-English content.
 const languagesToSkip = Object.keys(libLanguages)
   .filter((code) => code !== 'en')
-  .map((code) => `${root}/${code}`)
+  .map((code) => new RegExp(`${root}/${code}`))
 
 // Skip deprecated Enterprise content.
 // Capture the old format https://docs.github.com/enterprise/2.1/
@@ -66,7 +66,19 @@ const config = {
   recurse: !program.opts().dryRun,
   silent: true,
   // The values in this array are treated as regexes.
-  linksToSkip: [enterpriseReleasesToSkip, ...languagesToSkip, ...excludedLinks],
+  linksToSkip: linksToSkipFactory([enterpriseReleasesToSkip, ...languagesToSkip, ...excludedLinks]),
+}
+
+// Return a function that can as quickly as possible check if a certain
+// href input should be skipped.
+// Do this so we can use a `Set` and a `iterable.some()` for a speedier
+// check. The default implementation in Linkinator, if you set
+// the `linksToSkip` config to be an array, it will, for every URL it
+// checks turn that into a new regex every single time.
+function linksToSkipFactory(regexAndURLs) {
+  const set = new Set(regexAndURLs.filter((regexOrURL) => typeof regexOrURL === 'string'))
+  const regexes = regexAndURLs.filter((regexOrURL) => regexOrURL instanceof RegExp)
+  return (href) => set.has(href) || regexes.some((regex) => regex.test(href))
 }
 
 main()
diff --git a/tests/meta/repository-references.js b/tests/meta/repository-references.js
@@ -1,5 +1,6 @@
+import fs from 'fs'
+
 import walkSync from 'walk-sync'
-import readFileAsync from '../../lib/readfile-async.js'
 import minimatch from 'minimatch'
 
 /*
@@ -75,33 +76,56 @@ const REPO_REGEXP = /\/\/github\.com\/github\/(?!docs[/'"\n])([\w-.]+)/gi
 const IGNORE_PATHS = [
   '.git',
   '.next',
+  '.vscode', // Not part of the repo but could be for a developer locally
   'node_modules',
   'translations',
+  '.linkinator',
   '**/*.png', // Do not check images or font files.
   '**/*.jpg', // We could just put all of assets/* here, but that would prevent any
   '**/*.gif', // READMEs or other text-based files from being checked.
   '**/*.pdf',
   '**/*.ico',
   '**/*.woff',
+  '**/*.csv',
+  '**/*.br', // E.g. the search index .json.br files
+  '**/*.graphql', // E.g. data/graphql/ghec/schema.docs.graphql
+  'package-lock.json', // At the time of writing it's 1.5MB!
+  '.linkinator/full.log', // Only present if you've run linkinator
+  'lib/search/popular-pages.json', // used to build search indexes
+  'tests/**/*.json',
 
   'content/early-access', // Not committed to public repository.
   'data/early-access', // Not committed to public repository.
   'data/release-notes', // These include links to many internal issues in Liquid comments.
+  'lib/redirects/.redirects-cache*',
 ]
 
 describe('check if a GitHub-owned private repository is referenced', () => {
   const filenames = walkSync(process.cwd(), {
     directories: false,
     ignore: IGNORE_PATHS,
-  })
+  }).filter(
+    (filename) =>
+      // Skip the large static json files because they're not code.
+      !(
+        filename.includes('static') &&
+        (filename.endsWith('.json') || filename.endsWith('.json.br'))
+      )
+  )
 
-  test.each(filenames)('in file %s', async (filename) => {
-    const file = await readFileAsync(filename, 'utf8')
-    const allowDocs = ALLOW_DOCS_PATHS.some((path) => minimatch(filename, path))
+  test.each(filenames)('in file %s', (filename) => {
+    // When you're reading many small files, it's faster to do it
+    // *synchronously* because the event-loop overhead is less since
+    // the disk I/O is sufficiently small.
+    const file = fs.readFileSync(filename, 'utf8')
     const matches = Array.from(file.matchAll(REPO_REGEXP))
       .map(([, repoName]) => repoName)
       .filter((repoName) => !PUBLIC_REPOS.has(repoName))
-      .filter((repoName) => !(allowDocs && repoName.startsWith('docs')))
+      .filter((repoName) => {
+        return !(
+          repoName.startsWith('docs') && ALLOW_DOCS_PATHS.some((path) => minimatch(filename, path))
+        )
+      })
     expect(
       matches,
       `Please edit ${filename} to remove references to ${matches.join(', ')}`