Skip to content

Commit 4d4128b

Browse files
heiskrCopilot
andauthored
Reduce load from archived enterprise version requests (#60356)
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent fb5b51e commit 4d4128b

File tree

2 files changed

+86
-2
lines changed

2 files changed

+86
-2
lines changed

src/archives/middleware/archived-enterprise-versions.ts

Lines changed: 74 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import { setFastlySurrogateKey, SURROGATE_ENUMS } from '@/frame/middleware/set-f
1616
import { readCompressedJsonFileFallbackLazily } from '@/frame/lib/read-json-file'
1717
import { archivedCacheControl, languageCacheControl } from '@/frame/middleware/cache-control'
1818
import { pathLanguagePrefixed, languagePrefixPathRegex } from '@/languages/lib/languages-server'
19+
import { languages as allLanguages } from '@/languages/lib/languages'
1920
import getRedirect, { splitPathByLanguage } from '@/redirects/lib/get-redirect'
2021
import getRemoteJSON from '@/frame/lib/get-remote-json'
2122
import { ExtendedRequest } from '@/types'
@@ -214,6 +215,29 @@ export default async function archivedEnterpriseVersions(
214215
return res.safeRedirect(redirectCode, redirectJson[req.path])
215216
}
216217
}
218+
// Short-circuit requests that will never resolve on the upstream
219+
// GitHub Pages repos, avoiding unnecessary network requests.
220+
const earlyNotFound = getEarlyNotFoundReason(req.path, requestedVersion)
221+
if (earlyNotFound) {
222+
statsd.increment('middleware.archived_early_not_found', 1, [
223+
`reason:${earlyNotFound}`,
224+
`version:${requestedVersion}`,
225+
])
226+
cacheAggressively(res)
227+
return res.status(404).type('text').send('Page not found')
228+
}
229+
230+
// Requests without a language prefix for versions > 2.17 will always
231+
// 404 upstream (the archive repos store pages under /en/, /zh/, etc.).
232+
// Skip the fetch and let downstream middleware handle the redirect.
233+
if (
234+
versionSatisfiesRange(requestedVersion, `>${lastVersionWithoutArchivedRedirectsFile}`) &&
235+
!pathLanguagePrefixed(req.path)
236+
) {
237+
statsd.increment('middleware.archived_skip_no_language', 1, [`version:${requestedVersion}`])
238+
return next()
239+
}
240+
217241
// Retrieve the page from the archived repo
218242
const doGet = () =>
219243
fetchWithRetry(
@@ -246,15 +270,17 @@ export default async function archivedEnterpriseVersions(
246270
})
247271
}
248272

249-
// Log errors for non-200 responses to help identify issues with archived content
273+
// Log non-200 responses — use warn for 404s (expected for missing archived
274+
// pages) and error for genuine upstream failures (5xx, timeouts).
250275
if (r.status !== 200) {
251276
let upstreamBody: string | undefined
252277
try {
253278
upstreamBody = await r.text()
254279
} catch {
255280
// ignore — body reading failure shouldn't affect error handling
256281
}
257-
logger.error('Failed to fetch archived enterprise content', {
282+
const level = r.status === 404 ? 'warn' : 'error'
283+
logger[level]('Failed to fetch archived enterprise content', {
258284
version: requestedVersion,
259285
path: req.path,
260286
status: r.status,
@@ -502,3 +528,49 @@ function splitByLanguage(uri: string) {
502528
}
503529
return [language, withoutLanguage]
504530
}
531+
532+
// Regex to extract any language-like prefix from the path, including
533+
// "cn" which was the old Chinese language code used in archives ≤3.2.
534+
const archiveLanguagePrefixRegex = new RegExp(`^/(${Object.keys(allLanguages).join('|')}|cn)(/|$)`)
535+
536+
// Detects request paths that will never resolve on the upstream GitHub
537+
// Pages archive repos, so we can 404 immediately without making a
538+
// network request. Returns a short reason string, or null if the
539+
// request looks plausible.
540+
function getEarlyNotFoundReason(reqPath: string, version: string): string | null {
541+
// Double slashes in the path never resolve (e.g. ".../about-2fa//index.html")
542+
if (reqPath.includes('//')) {
543+
return 'double-slash'
544+
}
545+
546+
// Duplicated "/developer/developer/" segment — these are broken
547+
// crawler URLs from the old developer.github.com site.
548+
if (reqPath.includes('/developer/developer/')) {
549+
return 'developer-developer'
550+
}
551+
552+
// Check if the language in the path actually exists in this version's
553+
// archive. Each language has a `firstArchivedVersion` indicating when
554+
// it was first included in the GHES archives.
555+
const langMatch = reqPath.match(archiveLanguagePrefixRegex)
556+
if (langMatch) {
557+
const lang = langMatch[1]
558+
559+
// "cn" was the old Chinese language code; those archives are ancient
560+
// and effectively dead traffic. Always 404.
561+
if (lang === 'cn') {
562+
return 'language-not-in-version'
563+
}
564+
565+
const langDef = allLanguages[lang]
566+
if (langDef?.firstArchivedVersion) {
567+
// 404 if the requested version is older than when this language
568+
// was first archived (e.g. /zh/ on v3.0 → 404 because zh started in 3.3)
569+
if (!versionSatisfiesRange(version, `>=${langDef.firstArchivedVersion}`)) {
570+
return 'language-not-in-version'
571+
}
572+
}
573+
}
574+
575+
return null
576+
}

src/languages/lib/languages.ts

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,10 @@ export interface Language {
2727
locale: LocaleCode
2828
redirectPatterns?: RegExp[]
2929
dir?: string
30+
// The earliest archived GHES version that includes this language.
31+
// Used to short-circuit requests for translations that don't exist
32+
// in a given archive. English is always available so it has no value.
33+
firstArchivedVersion?: string
3034
}
3135

3236
export interface Languages {
@@ -48,6 +52,7 @@ export const languages: Languages = {
4852
code: 'es',
4953
hreflang: 'es',
5054
locale: 'es-es',
55+
firstArchivedVersion: '3.0',
5156
},
5257
ja: {
5358
name: 'Japanese',
@@ -56,6 +61,7 @@ export const languages: Languages = {
5661
hreflang: 'ja',
5762
redirectPatterns: [/^\/jp/],
5863
locale: 'ja-jp',
64+
firstArchivedVersion: '3.0',
5965
},
6066
pt: {
6167
name: 'Portuguese',
@@ -64,6 +70,7 @@ export const languages: Languages = {
6470
hreflang: 'pt',
6571
redirectPatterns: [/^\/br/],
6672
locale: 'pt-br',
73+
firstArchivedVersion: '3.0',
6774
},
6875
zh: {
6976
name: 'Simplified Chinese',
@@ -72,20 +79,23 @@ export const languages: Languages = {
7279
hreflang: 'zh-Hans',
7380
redirectPatterns: [/^\/cn/, /^\/zh-\w{2}/],
7481
locale: 'zh-cn',
82+
firstArchivedVersion: '3.3',
7583
},
7684
ru: {
7785
name: 'Russian',
7886
nativeName: 'Русский',
7987
code: 'ru',
8088
hreflang: 'ru',
8189
locale: 'ru-ru',
90+
firstArchivedVersion: '3.3',
8291
},
8392
fr: {
8493
name: 'French',
8594
nativeName: 'Français',
8695
code: 'fr',
8796
hreflang: 'fr',
8897
locale: 'fr-fr',
98+
firstArchivedVersion: '3.3',
8999
},
90100
ko: {
91101
name: 'Korean',
@@ -94,13 +104,15 @@ export const languages: Languages = {
94104
hreflang: 'ko',
95105
redirectPatterns: [/^\/kr/],
96106
locale: 'ko-kr',
107+
firstArchivedVersion: '3.3',
97108
},
98109
de: {
99110
name: 'German',
100111
nativeName: 'Deutsch',
101112
code: 'de',
102113
hreflang: 'de',
103114
locale: 'de-de',
115+
firstArchivedVersion: '3.3',
104116
},
105117
}
106118

0 commit comments

Comments
 (0)