@@ -16,6 +16,7 @@ import { setFastlySurrogateKey, SURROGATE_ENUMS } from '@/frame/middleware/set-f
1616import { readCompressedJsonFileFallbackLazily } from '@/frame/lib/read-json-file'
1717import { archivedCacheControl , languageCacheControl } from '@/frame/middleware/cache-control'
1818import { pathLanguagePrefixed , languagePrefixPathRegex } from '@/languages/lib/languages-server'
19+ import { languages as allLanguages } from '@/languages/lib/languages'
1920import getRedirect , { splitPathByLanguage } from '@/redirects/lib/get-redirect'
2021import getRemoteJSON from '@/frame/lib/get-remote-json'
2122import { ExtendedRequest } from '@/types'
@@ -214,6 +215,29 @@ export default async function archivedEnterpriseVersions(
214215 return res . safeRedirect ( redirectCode , redirectJson [ req . path ] )
215216 }
216217 }
218+ // Short-circuit requests that will never resolve on the upstream
219+ // GitHub Pages repos, avoiding unnecessary network requests.
220+ const earlyNotFound = getEarlyNotFoundReason ( req . path , requestedVersion )
221+ if ( earlyNotFound ) {
222+ statsd . increment ( 'middleware.archived_early_not_found' , 1 , [
223+ `reason:${ earlyNotFound } ` ,
224+ `version:${ requestedVersion } ` ,
225+ ] )
226+ cacheAggressively ( res )
227+ return res . status ( 404 ) . type ( 'text' ) . send ( 'Page not found' )
228+ }
229+
230+ // Requests without a language prefix for versions > 2.17 will always
231+ // 404 upstream (the archive repos store pages under /en/, /zh/, etc.).
232+ // Skip the fetch and let downstream middleware handle the redirect.
233+ if (
234+ versionSatisfiesRange ( requestedVersion , `>${ lastVersionWithoutArchivedRedirectsFile } ` ) &&
235+ ! pathLanguagePrefixed ( req . path )
236+ ) {
237+ statsd . increment ( 'middleware.archived_skip_no_language' , 1 , [ `version:${ requestedVersion } ` ] )
238+ return next ( )
239+ }
240+
217241 // Retrieve the page from the archived repo
218242 const doGet = ( ) =>
219243 fetchWithRetry (
@@ -246,15 +270,17 @@ export default async function archivedEnterpriseVersions(
246270 } )
247271 }
248272
249- // Log errors for non-200 responses to help identify issues with archived content
273+ // Log non-200 responses — use warn for 404s (expected for missing archived
274+ // pages) and error for genuine upstream failures (5xx, timeouts).
250275 if ( r . status !== 200 ) {
251276 let upstreamBody : string | undefined
252277 try {
253278 upstreamBody = await r . text ( )
254279 } catch {
255280 // ignore — body reading failure shouldn't affect error handling
256281 }
257- logger . error ( 'Failed to fetch archived enterprise content' , {
282+ const level = r . status === 404 ? 'warn' : 'error'
283+ logger [ level ] ( 'Failed to fetch archived enterprise content' , {
258284 version : requestedVersion ,
259285 path : req . path ,
260286 status : r . status ,
@@ -502,3 +528,49 @@ function splitByLanguage(uri: string) {
502528 }
503529 return [ language , withoutLanguage ]
504530}
531+
532+ // Regex to extract any language-like prefix from the path, including
533+ // "cn" which was the old Chinese language code used in archives ≤3.2.
534+ const archiveLanguagePrefixRegex = new RegExp ( `^/(${ Object . keys ( allLanguages ) . join ( '|' ) } |cn)(/|$)` )
535+
536+ // Detects request paths that will never resolve on the upstream GitHub
537+ // Pages archive repos, so we can 404 immediately without making a
538+ // network request. Returns a short reason string, or null if the
539+ // request looks plausible.
540+ function getEarlyNotFoundReason ( reqPath : string , version : string ) : string | null {
541+ // Double slashes in the path never resolve (e.g. ".../about-2fa//index.html")
542+ if ( reqPath . includes ( '//' ) ) {
543+ return 'double-slash'
544+ }
545+
546+ // Duplicated "/developer/developer/" segment — these are broken
547+ // crawler URLs from the old developer.github.com site.
548+ if ( reqPath . includes ( '/developer/developer/' ) ) {
549+ return 'developer-developer'
550+ }
551+
552+ // Check if the language in the path actually exists in this version's
553+ // archive. Each language has a `firstArchivedVersion` indicating when
554+ // it was first included in the GHES archives.
555+ const langMatch = reqPath . match ( archiveLanguagePrefixRegex )
556+ if ( langMatch ) {
557+ const lang = langMatch [ 1 ]
558+
559+ // "cn" was the old Chinese language code; those archives are ancient
560+ // and effectively dead traffic. Always 404.
561+ if ( lang === 'cn' ) {
562+ return 'language-not-in-version'
563+ }
564+
565+ const langDef = allLanguages [ lang ]
566+ if ( langDef ?. firstArchivedVersion ) {
567+ // 404 if the requested version is older than when this language
568+ // was first archived (e.g. /zh/ on v3.0 → 404 because zh started in 3.3)
569+ if ( ! versionSatisfiesRange ( version , `>=${ langDef . firstArchivedVersion } ` ) ) {
570+ return 'language-not-in-version'
571+ }
572+ }
573+ }
574+
575+ return null
576+ }
0 commit comments