Skip to content

Commit 7571b63

Browse files
heiskrCopilot
andauthored
Fix link checker false positives (#59774)
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 1b4a2ba commit 7571b63

3 files changed

Lines changed: 148 additions & 20 deletions

File tree

src/links/lib/excluded-links.yml

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -70,8 +70,8 @@
7070
- is: https://moodle.org
7171
- is: https://azure.microsoft.com
7272
- is: https://api.octocorp.ghe.com
73-
- is: https://platform.openai.com/docs/guides/safety-best-practices
74-
- is: https://platform.openai.com/docs/guides/function-calling
73+
- startsWith: https://platform.openai.com/docs
74+
- startsWith: https://openai.com
7575
- is: https://global.rel.tunnels.api.visualstudio.com/api/version
7676
- is: https://www.wireguard.com/quickstart/
7777
- is: https://docs.openstack.org/horizon/latest/
@@ -85,8 +85,6 @@
8585
- is: https://jsonformatter.org/
8686
- is: https://mvnrepository.com/artifact/org.xwiki.platform/xwiki-platform-oldcore
8787
- is: https://mvnrepository.com/artifact/com.google.guava/guava
88-
- startsWith: https://platform.openai.com/docs/models
89-
- startsWith: https://openai.com/index
9088
- is: https://github.com/github-linguist/linguist/compare/master...octocat:master
9189
- is: https://www.servicenow.com/docs/bundle/utah-devops/page/product/enterprise-dev-ops/concept/github-integration-dev-ops.html
9290
- startsWith: https://www.ilo.org
@@ -95,8 +93,7 @@
9593
- is: https://www.nongnu.org/oath-toolkit/man-oathtool.html
9694
- is: https://www.gnu.org/software/emacs/
9795
- is: https://www.transparency.org/what-is-corruption
98-
- startsWith: https://platform.openai.com/docs/api-reference/
99-
- is: https://azuredownloads-g3ahgwb5b8bkbxhd.b01.azurefd.net/github-copilot/
96+
- startsWith: https://azuredownloads-g3ahgwb5b8bkbxhd.b01.azurefd.net/github-copilot/
10097
- is: https://www.anthropic.com/claude/sonnet
10198
- is: https://www.psiexams.com/become-psi-test-center/computer-specifications/
10299
- is: https://www.buymeacoffee.com/
@@ -115,3 +112,21 @@
115112
- is: https://collectd.org/documentation/manpages/collectd.conf.html#plugin-fhcount
116113
- is: https://mywiki.wooledge.org/BashPitfalls
117114
- startsWith: https://code.visualstudio.com/docs/configure/telemetry
115+
116+
# npmjs.com blocks automated link checkers with 403.
117+
- startsWith: https://www.npmjs.com
118+
119+
# Azure Marketplace blocks automated link checkers with 403.
120+
- startsWith: https://azuremarketplace.microsoft.com
121+
122+
# Splunk docs blocks automated link checkers with 403.
123+
- startsWith: https://docs.splunk.com
124+
125+
# HashiCorp rate-limits automated requests (429).
126+
- startsWith: https://www.hashicorp.com
127+
128+
# ISO website blocks automated link checkers with 403.
129+
- startsWith: https://www.iso.org
130+
131+
# Example domain used in style guide documentation.
132+
- startsWith: https://some-docs.com

src/links/lib/extract-links.ts

Lines changed: 37 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@ import type { Context, Page } from '@/types'
1616
// Link patterns for Markdown
1717
const INTERNAL_LINK_PATTERN = /\]\(\/[^)]+\)/g
1818
const AUTOTITLE_LINK_PATTERN = /\[AUTOTITLE\]\(([^)]+)\)/g
19-
const EXTERNAL_LINK_PATTERN = /\]\((https?:\/\/[^)]+)\)/g
19+
// Handles one level of balanced parentheses in URLs (e.g., Wikipedia links)
20+
const EXTERNAL_LINK_PATTERN = /\]\((https?:\/\/(?:[^()\s]+|\([^()]*\))*)\)/g
2021
const IMAGE_LINK_PATTERN = /!\[[^\]]*\]\(([^)]+)\)/g
2122

2223
// Anchor link patterns (for same-page links)
@@ -82,10 +83,19 @@ export function extractLinksFromMarkdown(content: string): LinkExtractionResult
8283
const anchorLinks: ExtractedLink[] = []
8384
const imageLinks: ExtractedLink[] = []
8485

86+
// Strip fenced code blocks to avoid checking example/placeholder URLs
87+
// Replaces non-newline characters with spaces to preserve line numbers and positions
88+
const strippedContent = content.replace(
89+
/^ {0,3}(`{3,})[^\n]*\n[\s\S]*?^ {0,3}\1\s*$/gm,
90+
(match) => {
91+
return match.replace(/[^\n]/g, ' ')
92+
},
93+
)
94+
8595
// Extract AUTOTITLE links first (they're a special case of internal links)
8696
let match
87-
while ((match = AUTOTITLE_LINK_PATTERN.exec(content)) !== null) {
88-
const { line, column } = getLineAndColumn(content, match.index)
97+
while ((match = AUTOTITLE_LINK_PATTERN.exec(strippedContent)) !== null) {
98+
const { line, column } = getLineAndColumn(strippedContent, match.index)
8999
const href = match[1].split('#')[0] // Remove anchor if present
90100
if (href.startsWith('/')) {
91101
internalLinks.push({
@@ -102,17 +112,17 @@ export function extractLinksFromMarkdown(content: string): LinkExtractionResult
102112
AUTOTITLE_LINK_PATTERN.lastIndex = 0
103113

104114
// Extract regular internal links
105-
while ((match = INTERNAL_LINK_PATTERN.exec(content)) !== null) {
115+
while ((match = INTERNAL_LINK_PATTERN.exec(strippedContent)) !== null) {
106116
// Skip if this is an AUTOTITLE link (already captured)
107117
const fullMatch = match[0]
108-
if (content.substring(match.index - 10, match.index).includes('AUTOTITLE')) {
118+
if (strippedContent.substring(match.index - 10, match.index).includes('AUTOTITLE')) {
109119
continue
110120
}
111121

112-
const { line, column } = getLineAndColumn(content, match.index)
122+
const { line, column } = getLineAndColumn(strippedContent, match.index)
113123
// Extract href from ](/path) format
114124
const href = fullMatch.substring(2, fullMatch.length - 1).split('#')[0]
115-
const text = extractLinkText(content, match.index)
125+
const text = extractLinkText(strippedContent, match.index)
116126

117127
internalLinks.push({
118128
href,
@@ -127,10 +137,10 @@ export function extractLinksFromMarkdown(content: string): LinkExtractionResult
127137
INTERNAL_LINK_PATTERN.lastIndex = 0
128138

129139
// Extract external links
130-
while ((match = EXTERNAL_LINK_PATTERN.exec(content)) !== null) {
131-
const { line, column } = getLineAndColumn(content, match.index)
140+
while ((match = EXTERNAL_LINK_PATTERN.exec(strippedContent)) !== null) {
141+
const { line, column } = getLineAndColumn(strippedContent, match.index)
132142
const href = match[1]
133-
const text = extractLinkText(content, match.index)
143+
const text = extractLinkText(strippedContent, match.index)
134144

135145
externalLinks.push({
136146
href,
@@ -144,8 +154,8 @@ export function extractLinksFromMarkdown(content: string): LinkExtractionResult
144154
EXTERNAL_LINK_PATTERN.lastIndex = 0
145155

146156
// Extract anchor links
147-
while ((match = ANCHOR_LINK_PATTERN.exec(content)) !== null) {
148-
const { line, column } = getLineAndColumn(content, match.index)
157+
while ((match = ANCHOR_LINK_PATTERN.exec(strippedContent)) !== null) {
158+
const { line, column } = getLineAndColumn(strippedContent, match.index)
149159
const href = match[0].substring(2, match[0].length - 1)
150160

151161
anchorLinks.push({
@@ -160,8 +170,8 @@ export function extractLinksFromMarkdown(content: string): LinkExtractionResult
160170
ANCHOR_LINK_PATTERN.lastIndex = 0
161171

162172
// Extract image links
163-
while ((match = IMAGE_LINK_PATTERN.exec(content)) !== null) {
164-
const { line, column } = getLineAndColumn(content, match.index)
173+
while ((match = IMAGE_LINK_PATTERN.exec(strippedContent)) !== null) {
174+
const { line, column } = getLineAndColumn(strippedContent, match.index)
165175
const href = match[1]
166176

167177
// Only include internal images (starting with /)
@@ -345,6 +355,19 @@ export function checkInternalLink(
345355
}
346356
}
347357

358+
// Strip language prefix and check redirects (which are stored without it)
359+
const langPrefixMatch = resolved.match(/^\/[a-z]{2}\//)
360+
if (langPrefixMatch) {
361+
const withoutLang = resolved.slice(langPrefixMatch[0].length - 1)
362+
if (redirects[withoutLang]) {
363+
return {
364+
exists: true,
365+
isRedirect: true,
366+
redirectTarget: redirects[withoutLang],
367+
}
368+
}
369+
}
370+
348371
return { exists: false, isRedirect: false }
349372
}
350373

src/links/tests/extract-links.ts

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,75 @@ Also [versioned](/enterprise-server@{{ currentVersion }}/admin).
140140
expect(result.internalLinks.length).toBeGreaterThanOrEqual(0)
141141
})
142142

143+
test('extracts external links with parentheses in URLs', () => {
144+
const content = `
145+
See the [shebang article](https://en.wikipedia.org/wiki/Shebang_(Unix)) for more.
146+
Also [Continuum](https://en.wikipedia.org/wiki/Continuum_(measurement)) is relevant.
147+
`
148+
const result = extractLinksFromMarkdown(content)
149+
150+
expect(result.externalLinks).toHaveLength(2)
151+
expect(result.externalLinks[0].href).toBe('https://en.wikipedia.org/wiki/Shebang_(Unix)')
152+
expect(result.externalLinks[1].href).toBe(
153+
'https://en.wikipedia.org/wiki/Continuum_(measurement)',
154+
)
155+
})
156+
157+
test('skips links inside fenced code blocks', () => {
158+
const content = `
159+
Here is [a real link](https://example.com).
160+
161+
\`\`\`yaml
162+
![badge](https://github.com/octocat/repo/actions/workflows/ci.yml/badge.svg)
163+
[example](https://fake-example.com/not-real)
164+
\`\`\`
165+
166+
And [another real link](https://real.example.com/page).
167+
`
168+
const result = extractLinksFromMarkdown(content)
169+
170+
expect(result.externalLinks).toHaveLength(2)
171+
expect(result.externalLinks[0].href).toBe('https://example.com')
172+
expect(result.externalLinks[1].href).toBe('https://real.example.com/page')
173+
})
174+
175+
test('preserves correct line numbers when code blocks are stripped', () => {
176+
const content = `Line 1
177+
[Link on line 2](/path/one)
178+
\`\`\`
179+
code block on line 3
180+
code block on line 4
181+
\`\`\`
182+
Line 6
183+
[Link on line 8](/path/two)
184+
`
185+
const result = extractLinksFromMarkdown(content)
186+
187+
expect(result.internalLinks).toHaveLength(2)
188+
expect(result.internalLinks[0].line).toBe(2)
189+
// Line numbers are preserved because code block content is replaced with spaces
190+
expect(result.internalLinks[1].line).toBe(8)
191+
})
192+
193+
test('skips links inside indented fenced code blocks', () => {
194+
const content = `
195+
Here is [a real link](https://example.com).
196+
197+
1. Step one:
198+
199+
\`\`\`yaml
200+
[example](https://fake-example.com/not-real)
201+
\`\`\`
202+
203+
And [another real link](https://real.example.com/page).
204+
`
205+
const result = extractLinksFromMarkdown(content)
206+
207+
expect(result.externalLinks).toHaveLength(2)
208+
expect(result.externalLinks[0].href).toBe('https://example.com')
209+
expect(result.externalLinks[1].href).toBe('https://real.example.com/page')
210+
})
211+
143212
test('handles complex nested brackets', () => {
144213
const content = `
145214
Use the [\`git clone\`](/repositories/cloning) command.
@@ -186,6 +255,8 @@ describe('checkInternalLink', () => {
186255
const redirects = {
187256
'/en/old-path': '/en/new-path',
188257
'/en/deprecated': '/en/current',
258+
'/enterprise-server@3.19/actions/old-path': '/enterprise-server@3.19/actions/new-path',
259+
'/actions/legacy-path': '/actions/current-path',
189260
}
190261

191262
test('finds direct page match', () => {
@@ -233,6 +304,25 @@ describe('checkInternalLink', () => {
233304
const result = checkInternalLink('/enterprise-server@latest/does/not/exist', pageMap, redirects)
234305
expect(result.exists).toBe(false)
235306
})
307+
308+
test('finds redirect after stripping language prefix', () => {
309+
// Links from rendered HTML have /en/ prefix but redirects are stored without it
310+
const result = checkInternalLink(
311+
'/en/enterprise-server@3.19/actions/old-path',
312+
pageMap,
313+
redirects,
314+
)
315+
expect(result.exists).toBe(true)
316+
expect(result.isRedirect).toBe(true)
317+
expect(result.redirectTarget).toBe('/enterprise-server@3.19/actions/new-path')
318+
})
319+
320+
test('finds versionless redirect after stripping language prefix', () => {
321+
const result = checkInternalLink('/en/actions/legacy-path', pageMap, redirects)
322+
expect(result.exists).toBe(true)
323+
expect(result.isRedirect).toBe(true)
324+
expect(result.redirectTarget).toBe('/actions/current-path')
325+
})
236326
})
237327

238328
describe('isAssetLink', () => {

0 commit comments

Comments
 (0)