Skip to content

Commit 90dc326

Browse files
heiskrCopilot
andauthored
Improve Copilot landing page search relevancy (#59870)
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent d013c69 commit 90dc326

File tree

5 files changed

+303
-65
lines changed

5 files changed

+303
-65
lines changed

src/landings/components/shared/LandingArticleGridWithFilter.tsx

Lines changed: 6 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,10 @@ import cx from 'classnames'
55

66
import { Link } from '@/frame/components/Link'
77
import { useTranslation } from '@/languages/components/useTranslation'
8-
import { ArticleCardItems, ChildTocItem, TocItem } from '@/landings/types'
8+
import { ChildTocItem, TocItem } from '@/landings/types'
99
import { LandingType } from '@/landings/context/LandingContext'
1010
import type { QueryParams } from '@/search/components/hooks/useMultiQueryParams'
11-
import { fuzzyMatchScore } from '@/landings/lib/fuzzy-match'
11+
import { flattenArticles, deriveStopWords, searchArticles } from '@/landings/lib/article-search'
1212

1313
import styles from './LandingArticleGridWithFilter.module.scss'
1414

@@ -22,38 +22,6 @@ type ArticleGridProps = {
2222

2323
const ALL_CATEGORIES = 'all_categories'
2424

25-
// Helper function to recursively flatten nested articles
26-
// Excludes index pages (pages with childTocItems)
27-
const flattenArticlesRecursive = (articles: (TocItem | ChildTocItem)[]): ArticleCardItems => {
28-
const flattened: ArticleCardItems = []
29-
30-
for (const article of articles) {
31-
// If the article has children, recursively process them but don't include the parent (index page)
32-
if (article.childTocItems && article.childTocItems.length > 0) {
33-
flattened.push(...flattenArticlesRecursive(article.childTocItems))
34-
} else {
35-
// Only add articles that don't have children (actual article pages, not index pages)
36-
flattened.push(article as ChildTocItem)
37-
}
38-
}
39-
40-
return flattened
41-
}
42-
43-
// Wrapper function that flattens, deduplicates, and sorts alphabetically by title (only once)
44-
const flattenArticles = (articles: (TocItem | ChildTocItem)[]): ArticleCardItems => {
45-
const flattened = flattenArticlesRecursive(articles)
46-
// Deduplicate articles by fullPath - needed when a page lists both individual
47-
// articles and their parent group as children (e.g., bespoke landing pages)
48-
const seen = new Set<string>()
49-
const deduped = flattened.filter((article) => {
50-
if (seen.has(article.fullPath)) return false
51-
seen.add(article.fullPath)
52-
return true
53-
})
54-
return deduped.sort((a, b) => a.title.localeCompare(b.title))
55-
}
56-
5725
// Hook to get current articles per page based on screen size
5826
const useResponsiveArticlesPerPage = () => {
5927
const [articlesPerPage, setArticlesPerPage] = useState(9) // Default to desktop
@@ -102,6 +70,9 @@ export const ArticleGrid = ({
10270
// Recursively flatten all articles from tocItems, including both direct children and nested articles
10371
const allArticles = useMemo(() => flattenArticles(tocItems), [tocItems])
10472

73+
// Auto-derive stop words from article frequency
74+
const stopWords = useMemo(() => deriveStopWords(allArticles), [allArticles])
75+
10576
// Filter articles based on includedCategories for discovery landing pages
10677
// For bespoke landing pages, show all articles regardless of includedCategories
10778
const filteredArticlesByLandingType = useMemo(() => {
@@ -160,27 +131,7 @@ export const ArticleGrid = ({
160131
let results = filteredArticlesByLandingType
161132

162133
if (searchQuery) {
163-
// Calculate match scores for each article
164-
const scoredResults = results
165-
.map((token) => {
166-
let maxScore = -1
167-
for (const value of Object.values(token)) {
168-
if (typeof value === 'string') {
169-
maxScore = Math.max(maxScore, fuzzyMatchScore(value, searchQuery))
170-
} else if (Array.isArray(value)) {
171-
for (const item of value) {
172-
if (typeof item === 'string') {
173-
maxScore = Math.max(maxScore, fuzzyMatchScore(item, searchQuery))
174-
}
175-
}
176-
}
177-
}
178-
return { token, score: maxScore }
179-
})
180-
.filter(({ score }) => score >= 0)
181-
.sort((a, b) => b.score - a.score)
182-
183-
results = scoredResults.map(({ token }) => token)
134+
results = searchArticles(results, searchQuery, stopWords)
184135
}
185136

186137
if (selectedCategory !== ALL_CATEGORIES) {

src/landings/lib/article-search.ts

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
import type { ArticleCardItems, ChildTocItem, TocItem } from '@/landings/types'
2+
import { fuzzyMatchScore, stripStopWords } from '@/landings/lib/fuzzy-match'
3+
4+
const STOP_WORD_THRESHOLD = 0.8
5+
6+
// Recursively flatten nested TOC items into leaf articles.
7+
// Excludes index pages (pages with childTocItems).
8+
const flattenArticlesRecursive = (articles: (TocItem | ChildTocItem)[]): ArticleCardItems => {
9+
const flattened: ArticleCardItems = []
10+
11+
for (const article of articles) {
12+
if (article.childTocItems && article.childTocItems.length > 0) {
13+
flattened.push(...flattenArticlesRecursive(article.childTocItems))
14+
} else {
15+
flattened.push(article as ChildTocItem)
16+
}
17+
}
18+
19+
return flattened
20+
}
21+
22+
// Flatten, deduplicate by fullPath, and sort alphabetically by title.
23+
export const flattenArticles = (articles: (TocItem | ChildTocItem)[]): ArticleCardItems => {
24+
const flattened = flattenArticlesRecursive(articles)
25+
const seen = new Set<string>()
26+
const deduped = flattened.filter((article) => {
27+
if (seen.has(article.fullPath)) return false
28+
seen.add(article.fullPath)
29+
return true
30+
})
31+
return deduped.sort((a, b) => a.title.localeCompare(b.title))
32+
}
33+
34+
// Find words appearing in a high percentage of article titles/intros.
35+
// These add little signal to search since they match nearly everything.
36+
export const deriveStopWords = (
37+
articles: ArticleCardItems,
38+
threshold = STOP_WORD_THRESHOLD,
39+
): string[] => {
40+
if (articles.length === 0) return []
41+
const wordCounts = new Map<string, number>()
42+
for (const article of articles) {
43+
const uniqueWords = new Set(
44+
[article.title, article.intro].join(' ').toLowerCase().split(/\s+/).filter(Boolean),
45+
)
46+
for (const w of uniqueWords) wordCounts.set(w, (wordCounts.get(w) || 0) + 1)
47+
}
48+
const minCount = articles.length * threshold
49+
return [...wordCounts.entries()].filter(([, count]) => count >= minCount).map(([word]) => word)
50+
}
51+
52+
// Score and rank articles against a search query, returning only matches.
53+
// Searches title, intro, and category fields. Returns all articles (scored 0.5)
54+
// when the query consists entirely of stop words.
55+
export const searchArticles = (
56+
articles: ArticleCardItems,
57+
query: string,
58+
stopWords: string[],
59+
): ArticleCardItems => {
60+
const cleanedQuery = stripStopWords(query, stopWords)
61+
62+
const scored = articles
63+
.map((article) => {
64+
if (!cleanedQuery) return { article, score: 0.5 }
65+
66+
let maxScore = -1
67+
const searchableValues = [article.title, article.intro, ...(article.category || [])].filter(
68+
(v): v is string => typeof v === 'string',
69+
)
70+
71+
for (const value of searchableValues) {
72+
const cleanedValue = stripStopWords(value, stopWords)
73+
maxScore = Math.max(maxScore, fuzzyMatchScore(cleanedValue, cleanedQuery))
74+
}
75+
return { article, score: maxScore }
76+
})
77+
.filter(({ score }) => score >= 0)
78+
.sort((a, b) => b.score - a.score)
79+
80+
return scored.map(({ article }) => article)
81+
}

src/landings/lib/fuzzy-match.ts

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,12 @@
1-
// 60% threshold: Empirically chosen to balance precision vs recall.
2-
// Lower values (e.g., 40%) match too loosely (e.g., "agent" matches "urgent").
3-
// Higher values (e.g., 80%) miss reasonable matches like singular/plural variations.
4-
// 60% captures most typo corrections and word form variations while filtering noise.
5-
const BIGRAM_COVERAGE_THRESHOLD = 0.6
1+
// 70% threshold: Raised from 60% to reduce false positives from short queries.
2+
// At 60%, "billing" matched "installing" and "pricing" matched "writing pr descriptions".
3+
// 70% still captures singular/plural ("agent"→"agents" = 80%, "repository"→"repositories" = 73%)
4+
// while filtering the worst noise (both of those false positives were at 67%).
5+
const BIGRAM_COVERAGE_THRESHOLD = 0.7
6+
7+
// Short search terms produce very few bigrams, making spurious matches likely.
8+
// Require exact substring match for terms with 4 or fewer non-space characters.
9+
const SHORT_TERM_MAX_LENGTH = 4
610

711
// Memoization cache for bigram computation
812
const bigramCache = new Map<string, Set<string>>()
@@ -44,6 +48,10 @@ export const fuzzyMatchScore = (text: string, searchTerm: string): number => {
4448
// Exact substring match gets highest score
4549
if (lowerText.includes(lowerSearch)) return 1
4650

51+
// Short search terms (e.g., "mcp", "pr", "test") produce too few bigrams
52+
// for reliable fuzzy matching, so require exact substring only.
53+
if (lowerSearch.replace(/\s+/g, '').length <= SHORT_TERM_MAX_LENGTH) return -1
54+
4755
// Bigram coverage: what % of search bigrams appear in text
4856
// This works better than Jaccard when text is much longer than search
4957
const score = bigramCoverage(text, searchTerm)
@@ -54,3 +62,13 @@ export const fuzzyMatchScore = (text: string, searchTerm: string): number => {
5462
export const fuzzyMatch = (text: string, searchTerm: string): boolean => {
5563
return fuzzyMatchScore(text, searchTerm) >= 0
5664
}
65+
66+
// Strip stop words from a string, preserving other words.
67+
// On product-specific landing pages (e.g. /copilot), the product name appears
68+
// in nearly every article, drowning out the actual query.
69+
export const stripStopWords = (text: string, stopWords: string[]): string =>
70+
text
71+
.split(/\s+/)
72+
.filter((w) => !stopWords.includes(w.toLowerCase()))
73+
.join(' ')
74+
.trim()
Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
import { describe, expect, test } from 'vitest'
2+
3+
import { flattenArticles, deriveStopWords, searchArticles } from '@/landings/lib/article-search'
4+
import type { TocItem, ChildTocItem } from '@/landings/types'
5+
6+
const makeArticle = (
7+
title: string,
8+
intro: string,
9+
fullPath: string,
10+
category?: string[],
11+
): ChildTocItem => ({
12+
title,
13+
intro,
14+
fullPath,
15+
category: category || null,
16+
})
17+
18+
describe('flattenArticles', () => {
19+
test('flattens nested tocItems into leaf articles', () => {
20+
const items: TocItem[] = [
21+
{
22+
title: 'Parent',
23+
fullPath: '/parent',
24+
childTocItems: [
25+
{ title: 'Child A', fullPath: '/parent/a', intro: 'Intro A' },
26+
{ title: 'Child B', fullPath: '/parent/b', intro: 'Intro B' },
27+
],
28+
},
29+
]
30+
const result = flattenArticles(items)
31+
expect(result).toHaveLength(2)
32+
expect(result.map((a) => a.title)).toEqual(['Child A', 'Child B'])
33+
})
34+
35+
test('deduplicates articles by fullPath', () => {
36+
const items: TocItem[] = [
37+
{ title: 'Article', fullPath: '/same', intro: 'First' },
38+
{ title: 'Article', fullPath: '/same', intro: 'Duplicate' },
39+
]
40+
const result = flattenArticles(items)
41+
expect(result).toHaveLength(1)
42+
})
43+
44+
test('sorts alphabetically by title', () => {
45+
const items: TocItem[] = [
46+
{ title: 'Zebra', fullPath: '/z', intro: '' },
47+
{ title: 'Alpha', fullPath: '/a', intro: '' },
48+
{ title: 'Middle', fullPath: '/m', intro: '' },
49+
]
50+
const result = flattenArticles(items)
51+
expect(result.map((a) => a.title)).toEqual(['Alpha', 'Middle', 'Zebra'])
52+
})
53+
54+
test('excludes index pages (parents with children)', () => {
55+
const items: TocItem[] = [
56+
{
57+
title: 'Index page',
58+
fullPath: '/index',
59+
childTocItems: [{ title: 'Leaf', fullPath: '/index/leaf', intro: '' }],
60+
},
61+
]
62+
const result = flattenArticles(items)
63+
expect(result).toHaveLength(1)
64+
expect(result[0].title).toBe('Leaf')
65+
})
66+
})
67+
68+
describe('deriveStopWords', () => {
69+
test('finds words appearing in 80%+ of articles', () => {
70+
const articles = [
71+
makeArticle('GitHub Copilot agents', 'Use Copilot for coding', '/a'),
72+
makeArticle('GitHub Copilot billing', 'Manage Copilot billing', '/b'),
73+
makeArticle('GitHub Copilot extensions', 'Build Copilot extensions', '/c'),
74+
makeArticle('GitHub Copilot settings', 'Configure Copilot settings', '/d'),
75+
makeArticle('GitHub Copilot MCP', 'Use Copilot with MCP servers', '/e'),
76+
]
77+
const stopWords = deriveStopWords(articles)
78+
expect(stopWords).toContain('copilot')
79+
expect(stopWords).toContain('github')
80+
expect(stopWords).not.toContain('agents')
81+
expect(stopWords).not.toContain('billing')
82+
})
83+
84+
test('returns empty array for empty articles', () => {
85+
expect(deriveStopWords([])).toEqual([])
86+
})
87+
88+
test('respects custom threshold', () => {
89+
const articles = [
90+
makeArticle('Copilot agents', 'Use agents', '/a'),
91+
makeArticle('Copilot billing', 'Manage billing', '/b'),
92+
makeArticle('Copilot settings', 'Configure settings', '/c'),
93+
]
94+
// "copilot" appears in 3/3 = 100%, always a stop word
95+
expect(deriveStopWords(articles, 0.5)).toContain('copilot')
96+
// At threshold 1.0, only words in every single article qualify
97+
const strict = deriveStopWords(articles, 1.0)
98+
expect(strict).toContain('copilot')
99+
expect(strict).not.toContain('agents')
100+
})
101+
})
102+
103+
describe('searchArticles', () => {
104+
const articles = [
105+
makeArticle('Copilot billing plans', 'Manage your billing', '/billing', ['Billing']),
106+
makeArticle('Copilot agent features', 'Use coding agents', '/agents', ['Agents']),
107+
makeArticle('Copilot extensions', 'Build and install extensions', '/ext', ['Extensions']),
108+
]
109+
110+
test('returns matching articles ranked by score', () => {
111+
const results = searchArticles(articles, 'billing', [])
112+
expect(results[0].title).toBe('Copilot billing plans')
113+
})
114+
115+
test('returns all articles when query is only stop words', () => {
116+
const results = searchArticles(articles, 'copilot', ['copilot'])
117+
expect(results).toHaveLength(3)
118+
})
119+
120+
test('strips stop words from query before matching', () => {
121+
const results = searchArticles(articles, 'copilot billing', ['copilot'])
122+
expect(results[0].title).toBe('Copilot billing plans')
123+
})
124+
125+
test('returns empty array when no articles match', () => {
126+
const results = searchArticles(articles, 'xyznonexistent', [])
127+
expect(results).toHaveLength(0)
128+
})
129+
130+
test('searches title, intro, and category fields', () => {
131+
const results = searchArticles(articles, 'agents', [])
132+
expect(results.length).toBeGreaterThan(0)
133+
expect(results.some((a) => a.title === 'Copilot agent features')).toBe(true)
134+
})
135+
})

0 commit comments

Comments
 (0)