Improve Copilot landing page search relevancy (#59870)

heiskr · Copilot · web-flow · commit 90dc326694bb · 2026-03-09T18:23:58.000Z
Co-authored-by: Copilot &lt;223556219+Copilot@users.noreply.github.com&gt;
diff --git a/src/landings/components/shared/LandingArticleGridWithFilter.tsx b/src/landings/components/shared/LandingArticleGridWithFilter.tsx
@@ -5,10 +5,10 @@ import cx from 'classnames'
 
 import { Link } from '@/frame/components/Link'
 import { useTranslation } from '@/languages/components/useTranslation'
-import { ArticleCardItems, ChildTocItem, TocItem } from '@/landings/types'
+import { ChildTocItem, TocItem } from '@/landings/types'
 import { LandingType } from '@/landings/context/LandingContext'
 import type { QueryParams } from '@/search/components/hooks/useMultiQueryParams'
-import { fuzzyMatchScore } from '@/landings/lib/fuzzy-match'
+import { flattenArticles, deriveStopWords, searchArticles } from '@/landings/lib/article-search'
 
 import styles from './LandingArticleGridWithFilter.module.scss'
 
@@ -22,38 +22,6 @@ type ArticleGridProps = {
 
 const ALL_CATEGORIES = 'all_categories'
 
-// Helper function to recursively flatten nested articles
-// Excludes index pages (pages with childTocItems)
-const flattenArticlesRecursive = (articles: (TocItem | ChildTocItem)[]): ArticleCardItems => {
-  const flattened: ArticleCardItems = []
-
-  for (const article of articles) {
-    // If the article has children, recursively process them but don't include the parent (index page)
-    if (article.childTocItems && article.childTocItems.length > 0) {
-      flattened.push(...flattenArticlesRecursive(article.childTocItems))
-    } else {
-      // Only add articles that don't have children (actual article pages, not index pages)
-      flattened.push(article as ChildTocItem)
-    }
-  }
-
-  return flattened
-}
-
-// Wrapper function that flattens, deduplicates, and sorts alphabetically by title (only once)
-const flattenArticles = (articles: (TocItem | ChildTocItem)[]): ArticleCardItems => {
-  const flattened = flattenArticlesRecursive(articles)
-  // Deduplicate articles by fullPath - needed when a page lists both individual
-  // articles and their parent group as children (e.g., bespoke landing pages)
-  const seen = new Set<string>()
-  const deduped = flattened.filter((article) => {
-    if (seen.has(article.fullPath)) return false
-    seen.add(article.fullPath)
-    return true
-  })
-  return deduped.sort((a, b) => a.title.localeCompare(b.title))
-}
-
 // Hook to get current articles per page based on screen size
 const useResponsiveArticlesPerPage = () => {
   const [articlesPerPage, setArticlesPerPage] = useState(9) // Default to desktop
@@ -102,6 +70,9 @@ export const ArticleGrid = ({
   // Recursively flatten all articles from tocItems, including both direct children and nested articles
   const allArticles = useMemo(() => flattenArticles(tocItems), [tocItems])
 
+  // Auto-derive stop words from article frequency
+  const stopWords = useMemo(() => deriveStopWords(allArticles), [allArticles])
+
   // Filter articles based on includedCategories for discovery landing pages
   // For bespoke landing pages, show all articles regardless of includedCategories
   const filteredArticlesByLandingType = useMemo(() => {
@@ -160,27 +131,7 @@ export const ArticleGrid = ({
     let results = filteredArticlesByLandingType
 
     if (searchQuery) {
-      // Calculate match scores for each article
-      const scoredResults = results
-        .map((token) => {
-          let maxScore = -1
-          for (const value of Object.values(token)) {
-            if (typeof value === 'string') {
-              maxScore = Math.max(maxScore, fuzzyMatchScore(value, searchQuery))
-            } else if (Array.isArray(value)) {
-              for (const item of value) {
-                if (typeof item === 'string') {
-                  maxScore = Math.max(maxScore, fuzzyMatchScore(item, searchQuery))
-                }
-              }
-            }
-          }
-          return { token, score: maxScore }
-        })
-        .filter(({ score }) => score >= 0)
-        .sort((a, b) => b.score - a.score)
-
-      results = scoredResults.map(({ token }) => token)
+      results = searchArticles(results, searchQuery, stopWords)
     }
 
     if (selectedCategory !== ALL_CATEGORIES) {
diff --git a/src/landings/lib/article-search.ts b/src/landings/lib/article-search.ts
@@ -0,0 +1,81 @@
+import type { ArticleCardItems, ChildTocItem, TocItem } from '@/landings/types'
+import { fuzzyMatchScore, stripStopWords } from '@/landings/lib/fuzzy-match'
+
+const STOP_WORD_THRESHOLD = 0.8
+
+// Recursively flatten nested TOC items into leaf articles.
+// Excludes index pages (pages with childTocItems).
+const flattenArticlesRecursive = (articles: (TocItem | ChildTocItem)[]): ArticleCardItems => {
+  const flattened: ArticleCardItems = []
+
+  for (const article of articles) {
+    if (article.childTocItems && article.childTocItems.length > 0) {
+      flattened.push(...flattenArticlesRecursive(article.childTocItems))
+    } else {
+      flattened.push(article as ChildTocItem)
+    }
+  }
+
+  return flattened
+}
+
+// Flatten, deduplicate by fullPath, and sort alphabetically by title.
+export const flattenArticles = (articles: (TocItem | ChildTocItem)[]): ArticleCardItems => {
+  const flattened = flattenArticlesRecursive(articles)
+  const seen = new Set<string>()
+  const deduped = flattened.filter((article) => {
+    if (seen.has(article.fullPath)) return false
+    seen.add(article.fullPath)
+    return true
+  })
+  return deduped.sort((a, b) => a.title.localeCompare(b.title))
+}
+
+// Find words appearing in a high percentage of article titles/intros.
+// These add little signal to search since they match nearly everything.
+export const deriveStopWords = (
+  articles: ArticleCardItems,
+  threshold = STOP_WORD_THRESHOLD,
+): string[] => {
+  if (articles.length === 0) return []
+  const wordCounts = new Map<string, number>()
+  for (const article of articles) {
+    const uniqueWords = new Set(
+      [article.title, article.intro].join(' ').toLowerCase().split(/\s+/).filter(Boolean),
+    )
+    for (const w of uniqueWords) wordCounts.set(w, (wordCounts.get(w) || 0) + 1)
+  }
+  const minCount = articles.length * threshold
+  return [...wordCounts.entries()].filter(([, count]) => count >= minCount).map(([word]) => word)
+}
+
+// Score and rank articles against a search query, returning only matches.
+// Searches title, intro, and category fields. Returns all articles (scored 0.5)
+// when the query consists entirely of stop words.
+export const searchArticles = (
+  articles: ArticleCardItems,
+  query: string,
+  stopWords: string[],
+): ArticleCardItems => {
+  const cleanedQuery = stripStopWords(query, stopWords)
+
+  const scored = articles
+    .map((article) => {
+      if (!cleanedQuery) return { article, score: 0.5 }
+
+      let maxScore = -1
+      const searchableValues = [article.title, article.intro, ...(article.category || [])].filter(
+        (v): v is string => typeof v === 'string',
+      )
+
+      for (const value of searchableValues) {
+        const cleanedValue = stripStopWords(value, stopWords)
+        maxScore = Math.max(maxScore, fuzzyMatchScore(cleanedValue, cleanedQuery))
+      }
+      return { article, score: maxScore }
+    })
+    .filter(({ score }) => score >= 0)
+    .sort((a, b) => b.score - a.score)
+
+  return scored.map(({ article }) => article)
+}
diff --git a/src/landings/lib/fuzzy-match.ts b/src/landings/lib/fuzzy-match.ts
@@ -1,8 +1,12 @@
-// 60% threshold: Empirically chosen to balance precision vs recall.
-// Lower values (e.g., 40%) match too loosely (e.g., "agent" matches "urgent").
-// Higher values (e.g., 80%) miss reasonable matches like singular/plural variations.
-// 60% captures most typo corrections and word form variations while filtering noise.
-const BIGRAM_COVERAGE_THRESHOLD = 0.6
+// 70% threshold: Raised from 60% to reduce false positives from short queries.
+// At 60%, "billing" matched "installing" and "pricing" matched "writing pr descriptions".
+// 70% still captures singular/plural ("agent"→"agents" = 80%, "repository"→"repositories" = 73%)
+// while filtering the worst noise (both of those false positives were at 67%).
+const BIGRAM_COVERAGE_THRESHOLD = 0.7
+
+// Short search terms produce very few bigrams, making spurious matches likely.
+// Require exact substring match for terms with 4 or fewer non-space characters.
+const SHORT_TERM_MAX_LENGTH = 4
 
 // Memoization cache for bigram computation
 const bigramCache = new Map<string, Set<string>>()
@@ -44,6 +48,10 @@ export const fuzzyMatchScore = (text: string, searchTerm: string): number => {
   // Exact substring match gets highest score
   if (lowerText.includes(lowerSearch)) return 1
 
+  // Short search terms (e.g., "mcp", "pr", "test") produce too few bigrams
+  // for reliable fuzzy matching, so require exact substring only.
+  if (lowerSearch.replace(/\s+/g, '').length <= SHORT_TERM_MAX_LENGTH) return -1
+
   // Bigram coverage: what % of search bigrams appear in text
   // This works better than Jaccard when text is much longer than search
   const score = bigramCoverage(text, searchTerm)
@@ -54,3 +62,13 @@ export const fuzzyMatchScore = (text: string, searchTerm: string): number => {
 export const fuzzyMatch = (text: string, searchTerm: string): boolean => {
   return fuzzyMatchScore(text, searchTerm) >= 0
 }
+
+// Strip stop words from a string, preserving other words.
+// On product-specific landing pages (e.g. /copilot), the product name appears
+// in nearly every article, drowning out the actual query.
+export const stripStopWords = (text: string, stopWords: string[]): string =>
+  text
+    .split(/\s+/)
+    .filter((w) => !stopWords.includes(w.toLowerCase()))
+    .join(' ')
+    .trim()
diff --git a/src/landings/tests/article-search.ts b/src/landings/tests/article-search.ts
@@ -0,0 +1,135 @@
+import { describe, expect, test } from 'vitest'
+
+import { flattenArticles, deriveStopWords, searchArticles } from '@/landings/lib/article-search'
+import type { TocItem, ChildTocItem } from '@/landings/types'
+
+const makeArticle = (
+  title: string,
+  intro: string,
+  fullPath: string,
+  category?: string[],
+): ChildTocItem => ({
+  title,
+  intro,
+  fullPath,
+  category: category || null,
+})
+
+describe('flattenArticles', () => {
+  test('flattens nested tocItems into leaf articles', () => {
+    const items: TocItem[] = [
+      {
+        title: 'Parent',
+        fullPath: '/parent',
+        childTocItems: [
+          { title: 'Child A', fullPath: '/parent/a', intro: 'Intro A' },
+          { title: 'Child B', fullPath: '/parent/b', intro: 'Intro B' },
+        ],
+      },
+    ]
+    const result = flattenArticles(items)
+    expect(result).toHaveLength(2)
+    expect(result.map((a) => a.title)).toEqual(['Child A', 'Child B'])
+  })
+
+  test('deduplicates articles by fullPath', () => {
+    const items: TocItem[] = [
+      { title: 'Article', fullPath: '/same', intro: 'First' },
+      { title: 'Article', fullPath: '/same', intro: 'Duplicate' },
+    ]
+    const result = flattenArticles(items)
+    expect(result).toHaveLength(1)
+  })
+
+  test('sorts alphabetically by title', () => {
+    const items: TocItem[] = [
+      { title: 'Zebra', fullPath: '/z', intro: '' },
+      { title: 'Alpha', fullPath: '/a', intro: '' },
+      { title: 'Middle', fullPath: '/m', intro: '' },
+    ]
+    const result = flattenArticles(items)
+    expect(result.map((a) => a.title)).toEqual(['Alpha', 'Middle', 'Zebra'])
+  })
+
+  test('excludes index pages (parents with children)', () => {
+    const items: TocItem[] = [
+      {
+        title: 'Index page',
+        fullPath: '/index',
+        childTocItems: [{ title: 'Leaf', fullPath: '/index/leaf', intro: '' }],
+      },
+    ]
+    const result = flattenArticles(items)
+    expect(result).toHaveLength(1)
+    expect(result[0].title).toBe('Leaf')
+  })
+})
+
+describe('deriveStopWords', () => {
+  test('finds words appearing in 80%+ of articles', () => {
+    const articles = [
+      makeArticle('GitHub Copilot agents', 'Use Copilot for coding', '/a'),
+      makeArticle('GitHub Copilot billing', 'Manage Copilot billing', '/b'),
+      makeArticle('GitHub Copilot extensions', 'Build Copilot extensions', '/c'),
+      makeArticle('GitHub Copilot settings', 'Configure Copilot settings', '/d'),
+      makeArticle('GitHub Copilot MCP', 'Use Copilot with MCP servers', '/e'),
+    ]
+    const stopWords = deriveStopWords(articles)
+    expect(stopWords).toContain('copilot')
+    expect(stopWords).toContain('github')
+    expect(stopWords).not.toContain('agents')
+    expect(stopWords).not.toContain('billing')
+  })
+
+  test('returns empty array for empty articles', () => {
+    expect(deriveStopWords([])).toEqual([])
+  })
+
+  test('respects custom threshold', () => {
+    const articles = [
+      makeArticle('Copilot agents', 'Use agents', '/a'),
+      makeArticle('Copilot billing', 'Manage billing', '/b'),
+      makeArticle('Copilot settings', 'Configure settings', '/c'),
+    ]
+    // "copilot" appears in 3/3 = 100%, always a stop word
+    expect(deriveStopWords(articles, 0.5)).toContain('copilot')
+    // At threshold 1.0, only words in every single article qualify
+    const strict = deriveStopWords(articles, 1.0)
+    expect(strict).toContain('copilot')
+    expect(strict).not.toContain('agents')
+  })
+})
+
+describe('searchArticles', () => {
+  const articles = [
+    makeArticle('Copilot billing plans', 'Manage your billing', '/billing', ['Billing']),
+    makeArticle('Copilot agent features', 'Use coding agents', '/agents', ['Agents']),
+    makeArticle('Copilot extensions', 'Build and install extensions', '/ext', ['Extensions']),
+  ]
+
+  test('returns matching articles ranked by score', () => {
+    const results = searchArticles(articles, 'billing', [])
+    expect(results[0].title).toBe('Copilot billing plans')
+  })
+
+  test('returns all articles when query is only stop words', () => {
+    const results = searchArticles(articles, 'copilot', ['copilot'])
+    expect(results).toHaveLength(3)
+  })
+
+  test('strips stop words from query before matching', () => {
+    const results = searchArticles(articles, 'copilot billing', ['copilot'])
+    expect(results[0].title).toBe('Copilot billing plans')
+  })
+
+  test('returns empty array when no articles match', () => {
+    const results = searchArticles(articles, 'xyznonexistent', [])
+    expect(results).toHaveLength(0)
+  })
+
+  test('searches title, intro, and category fields', () => {
+    const results = searchArticles(articles, 'agents', [])
+    expect(results.length).toBeGreaterThan(0)
+    expect(results.some((a) => a.title === 'Copilot agent features')).toBe(true)
+  })
+})
diff --git a/src/landings/tests/fuzzy-match.ts b/src/landings/tests/fuzzy-match.ts