Skip to content

Commit b0f7e45

Browse files
authored
Merge branch 'main' into repo-sync
2 parents 2142303 + 5d080b7 commit b0f7e45

2 files changed

Lines changed: 123 additions & 23 deletions

File tree

lib/failbot.js

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,46 @@
11
import got from 'got'
2-
import { Failbot, HTTPBackend, LogBackend } from '@github/failbot'
2+
import { Failbot, HTTPBackend } from '@github/failbot'
33

44
const HAYSTACK_APP = 'docs'
55

6+
async function retryingGot(url, args) {
7+
return got(
8+
url,
9+
Object.assign({}, args, {
10+
// With the timeout at 3000 (milliseconds) and the retry.limit
11+
// at 4 (times), the total worst-case is:
12+
// 3000 * 4 + 1000 + 2000 + 3000 + 4000 + 8000 = 30 seconds
13+
timeout: 3000,
14+
retry: {
15+
// This means it will wait...
16+
// 1. 1000ms
17+
// 2. 2000ms
18+
// 3. 4000ms
19+
// 4. 8000ms
20+
// 5. give up!
21+
//
22+
// From the documentation:
23+
//
24+
// Delays between retries counts with function
25+
// 1000 * Math.pow(2, retry - 1) + Math.random() * 100,
26+
// where retry is attempt number (starts from 1).
27+
//
28+
limit: 4,
29+
},
30+
})
31+
)
32+
}
33+
634
export function report(error, metadata) {
735
// If there's no HAYSTACK_URL set, bail early
836
if (!process.env.HAYSTACK_URL) return
937

1038
const backends = [
1139
new HTTPBackend({
1240
haystackURL: process.env.HAYSTACK_URL,
13-
fetchFn: got,
41+
fetchFn: retryingGot,
1442
}),
1543
]
16-
if (process.env.NODE_ENV !== 'test') {
17-
backends.push(new LogBackend({ log: console.log.bind(console) }))
18-
}
1944
const failbot = new Failbot({
2045
app: HAYSTACK_APP,
2146
backends: backends,

lib/search/lunr-search.js

Lines changed: 93 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,25 @@ export default async function loadLunrResults({ version, language, query, limit
8888
// want to make sure this number accounts for that.
8989
const TITLE_FIRST = queryLength <= 2 ? 45 : queryLength <= 6 ? 25 : 10
9090

91+
// Multiplication bonus given to matches that were made on the
92+
// the search where ALL tokens are required.
93+
// E.g. you search for 'foo bar' and we have three records:
94+
//
95+
// A) "This foo is very special"
96+
// B) "With bar and foo you can't go wrong"
97+
// C) "Only bar can save you"
98+
//
99+
// What will happen is that it only finds record (B) when it's
100+
// requires to match both 'foo' *and* 'bar'. So you get these scores:
101+
//
102+
// A) score = result.score + popularity
103+
// B) score = MATCH_PHRASE * (result.score + popularity)
104+
// C) score = result.score + popularity
105+
//
106+
// So it's very powerful multiplier. But that's fine because a
107+
// "phrase match" is a very accurate thing.
108+
const MATCH_PHRASE = 5
109+
91110
// Imagine that we have 1,000 documents. 100 of them contain the word
92111
// 'foobar'. Of those 100, we want to display the top 10 "best".
93112
// But if we only do `lunrindex.search('foobar').slice(0, 10)` we
@@ -101,28 +120,84 @@ export default async function loadLunrResults({ version, language, query, limit
101120
// records that we finally return.
102121
const PRE_LIMIT = 500
103122

104-
let titleQuery = query.trim()
105-
if (titleQuery.length <= 3 && !titleQuery.endsWith('*s')) {
106-
// When the search input is really short, force it to search with
107-
// the "forward wild card". I.e. you typed `go` we turn it into a
108-
// search for `go*` which means it can find things like `Google`.
109-
titleQuery += '*'
110-
}
123+
const titleQuery = query.trim()
111124

112125
let highestTitleScore = 0.0
126+
127+
const andTitleResults = []
128+
129+
// This will turn something like 'foo and bar' into:
130+
// [
131+
// { str: 'foo', metadata: { position: [Array], index: 0 } },
132+
// { str: 'bar', metadata: { position: [Array], index: 1 } }
133+
// ]
134+
// Note how the stopword gets omitted.
135+
// It's important to omit the stopwords because even if the record
136+
// actually contains the stopword, it won't match then.
137+
// E.g. you have a record called "Foo And Bar" and you search for
138+
// {foo AND and AND bar} it will actually not find anything.
139+
// But if you change it to {foo AND bar} it will match "Foo And Bar"
140+
// Same goes if any other stopwords were used like "Foo the Bar with for a".
141+
// That also needs to become an AND-search of {foo AND bar} ...only.
142+
const titleQueryTokenized = lunr.tokenizer(titleQuery).filter(lunr.stopWordFilter)
143+
144+
if (titleQueryTokenized.length > 1) {
145+
andTitleResults.push(
146+
...index
147+
.query((q) => {
148+
for (const { str } of titleQueryTokenized) {
149+
q.term(str, { fields: ['title'], presence: lunr.Query.presence.REQUIRED })
150+
}
151+
})
152+
.slice(0, PRE_LIMIT)
153+
.map((result) => {
154+
const { popularity } = records[result.ref]
155+
if (result.score > highestTitleScore) {
156+
highestTitleScore = result.score
157+
}
158+
const score = result.score / highestTitleScore
159+
return {
160+
result,
161+
_score: MATCH_PHRASE * TITLE_FIRST * (score + POPULARITY_FACTOR * (popularity || 0.0)),
162+
}
163+
})
164+
)
165+
}
166+
113167
const titleResults = index
114168
.query((q) => {
115-
if (/['"]/.test(titleQuery)) {
116-
// If the query contains a quotation marks, you can't easily
117-
// enough break it up into individual words.
118-
q.term(titleQuery, { fields: ['title'] })
119-
} else {
120-
// This is the structured way of doing turning 'foo bar'
121-
// into `title:foo title:bar'.
122-
titleQuery.split(/ /g).forEach((part) => {
123-
q.term(part, { fields: ['title'] })
169+
// The objective is to create an OR-query specifically for the 'title'
170+
// because *we* value matches on that much higher than any other
171+
// field in our records.
172+
// But we want to make sure that the last word is always treated
173+
// like a forward-tokenized token. I.e. you typed "google ku"
174+
// becomes a search for "google ku*".
175+
// Note that it's import that use the `lunr.tokenizer()` function when
176+
// using the `index.query()` function because, for starters, it will
177+
// normalize the input.
178+
// If you use `index.search()` is the higher abstraction of basically
179+
// doing this:
180+
// (pseudo code)
181+
//
182+
// Index.prototype.search = function(input) {
183+
// lunr.tokenize(input).forEach(token => {
184+
// Index.query(callback => {
185+
// callback(token)
186+
// })
187+
// })
188+
// }
189+
//
190+
// If we didn't use the tokenized form, we'd get different results
191+
// for searching for "SSH agent" and "ssh AgenT" for example.
192+
titleQueryTokenized.forEach(({ str }, i) => {
193+
const isLastToken = i === titleQueryTokenized.length - 1
194+
const isShort = str.length <= 3
195+
q.term(str, {
196+
fields: ['title'],
197+
wildcard:
198+
isLastToken && isShort ? lunr.Query.wildcard.TRAILING : lunr.Query.wildcard.NONE,
124199
})
125-
}
200+
})
126201
})
127202
.slice(0, PRE_LIMIT)
128203
.map((result) => {
@@ -170,7 +245,7 @@ export default async function loadLunrResults({ version, language, query, limit
170245
const _unique = new Set()
171246
const combinedMatchData = {}
172247
const results = []
173-
for (const matches of [titleResults, allResults]) {
248+
for (const matches of [andTitleResults, titleResults, allResults]) {
174249
for (const match of matches) {
175250
const { result } = match
176251
// We need to loop over all results (both from title searches and

0 commit comments

Comments
 (0)