Merge branch 'main' into repo-sync

Octomerger · web-flow · commit b0f7e457676b · 2021-11-16T15:23:32.000-05:00
diff --git a/lib/failbot.js b/lib/failbot.js
@@ -1,21 +1,46 @@
 import got from 'got'
-import { Failbot, HTTPBackend, LogBackend } from '@github/failbot'
+import { Failbot, HTTPBackend } from '@github/failbot'
 
 const HAYSTACK_APP = 'docs'
 
+async function retryingGot(url, args) {
+  return got(
+    url,
+    Object.assign({}, args, {
+      // With the timeout at 3000 (milliseconds) and the retry.limit
+      // at 4 (times), the total worst-case is:
+      // 3000 * 4  + 1000 + 2000 + 3000 + 4000 + 8000 = 30 seconds
+      timeout: 3000,
+      retry: {
+        // This means it will wait...
+        // 1. 1000ms
+        // 2. 2000ms
+        // 3. 4000ms
+        // 4. 8000ms
+        // 5. give up!
+        //
+        // From the documentation:
+        //
+        //   Delays between retries counts with function
+        //   1000 * Math.pow(2, retry - 1) + Math.random() * 100,
+        //   where retry is attempt number (starts from 1).
+        //
+        limit: 4,
+      },
+    })
+  )
+}
+
 export function report(error, metadata) {
   // If there's no HAYSTACK_URL set, bail early
   if (!process.env.HAYSTACK_URL) return
 
   const backends = [
     new HTTPBackend({
       haystackURL: process.env.HAYSTACK_URL,
-      fetchFn: got,
+      fetchFn: retryingGot,
     }),
   ]
-  if (process.env.NODE_ENV !== 'test') {
-    backends.push(new LogBackend({ log: console.log.bind(console) }))
-  }
   const failbot = new Failbot({
     app: HAYSTACK_APP,
     backends: backends,
diff --git a/lib/search/lunr-search.js b/lib/search/lunr-search.js
@@ -88,6 +88,25 @@ export default async function loadLunrResults({ version, language, query, limit
   // want to make sure this number accounts for that.
   const TITLE_FIRST = queryLength <= 2 ? 45 : queryLength <= 6 ? 25 : 10
 
+  // Multiplication bonus given to matches that were made on the
+  // the search where ALL tokens are required.
+  // E.g. you search for 'foo bar' and we have three records:
+  //
+  //  A)  "This foo is very special"
+  //  B)  "With bar and foo you can't go wrong"
+  //  C)  "Only bar can save you"
+  //
+  // What will happen is that it only finds record (B) when it's
+  // requires to match both 'foo' *and* 'bar'. So you get these scores:
+  //
+  //  A) score = result.score + popularity
+  //  B) score = MATCH_PHRASE * (result.score + popularity)
+  //  C) score = result.score + popularity
+  //
+  // So it's very powerful multiplier. But that's fine because a
+  // "phrase match" is a very accurate thing.
+  const MATCH_PHRASE = 5
+
   // Imagine that we have 1,000 documents. 100 of them contain the word
   // 'foobar'. Of those 100, we want to display the top 10 "best".
   // But if we only do `lunrindex.search('foobar').slice(0, 10)` we
@@ -101,28 +120,84 @@ export default async function loadLunrResults({ version, language, query, limit
   // records that we finally return.
   const PRE_LIMIT = 500
 
-  let titleQuery = query.trim()
-  if (titleQuery.length <= 3 && !titleQuery.endsWith('*s')) {
-    // When the search input is really short, force it to search with
-    // the "forward wild card". I.e. you typed `go` we turn it into a
-    // search for `go*` which means it can find things like `Google`.
-    titleQuery += '*'
-  }
+  const titleQuery = query.trim()
 
   let highestTitleScore = 0.0
+
+  const andTitleResults = []
+
+  // This will turn something like 'foo and bar' into:
+  // [
+  //   { str: 'foo', metadata: { position: [Array], index: 0 } },
+  //   { str: 'bar', metadata: { position: [Array], index: 1 } }
+  // ]
+  // Note how the stopword gets omitted.
+  // It's important to omit the stopwords because even if the record
+  // actually contains the stopword, it won't match then.
+  // E.g. you have a record called "Foo And Bar" and you search for
+  // {foo AND and AND bar} it will actually not find anything.
+  // But if you change it to {foo AND bar} it will match "Foo And Bar"
+  // Same goes if any other stopwords were used like "Foo the Bar with for a".
+  // That also needs to become an AND-search of {foo AND bar} ...only.
+  const titleQueryTokenized = lunr.tokenizer(titleQuery).filter(lunr.stopWordFilter)
+
+  if (titleQueryTokenized.length > 1) {
+    andTitleResults.push(
+      ...index
+        .query((q) => {
+          for (const { str } of titleQueryTokenized) {
+            q.term(str, { fields: ['title'], presence: lunr.Query.presence.REQUIRED })
+          }
+        })
+        .slice(0, PRE_LIMIT)
+        .map((result) => {
+          const { popularity } = records[result.ref]
+          if (result.score > highestTitleScore) {
+            highestTitleScore = result.score
+          }
+          const score = result.score / highestTitleScore
+          return {
+            result,
+            _score: MATCH_PHRASE * TITLE_FIRST * (score + POPULARITY_FACTOR * (popularity || 0.0)),
+          }
+        })
+    )
+  }
+
   const titleResults = index
     .query((q) => {
-      if (/['"]/.test(titleQuery)) {
-        // If the query contains a quotation marks, you can't easily
-        // enough break it up into individual words.
-        q.term(titleQuery, { fields: ['title'] })
-      } else {
-        // This is the structured way of doing turning 'foo bar'
-        // into `title:foo title:bar'.
-        titleQuery.split(/ /g).forEach((part) => {
-          q.term(part, { fields: ['title'] })
+      // The objective is to create an OR-query specifically for the 'title'
+      // because *we* value matches on that much higher than any other
+      // field in our records.
+      // But we want to make sure that the last word is always treated
+      // like a forward-tokenized token. I.e. you typed "google ku"
+      // becomes a search for "google ku*".
+      // Note that it's import that use the `lunr.tokenizer()` function when
+      // using the `index.query()` function because, for starters, it will
+      // normalize the input.
+      // If you use `index.search()` is the higher abstraction of basically
+      // doing this:
+      // (pseudo code)
+      //
+      //    Index.prototype.search = function(input) {
+      //       lunr.tokenize(input).forEach(token => {
+      //          Index.query(callback => {
+      //              callback(token)
+      //          })
+      //       })
+      //    }
+      //
+      // If we didn't use the tokenized form, we'd get different results
+      // for searching for "SSH agent" and "ssh AgenT" for example.
+      titleQueryTokenized.forEach(({ str }, i) => {
+        const isLastToken = i === titleQueryTokenized.length - 1
+        const isShort = str.length <= 3
+        q.term(str, {
+          fields: ['title'],
+          wildcard:
+            isLastToken && isShort ? lunr.Query.wildcard.TRAILING : lunr.Query.wildcard.NONE,
         })
-      }
+      })
     })
     .slice(0, PRE_LIMIT)
     .map((result) => {
@@ -170,7 +245,7 @@ export default async function loadLunrResults({ version, language, query, limit
   const _unique = new Set()
   const combinedMatchData = {}
   const results = []
-  for (const matches of [titleResults, allResults]) {
+  for (const matches of [andTitleResults, titleResults, allResults]) {
     for (const match of matches) {
       const { result } = match
       // We need to loop over all results (both from title searches and