@@ -88,6 +88,25 @@ export default async function loadLunrResults({ version, language, query, limit
8888 // want to make sure this number accounts for that.
8989 const TITLE_FIRST = queryLength <= 2 ? 45 : queryLength <= 6 ? 25 : 10
9090
91+ // Multiplication bonus given to matches that were made on the
92+ // the search where ALL tokens are required.
93+ // E.g. you search for 'foo bar' and we have three records:
94+ //
95+ // A) "This foo is very special"
96+ // B) "With bar and foo you can't go wrong"
97+ // C) "Only bar can save you"
98+ //
99+ // What will happen is that it only finds record (B) when it's
100+ // requires to match both 'foo' *and* 'bar'. So you get these scores:
101+ //
102+ // A) score = result.score + popularity
103+ // B) score = MATCH_PHRASE * (result.score + popularity)
104+ // C) score = result.score + popularity
105+ //
106+ // So it's very powerful multiplier. But that's fine because a
107+ // "phrase match" is a very accurate thing.
108+ const MATCH_PHRASE = 5
109+
91110 // Imagine that we have 1,000 documents. 100 of them contain the word
92111 // 'foobar'. Of those 100, we want to display the top 10 "best".
93112 // But if we only do `lunrindex.search('foobar').slice(0, 10)` we
@@ -101,28 +120,84 @@ export default async function loadLunrResults({ version, language, query, limit
101120 // records that we finally return.
102121 const PRE_LIMIT = 500
103122
104- let titleQuery = query . trim ( )
105- if ( titleQuery . length <= 3 && ! titleQuery . endsWith ( '*s' ) ) {
106- // When the search input is really short, force it to search with
107- // the "forward wild card". I.e. you typed `go` we turn it into a
108- // search for `go*` which means it can find things like `Google`.
109- titleQuery += '*'
110- }
123+ const titleQuery = query . trim ( )
111124
112125 let highestTitleScore = 0.0
126+
127+ const andTitleResults = [ ]
128+
129+ // This will turn something like 'foo and bar' into:
130+ // [
131+ // { str: 'foo', metadata: { position: [Array], index: 0 } },
132+ // { str: 'bar', metadata: { position: [Array], index: 1 } }
133+ // ]
134+ // Note how the stopword gets omitted.
135+ // It's important to omit the stopwords because even if the record
136+ // actually contains the stopword, it won't match then.
137+ // E.g. you have a record called "Foo And Bar" and you search for
138+ // {foo AND and AND bar} it will actually not find anything.
139+ // But if you change it to {foo AND bar} it will match "Foo And Bar"
140+ // Same goes if any other stopwords were used like "Foo the Bar with for a".
141+ // That also needs to become an AND-search of {foo AND bar} ...only.
142+ const titleQueryTokenized = lunr . tokenizer ( titleQuery ) . filter ( lunr . stopWordFilter )
143+
144+ if ( titleQueryTokenized . length > 1 ) {
145+ andTitleResults . push (
146+ ...index
147+ . query ( ( q ) => {
148+ for ( const { str } of titleQueryTokenized ) {
149+ q . term ( str , { fields : [ 'title' ] , presence : lunr . Query . presence . REQUIRED } )
150+ }
151+ } )
152+ . slice ( 0 , PRE_LIMIT )
153+ . map ( ( result ) => {
154+ const { popularity } = records [ result . ref ]
155+ if ( result . score > highestTitleScore ) {
156+ highestTitleScore = result . score
157+ }
158+ const score = result . score / highestTitleScore
159+ return {
160+ result,
161+ _score : MATCH_PHRASE * TITLE_FIRST * ( score + POPULARITY_FACTOR * ( popularity || 0.0 ) ) ,
162+ }
163+ } )
164+ )
165+ }
166+
113167 const titleResults = index
114168 . query ( ( q ) => {
115- if ( / [ ' " ] / . test ( titleQuery ) ) {
116- // If the query contains a quotation marks, you can't easily
117- // enough break it up into individual words.
118- q . term ( titleQuery , { fields : [ 'title' ] } )
119- } else {
120- // This is the structured way of doing turning 'foo bar'
121- // into `title:foo title:bar'.
122- titleQuery . split ( / / g) . forEach ( ( part ) => {
123- q . term ( part , { fields : [ 'title' ] } )
169+ // The objective is to create an OR-query specifically for the 'title'
170+ // because *we* value matches on that much higher than any other
171+ // field in our records.
172+ // But we want to make sure that the last word is always treated
173+ // like a forward-tokenized token. I.e. you typed "google ku"
174+ // becomes a search for "google ku*".
175+ // Note that it's import that use the `lunr.tokenizer()` function when
176+ // using the `index.query()` function because, for starters, it will
177+ // normalize the input.
178+ // If you use `index.search()` is the higher abstraction of basically
179+ // doing this:
180+ // (pseudo code)
181+ //
182+ // Index.prototype.search = function(input) {
183+ // lunr.tokenize(input).forEach(token => {
184+ // Index.query(callback => {
185+ // callback(token)
186+ // })
187+ // })
188+ // }
189+ //
190+ // If we didn't use the tokenized form, we'd get different results
191+ // for searching for "SSH agent" and "ssh AgenT" for example.
192+ titleQueryTokenized . forEach ( ( { str } , i ) => {
193+ const isLastToken = i === titleQueryTokenized . length - 1
194+ const isShort = str . length <= 3
195+ q . term ( str , {
196+ fields : [ 'title' ] ,
197+ wildcard :
198+ isLastToken && isShort ? lunr . Query . wildcard . TRAILING : lunr . Query . wildcard . NONE ,
124199 } )
125- }
200+ } )
126201 } )
127202 . slice ( 0 , PRE_LIMIT )
128203 . map ( ( result ) => {
@@ -170,7 +245,7 @@ export default async function loadLunrResults({ version, language, query, limit
170245 const _unique = new Set ( )
171246 const combinedMatchData = { }
172247 const results = [ ]
173- for ( const matches of [ titleResults , allResults ] ) {
248+ for ( const matches of [ andTitleResults , titleResults , allResults ] ) {
174249 for ( const match of matches ) {
175250 const { result } = match
176251 // We need to loop over all results (both from title searches and
0 commit comments