-
-
Notifications
You must be signed in to change notification settings - Fork 61
/
search.js
130 lines (123 loc) · 5.5 KB
/
search.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
// length for context in excerpts
const CONTEXT_LENGTH = 50
const normalizeWhitespace = str => str.replace(/\s+/g, ' ')
const makeExcerpt = (strs, { startIndex, startOffset, endIndex, endOffset }) => {
const start = strs[startIndex]
const end = strs[endIndex]
const match = start === end
? start.slice(startOffset, endOffset)
: start.slice(startOffset)
+ strs.slice(start + 1, end).join('')
+ end.slice(0, endOffset)
const trimmedStart = normalizeWhitespace(start.slice(0, startOffset)).trimStart()
const trimmedEnd = normalizeWhitespace(end.slice(endOffset)).trimEnd()
const ellipsisPre = trimmedStart.length < CONTEXT_LENGTH ? '' : '…'
const ellipsisPost = trimmedEnd.length < CONTEXT_LENGTH ? '' : '…'
const pre = `${ellipsisPre}${trimmedStart.slice(-CONTEXT_LENGTH)}`
const post = `${trimmedEnd.slice(0, CONTEXT_LENGTH)}${ellipsisPost}`
return { pre, match, post }
}
const simpleSearch = function* (strs, query, options = {}) {
const { locales = 'en', sensitivity } = options
const matchCase = sensitivity === 'variant'
const haystack = strs.join('')
const lowerHaystack = matchCase ? haystack : haystack.toLocaleLowerCase(locales)
const needle = matchCase ? query : query.toLocaleLowerCase(locales)
const needleLength = needle.length
let index = -1
let strIndex = -1
let sum = 0
do {
index = lowerHaystack.indexOf(needle, index + 1)
if (index > -1) {
while (sum <= index) sum += strs[++strIndex].length
const startIndex = strIndex
const startOffset = index - (sum - strs[strIndex].length)
const end = index + needleLength
while (sum <= end) sum += strs[++strIndex].length
const endIndex = strIndex
const endOffset = end - (sum - strs[strIndex].length)
const range = { startIndex, startOffset, endIndex, endOffset }
yield { range, excerpt: makeExcerpt(strs, range) }
}
} while (index > -1)
}
const segmenterSearch = function* (strs, query, options = {}) {
const { locales = 'en', granularity = 'word', sensitivity = 'base' } = options
let segmenter, collator
try {
segmenter = new Intl.Segmenter(locales, { usage: 'search', granularity })
collator = new Intl.Collator(locales, { sensitivity })
} catch (e) {
console.warn(e)
segmenter = new Intl.Segmenter('en', { usage: 'search', granularity })
collator = new Intl.Collator('en', { sensitivity })
}
const queryLength = Array.from(segmenter.segment(query)).length
const substrArr = []
let strIndex = 0
let segments = segmenter.segment(strs[strIndex])[Symbol.iterator]()
main: while (strIndex < strs.length) {
while (substrArr.length < queryLength) {
const { done, value } = segments.next()
if (done) {
// the current string is exhausted
// move on to the next string
strIndex++
if (strIndex < strs.length) {
segments = segmenter.segment(strs[strIndex])[Symbol.iterator]()
continue
} else break main
}
const { index, segment } = value
// ignore formatting characters
if (!/[^\p{Format}]/u.test(segment)) continue
// normalize whitespace
if (/\s/u.test(segment)) {
if (!/\s/u.test(substrArr[substrArr.length - 1]?.segment))
substrArr.push({ strIndex, index, segment: ' ' })
continue
}
value.strIndex = strIndex
substrArr.push(value)
}
const substr = substrArr.map(x => x.segment).join('')
if (collator.compare(query, substr) === 0) {
const endIndex = strIndex
const lastSeg = substrArr[substrArr.length - 1]
const endOffset = lastSeg.index + lastSeg.segment.length
const startIndex = substrArr[0].strIndex
const startOffset = substrArr[0].index
const range = { startIndex, startOffset, endIndex, endOffset }
yield { range, excerpt: makeExcerpt(strs, range) }
}
substrArr.shift()
}
}
export const search = (strs, query, options) => {
const { granularity = 'grapheme', sensitivity = 'base' } = options
if (!Intl?.Segmenter || granularity === 'grapheme'
&& (sensitivity === 'variant' || sensitivity === 'accent'))
return simpleSearch(strs, query, options)
return segmenterSearch(strs, query, options)
}
export const searchMatcher = (textWalker, opts) => {
const { defaultLocale, matchCase, matchDiacritics, matchWholeWords } = opts
return function* (doc, query) {
const iter = textWalker(doc, function* (strs, makeRange) {
for (const result of search(strs, query, {
locales: doc.body.lang || doc.documentElement.lang || defaultLocale || 'en',
granularity: matchWholeWords ? 'word' : 'grapheme',
sensitivity: matchDiacritics && matchCase ? 'variant'
: matchDiacritics && !matchCase ? 'accent'
: !matchDiacritics && matchCase ? 'case'
: 'base',
})) {
const { startIndex, startOffset, endIndex, endOffset } = result.range
result.range = makeRange(startIndex, startOffset, endIndex, endOffset)
yield result
}
})
for (const result of iter) yield result
}
}