-
Notifications
You must be signed in to change notification settings - Fork 1
/
query.py
55 lines (47 loc) · 1.64 KB
/
query.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import re
import unidecode
def query(q):
# TODO Test cases? Even if we move to golang the test cases would be the same.
#
# test 123
# test "123 okay"
# test "123 okay" then
# "test 123"
# "test 123" "okay then
# "test 123" "okay then"
#
# Probably a lot more too.
res = q.split('"')
tokens = []
def add_quoted(t):
nonlocal tokens
stripped = t.strip()
if stripped:
tokens.append('"' + t.strip() + '"')
def add_unquoted(ts):
nonlocal tokens
tokens += [
'"' + t.strip() + '"*'
for t in ts.split(' ')
if len(t) > 1 # skip empty and one-char strings
]
# Start range at 1 so that an even number of quotation markes leads to the
# ValueError below. It just sort of works out that way with the split()
# function. res[0] is unquoted no matter what so we just handle it first.
add_unquoted(res[0])
for idx in range(1, len(res), 2):
try:
quoted, unquoted = res[idx:idx+2]
add_quoted(quoted)
add_unquoted(unquoted)
except ValueError:
# If we missed the closing quotation mark somewhere, this gets triggered.
# In this case we ignore the quote mark and treat everything as unquoted
add_unquoted(res[idx])
return ' '.join(tokens)
def search_normalize(s):
s = unidecode.unidecode(s).lower().replace('-', ' ').replace('_', ' ')
s = re.sub('[^0-9a-zA-Z "]+', '', s)
return s
def search_normalize_save(s):
return search_normalize(s).replace('"', ' ') # quotes don't get saved, we only want them for searching