-
Notifications
You must be signed in to change notification settings - Fork 0
/
search_event.py
190 lines (152 loc) · 4.77 KB
/
search_event.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
# To add a new cell, type '# %%'
# To add a new markdown cell, type '# %% [markdown]'
# %%
import pickle
import MeCab
import time
from datetime import datetime
# %%
def load_file(name):
with open("wordtable_event/"+name+".pickle", "rb") as f:
return pickle.load(f)
# %%
word_table = load_file("word_table")
page_title_id_table = load_file("page_title_id_table")
page_heading_id_table = load_file("page_heading_id_table")
page_desc_id_table = load_file("page_desc_id_table")
zenbun_table = load_file("zenbun_table")
event_dict = {}
# %%
def load_tables():
global page_title_id_table
global page_desc_id_table
global page_heading_id_table
global zenbun_table
global word_table
word_table = load_file("word_table")
page_title_id_table = load_file("page_title_id_table")
page_heading_id_table = load_file("page_heading_id_table")
page_desc_id_table = load_file("page_desc_id_table")
zenbun_table = load_file("zenbun_table")
global event_dict
with open("pages.pickle", "rb") as f:
for page in pickle.load(f)["data"]:
page_id = page["id"]
if "title" not in page["event"]:
continue
for i in range(len(page["event"]["title"])):
event_id = get_event_id(page_id, i)
event_dict[event_id] = {
"title": page["event"]["title"][i],
"start": page["event"]["start"][i],
"end": page["event"]["end"][i],
"page_id": page_id,
"description": page["event"]["description"][i],
"page": page
}
# %%
def get_word_id(word):
global word_table
word = word.lower()
if word not in word_table["data"]:
return None
return word_table["data"][word]
# %%
wakati = MeCab.Tagger("-Owakati")
remove_words = {"(", ")", "(", ")", "[", "]",
"「", "」", "+", "-", "*", "$",
"'", '"', "、", ".", "”", "’",
":", ";", "_", "/", "?", "!",
"。", ",", "=", "="}
def split_word(keyword):
return [get_word_id(r) for r in wakati.parse(keyword).split() if r not in remove_words]
def split_word_str(keyword):
return [r for r in wakati.parse(keyword).split() if r not in remove_words]
# %%
def set_score(result, word_id, table, score):
if word_id not in table:
return
pageset = table[word_id]
for page in pageset:
if page not in result:
result[page] = 0
result[page] += score
# %%
def zenbun_search(result, keyword, score):
global zenbun_table
words_str = split_word_str(keyword)
for page_id, zenbun in zenbun_table.items():
for word in words_str:
cnt = zenbun.count(word)
if cnt == 0:
continue
if page_id not in result:
result[page_id] = 0
result[page_id] += score * cnt
# %%
def scored_search(keyword):
result = {}
for word_id in split_word(keyword):
if word_id is not None:
set_score(result, word_id, page_title_id_table, 30)
set_score(result, word_id, page_heading_id_table, 10)
set_score(result, word_id, page_desc_id_table, 1)
zenbun_search(result, keyword, 1)
return result
# %%
def get_event_id(page_id, event_index):
return page_id*1919 + event_index
# %%
def sort_score(scores):
score_array = []
for page_id, score in scores.items():
score_array.append({
"event_id": page_id,
"score": score
})
score_array.sort(key=lambda x: x["score"], reverse=True)
return score_array
# %%
currenttime = 0
def reload():
global currenttime
if time.time() - currenttime < 5:
return
load_tables()
currenttime = time.time()
# %%
def keyword_search(keyword):
if len(keyword) == 0:
return
reload()
scores = scored_search(keyword)
scores = [event_dict[s["event_id"]] for s in sort_score(scores)]
return scores
# %%
def filter_by_date(events, a, b):
# pass if a<start<b or a<end<b
a = datetime.fromisoformat(a)
b = datetime.fromisoformat(b)
result = []
for e in events:
try:
start = datetime.fromisoformat(e["start"])
end = datetime.fromisoformat(e["end"])
if a < start < b or a < end < b:
result.append(e)
except:
continue
return result
# %%
def get_all():
global event_dict
return [i for i in event_dict.values()]
# %%
def search(keyword="", rangestart="2021-04-01", rangeend="2021-12-31"):
events = []
if len(keyword) == 0:
reload()
events = get_all()
else:
events = keyword_search(keyword)
return(filter_by_date(events, rangestart, rangeend))