forked from roeseth/Cosi-132a-Information-Retrieval-Project
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathboolean_index.py
56 lines (42 loc) · 1.47 KB
/
boolean_index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import shelve
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
import json
def index(id):
""" A method that tokenizes and stems each word in the data field and constructed the inverted index
:param id: the doc_id
:return: none
"""
ps = PorterStemmer()
tokens = word_tokenize(data[id]['Title'][0]) + word_tokenize(data[id]['Text'])
for token in tokens:
if token not in stop_words:
term = ps.stem(token)
if term not in cache_index:
cache_index[term] = [int(id)]
elif cache_index[term][-1] != int(id):
cache_index[term].append(int(id))
if __name__ == "__main__":
import time
# setup timer
s = time.perf_counter()
# open files
idx = shelve.open('corpus_index.dat', flag = 'n', writeback = False)
sw = shelve.open('stop_words.dat', flag = 'n', writeback = False)
with open('films_corpus.json', 'r', encoding = 'UTF-8') as f:
data = json.load(f)
# get and store the stopwords list
sw['stop_words'] = stopwords.words('english')
stop_words = sw['stop_words']
# create a in-RAM index and later store in disk
cache_index = {}
for id in data:
index(id)
for term in cache_index:
idx[term] = cache_index[term]
idx.close()
sw.close()
f.close()
elapsed = time.perf_counter() - s
print(f"{__file__} executed in {elapsed:0.2f} seconds.")