-
Notifications
You must be signed in to change notification settings - Fork 0
/
app.py
170 lines (130 loc) · 5.11 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import math
import re
from flask import Flask, render_template, request, jsonify
from flask_wtf import FlaskForm
from wtforms import StringField, SubmitField
def load_vocab():
vocab = {}
with open("vocab.txt", "r") as f:
vocab_terms = f.readlines()
with open("idf-values.txt", "r") as f:
idf_values = f.readlines()
for (term, idf_value) in zip(vocab_terms, idf_values):
vocab[term.rstrip()] = int(idf_value.rstrip())
return vocab
def load_document():
with open("document.txt", "r") as f:
documents = f.readlines()
# print('Number of documents: ', len(documents))
# print('Sample document: ', documents[0])
return documents
# returns a dict with key : term, value : document indexes in which it is present
def load_inverted_index():
inverted_index = {}
with open('inverted_index.txt', 'r') as f:
inverted_index_terms = f.readlines()
for row_num in range(0,len(inverted_index_terms),2):
term = inverted_index_terms[row_num].strip()
documents = inverted_index_terms[row_num+1].strip().split()
inverted_index[term] = documents
# print('Size of inverted index: ', len(inverted_index))
return inverted_index
def load_link_of_qs():
with open("Leetcode-Scraping/qData/Qlink.txt", "r") as f:
links = f.readlines()
return links
def load_headings():
with open("Leetcode-Scraping/qData/index.txt", "r", encoding='utf-8', errors = "ignore") as f:
lines = f.readlines()
headings = []
for heading in lines:
words = heading.split()
heading = ' '.join(words[1:])
headings.append(heading)
return headings
vocab = load_vocab() # vocab : idf_values
document = load_document()
inverted_index = load_inverted_index()
Qlink = load_link_of_qs()
headings = load_headings()
# returns a dict with key : doc index, value : Normalized term frequency for this doc
def get_tf_dict(term):
tf_dict = {}
if term in inverted_index:
for doc in inverted_index[term]:
if doc not in tf_dict:
tf_dict[doc] = 1
else:
tf_dict[doc] += 1
for doc in tf_dict:
# dividing the freq of the word in doc with the total no of words in doc indexed document
try:
tf_dict[doc] /= len(document[int(doc)])
except (ZeroDivisionError, ValueError, IndexError) as e:
print(e)
print("Error in doc: ", doc)
return tf_dict
def get_idf_value(term):
return math.log( (1 + len(document)) / (1 + vocab[term]) )
def calc_docs_sorted_order(q_terms):
potential_docs = {} # will store the doc which can be our ans: sum of tf-idf value of that doc for all the query terms
q_Links = []
for term in q_terms:
if(term not in vocab):
continue
tf_vals_by_docs = get_tf_dict(term)
idf_value = get_idf_value(term)
# print(term, tf_vals_by_docs, idf_value)
for doc in tf_vals_by_docs:
if doc not in potential_docs:
potential_docs[doc] = tf_vals_by_docs[doc]*idf_value
else:
potential_docs[doc] += tf_vals_by_docs[doc]*idf_value
# print(potential_docs)
#divide the scores of each doc with no of query terms
for doc in potential_docs:
potential_docs[doc] /= len(q_terms)
# sort in dec order acc to values calculated
potential_docs = dict(sorted(potential_docs.items(), key =lambda item : item[1], reverse = True))
# if no doc found
if(len(potential_docs) == 0):
print("No matching question found. Please search with more relevant terms.")
count = 0
for doc_index in potential_docs:
q_Links.append([potential_docs[doc_index],
Qlink[int(doc_index) - 1],
headings[int(doc_index) - 1]])
count += 1
if(count > 20):
break
q_Links = sorted(q_Links, key =lambda item : item[0], reverse = True)
q_Links = q_Links[:20]
ans = [] # [link, heading]
for link in q_Links:
ans.append([link[1], link[2]])
return ans
# Use of Flask
app = Flask(__name__)
app.config['SECRET_KEY'] = 'abhinav'
# Search Form for user to input the query
class Search_Form(FlaskForm):
search = StringField("Enter your search terms: ")
submit = SubmitField("Search")
# @app.route("/<query>")
# def return_links(query):
# q_terms = [term.lower() for term in query.strip().split()]
# return jsonify(calc_docs_sorted_order(q_terms)[:20])
@app.route("/", methods = ['GET', 'POST'])
def home():
form = Search_Form()
ans = []
q_terms = []
if form.validate_on_submit():
query = form.search.data
q_terms = [term.lower() for term in query.strip().split()]
ans = calc_docs_sorted_order(q_terms)[:20]
if len(q_terms) != 0:
search_triggered = True
else:
search_triggered = False
return render_template('index.html', form = form, results = ans, search_triggered = search_triggered)