-
Notifications
You must be signed in to change notification settings - Fork 0
/
helper.py
370 lines (297 loc) · 13 KB
/
helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
import requests
import dateparser
import datetime
import pytz
import re
import csv
import os
from bs4 import BeautifulSoup
import tabula
import cs50
from text_to_speech import speak
import datetime
import os.path
import telegram
import hashlib
import hmac
from googlesearch import search # Performing Google searches
from markdown import markdown
from nltk import sent_tokenize
from sys import argv
from speech_to_text import takeCommand
from dotenv import load_dotenv
load_dotenv()
SECRET_KEY = b'10a01dcf33762d3a204cb96429918ff6'
API_KEY = '38e8643fb0dc04e8d65b99994d3dafff'
token = os.getenv("TELE_TOKEN")
id = os.getenv("TELE_ID")
s1 = u'ÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚÝàáâãèéêìíòóôõùúýĂăĐđĨĩŨũƠơƯưẠạẢảẤấẦầẨẩẪẫẬậẮắẰằẲẳẴẵẶặẸẹẺẻẼẽẾếỀềỂểỄễỆệỈỉỊịỌọỎỏỐốỒồỔổỖỗỘộỚớỜờỞởỠỡỢợỤụỦủỨứỪừỬửỮữỰựỲỳỴỵỶỷỸỹ'
s0 = u'AAAAEEEIIOOOOUUYaaaaeeeiioooouuyAaDdIiUuOoUuAaAaAaAaAaAaAaAaAaAaAaAaEeEeEeEeEeEeEeEeIiIiOoOoOoOoOoOoOoOoOoOoOoOoUuUuUuUuUuUuUuYyYyYyYy'
bot = telegram.Bot(token=token)
db = cs50.SQL("sqlite:///database\schedule.db")
# REGEX
REGEX_DATE = r"(3[01]|[12][0-9]|0?[1-9])[-\/:|](1[0-2]|0?[1-9])([-\/:|](2[0-1][0-9][0-9]))"
REGEX_DAY_MONTH = r"(3[01]|[12][0-9]|0?[1-9])[-\/:|](1[0-2]|0?[1-9])"
REGEX_MONTH_YEAR = r"(1[0-2]|0?[1-9])([-\/:|](2[0-1][0-9][0-9]))"
def regex_date(msg, timezone="Asia/Ho_Chi_Minh"):
''' use regex to capture date string format '''
tz = pytz.timezone(timezone)
now = datetime.datetime.now(tz=tz)
temp = msg
date_str = []
regex = REGEX_DATE
regex_day_month = REGEX_DAY_MONTH
regex_month_year = REGEX_MONTH_YEAR
pattern = re.compile("(%s|%s|%s)" % (
regex, regex_month_year, regex_day_month), re.UNICODE)
matches = pattern.finditer(msg)
for match in matches:
_dt = match.group(0)
_dt = _dt.replace("/", "-").replace("|", "-").replace(":", "-")
for i in range(len(_dt.split("-"))):
if len(_dt.split("-")[i]) == 1:
_dt = _dt.replace(_dt.split("-")[i], "0"+_dt.split("-")[i])
if len(_dt.split("-")) == 2:
pos1 = _dt.split("-")[0]
pos2 = _dt.split("-")[1]
if 0 < int(pos1) < 32 and 0 < int(pos2) < 13:
_dt = pos1+"-"+pos2+"-"+str(now.year)
date_str.append(_dt)
if not date_str:
lst = ["hôm qua", "hôm nay", "ngày mai"]
temp = temp.replace("mai", "ngày mai")
temp = temp.replace("qua", "hôm qua")
temp = temp.replace("mơi", "ngày mai")
temp = temp.replace("nay", "hôm nay")
temp = temp.replace("bữa nay", "hôm nay")
for word in lst:
if re.findall(word, temp):
date_str.append(re.findall(word, temp))
date_str = dateparser.parse(date_str[0][0])
return date_str
else:
return dateparser.parse(date_str[0], date_formats=['%d-%m-%Y'])
def getFile(url, name):
# Download pdf file from url
r = requests.get(url, stream=True)
with open(os.path.join(os.getcwd(), f'{name}.pdf'), 'wb') as f:
f.write(r.content)
return name
def pdfToCsv():
df = tabula.read_pdf("myfile.pdf", encoding='utf-8', pages='all', lattice=True)
tabula.convert_into("myfile.pdf", "output.csv", output_format="csv", pages='all')
l = ["STT","Thứ","Ngày","Giờ","Số tiết","Phòng","SL","CBGD","Mã MH","Tên môn","Nhóm","Lớp"]
with open("output.csv", 'r', encoding='utf-8', errors='ignore') as data_file:
lines = data_file.readlines()
lines[0]= ",".join(l)+"\n" # replace first line, the "header" with list contents
with open("output.csv", 'w', encoding='utf-8', errors='ignore') as out_data:
for line in lines: # write updated lines
out_data.write(line)
os.remove("myfile.pdf")
def findAndGetSchedule(week_number):
url = "https://camau.bdu.edu.vn/chuyen-muc/sinh-vien/chinh-quy"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
# Tìm link lịch học tuần mới nhất
for link in soup.find_all('a'):
if link.get("title"):
if "lịch học tuần" in link.get("title").lower():
if int(regex_date(str(link.get('title'))).date().isocalendar()[1]) == week_number:
response = requests.get(link.get('href'))
soup = BeautifulSoup(response.content, "html.parser")
iframe = soup.find('div', {'class': 'td-post-content'}).find('p').find('iframe')
id = iframe.get('src').split('/')[-2]
URL = f"https://docs.google.com/uc?id={id}&export=download"
getFile(URL, 'myfile')
pdfToCsv()
save_schedule('output')
return True
return False
def insert_schedule(id, date_time, room, lecturer, subject, name_class, time, week_number):
db.execute("INSERT INTO schedule(id, date_time, room, lecturer, subject, class, time, week_number)"
"VALUES(?, ?, ?, ?, ?, ?, ?, ?)",id, date_time, room, lecturer, subject, name_class, time, week_number)
def save_schedule(filename):
with open(f"{filename}.csv", encoding="utf-8") as csvfile:
reader = csv.DictReader(csvfile)
flag = True
for row in reader:
print(row)
try:
date = datetime.datetime.strptime(row["Ngày"], '%d/%m/%Y').date()
if flag == True:
week_number = datetime.datetime.strptime(row["Ngày"], '%d/%m/%Y').date().isocalendar()[1]
flag = False
insert_schedule(row["STT"] + str(week_number), date, row["Phòng"], row["CBGD"], row["Tên môn"], row["Lớp"], row["Giờ"], week_number)
except:
continue
try:
if row[None]:
db.execute("UPDATE schedule SET class=? WHERE id=?", str(row[None]), row["STT"]+str(week_number))
except:
continue
os.remove("output.csv")
'''
with open(f"{filename}.csv", encoding="utf-8") as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
try:
db.execute("UPDATE schedule SET class=? WHERE id=?", row[None],row["STT"])
except:
continue
'''
def get_schedule(class_name, date=None, week_number=None):
dayOfWeek = ["Thứ 2", "Thứ 3", "Thứ tư", "Thứ 5", "Thứ 6", "Thứ 7", "Chủ Nhật"]
schedule = []
# Neu co ngay cu the
if date:
week = date.isocalendar()[1]
rows = db.execute("SELECT * FROM schedule WHERE week_number = ?", week)
if rows:
for i in rows:
if i["date_time"] == str(date.date()) and class_name in i["class"].lower():
schedule.append(i)
else:
if findAndGetSchedule(date.isocalendar()[1]):
rows = db.execute("SELECT * FROM schedule WHERE date_time=? AND class LIKE ?", date.date(), "%" + class_name + "%")
if rows:
schedule = rows
if week_number:
week_number = str(week_number)
rows = db.execute("SELECT * FROM schedule WHERE week_number=? AND class LIKE ?", week_number, "%" + class_name + "%")
if rows:
schedule = rows
else:
if findAndGetSchedule(int(week_number)):
rows = db.execute("SELECT * FROM schedule WHERE week_number = ? AND class LIKE ?", week_number, "%" + class_name + "%")
if rows:
schedule = rows
if schedule:
rows = schedule
for schedule in rows:
day = datetime.datetime.strptime(schedule['date_time'], '%Y-%m-%d')
day_of_week = dayOfWeek[day.weekday()]
time = schedule['time']
time = time.split("h")
if len(time) >= 2:
if time[1] == "00":
time[1] = ""
speak(f"Bạn có lịch học vào {day_of_week},"
f"ngày {day.day} tháng {day.month} vào lúc {time[0] + ' giờ ' + time[1]}"
f"tại phòng {schedule['room']}")
bot.sendMessage(chat_id=id, text=f"{day_of_week} \n"
f"Ngày {day.day} Tháng {day.month}\nVào lúc: {schedule['time']}\n"
f"Phòng: {schedule['room']}", disable_notification=True)
print(f"Bạn có lịch học vào ngày {day} vào lúc {schedule['time']} tại phòng {schedule['room']}")
else:
speak("Xin lỗi mình không tìm được lịch học của bạn")
def getHash256(a):
m = hashlib.sha256()
m.update(a)
return m.hexdigest()
def getHmac512(str, key):
h = hmac.new(key, msg=str, digestmod=hashlib.sha512)
return h.hexdigest()
def predict_answer(model, question, contexts, seq_len=512, debug=False):
split_context = []
if not isinstance(contexts, list):
contexts = [contexts]
for context in contexts:
for i in range(0, len(context), seq_len):
split_context.append(context[i:i+seq_len])
split_context = contexts
f_data = []
for i, c in enumerate(split_context):
f_data.append(
{'qas':
[{'question': question,
'id': i,
'answers': [{'text': ' ', 'answer_start': 0}],
'is_impossible': False}],
'context': c
})
prediction = model.predict(f_data)
if debug:
for x in prediction[0]:
print(x['answer'][0])
preds = [x['answer'][0].lower().strip() for x in prediction[0] if x['answer'][0].strip() != '']
if preds:
return max(set(preds), key=preds.count)
return 'No answer'
def query_pages(query, n=1):
return list(search(query, num_results=n ,lang="vi"))
# Source: https://gist.github.com/lorey/eb15a7f3338f959a78cc3661fbc255fe
def markdown_to_text(markdown_string):
""" Converts a markdown string to plaintext """
# md -> html -> text since BeautifulSoup can extract text cleanly
html = markdown(markdown_string)
# remove code snippets
html = re.sub(r'<pre>(.*?)</pre>', ' ', html)
html = re.sub(r'<code>(.*?)</code >', ' ', html)
# extract text
soup = BeautifulSoup(html, "html.parser")
text = ''.join(soup.findAll(text=True))
return text
def format_text(text):
text = markdown_to_text(text)
text = text.replace('\n', ' ')
return text
def chunks(l, n):
for i in range(0, len(l), n):
yield l[i:i + n]
def getContent(url):
try:
html = requests.get(url, timeout = 3)
tree = BeautifulSoup(html.text,'lxml')
for invisible_elem in tree.find_all(['script', 'style']):
invisible_elem.extract()
paragraphs = [p.get_text() for p in tree.find_all("p")]
for para in tree.find_all('p'):
para.extract()
for href in tree.find_all(['a','strong']):
href.unwrap()
tree = BeautifulSoup(str(tree.html),'lxml')
text = tree.get_text(separator='\n\n')
text = re.sub('\n +\n','\n\n',text)
paragraphs += text.split('\n\n')
paragraphs = [re.sub(' +',' ',p.strip()) for p in paragraphs]
paragraphs = [p for p in paragraphs if len(p.split()) > 10]
for i in range(0,len(paragraphs)):
sents = []
text_chunks = list(chunks(paragraphs[i],100000))
for chunk in text_chunks:
sents += sent_tokenize(chunk)
sents = [s for s in sents if len(s) > 2]
sents = ' . '.join(sents)
paragraphs[i] = sents
txt = '\n\n'.join(paragraphs)
if len(txt) > 1000:
for i in range(len(txt)):
if i > 800 and txt[i] == ".":
return txt[:i + 1].replace(". .", ".")
return txt.replace(". .", ".")
except:
#print('Cannot read ' + url, str(sys.exc_info()[0]))
return ''
def query_to_text(query):
txt = ""
for link in query_pages(query, 2):
print(link)
txt = getContent(link)
txt = format_text(txt)
if txt:
return txt
return ""
#return text[0]
def getStudentClass(studentClass):
if not studentClass:
speak("Trước hết hãy cho mình biết bạn học lớp nào nhé")
q = takeCommand().lower()
q = ("").join(q.split(" "))
if re.findall("\d\d\w\w", q):
return re.findall("\d\d\w\w", q)[0]
else:
getStudentClass(studentClass)
return studentClass
#txt = query_to_text("Số thí sinh tham dự thi đại học năm nay")
#print(txt)
#save_schedule("output")