-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse_data.py
76 lines (57 loc) · 2.6 KB
/
parse_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
from os import listdir
from bs4 import BeautifulSoup
import re
PATH = 'cache/'
course_names = {}
possible = set()
with open('class_names.csv', 'r') as f:
lines = f.read().splitlines()
for line in lines:
parsed = line.split(';')
course_names[parsed[0]] = parsed[1]
courses = []
for filename in listdir(PATH):
with open(PATH + filename, 'r') as f:
soup = BeautifulSoup(f.read())
title = soup.find('h1')
prof_time = soup.find('h2')
caption = soup.find('caption')
entries = soup.find_all('tr')[1:]
page_values = {}
for entry in entries:
entry = str(entry).replace('<tr>', '').replace('</tr>', '').replace('<td>', '').split('</td>')[:-1]
median = float(entry[-1])
label = entry[0].strip().lower()[:-1]
page_values[label] = median
possible.add(label)
parsed_caption = str(caption).replace(' surveyed ', '').replace(' enrolled</caption>', '').replace('"', '').split('\xc2\xa0')
if not parsed_caption[-3].isdigit() or parsed_caption[-1].isdigit():
continue
completed = str(int(parsed_caption[-3]))
total_entrolled = str(int(parsed_caption[-1]))
parsed_prof = str(prof_time).replace('<h2>', '').replace('</h2>', '').split('\xc2\xa0')
name = parsed_prof[0].strip().lower()
time = parsed_prof[-1].strip().lower()
parsed_title = str(title).replace('<h1>', '').replace('</h1>', '').replace('&', 'and').split()[:-1]
course_code = parsed_title[-1]
if not course_code.isdigit():
continue
try:
course_dep = re.match('^[a-zA-Z]+[0-9]{3}', filename).group(0)[:-3]
except AttributeError:
continue
course_dep_code = (course_dep + course_code).lower().replace(' ', '')
course_title = 'NULL'
if course_dep_code in course_names:
course_title = course_names[course_dep_code]
courses.append({'course_code' : course_code, 'ratings' : page_values, 'prof' : name, 'time' : time, 'course_dep' : course_dep, 'course_title' : course_title, 'completed' : completed, 'total_entrolled' : total_entrolled})
# Add completed and total enrolled
print ';'.join(['course_department', 'course_code', 'professor', 'time', 'course_title', 'completed', 'total_entrolled'] + list(possible))
for course in courses:
output = [course['course_dep'], course['course_code'], course['prof'], course['time'], course['course_title'], course['completed'], course['total_entrolled']]
for rating in possible:
if rating in course['ratings']:
output.append(str(course['ratings'][rating]))
else:
output.append('NULL')
print ';'.join(output)