-
Notifications
You must be signed in to change notification settings - Fork 3
/
tools.py
259 lines (198 loc) · 8.21 KB
/
tools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
import pandas as pd
from natsort import natsorted
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
from statsmodels.stats.proportion import proportion_confint as ci
CAPEURL = 'https://cape.ucsd.edu/responses/Results.aspx'
CAPEDUMPURL = 'https://cape.ucsd.edu/responses/Results.aspx?Name=%2C'
CAPETITLE = 'Course And Professor Evaluations (CAPE)'
def get_raw_cape_dataframe():
# launch browser using Selenium, need to have Firefox installed
print('Opening a browser window...')
driver = webdriver.Firefox()
print('Browser window open, loading the page...')
# get the page that lists all the data, first try
driver.get(CAPEURL)
print('Please enter credentials...')
# wait until SSO credentials are entered
wait = WebDriverWait(driver, 60)
element = wait.until(expected_conditions.title_contains(CAPETITLE))
# get the page that lists all the data
# (%2C is the comma, drops all the data since every professor name has it)
driver.get(CAPEDUMPURL)
# read in the dataset from the html file
df = pd.read_html(driver.page_source)[0]
print('Dataset parsed, closing browser window.')
# destroy driver instance
driver.quit()
return df
def get_clean_cape_dataframe(raw_cape_dataframe, terms):
df = raw_cape_dataframe
# only looking at evaluations from 15/16 and 16/17
df = df[df.Term.isin(terms)]
# subset the columns we need
df = df[['Instructor', 'Course', 'Term', 'Evals Made', 'Rcmnd Class',
'Rcmnd Instr', 'Study Hrs/wk', 'Avg Grade Expected',
'Avg Grade Received']]
# rename the columns for convenience
df = df.rename(columns={
'Instructor': 'instr', 'Course': 'course', 'Term': 'term',
'Evals Made': 'evals', 'Rcmnd Class': 'rcmnd_class',
'Rcmnd Instr': 'rcmnd_instr', 'Study Hrs/wk': 'time',
'Avg Grade Expected': 'grade_expected',
'Avg Grade Received': 'grade_actual'
})
# drop rows that have data missing
df = df.dropna()
# only need the courses which hade at least one evaluation made
df = df[df['evals'] != 0]
# split to get the dept + course code
df.loc[:, 'course'] = df.course.str.split(' - ').apply(lambda x: x[0])
# convert the recommendation percentages to float values
# and resize to be in the interval [0, 1]:
df.loc[:, 'rcmnd_instr'] = (df.rcmnd_instr
.str.rstrip(' %')
.astype('float')) / 100
df.loc[:, 'rcmnd_class'] = (df.rcmnd_class
.str.rstrip(' %')
.astype('float')) / 100
"""We create a "weighted evals" column which contains the recommendation
percentage multiplied by the number of evals, yielding the approximate
number of positive recommendations. We round them to obtain integer values.
The exact numbers are available for every course, but it would require
scraping a lot of pages. Maybe in the next iteration."""
df['class_weighted_evals'] = ((df.evals * df.rcmnd_class).round()
.astype('int'))
df['instr_weighted_evals'] = ((df.evals * df.rcmnd_instr).round()
.astype('int'))
df['letter_expected'] = (df.grade_expected.str.split('(')
.apply(lambda x: x[0]))
df['gpa_expected'] = (df.grade_expected.str.split('(')
.apply(lambda x: x[-1])
.str.rstrip(')')
.astype('float'))
df['letter_actual'] = (df.grade_actual.str.split('(')
.apply(lambda x: x[0]))
df['gpa_actual'] = (df.grade_actual.str.split('(')
.apply(lambda x: x[-1])
.str.rstrip(')')
.astype('float'))
df = df.drop(['grade_expected', 'grade_actual'], axis=1)
# set and reset index to build an incremental index that starts at 0
df = df.set_index('instr').reset_index()
return df
def get_prof_ranking_dictionary(df):
df = (df[['instr', 'course', 'evals', 'instr_weighted_evals']])
gb = df.groupby(['course', 'instr']).sum()
gb.loc[:, 'lower'], gb.loc[:, 'upper'] = ci(gb.instr_weighted_evals,
gb.evals, method='wilson')
# populate the dictionary
ranking = {}
for course, instr in gb.index:
professors_sorted = gb.loc[course].sort_values(by='lower',
ascending=False)
ranking[course] = list(professors_sorted.index)
return ranking
def get_time_dictionary(df):
# subset the columns we need
df = df[['course', 'time']]
# groupby to get average time for course and round to 2 decimal places
gb = df[['time', 'course']].groupby('course').mean().round(2)
# average time spent for all courses
average = float(gb.mean())
# standard deviation of time spent for all courses
sd = float(gb.std())
# build the deviation column
gb['dev'] = gb - average
# warning statements
warning = (
'This course will take more time outside of class than average.'
)
normal = (
'This course will take an average amount of time outside of class.'
)
relax = (
'This course might take less time outside of class than average.'
)
def get_statement_and_color(dev, sd):
if (dev > sd):
statement = warning
color = 'red'
elif (abs(dev) < sd):
statement = normal
color = 'black'
else:
statement = relax
color = 'green'
return statement, color
time = {}
for course in gb.index:
statement, color = get_statement_and_color(gb.loc[course, 'dev'], sd)
time[course] = {'expected': str(float(gb.loc[course, 'time'])),
'statement': statement, 'color': color}
return time
def get_grade_dictionary(df):
# subset the columns we need
df = df[['course', 'gpa_expected', 'gpa_actual']]
# groupby to get the mean grade and round to 2 decimal places
gb = df.groupby('course').mean().round(2)
gb['dev'] = gb.gpa_actual - gb.gpa_expected
# warning statements
warning = (
'Students tend to get lower grades than they expect for this course.'
)
normal = (
'Students tend to get the grade they expect for this course.'
)
relax = (
'Students tend to get higher grades than they expect for this course.'
)
def GPA_val_to_grade(val):
if val == 4.0:
grade = 'A'
elif val >= 3.7:
grade = 'A-'
elif val >= 3.3:
grade = 'B+'
elif val >= 3.0:
grade = 'B'
elif val >= 2.7:
grade = 'B-'
elif val >= 2.3:
grade = 'C+'
elif val >= 2.0:
grade = 'C'
elif val >= 1.7:
grade = 'C-'
elif val >= 1.0:
grade = 'D'
return grade
def get_statement_and_color(dev):
if dev > 0.4:
color = 'green'
statement = relax
elif dev < -0.4:
color = 'red'
statement = warning
else:
color = 'black'
statement = normal
return statement, color
grade = {}
for course in gb.index:
statement, color = get_statement_and_color(gb.loc[course, 'dev'])
grade[course] = {
'expected': GPA_val_to_grade(gb.loc[course, 'gpa_actual']),
'color': color,
'statement': statement
}
return grade
def get_depts_and_courses_dictionary(df):
df = (df.course.str.split(expand=True)
.rename(columns={0: 'dept', 1: 'course'})
.drop_duplicates())
depts = natsorted(df.dept.unique())
df = df.set_index(['dept', 'course']).sort_index()
depts_and_courses = {dept: natsorted(df.loc[dept].index) for dept in depts}
return depts_and_courses