-
Notifications
You must be signed in to change notification settings - Fork 1
/
get_courses.py
162 lines (130 loc) · 5.57 KB
/
get_courses.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import requests
import csv
import pandas as pd
import os
from data.encryption import encrypt_file
from ast import literal_eval
from get_courses import current_semester, term
import re
import openai
import numpy as np
api_key = os.environ.get("api_key")
openai.api_key = os.environ.get("openai_api_key")
# Creates list of all course areas offered at the 5Cs
course_areas = requests.get(f'http://jicsweb.pomona.edu/api/courseareas')
area_code_to_area_name = {}
for area_codes in course_areas.json():
area_code_to_area_name[area_codes['Code']] = area_codes['Description']
# Generates url for all valid course areas of this semester and returns json
def get_area_course_info(code):
payload = {}
payload["api_key"] = api_key
# current semester set in app.py
api_url = "https://jicsweb.pomona.edu/api/Courses/"+ current_semester + "/" + code
r = requests.get(
api_url, params=payload)
if r.status_code != 200:
print(r)
return None
try:
print(f"Getting courses for {code} this semester")
return r.json()
except Exception as e:
print(f"No courses offered in course area {code} this semester", e)
# Parses course description for course prereqs
def get_course_prereqs(course_description):
reqs = course_description.find("Prerequisite:")
reqs1 = course_description.find("Prerequisites:")
if reqs != -1:
return course_description[reqs +
len("Prerequisite:"):]
elif reqs1 != -1:
return course_description[reqs1 +
len("Prerequisites:"):]
return 'None'
# Returns instructors names
def get_course_faculty(course):
course_faculty = []
if not course["Instructors"]:
return []
for instructor in course['Instructors']:
if instructor['Name'] == ', taff':
course_faculty.append('Staff')
else:
course_faculty.append(instructor['Name'])
return course_faculty
# returns (campus "SC Campus", time "02:45-05:30PM", weekdays "TR")
def get_course_sched(course):
campus = []
meet_time = []
weekdays = []
for schedule in course['Schedules']:
campus.append(schedule['Campus'])
meet_time.append(schedule['MeetTime'])
weekdays.append(schedule['Weekdays'])
return (campus, meet_time, weekdays)
# Writing data to CSV
header = ['CourseArea', 'CourseCode', 'Name', 'Description',
'Faculty', 'Campus', 'MeetTime', 'Weekdays', 'Prereqs']
with open('data/all_courses.csv', 'a', encoding='UTF8', newline='') as f:
writer = csv.writer(f)
writer.writerow(header)
for area_code in area_code_to_area_name.keys():
area_courses = get_area_course_info(area_code)
if not area_courses:
continue
for course in area_courses:
try:
course_area = area_code_to_area_name[area_code]
course_code = course['CourseCode']
name = course['Name']
description = str(course['Description'])
prereqs = get_course_prereqs(description)
faculty = get_course_faculty(course)
campus, meet_time, weekdays = get_course_sched(course)
except Exception as e:
print("Expection:", e,
" \nInsufficient information on course ", course)
data = [course_area, course_code, name, description,
faculty, campus, meet_time, weekdays, prereqs]
writer.writerow(data)
# obtain word vectors from openai API
# if rate limit error, break up the list_str into chunks
# calling get_embedding incrementally
def get_embedding(list_str):
# list of word embeddings
embedding_list = []
# Embed a line of text
for str in list_str:
# Embed a line of text
response = openai.Embedding.create(
model= "text-embedding-ada-002",
input = str
)
# Extract the AI output embedding as a list of floats
nth_word = response.data[0]['embedding']
embedding_list.append(nth_word)
return embedding_list
# create pkl file with proper data types
def create_vector_file(all_courses: pd.DataFrame) -> pd.DataFrame:
# convert string of area requirements -> list -> set
all_courses['CourseArea'] = all_courses['CourseArea'].apply(lambda x: set(x.split(", ")))
# get rid of extra spaces and tabs in course name
all_courses['Name'] = all_courses['Name'].apply(lambda x: re.sub(r"\s+", " ", x).strip())
# innermost set(x) gets rid of duplicates in the list of weekdays (['TR', 'TR'] => {'TR'})
# min(set(x)), convert {'TR'} => 'TR'
# set() again => 'TR' => {'T', 'R'}
all_courses['Weekday_set'] = all_courses['Weekdays'].apply(lambda x: set(min(set(x))))
all_courses['Campus'] = all_courses['Campus'].apply(lambda x: set(x)) # convert list of campus -> set
# get word vectors, append to df
courses = all_courses.Name.to_list()
vector_list = get_embedding(courses)
all_courses['vector'] = vector_list
return all_courses
# converters ensure Faculty, Campus, MeetTime, Weekdays are lists, not a string of a list
# try decrypted_courses_sp24.csv if all_courses.csv fails
df = pd.read_csv('decrypted_all_courses.csv', converters={'Faculty': literal_eval, 'Campus': literal_eval, 'MeetTime': literal_eval, 'Weekdays': literal_eval})
vector_df = create_vector_file(df)
vector_df.to_pickle(f'data/vectors_{term}_courses.pkl')
# pkl file to preserve data types (csv can not do this)
encrypt_file(f'data/vectors_{term}_courses.pkl', f'data/encrypted_vectors_{term}_courses.pkl')