Skip to content

Commit

Permalink
Convert out-param to multiple return values. Tested to work on 202401.
Browse files Browse the repository at this point in the history
  • Loading branch information
powe97 committed Oct 16, 2023
1 parent f1cd4ca commit ba23962
Showing 1 changed file with 36 additions and 28 deletions.
64 changes: 36 additions & 28 deletions scrapers/sis_scraper/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,22 +91,32 @@ async def get_section_information(section_url):
return section_dict


async def get_class_information(class_url, registration_dates):
async def get_class_information(class_url):
global session

sections = []
course_data = {}
registration_dates = ()

async with session.get(class_url) as data:
data = BeautifulSoup(await data.text())

# Iterate through each section in the course and retrieve appropriate data
section_data = data.findAll("th", {"class": "ddtitle", "scope": "colgroup"})
registration_dates[0] = (
re.search(r"Registration Dates: </span>(.*?)\n", str(data))
.group(1)
.split(" to ")
)

# Get registration start and end dates, unless they don't exist/are hidden for
# some reason (this happened with arch planning course)
dates_rgx= re.search(r"Registration Dates: </span>(.*?)\n", str(data))
if(dates_rgx):
registration_dates = tuple(
datetime.strptime(d.strip(), "%b %d, %Y")
for d in (
dates_rgx
.group(1)
.split(" to ")
)
)

meeting_times = data.findAll(
"table",
{
Expand Down Expand Up @@ -189,7 +199,7 @@ async def get_class_information(class_url, registration_dates):
course_data["crse"] = sections[0]["crse"]
course_data["id"] = f"{course_data['subj']}-{str(course_data['crse']).zfill(4)}"
course_data["sections"] = sections
return course_data
return course_data, registration_dates


def parse_semester_page(text):
Expand Down Expand Up @@ -217,24 +227,27 @@ async def get_classes_with_code(term, code):
return await request.text()


async def scrape_subject(term, name, code, registration_dates):
async def scrape_subject(term, name, code):
courses_data = {"name": name, "code": code}
courses_data["courses"] = list(
subj_data = list(
filter(
lambda x: x != None,
await asyncio.gather(
*[
get_class_information(
f"https://sis.rpi.edu{clazz}", registration_dates
)
get_class_information(f"https://sis.rpi.edu{clazz}")
for clazz in parse_semester_page(
await get_classes_with_code(term, code)
)
]
),
)
)
return courses_data
if not subj_data:
courses_data["courses"], registration_dates = subj_data, ()
else:
courses_data["courses"], registration_dates = zip(*subj_data)
registration_dates = tuple(filter(bool, registration_dates))
return courses_data, registration_dates


async def get_subjects_for_term(term):
Expand All @@ -249,25 +262,20 @@ async def get_subjects_for_term(term):


async def scrape_term(term):
# This is a list to simulate pass-by-reference, since adding
# another return value here would be a pain
registration_dates_list = [None]

print(f"Scraping {term}")
courses = await asyncio.gather(
*[
scrape_subject(term, *subj, registration_dates_list)
for subj in await get_subjects_for_term(term)
]

courses, registration_dates = zip(
*await asyncio.gather(
*[scrape_subject(term, *subj) for subj in await get_subjects_for_term(term)]
)
)
# Remove empty entries (these happen when a subject has no courses in a semester,
# e.g. ITWS over arch summer) and just get the first pair of registration dates (begin/end)
registration_dates = list(filter(bool, registration_dates))[0][0]

registration_dates = [
datetime.strptime(d.strip(), "%b %d, %Y").strftime("%Y-%m-%d")
for d in registration_dates_list[0]
]
registration_dates_json = {
"registration_opens": registration_dates[0],
"registration_closes": registration_dates[1],
"registration_opens": registration_dates[0].strftime("%Y-%m-%d"),
"registration_closes": registration_dates[1].strftime("%Y-%m-%d"),
}

# Filter any defunct / empty departments from the list
Expand Down

0 comments on commit ba23962

Please sign in to comment.