From 094a466ab2198cc58a80b34c566d7b24deb21b85 Mon Sep 17 00:00:00 2001 From: Anderson T Date: Wed, 19 Jun 2024 20:29:31 -0700 Subject: [PATCH 1/4] =?UTF-8?q?v3=20of=20the=20api=20=F0=9F=8E=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Controller.py | 435 +++++++++++++++++---------- api.py | 314 +++++++++++-------- requirements.txt | 1 + sdk/parsers/AttributesParser.py | 26 +- sdk/parsers/CatalogueParser.py | 27 +- sdk/parsers/SemesterParser.py | 67 +++-- sdk/parsers/TransferParser.py | 145 --------- sdk/schema/Attribute.py | 21 -- sdk/schema/BaseModels.py | 37 +++ sdk/schema/CourseAttribute.py | 37 +++ sdk/schema/CourseOutline.py | 25 ++ sdk/schema/CoursePage.py | 36 +++ sdk/schema/CourseSummary.py | 41 ++- sdk/schema/ScheduleEntry.py | 29 +- sdk/schema/Section.py | 40 ++- sdk/schema/Transfer.py | 50 ++- sdk/schema_built/ApiResponses.py | 17 +- sdk/schema_built/Course.py | 156 ---------- sdk/schema_built/CourseMax.py | 183 +++++++++++ sdk/schema_built/Semester.py | 48 --- sdk/scrapers/DownloadTransferInfo.py | 30 +- 21 files changed, 987 insertions(+), 778 deletions(-) delete mode 100644 sdk/parsers/TransferParser.py delete mode 100644 sdk/schema/Attribute.py create mode 100644 sdk/schema/BaseModels.py create mode 100644 sdk/schema/CourseAttribute.py create mode 100644 sdk/schema/CourseOutline.py create mode 100644 sdk/schema/CoursePage.py delete mode 100644 sdk/schema_built/Course.py create mode 100644 sdk/schema_built/CourseMax.py delete mode 100644 sdk/schema_built/Semester.py diff --git a/Controller.py b/Controller.py index d60fb2e..5530339 100644 --- a/Controller.py +++ b/Controller.py @@ -7,7 +7,8 @@ from main import PREBUILTS_DIRECTORY -from sdk.schema.Attribute import AttributeDB +from sdk.schema.BaseModels import Course, Semester +from sdk.schema.CourseAttribute import CourseAttributeDB from sdk.schema.CourseSummary import CourseSummaryDB from sdk.schema.Section import SectionAPI, SectionDB from sdk.schema.ScheduleEntry import ScheduleEntryDB @@ -17,7 +18,7 @@ from sqlalchemy.orm import selectinload from pydantic.json import pydantic_encoder -from sdk.schema_built.Course import CourseBase, CourseAPIBuild, CourseDB +from sdk.schema_built.CourseMax import CourseMax, CourseMaxDB from sdk.scrapers.DownloadLangaraInfo import fetchTermFromWeb from sdk.parsers.SemesterParser import parseSemesterHTML from sdk.parsers.CatalogueParser import parseCatalogueHTML @@ -30,6 +31,8 @@ def __init__(self, db_path="database/database.db", db_type="sqlite") -> None: connect_args = {"check_same_thread": False} self.engine = create_engine(f"{db_type}:///{db_path}", connect_args=connect_args) + self.existing_courses:dict[str, list[int]] = {} + # you should probably call this before doing anything def create_db_and_tables(self): # create db and tables if they don't already exist @@ -69,6 +72,47 @@ def updateLatestSemester(self, use_cache=False): # now = datetime.today().strftime('%Y-%m-%d %H:%M:%S') # print(f"[{now}] Fetched new data from Langara. {len(changes)} changes found.") return changes + + + # Build the entire database from scratch. + # Takes approximately 45 minutes from a live connection + def buildDatabase(self, use_cache=False): + print("Building database...\n") + start = time.time() + + # Download, parse and save Transfer Information + # Takes 20-30 minutes from live and 20 seconds from cache. + print("=== FETCHING TRANSFER INFORMATION ===") + self.fetchParseSaveTransfers(use_cache) + timepoint1 = time.time() + print(f"Transfer information downloaded and parsed in {Controller.timeDeltaString(start, timepoint1)}") + print() + + # Download, parse and save Langara Tnformation + # Takes 20-30 minutes from live and 10 - 5 minutes from cache + print("=== FETCHING SEMESTERLY INFORMATION ===") + + year, term = 1999, 20 # oldest records available on Banner + out = True + + while out: + out = self.updateSemester(year, term, use_cache) + year, term = Controller.incrementTerm(year, term) + + timepoint2 = time.time() + print(f"Langara information downloaded and parsed in {Controller.timeDeltaString(timepoint1, timepoint2)}") + print() + + # Takes approximately 3 minutes + print("=== GENERATING AGGREGATIONS & PREBUILTS ===") + self.genIndexesAndPreBuilts() + timepoint3 = time.time() + print(f"Database indexes built in {Controller.timeDeltaString(timepoint2, timepoint3)}") + print() + + + print(f"Database built in {Controller.timeDeltaString(start, timepoint3)}!") + @@ -76,12 +120,15 @@ class SemesterInternal(SQLModel): year: int = Field(description='Year of semester e.g. ```2024```.') term: int = Field(description='Term of semester e.g. ```30```.') - attributes: list[AttributeDB] = Field(default=[]) + attributes: list[CourseAttributeDB] = Field(default=[]) courseSummaries: list[CourseSummaryDB] = Field(default=[]) sections: list[SectionDB] = Field(default=[], description='List of sections in the semester.') schedules: list[ScheduleEntryDB] = Field(default=[]) - def updateSemester(self, year, term, use_cache=False) -> bool | None: + # gets data that is by semester + # sections, catalogue, and attributes + # returns true when the semester is updated or None if it can't find data for the given semester + def updateSemester(self, year:int, term:int, use_cache:bool=False) -> bool | None: warehouse = Controller.SemesterInternal(year=year, term=term) @@ -117,11 +164,26 @@ def updateSemester(self, year, term, use_cache=False) -> bool | None: # how do you implement an UPSERT in SQLModel??? with Session(self.engine) as session: + # save Semester if it doesn't exist + statement = select(Semester).where(Semester.year==year, Semester.term==term) + results = session.exec(statement) + result = results.first() + + if result == None: + s = Semester( + id=f'SMTR-{year}-{term}', + year=year, + term=term + ) + session.add(s) + # TODO: move changes watcher to its own service for c in warehouse.sections: - result = session.get(SectionDB, c.id) + self.checkCourseExists(session, c.subject, c.course_code, c) + result = session.get(SectionDB, c.id) + # insert if it doesn't exist or update if it does exist if result == None: session.add(c) @@ -131,8 +193,8 @@ def updateSemester(self, year, term, use_cache=False) -> bool | None: session.add(result) - for s in warehouse.schedules: - result = session.get(ScheduleEntryDB, s.id) + for s in warehouse.schedules: + result = session.get(ScheduleEntryDB, s.id) # insert if it doesn't exist or update if it already exists if result == None: @@ -143,6 +205,8 @@ def updateSemester(self, year, term, use_cache=False) -> bool | None: session.add(result) for cs in warehouse.courseSummaries: + self.checkCourseExists(session, cs.subject, cs.course_code, cs) + result = session.get(CourseSummaryDB, cs.id) # insert if it doesn't exist or update if it already exists @@ -154,7 +218,8 @@ def updateSemester(self, year, term, use_cache=False) -> bool | None: session.add(result) for a in warehouse.attributes: - result = session.get(AttributeDB, a.id) + self.checkCourseExists(session, a.subject, a.course_code, a) + result = session.get(CourseAttributeDB, a.id) # insert if it doesn't exist or update if it already exists if result == None: @@ -175,26 +240,43 @@ def timeDeltaString(time1:float, time2:float) -> str: hours, rem = divmod(time2-time1, 3600) minutes, seconds = divmod(rem, 60) return "{:0>2}:{:0>2}:{:02d}".format(int(hours),int(minutes),int(seconds)) + + def checkCourseExists(self, session:Session, subject:str, course_code:int, obj) -> None: + if type(subject) != str: + print("BAD!!!!") + print(obj) + print(subject, course_code) + input() + # check in-memory index before going out to the database + # performance impact not tested but I/O is always slow + if subject in self.existing_courses and course_code in self.existing_courses["subject"]: + return + + statement = select(Course).where(Course.subject == subject, Course.course_code == course_code).limit(1) + results = session.exec(statement) + result = results.first() - - # Build the entire database from scratch. - # Takes approximately 45 minutes from a live connection - def buildDatabase(self, use_cache=False): - print("Building database...\n") - start = time.time() + if result == None: + # CRSE-ENGL-1123 + c = Course(id=f'CRSE-{subject}-{course_code}', subject=subject, course_code=course_code) + session.add(c) - # Download / Save Transfer Information - # Also takes 30 - 20 minutes from live and 22 seconds from cache. + # save to index if course doesn't exist in index already + if subject in self.existing_courses: + self.existing_courses["subject"].append(course_code) + else: + self.existing_courses["subject"] = [course_code] + + + def fetchParseSaveTransfers(self, use_cache): transfers = getTransferInformation(use_cache=use_cache) with Session(self.engine) as session: for i, t in enumerate(transfers): - if i % 5000==0: - print(f"Storing transfer agreements... ({i}/{len(transfers)})") - statement = select(TransferDB).where(TransferDB.id == t.id).limit(1) - results = session.exec(statement) - result = results.first() + self.checkCourseExists(session, t.subject, t.course_code, t) + + result = session.get(TransferDB, t.id) # insert if it doesn't exist or update if it already exists if result == None: @@ -203,98 +285,75 @@ def buildDatabase(self, use_cache=False): new_data = t.model_dump() result.sqlmodel_update(new_data) session.add(result) + + if i % 5000==0: + print(f"Storing transfer agreements... ({i}/{len(transfers)})") session.commit() - - timepoint1 = time.time() - print(f"Transfer information downloaded and parsed in {Controller.timeDeltaString(start, timepoint1)}") - print() - - # Download / Save Langara Tnformation - # Takes 30 - 20 minutes from live and 10 - 5 minutes from cache - year, term = 1999, 20 # oldest records available on Banner - while True: - - out = self.updateSemester(year, term, use_cache) - - if out == None: # this means we've parsed all results - print(f"{year}{term} : No courses found!") - break - - year, term = Controller.incrementTerm(year, term) - - timepoint2 = time.time() - print(f"Langara information downloaded and parsed in {Controller.timeDeltaString(timepoint1, timepoint2)}") - print() - - # Takes approximately 3 minutes - print("Generating aggregated course data.") - self.genIndexesAndPreBuilts() - - timepoint3 = time.time() - print(f"Database indexes built in {Controller.timeDeltaString(timepoint2, timepoint3)}") - - - print(f"Database built in {Controller.timeDeltaString(start, timepoint3)}!") - + + def genIndexesAndPreBuilts(self) -> None: self._generateCourseIndexes() self._generatePreBuilds() - - def _generatePreBuilds(self) -> None: - - out = [] - - # get all courses for the given semester - with Session(self.engine) as session: - statement = select(SectionDB.subject, SectionDB.course_code).distinct() - results = session.exec(statement) - courses = results.all() - - for c in courses: - out.append(self.buildCourse(c[0], c[1], return_offerings=True)) - - with open(PREBUILTS_DIRECTORY + "allInfo.json", "w+") as fi: - fi.write(json.dumps(out, default=pydantic_encoder)) - + # generate the Course def _generateCourseIndexes(self) -> None: # get list of courses with Session(self.engine) as session: - statement = select(CourseSummaryDB.subject, CourseSummaryDB.course_code).distinct() - statement2 = select(SectionDB.subject, SectionDB.course_code).distinct() - - results = session.exec(union(statement, statement2)) + statement = select(Course.subject, Course.course_code).distinct() + results = session.exec(statement) courses = results.all() + # print(courses) + i = 0 for subject, course_code in courses: if i % 500 == 0: - print(f"Generating indices... ({i}/{len(courses)+1})") + print(f"Generating course summaries... ({i}/{len(courses)+1})") i+=1 - c = CourseDB( - id=f"CRS-{subject}-{course_code}", + c = CourseMaxDB( + id=f"CMAX-{subject}-{course_code}", subject=subject, course_code=course_code ) - - statement = select(AttributeDB).where( - AttributeDB.subject == subject, - AttributeDB.course_code == course_code - ).order_by(col(AttributeDB.year).desc(), col(AttributeDB.term).desc()).limit(1) - result = session.exec(statement).first() - if result: - c.latest_attribute_id = result.id + """ + The purpose of the following code is to get the freshest values where they exist + So we want the latest fees, the latest course description, etc. + This takes quite a bit of effort to build... + """ - statement = select(CourseSummaryDB).where(CourseSummaryDB.subject == subject, CourseSummaryDB.course_code == course_code).order_by(col(CourseSummaryDB.year).desc(), col(CourseSummaryDB.term).desc()).limit(1) + # TODO: TO BE REPLACED BY CoursePage once scraper is implemented + statement = select(CourseSummaryDB).where( + CourseSummaryDB.subject == subject, + CourseSummaryDB.course_code == course_code + ).order_by(col(CourseSummaryDB.year).desc(), col(CourseSummaryDB.term).desc()).limit(1) results = session.exec(statement) - result = results.first() - if result: - c.latest_course_summary_id = result.id + r = session.exec(statement).first() + if r: + c.credits = r.credits + c.title = r.title + c.description = r.description + c.hours_lecture = r.hours_lecture + c.hours_seminar = r.hours_seminar + c.hours_lab = r.hours_lab + + + statement = select(CourseAttributeDB).where( + CourseAttributeDB.subject == subject, + CourseAttributeDB.course_code == course_code + ).order_by(col(CourseAttributeDB.year).desc(), col(CourseAttributeDB.term).desc()).limit(1) + r = session.exec(statement).first() + if r: + c.attr_ar = r.attr_ar + c.attr_hum = r.attr_hum + c.attr_lsc = r.attr_lsc + c.attr_sci = r.attr_sci + c.attr_soc = r.attr_soc + c.attr_ut = r.attr_ut statement = select(SectionDB).where( @@ -304,18 +363,56 @@ def _generateCourseIndexes(self) -> None: ).limit(1) results = session.exec(statement) - result = results.first() - # a course can have info out without a section being public yet - if result: - c.latest_section_id = result.id + r = session.exec(statement).first() + if r: + c.RP = r.RP + c.abbreviated_title = r.abbreviated_title + c.add_fees = r.add_fees + c.rpt_limit = r.rpt_limit + + + if c.title == None or c.credits == None: + statement = select(TransferDB).where( + TransferDB.subject == subject, + TransferDB.course_code == course_code, + ) + + results = session.exec(statement) + results = session.exec(statement).all() + for r in results: + if r.source_title != None and c.title == None: + c.title = r.source_title + if r.source_credits != None and c.credits == None: + c.credits = r.source_credits + + # generate availability + statement = select(SectionDB).where( + SectionDB.subject == subject, + SectionDB.course_code == course_code + ).order_by(col(SectionDB.year).desc(), col(SectionDB.term).desc() + ).limit(1) + r = session.exec(statement).first() + if r: + c.last_offered_year = r.year + c.last_offered_term = r.term + + statement = select(SectionDB).where( + SectionDB.subject == subject, + SectionDB.course_code == course_code + ).order_by(col(SectionDB.year).asc(), col(SectionDB.term).asc() + ).limit(1) + r = session.exec(statement).first() + + c.first_offered_year = r.year + c.first_offered_term = r.term + + # TODO: calculate availability here. + c.availability = None - # save - # print(c.id) - statement = select(CourseDB).where(CourseDB.id == c.id).limit(1) - results = session.exec(statement) - result = results.first() - # print(result) + # save CourseMax to the database once we are done + + result = session.get(CourseMaxDB, c.id) # insert if it doesn't exist or update if it already exists if result == None: session.add(c) @@ -326,88 +423,106 @@ def _generateCourseIndexes(self) -> None: session.commit() + + def _generatePreBuilds(self) -> None: + + out = [] + + # get all courses for the given semester + with Session(self.engine) as session: + statement = select(Course.subject, Course.course_code).distinct() + results = session.exec(statement) + courses = results.all() + + # for c in courses: + # out.append(self.buildCourse(c[0], c[1], return_offerings=True)) + + # with open(PREBUILTS_DIRECTORY + "allInfo.json", "w+") as fi: + # fi.write(json.dumps(out, default=pydantic_encoder)) - def buildCourse(self, subject, course_code, return_offerings=True) -> CourseAPIBuild | None: + + + # def buildCourse(self, subject, course_code, return_offerings=True) -> CourseAPIBuild | None: - with Session(self.engine) as session: + # with Session(self.engine) as session: - statement = select(CourseDB).where(CourseDB.subject == subject, CourseDB.course_code == course_code).limit(1) - sources = session.exec(statement).first() + # statement = select(CourseBuiltDB).where(CourseBuiltDB.subject == subject, CourseBuiltDB.course_code == course_code).limit(1) + # sources = session.exec(statement).first() - if sources == None: - return None + # if sources == None: + # return None - api_response = CourseAPIBuild( - id=sources.id, - subject=sources.subject, - course_code=sources.course_code - ) + # api_response = CourseAPIBuild( + # id=sources.id, + # subject=sources.subject, + # course_code=sources.course_code + # ) - if sources.latest_attribute_id: - result = session.get(AttributeDB, sources.latest_attribute_id) - api_response.sqlmodel_update(result) + # if sources.latest_attribute_id: + # result = session.get(CourseAttributeDB, sources.latest_attribute_id) + # api_response.sqlmodel_update(result) - if sources.latest_course_summary_id: - result = session.get(CourseSummaryDB, sources.latest_course_summary_id) - api_response.sqlmodel_update(result) + # if sources.latest_course_summary_id: + # result = session.get(CourseSummaryDB, sources.latest_course_summary_id) + # api_response.sqlmodel_update(result) - if sources.latest_section_id: - result = session.get(SectionDB, sources.latest_section_id) - wanted_attributes = { - "RP" : result.RP, - "abbreviated_title": result.abbreviated_title, - "add_fees" : result.add_fees, - "rpt_limit" : result.rpt_limit - } - api_response.sqlmodel_update(wanted_attributes) - api_response.last_offered_year = result.year - api_response.last_offered_term = result.term + # if sources.latest_section_id: + # result = session.get(SectionDB, sources.latest_section_id) + # wanted_attributes = { + # "RP" : result.RP, + # "abbreviated_title": result.abbreviated_title, + # "add_fees" : result.add_fees, + # "rpt_limit" : result.rpt_limit + # } + # api_response.sqlmodel_update(wanted_attributes) + # api_response.last_offered_year = result.year + # api_response.last_offered_term = result.term - statement = select( - SectionDB.year, SectionDB.term - ).order_by( - col(SectionDB.year).asc(), - col(SectionDB.term).asc() - ).limit(1) - result = session.exec(statement).first() - api_response.first_offered_year = result[0] - api_response.first_offered_term = result[1] + # statement = select( + # SectionDB.year, SectionDB.term + # ).order_by( + # col(SectionDB.year).asc(), + # col(SectionDB.term).asc() + # ).limit(1) + # result = session.exec(statement).first() + # api_response.first_offered_year = result[0] + # api_response.first_offered_term = result[1] - # TODO: - # calculate availability - # extract prerequisites - # extract restriction + # # TODO: + # # calculate availability + # # extract prerequisites + # # extract restriction - # get transfers - id = f"CRS-{subject}-{course_code}" - statement = select(TransferDB).where(TransferDB.course_id == id) - result = session.exec(statement).all() - api_response.transfers = result + # # get transfers + # id = f"CRS-{subject}-{course_code}" + # statement = select(TransferDB).where(TransferDB.course_id == id) + # result = session.exec(statement).all() + # api_response.transfers = result - # Get all sections and their schedules in one go using eager loading - # this is dark sqlalchemy magic that was invoked by chatgpt, don't ask me how it works - if return_offerings: + # # Get all sections and their schedules in one go using eager loading + # # this is dark sqlalchemy magic that was invoked by chatgpt, don't ask me how it works + # if return_offerings: - statement = select( - SectionDB - ).where(SectionDB.subject == subject, - SectionDB.course_code == course_code - ).options(selectinload(SectionDB.schedule) - ).order_by(SectionDB.year.asc(), SectionDB.term.asc()) + # statement = select( + # SectionDB + # ).where(SectionDB.subject == subject, + # SectionDB.course_code == course_code + # ).options(selectinload(SectionDB.schedule) + # ).order_by(SectionDB.year.asc(), SectionDB.term.asc()) - results = session.exec(statement).unique() - sections = results.all() + # results = session.exec(statement).unique() + # sections = results.all() - api_response.offerings = sections + # api_response.offerings = sections - # reset the unique id because it gets overwritten - api_response.id = f"CRS-{subject}-{course_code}" + # # reset the unique id because it gets overwritten + # api_response.id = f"CRS-{subject}-{course_code}" - return api_response + # return api_response if __name__ == "__main__": diff --git a/api.py b/api.py index b9c8e52..b78ffcc 100644 --- a/api.py +++ b/api.py @@ -6,6 +6,9 @@ from fastapi import FastAPI, HTTPException, Query from fastapi.responses import FileResponse, HTMLResponse +from fastapi import Depends, FastAPI, HTTPException, Query +from sqlmodel import Field, Relationship, Session, SQLModel, create_engine, select + from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.gzip import GZipMiddleware @@ -17,16 +20,20 @@ from Controller import Controller -from sdk.schema.Attribute import Attribute, AttributeDB +# DATABASE STUFF +from sdk.schema.CourseAttribute import CourseAttributeDB +from sdk.schema.CourseOutline import CourseOutlineDB +from sdk.schema.CoursePage import CoursePage from sdk.schema.CourseSummary import CourseSummaryDB - +from sdk.schema.ScheduleEntry import ScheduleEntryDB from sdk.schema.Section import SectionDB, SectionAPI -from sdk.schema.ScheduleEntry import ScheduleEntry, ScheduleEntryDB, ScheduleEntryAPI -from sdk.schema.Transfer import Transfer +from sdk.schema.Transfer import TransferDB + +from sdk.schema.BaseModels import Course, Semester -from sdk.schema_built.ApiResponses import IndexCourseList, IndexSemester, IndexSemesterList -from sdk.schema_built.Course import CourseDB, CourseAPI, CourseAPIExt, CourseBase, CourseAPIBuild -from sdk.schema_built.Semester import Semester, SemesterCourses, SemesterSections +# RESPONSE STUFF +from sdk.schema_built.ApiResponses import IndexCourseList, IndexSemesterList +from sdk.schema_built.CourseMax import CourseMax, CourseMaxAPI, CourseMaxAPIOnlyTransfers, CourseMaxDB from main import ARCHIVES_DIRECTORY, DB_LOCATION, PREBUILTS_DIRECTORY @@ -38,7 +45,21 @@ # database controller controller = Controller() +def get_session(): + with Session(controller.engine) as session: + yield session + +# === STARTUP STUFF === + +@repeat(every(60).minutes) +def hourly(use_cache: bool = False): + controller.updateLatestSemester(use_cache) + +@repeat(every(24).hours) +def daily(use_cache: bool = False): + controller.buildDatabase(use_cache) + if not os.path.exists("database/"): os.mkdir("database") @@ -51,39 +72,32 @@ if not os.path.exists(ARCHIVES_DIRECTORY): os.mkdir(ARCHIVES_DIRECTORY) + if (os.path.exists(DB_LOCATION)): print("Database found.") + controller.create_db_and_tables() + hourly(use_cache=True) else: controller.create_db_and_tables() print("Database not found. Building database from scratch.") # save results to cache if cache doesn't exist controller.buildDatabase(use_cache=True) -@repeat(every(60).minutes) -def hourly(use_cache:bool = False): - controller.updateLatestSemester(use_cache) -@repeat(every(24).hours) -def daily(use_cache:bool = False): - controller.buildDatabase(use_cache) - -def startup(): - controller.create_db_and_tables() - hourly(True) +# controller.create_db_and_tables() +# hourly(use_cache=False) + +# controller.create_db_and_tables() +# controller.buildDatabase(use_cache=True) # startup() -# daily(True) +# daily(use_cache=True) +# === FASTAPI STARTUP STUFF === @asynccontextmanager async def lifespan(app: FastAPI): yield - -origins = [ - "*", -] - -# better api stuff description = "Gets course data from the Langara website. Data refreshes hourly. All data belongs to Langara College or BC Transfer Guide and is summarized here in order to help students. Pull requests welcome!" app = FastAPI( @@ -94,7 +108,12 @@ async def lifespan(app: FastAPI): lifespan=lifespan ) -app.add_middleware(GZipMiddleware, minimum_size=500) # only gzip responses above 500 bytes +# gzip responses above 500 bytes +app.add_middleware(GZipMiddleware, minimum_size=500) + +origins = [ + "*", +] app.add_middleware( CORSMiddleware, @@ -111,18 +130,18 @@ async def lifespan(app: FastAPI): "/index/latest_semester", summary="Latest semester.", description="Returns the latest semester from which data is available", - response_model=IndexSemester + response_model=Semester ) -async def semesters_all() -> dict[str, int]: - with Session(controller.engine) as session: - statement = select(CourseSummaryDB.year, CourseSummaryDB.term).order_by(col(CourseSummaryDB.year).desc(), col(CourseSummaryDB.term).desc()).distinct().limit(1) - results = session.exec(statement) - result = results.all() - - return { - "year": result[0][0], - "term": result[0][1] - } +async def index_latest_semester( + *, + session: Session = Depends(get_session), +): + + statement = select(Semester).order_by(col(Semester.year).desc(), col(Semester.term).desc()).distinct().limit(1) + results = session.exec(statement) + result = results.first() + + return result @app.get( "/index/semesters", @@ -130,16 +149,21 @@ async def semesters_all() -> dict[str, int]: description="Returns all semesters from which data is available", response_model=IndexSemesterList ) -async def semesters_all() -> list[str]: - with Session(controller.engine) as session: - statement = select(AttributeDB.year, AttributeDB.term).order_by(col(AttributeDB.year).desc(), col(AttributeDB.term).desc()).distinct() - results = session.exec(statement) - result = results.all() - - return IndexSemesterList( - count = len(result), - semesters = result - ) +async def index_semesters( + *, + session: Session = Depends(get_session), +): + + statement = select(Semester + ).order_by( col(Semester.year).desc(), col(Semester.term).desc() + ).distinct() + results = session.exec(statement) + result = results.all() + + return IndexSemesterList( + count = len(result), + semesters = result + ) @app.get( @@ -148,89 +172,104 @@ async def semesters_all() -> list[str]: description="Returns all known subjects and their courses.", response_model=IndexCourseList ) -async def courses() -> IndexCourseList: +async def index_courses( + *, + session: Session = Depends(get_session), +) -> IndexCourseList: - with Session(controller.engine) as session: - statement = select(CourseSummaryDB.subject, CourseSummaryDB.course_code).order_by(col(CourseSummaryDB.subject).asc()).distinct() - results = session.exec(statement) - result = results.all() - - subjects:dict[str, list[int]] = {} - for r in result: - if r[0] not in subjects: - subjects[r[0]] = [] - subjects[r[0]].append(r[1]) - - return IndexCourseList( - subject_count = len(subjects), - course_code_count= len(result), - subjects = subjects - ) + statement = select(Course.subject, Course.course_code).order_by(col(Course.subject).asc(), col(Course.course_code).asc()).distinct() + results = session.exec(statement) + result = results.all() + + subjects:dict[str, list[int]] = {} + for r in result: + if r[0] not in subjects: + subjects[r[0]] = [] + subjects[r[0]].append(r[1]) + + return IndexCourseList( + subject_count = len(subjects), + course_code_count= len(result), + subjects = subjects + ) + - @app.get( - "/semester/courses/{year}/{term}", + "/semester/{year}/{term}/courses", summary="Semester course data.", description="Returns all courses for a semester" ) -async def semester(year:int, term:int) -> list[CourseAPI]: - # TODO: check that year/term exist +async def semester( + *, + session: Session = Depends(get_session), + year: int, + term: int +) -> list[CourseMaxAPIOnlyTransfers]: - api_response = [] - - # get all courses for the given semester - with Session(controller.engine) as session: - statement = select(SectionDB.subject, SectionDB.course_code).where(SectionDB.year == year, SectionDB.term == term).distinct() - results = session.exec(statement) - courses = results.all() + # TODO: Move this to a link table instead of calculating it on the fly + + statement = select(SectionDB.subject, SectionDB.course_code).where(SectionDB.year == year, SectionDB.term == term).distinct() + results = session.exec(statement) + courses = results.all() + + out = [] for c in courses: - api_response.append(controller.buildCourse(c[0], c[1], return_offerings=False)) + result = session.get(CourseMaxDB, f'CMAX-{c[0]}-{c[1]}') + assert result != None + out.append(result) - return api_response + return out @app.get( - "/semester/sections/{year}/{term}", + "/semester/{year}/{term}/sections", summary="Semester section data.", description="Returns all sections of a semester", response_model=list[SectionAPI] ) -async def semester(year:int, term:int) -> list[SectionAPI]: +async def semester( + *, + session: Session = Depends(get_session), + year: int, + term: int +) -> list[SectionAPI]: - with Session(controller.engine) as session: - - statement = select( - SectionDB - ).where(SectionDB.year == year, + + statement = select(SectionDB).where( + SectionDB.year == year, SectionDB.term == term - ).options(selectinload(SectionDB.schedule) - ).order_by(SectionDB.year.asc(), SectionDB.term.asc()) - - results = session.exec(statement).unique() - sections = results.all() - - return sections + ).options(selectinload(SectionDB.schedule) + ).order_by(SectionDB.year.asc(), SectionDB.term.asc()) + + results = session.exec(statement).unique() + sections = results.all() + + return sections @app.get( "/course/{subject}/{course_code}", summary="Course information.", description="Get all available information for a given course.", - response_model=CourseAPIExt, - + response_model=CourseMaxAPI, ) -async def semesterCoursesInfo(subject: str, course_code:int): +async def semesterCoursesInfo( + *, + session: Session = Depends(get_session), + subject: str, + course_code:int +): subject = subject.upper() - c = controller.buildCourse(subject, course_code, True) + result = session.get(CourseMaxDB, f"CMAX-{subject}-{course_code}") - if c == None: - raise HTTPException(status_code=404, detail="Course not found") + if result == None: + raise HTTPException(status_code=404, detail="Course not found.") - return c + return result @app.get( @@ -239,26 +278,31 @@ async def semesterCoursesInfo(subject: str, course_code:int): description="Get all available information for a given section.", response_model=SectionAPI ) -async def semesterSectionsInfo(year: int, term:int, crn: int): - with Session(controller.engine) as session: - statement = select(SectionDB).where(SectionDB.year == year, SectionDB.term == term, SectionDB.crn == crn) - results = session.exec(statement) - section = results.first() - - if section == None: - return 404 - - statement = select(ScheduleEntryDB).where(ScheduleEntryDB.year == year, ScheduleEntryDB.term == term, ScheduleEntryDB.crn == crn) - results = session.exec(statement) - schedules = results.all() - - out = section.model_dump() - out["schedule"] = [] - - for s in schedules: - out["schedule"].append(s.model_dump()) - - return out +async def semesterSectionsInfo( + *, + session: Session = Depends(get_session), + year: int, + term: int, + crn: int +): + statement = select(SectionDB).where(SectionDB.year == year, SectionDB.term == term, SectionDB.crn == crn) + results = session.exec(statement) + section = results.first() + + if section == None: + return 404 + + statement = select(ScheduleEntryDB).where(ScheduleEntryDB.year == year, ScheduleEntryDB.term == term, ScheduleEntryDB.crn == crn) + results = session.exec(statement) + schedules = results.all() + + out = section.model_dump() + out["schedule"] = [] + + for s in schedules: + out["schedule"].append(s.model_dump()) + + return out # my wares are too powerful for you, traveller @@ -266,25 +310,29 @@ async def semesterSectionsInfo(year: int, term:int, crn: int): "/export/all", summary="All information.", description="Get all available information. You probably shouldn't use this route...", - response_model=list[CourseAPIExt] + response_model=list[CourseMaxAPI] ) -async def allCourses(): +async def allCourses( + *, + session: Session = Depends(get_session), +): - with open(PREBUILTS_DIRECTORY + "allInfo.json", "r") as fi: - data = json.load(fi) + statement = select(CourseMaxDB) + results = session.exec(statement) + courses = results.all() - return data + return courses # Yes, this is not a secure method for passing an authentication token # This is extremely easy to call from firefox and it really shouldn't be called at all -@app.get( - "/admin/regenerateDatabase", - summary="Generate the database.", - description="Downloads new information and builds a database.", - # include_in_schema=False -) -async def genDB(API_KEY: str) -> None: - if API_KEY == os.getenv("API_KEY") or os.getenv("DEBUG") == True: - controller.buildDatabase() - else: - return False \ No newline at end of file +# @app.get( +# "/admin/regenerateDatabase", +# summary="Generate the database.", +# description="Downloads new information and builds a database.", +# # include_in_schema=False +# ) +# async def genDB(API_KEY: str) -> None: +# if API_KEY == os.getenv("API_KEY") or os.getenv("DEBUG") == True: +# controller.buildDatabase() +# else: +# return False \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index da0cb28..8edeb04 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,5 +13,6 @@ requests-cache beautifulsoup4 lxml +cchardet playwright \ No newline at end of file diff --git a/sdk/parsers/AttributesParser.py b/sdk/parsers/AttributesParser.py index 2477c08..cb7aeb7 100644 --- a/sdk/parsers/AttributesParser.py +++ b/sdk/parsers/AttributesParser.py @@ -1,14 +1,16 @@ # https://swing.langara.bc.ca/prod/hzgkcald.P_DisplayCatalog from bs4 import BeautifulSoup, element +import lxml +import cchardet -from sdk.schema.Attribute import AttributeDB +from sdk.schema.CourseAttribute import CourseAttributeDB ''' Parses the Langara Course attributes into json https://swing.langara.bc.ca/prod/hzgkcald.P_DispCrseAttr#A ''' -def parseAttributesHTML(html, year, term) -> list[AttributeDB]: +def parseAttributesHTML(html, year, term) -> list[CourseAttributeDB]: soup = BeautifulSoup(html, 'lxml') @@ -25,17 +27,20 @@ def parseAttributesHTML(html, year, term) -> list[AttributeDB]: elif table_items[i] == " " or table_items[i].isspace(): table_items[i] = False - attributes: list[AttributeDB] = [] + attributes: list[CourseAttributeDB] = [] i = 0 while i < len(table_items): - a = AttributeDB( - # ATR-year-term-subject-course_code - id=f"ATR-{year}-{term}-{table_items[i].split(' ')[0]}-{table_items[i].split(' ')[1]}", + subject = table_items[i].split(" ")[0] + course_code = table_items[i].split(" ")[1] + + a = CourseAttributeDB( + + # ATRB-subj-code-year-term + # ATRB-ENGL-1123-2024-30 + id=f"ATRB-{subject}-{course_code}-{year}-{term}", - subject = table_items[i].split(" ")[0], - course_code = table_items[i].split(" ")[1], attr_ar=table_items[i+1], attr_sc=table_items[i+2], attr_hum=table_items[i+3], @@ -43,8 +48,11 @@ def parseAttributesHTML(html, year, term) -> list[AttributeDB]: attr_sci=table_items[i+5], attr_soc=table_items[i+6], attr_ut=table_items[i+7], + + subject = subject, + course_code = course_code, year=year, - term=term + term=term, ) attributes.append(a) diff --git a/sdk/parsers/CatalogueParser.py b/sdk/parsers/CatalogueParser.py index 36785f5..1323d21 100644 --- a/sdk/parsers/CatalogueParser.py +++ b/sdk/parsers/CatalogueParser.py @@ -1,4 +1,6 @@ from bs4 import BeautifulSoup, element +import lxml +import cchardet from sdk.schema.CourseSummary import CourseSummaryDB @@ -40,23 +42,34 @@ def __parseCatalogueHTML(html, year, term) -> list[CourseSummaryDB]: description = e.text.strip() break + # print(h2) + h2 = h2.split() # h2 = ['ABST', '1100', '(3', 'credits)', '(3:0:0)'] hours = h2[4].replace("(", "").replace(")", "").split(":") + subject = h2[0] + course_code = int(h2[1]) + c = CourseSummaryDB( - # CAT-year-term-subject-course_code - id=f"CAT-{year}-{term}-{h2[0]}-{int(h2[1])}", - subject=h2[0], - course_code=int(h2[1]), + + # CSMR-subj-code-year-term + # CSMR-ENGL-1123-2024-30 + id = f'CSMR-{subject}-{course_code}-{year}-{term}', + + title=title, + description=description, credits=float(h2[2].replace("(", "")), hours_lecture=float(hours[0]), hours_seminar=float(hours[1]), hours_lab=float(hours[2]), - title=title, - description=description, + + subject=subject, + course_code=course_code, year=year, - term=term + term=term, + id_course=f'CRSE-{subject}-{course_code}', + id_semester=f'SMTR-{year}-{term}', ) summaries.append(c) diff --git a/sdk/parsers/SemesterParser.py b/sdk/parsers/SemesterParser.py index 47e2dd9..73d6077 100644 --- a/sdk/parsers/SemesterParser.py +++ b/sdk/parsers/SemesterParser.py @@ -1,4 +1,6 @@ from bs4 import BeautifulSoup +import lxml +import cchardet import unicodedata import datetime @@ -10,16 +12,14 @@ """ Parses a page and returns all of the information contained therein. -Naturally there are a few caveats" +Naturally there are a few caveats: 1) If they ever change the course search interface, this will break horribly 2) For a few years, they had a course-code note that applied to all sections of a course. Instead of storing that properly, we simply append that note to the end of all sections of a course. """ # TODO: refactor this method to make it quicker -def parseSemesterHTML(html) -> tuple[list[SectionDB], list[ScheduleEntryDB]]: - courses_first_day = None - courses_last_day = None +def parseSemesterHTML(html:str) -> tuple[list[SectionDB], list[ScheduleEntryDB]]: # use BeautifulSoup to change html to Python friendly format soup = BeautifulSoup(html, 'lxml') @@ -107,29 +107,37 @@ def parseSemesterHTML(html) -> tuple[list[SectionDB], list[ScheduleEntryDB]]: rpt = formatProp(rawdata[i+11]) if rpt == "-": rpt = None + + subject = rawdata[i+5] + course_code = formatProp(rawdata[i+6]) + crn = formatProp(rawdata[i+4]) current_course = SectionDB( - # i hate sqlmodel with a burning passion - # SEC-year-term-crn - id = f"SEC-{year}-{term}-{formatProp(rawdata[i+4])}", + + # SECT-subj-code-year-term-crn + # SECT-ENGL-1123-2024-30-31005 + id = f"SECT-{subject}-{course_code}-{year}-{term}-{crn}", RP = formatProp(rawdata[i]), seats = formatProp(rawdata[i+1]), waitlist = formatProp(rawdata[i+2]), # skip the select column - crn = formatProp(rawdata[i+4]), - subject = rawdata[i+5], - course_code = formatProp(rawdata[i+6]), + crn = crn, section = rawdata[i+7], credits = formatProp(rawdata[i+8]), abbreviated_title = rawdata[i+9], add_fees = fee, rpt_limit = rpt, + notes = None, + + id_course=f'CRSE-{subject}-{course_code}', + id_semester=f'SMTR-{year}-{term}', + id_course_max=f'CMAX-{subject}-{course_code}', - notes = None, - # schedule = [], + subject = subject, + course_code = course_code, year=year, - term=term + term=term, ) if sectionNotes != None: @@ -142,7 +150,7 @@ def parseSemesterHTML(html) -> tuple[list[SectionDB], list[ScheduleEntryDB]]: sections.append(current_course) i += 12 - section_count = 0 + schedule_count = 0 while True: @@ -151,27 +159,34 @@ def parseSemesterHTML(html) -> tuple[list[SectionDB], list[ScheduleEntryDB]]: raise Exception(f"Parsing error: unexpected course type found: {rawdata[i]}. {current_course} in course {current_course.toJSON()}") c = ScheduleEntryDB( - section_id = current_course.id, - # SCH-year-term-crn-section_# - id = f"SCH-{year}-{term}-{current_course.crn}-{section_count}", + # SCHD-subj-code-year-term-crn-section_number + # SCHD-ENGL-1123-2024-30-31005-1 + id = f'SCHD-{subject}-{course_code}-{year}-{term}-{crn}-{schedule_count}', + subject = subject, + course_code= course_code, year = year, term = term, + crn = current_course.crn, type = rawdata[i], days = rawdata[i+1], time = rawdata[i+2], - start = formatDate(rawdata[i+3]), - end = formatDate(rawdata[i+4]), + start = formatDate(rawdata[i+3], year), + end = formatDate(rawdata[i+4], year), room = rawdata[i+5], instructor = rawdata[i+6], + + id_course=f'CRSE-{subject}-{course_code}', + id_semester=f'SMTR-{year}-{term}', + id_section=f'SECT-{subject}-{course_code}-{year}-{term}-{crn}' ) - section_count += 1 + schedule_count += 1 if c.start.isspace(): - c.start = courses_first_day + c.start = None if c.end.isspace(): - c.end = courses_last_day + c.end = None schedules.append(c) i += 7 @@ -222,7 +237,7 @@ def formatProp(s:str) -> str | int | float: # converts date from "11-Apr-23" to "2023-04-11" (ISO 8601) -def formatDate(date:str) -> datetime.date: +def formatDate(date:str, year:int) -> datetime.date: if date == None: return None @@ -236,6 +251,10 @@ def formatDate(date:str) -> datetime.date: if month <= 9: month = "0" + str(month) - out = f"20{date[2]}-{month}-{date[0]}" + # oh no, this will break when 2100 comes around! + if year <= 1999: + out = f"19{date[2]}-{month}-{date[0]}" + else: + out = f"20{date[2]}-{month}-{date[0]}" return out \ No newline at end of file diff --git a/sdk/parsers/TransferParser.py b/sdk/parsers/TransferParser.py deleted file mode 100644 index 8752e3c..0000000 --- a/sdk/parsers/TransferParser.py +++ /dev/null @@ -1,145 +0,0 @@ -import pdfquery -import os - -from schema.Transfer import Transfer - -class TransferParser: - - - # TODO: use PyMuPDF to speed this up - def parseTransferPDFs() -> list[Transfer]: - pdfs = os.listdir("downloads/") - - assert len(pdfs) > 0, f"No PDFs to parse in {dir}." - - transfers: list[Transfer] = [] - - for p in pdfs: - - pdf = pdfquery.PDFQuery("downloads/" + p) - pdf.load() - - # save xml - #pdf.tree.write("pain.xml", pretty_print=True) - - pyquery = pdf.pq("LTTextBoxHorizontal") - - stuff:list[str] = [] - - for i in pyquery.contents(): - - # for some reason some elements become lxml.etree._ElementUnicodeResult - # and others become pdfquery.pdfquery.LayoutElement - # ??? - # TODO: make this not terrible - - - try: - stuff.append(i.text.strip()) - except: - try: - stuff.append(str(i).strip()) - except: - print(f"Could not save {i} {type(i)}") - - # don't save empty ones (idk why there are empty ones) - # WHY DOESNT THIS WORK - if stuff[-1].isspace(): - stuff.pop(-1) - - while "" in stuff: - stuff.remove("") - - ''' - Remove the following: - Course Search Result from "Course Loads" - 217 agreements found for 15 courses at 17 institutions - Generated Apr 9, 2023 - 1 of 23 - From - To - Transfer Credit - Effective Date - ''' - - ''' - Parsing something like this: - LANG ABST 1100 - (there may or may not be a 1 or 2 line description here) - Credits: 3 - Langara College (BC) - CAPU - CAPU HIST 209 (3) - May/03 to - present (sometimes present is on the same line as above) - ''' - print(f"Parsed {p} - {stuff[1]}.") - #print(stuff[0:50]) - - # sometimes the 1 of 23 pagecount doesn't show up???? - if "of" in stuff[3]: - stuff = stuff[8:] - else: - stuff = stuff[7:] - - i = 0 - while i < len(stuff): - - title = stuff[i].split(" ") - i += 1 - - while "Credits:" not in stuff[i]: - description = stuff[i] - i += 1 - - # we don't need the # of credits - # credit = float(stuff[i].split(":")[-1]) - i += 1 - - i += 1 # skip Langara College (BC) - - dest = stuff[i] - i += 1 - - - #print("Getting transfer info:") - #print(stuff[i]) - transfer = stuff[i] - i += 1 - - while stuff[i][6:9] != " to" or (not stuff[i][4:6].isnumeric() and not stuff[i][3] == "/"): - #print(stuff[i]) - - transfer += " " + stuff[i] - i += 1 - - validity = stuff[i].split("to") - start = validity[0].strip() - i += 1 - - - if len(validity) == 2 and validity[1] != "": - end = validity[1].strip() - else: - # if there is a second line - end = stuff[i].strip() - i += 1 - - - transfers.append(Transfer( - subject = title[1], - course_code = title[2], - source=title[0], - destination=dest, - credit=transfer, - effective_start=start, - effective_end=end, - )) - - # why is 8 of 23 here??? what about 1-7 of 23??? - # i don't know why only some of the page numbers show up :sob: - while i < len(stuff) and " of " in stuff[i]: - i += 1 - - return transfers - diff --git a/sdk/schema/Attribute.py b/sdk/schema/Attribute.py deleted file mode 100644 index 0a651b9..0000000 --- a/sdk/schema/Attribute.py +++ /dev/null @@ -1,21 +0,0 @@ -from enum import Enum -from sqlmodel import Field, SQLModel - - -class Attribute(SQLModel): - subject: str = Field(description="Subject area e.g. ```CPSC```.") - course_code: int = Field(description="Course code e.g. ```1050```.") - - attr_ar: bool =Field(default=False, description="Second year arts course.") - attr_sc: bool =Field(default=False, description="Second year science course.") - attr_hum: bool =Field(default=False, description="Humanities course.") - attr_lsc: bool =Field(default=False, description="Lab science course.") - attr_sci: bool =Field(default=False, description="Science course.") - attr_soc: bool =Field(default=False, description="SOC course.") - attr_ut: bool =Field(default=False, description="University transferrable course.") - - -class AttributeDB(Attribute, table=True): - id: str = Field(primary_key=True, description="Unique identifier for each Attribute.") - year: int = Field(description='Year e.g. ```2024```.') - term: int = Field(description='Term e.g. ```30```') \ No newline at end of file diff --git a/sdk/schema/BaseModels.py b/sdk/schema/BaseModels.py new file mode 100644 index 0000000..7a3c1c6 --- /dev/null +++ b/sdk/schema/BaseModels.py @@ -0,0 +1,37 @@ +from requests_cache import Optional +from sqlmodel import Field, Relationship, SQLModel + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from sdk.schema.CourseAttribute import CourseAttributeDB + from sdk.schema.CourseOutline import CourseOutlineDB + from sdk.schema.CoursePage import CoursePage + from sdk.schema.CourseSummary import CourseSummaryDB + from sdk.schema.Section import SectionDB + from sdk.schema.Transfer import TransferDB + + +class Course(SQLModel, table=True): + id: str = Field(primary_key=True, description="Internal primary key (e.g. CRSE-ENGL-1123).") + subject: str = Field(description="Subject area e.g. ```CPSC```.") + course_code: int = Field(description="Course code e.g. ```1050```.") + + # attributes: list["CourseAttributeDB"] = Relationship(back_populates="course") + # outlines: list["CourseOutlineDB"] = Relationship(back_populates="course") + # page: "CoursePage" = Relationship(back_populates="course") + # summaries: list["CourseSummaryDB"] = Relationship(back_populates="course") + # sections: list["SectionDB"] = Relationship(back_populates="course") + + # transfers: list["TransferDB"] = Relationship(back_populates="course") + +class Semester(SQLModel, table=True): + id: str = Field(primary_key=True, description="Internal primary key (e.g. SMTR-2024-30).") + + year: int = Field(description='Year e.g. ```2024```.') + term: int = Field(description='Term e.g. ```30```') + + courses_first_day: Optional[str] = Field(default=None, description="First day of normal classes.") + courses_last_day: Optional[str] = Field(default=None, description="Last day of normal classes.") + + # sections: list["SectionDB"] = Relationship() \ No newline at end of file diff --git a/sdk/schema/CourseAttribute.py b/sdk/schema/CourseAttribute.py new file mode 100644 index 0000000..70b0a6d --- /dev/null +++ b/sdk/schema/CourseAttribute.py @@ -0,0 +1,37 @@ +from sqlmodel import Field, Relationship, SQLModel + +from sdk.schema.BaseModels import Course + +""" +Stores the attributes of all courses +Data is also available by term so we store that as well + +Source: https://langara.ca/programs-and-courses/courses/course-attributes.html +""" +class CourseAttribute(SQLModel): + attr_ar: bool = Field(default=False, description="Meets second-year arts requirement (2AR).") + attr_sc: bool = Field(default=False, description="Meets second-year science requirement (2SC).") + attr_hum: bool = Field(default=False, description="Meets humanities requirement (HUM).") + attr_lsc: bool = Field(default=False, description="Meets lab-science requirement (LSC).") + attr_sci: bool = Field(default=False, description="Meets science requirement (SCI).") + attr_soc: bool = Field(default=False, description="Meets social science requirement (SOC).") + attr_ut: bool = Field(default=False, description='Meets "university-transferable" requirements. Course transfers to at least one of UBC, UBCO, SFU, UVIC, and UNBC (UT).') + + +class CourseAttributeDB(CourseAttribute, table=True): + id: str = Field(primary_key=True, description="Internal primary and unique key (e.g. ATRB-ENGL-1123-2024-30).") + + # 1:many relationship with course + subject: str = Field(index=True, foreign_key="course.subject") + course_code: int = Field(index=True, foreign_key="course.course_code") + year: int = Field(index=True, foreign_key="semester.year") + term: int = Field(index=True, foreign_key="semester.term") + + # id_course: str = Field(index=True, foreign_key="course.id") + # id_semester: str = Field(index=True, foreign_key="semester.id") + + course: Course = Relationship( + sa_relationship_kwargs={"primaryjoin": "CourseAttributeDB.subject==Course.subject and CourseAttributeDB.course_code==Course.course_code", "lazy": "joined"} + ) + + \ No newline at end of file diff --git a/sdk/schema/CourseOutline.py b/sdk/schema/CourseOutline.py new file mode 100644 index 0000000..2fc9e2c --- /dev/null +++ b/sdk/schema/CourseOutline.py @@ -0,0 +1,25 @@ +from sqlmodel import Field, Relationship, SQLModel + +from sdk.schema.BaseModels import Course + +""" +The course outlines which are available on the Langara course pages + +Source: https://langara.ca/programs-and-courses/courses/ENGL/1123.html +""" +class CourseOutline(SQLModel): + url: str = Field(description="URL to the pdf of the course outline.") + url_name: str = Field(description="Text that links to the course outline e.g. `CPSC 1150 - Summer 2021 (v. 1)`.") + +class CourseOutlineDB(CourseOutline, table=True): + id: str = Field(primary_key=True, description="Internal primary and unique key (e.g. OUTL-ENGL-1123-1).") + + # 1:many relationship with course + subject: str = Field(index=True, foreign_key="course.subject") + course_code: int = Field(index=True, foreign_key="course.course_code") + + id_course: str = Field(index=True, foreign_key="course.id") + + course: Course = Relationship( + sa_relationship_kwargs={"primaryjoin": "CourseOutlineDB.id_course==Course.id", "lazy": "joined"} + ) \ No newline at end of file diff --git a/sdk/schema/CoursePage.py b/sdk/schema/CoursePage.py new file mode 100644 index 0000000..f31e235 --- /dev/null +++ b/sdk/schema/CoursePage.py @@ -0,0 +1,36 @@ +from typing import Optional +from sqlmodel import Field, Relationship, SQLModel + +from sdk.schema.BaseModels import Course + +""" +All the data contained in the course page on the langara website +(except for the course outlines which get their own table) + +Source: https://langara.ca/programs-and-courses/courses/ENGL/1123.html +""" +class CoursePage(SQLModel): + title: str = Field(description="*Unabbreviated* title of the course e.g. ```Intro to Computer Science```.") + description: Optional[str] = Field(description="Description of course.") + + credits: float = Field(description="Credits of the course.") + hours_lecture: float = Field(description="Lecture hours of the course.") + hours_seminar: float = Field(description="Seminar hours of the course.") + hours_lab: float = Field(description="Lab hours of the course.") + + description: str = Field(description="Summary of the course.") + desc_duplicate_credit: Optional[str] = Field(description="If the credits for this course exclude credits from another course.") + desc_registration_restriction: Optional[str] = Field(description="If a course is restricted or has priority registration it will say here.") + desc_prerequisite: Optional[str] = Field(description="Prerequisites of the course are stated here.") + +class CoursePageDB(CoursePage, table=True): + id: str = Field(primary_key=True, description="Internal primary and unique key (e.g. CPGE-ENGL-1123).") + # 1:1 relationship with course + subject: str = Field(index=True, foreign_key="course.subject") + course_code: int = Field(index=True, foreign_key="course.course_code") + + id_course: str = Field(index=True, foreign_key="course.id") + + course: Course = Relationship( + sa_relationship_kwargs={"primaryjoin": "CoursePageDB.subject==Course.subject", "lazy": "joined"} + ) \ No newline at end of file diff --git a/sdk/schema/CourseSummary.py b/sdk/schema/CourseSummary.py index d0ad1fb..4574f0c 100644 --- a/sdk/schema/CourseSummary.py +++ b/sdk/schema/CourseSummary.py @@ -1,23 +1,38 @@ -from enum import Enum from typing import Optional -from sqlmodel import Field, SQLModel +from sqlmodel import Field, Relationship, SQLModel +from sdk.schema.BaseModels import Course +""" +Stores information taken from the course catalogue +Frankly I'm not sure if this page is even supposed to be public +Data is also available by semester +But pages before 2011 are in a different format + +Source: https://swing.langara.bc.ca/prod/hzgkcald.P_DisplayCatalog +""" class CourseSummary(SQLModel): - subject: str = Field(description="Subject area e.g. ```CPSC```.") - course_code: int = Field(description="Course code e.g. ```1050```.") - - credits: float = Field(description="Credits the course is worth.") - title: str = Field(description="*Unabbreviated* title of the course e.g. ```Intro to Computer Science```.") description: Optional[str] = Field(description="Description of course.") - hours_lecture: float = Field(default=False, description="Lecture hours of the course.") - hours_seminar: float = Field(default=False, description="Lecture hours of the course.") - hours_lab: float = Field(default=False, description="Lecture hours of the course.") + credits: float = Field(description="Credits of the course.") + hours_lecture: float = Field(default=False, description="Lecture hours of the course.") + hours_seminar: float = Field(default=False, description="Seminar hours of the course.") + hours_lab: float = Field(default=False, description="Lab hours of the course.") class CourseSummaryDB(CourseSummary, table=True): - id: str = Field(primary_key=True, description="Unique identifier for each CourseSummary.") - year: int = Field(description='Year e.g. ```2024```.') - term: int = Field(description='Term e.g. ```30```') \ No newline at end of file + id: str = Field(primary_key=True, description="Internal primary and unique key (e.g. `CSMR-ENGL-1123-2024-30`).") + + # 1:many relationship with course + subject: str = Field(index=True, foreign_key="course.subject") + course_code: int = Field(index=True, foreign_key="course.course_code") + year: int = Field(index=True, foreign_key="semester.year") + term: int = Field(index=True, foreign_key="semester.term") + + id_course: str = Field(index=True, foreign_key="course.id") + id_semester: str = Field(index=True, foreign_key="semester.id") + + course: Course = Relationship( + sa_relationship_kwargs={"primaryjoin": "CourseSummaryDB.id_course == Course.id", "lazy": "joined"} + ) \ No newline at end of file diff --git a/sdk/schema/ScheduleEntry.py b/sdk/schema/ScheduleEntry.py index 00b3592..c6727bd 100644 --- a/sdk/schema/ScheduleEntry.py +++ b/sdk/schema/ScheduleEntry.py @@ -1,10 +1,8 @@ - from enum import Enum - from typing import Optional, TYPE_CHECKING from sqlmodel import Field, Relationship, SQLModel -# from sdk.schema.Section import SectionDB +from sdk.schema.BaseModels import Course if TYPE_CHECKING: from sdk.schema.Section import SectionDB @@ -52,14 +50,29 @@ class Config: } class ScheduleEntryDB(ScheduleEntry, table=True): - id: str = Field(primary_key=True, description="Unique identifier for a ScheduleEntry.") + # 1:many relationship with course + # 1:many relationship with section + # 1:many relationship with semester + id: str = Field(primary_key=True, description="Internal primary and unique key (e.g. SCHD-ENGL-1123-2024-30-31005-1).") + crn: int = Field(index=True) # foreign key commented out here to not conflict with id_section + + subject: str = Field(index=True, foreign_key="course.subject") + course_code: int = Field(index=True, foreign_key="course.course_code") + year: int = Field(index=True, foreign_key="semester.year") + term: int = Field(index=True, foreign_key="semester.term") + + id_course: str = Field(index=True, foreign_key="course.id") + id_semester: str = Field(index=True, foreign_key="semester.id") + id_section: str = Field(index=True, foreign_key="sectiondb.id") + + + # course: Course = Relationship( + # sa_relationship_kwargs={"primaryjoin": "ScheduleEntryDB.id_course == Course.id", "lazy": "joined"} + # ) - section_id: Optional[str] = Field(default=None, index=True, foreign_key="sectiondb.id") section: Optional["SectionDB"] = Relationship(back_populates="schedule") - year: int = Field(index=True) - term: int = Field(index=True) - crn: int = Field(index=True) + class ScheduleEntryAPI(ScheduleEntry): id: str \ No newline at end of file diff --git a/sdk/schema/Section.py b/sdk/schema/Section.py index 49ae989..0cf1c3b 100644 --- a/sdk/schema/Section.py +++ b/sdk/schema/Section.py @@ -1,15 +1,13 @@ from enum import Enum -from typing import List, Optional, Union, TYPE_CHECKING +from typing import List, Optional, TYPE_CHECKING from sqlmodel import Field, Relationship, SQLModel +from sdk.schema.BaseModels import Course from sdk.schema.ScheduleEntry import ScheduleEntryAPI - -# from sdk.schema.ScheduleEntry import ScheduleEntryDB - if TYPE_CHECKING: from sdk.schema.ScheduleEntry import ScheduleEntryDB - from sdk.schema_built.Course import CourseAPIBuild + from sdk.schema_built.CourseMax import CourseAPIBuild @@ -19,13 +17,14 @@ class RPEnum(Enum): RP = "RP" class SectionBase(SQLModel): - crn: int = Field(index=True, description="Always 5 digits long.") + id: str = Field(primary_key=True, description="Internal primary and unique key (e.g. SECT-ENGL-1123-2024-30-31005).") + crn: int = Field(index=True, description="Always 5 digits long.") RP : Optional["RPEnum"] = Field(default=None, description='Prerequisites of the course.') seats: Optional[str] = Field(default=None, description='```"Inact"``` means registration isn\'t open yet. \n\n```"Cancel"``` means that the course is cancelled.') waitlist: Optional[str] = Field(default=None, description='```null``` means that the course has no waitlist (ie MATH 1183 & MATH 1283). \n\n```"N/A"``` means the course does not have a waitlist.') - subject: str = Field(default=None, index=True, description="Subject area e.g. ```CPSC```.") - course_code: int = Field(default=None, index=True, description="Course code e.g. ```1050```.") + # subject: str = Field(default=None, index=True, description="Subject area e.g. ```CPSC```.") + # course_code: int = Field(default=None, index=True, description="Course code e.g. ```1050```.") section: Optional[str] = Field(default=None, description="Section e.g. ```001```, ```W01```, ```M01```.") credits: float = Field(default=0, description="Credits the course is worth.") abbreviated_title: Optional[str]= Field(default=None, description="Abbreviated title of the course e.g. ```Algrthms & Data Strctrs I```.") @@ -35,16 +34,31 @@ class SectionBase(SQLModel): class SectionDB(SectionBase, table=True): - id:str = Field(primary_key=True, description="Unique identifier for a section.") - year: int = Field(index=True) - term: int = Field(index=True) + # 1:many relationship with course + # 1:many relationship with semester + subject: str = Field(index=True, foreign_key="course.subject") + course_code: int = Field(index=True, foreign_key="course.course_code") + year: int = Field(index=True, foreign_key="semester.year") + term: int = Field(index=True, foreign_key="semester.term") + + id_course: str = Field(index=True, foreign_key="course.id") + id_semester: str = Field(index=True, foreign_key="semester.id") + id_course_max : str = Field(index=True, foreign_key="coursemaxdb.id") + + # course: Course = Relationship( + # sa_relationship_kwargs={"primaryjoin": "SectionDB.id_course==Course.id", "lazy": "joined"} + # ) schedule: List["ScheduleEntryDB"] = Relationship(back_populates="section") -class SectionAPI(SectionBase): +class SectionAPI(SectionBase): + subject: str + course_code: int + year: int + term: int + schedule: List["ScheduleEntryAPI"] = [] - id: str = Field() # course_id: Optional[str] = Field(default=None, foreign_key="sectiondb.id") # course: Optional["CourseAPIExt"] = Relationship(back_populates="schedule") \ No newline at end of file diff --git a/sdk/schema/Transfer.py b/sdk/schema/Transfer.py index 538c53e..28aee72 100644 --- a/sdk/schema/Transfer.py +++ b/sdk/schema/Transfer.py @@ -1,16 +1,26 @@ from requests_cache import Optional -from sqlmodel import Field, SQLModel +from sqlmodel import Field, Relationship, SQLModel +from sdk.schema.BaseModels import Course -class Transfer(SQLModel): - subject: str = Field(index=True, description="Subject area e.g. ```CPSC```.") - course_code: int = Field(index=True, description="Course code e.g. ```1050```.") - source: str = Field(description="Source institution e.g. ````LANG```.") - destination: str = Field(description="Destination instituation e.g. ```SFU```.") - credit: str = Field(description="How many credits at the destination.") - condition: Optional[str] = Field() - effective_start: str = Field(description="When this transfer agreement began.") - effective_end: Optional[str] = Field(description="When the transfer agreement ended.") + +class Transfer(SQLModel): + id: str = Field(primary_key=True, description="Internal primary and unique key (e.g. TNFR-ENGL-1123-UBCV-309967).") + + transfer_guide_id: int = Field(index=True, description="Internal id that BCTransferGuide uses for transfer agreements") + + source: str = Field(description="Source institution code e.g. ````LANG```.") + source_credits: Optional[float] = Field(description="Credits at the source institution.") + source_title : Optional[str] = Field(description="Course title at the source institution.") + + destination: str = Field(description="Destination institution code e.g. ```SFU```.") + destination_name: str = Field(description="Destination institution full name e.g. ```Simon Fraser University```.") + + credit: str = Field(description="How many credits is the course worth at the source institution.") + condition: Optional[str] = Field(description="Additional conditions that apply to the credit transfer.") + + effective_start: str = Field(description="When this transfer agreement began.") + effective_end: Optional[str] = Field(description="When the transfer agreement ended.") class Config: @@ -22,7 +32,7 @@ class Config: "destination": "ALEX", "credit": "ALEX CPSC 1XX (3)", "effective_start": "Sep/15", - "effective_end": "present" + "effective_end": None }, "example2": { "subject": "CPSC", @@ -31,15 +41,25 @@ class Config: "destination": "AU", "credit": "AU COMP 2XX (3)", "effective_start": "May/15", - "effective_end": "present" + "effective_end": None } } class TransferDB(Transfer, table=True): - id: str = Field(primary_key=True, description="Unique identifier for each transfer.") - course_id:str = Field(primary_key=True, description="Unique identifier for each Course.") + # 1:many relationship with course + subject: str = Field(index=True, foreign_key="course.subject") + course_code: int = Field(index=True, foreign_key="course.course_code") + + id_course: str = Field(index=True, foreign_key="course.id") + id_course_max : str = Field(index=True, foreign_key="coursemaxdb.id") + # course: Course = Relationship( + # sa_relationship_kwargs={"primaryjoin": "TransferDB.subject==Course.subject and TransferDB.course_code==Course.course_code", "lazy": "joined"} + # ) + + class TransferAPI(Transfer): - id: str + subject: str + course_code: int \ No newline at end of file diff --git a/sdk/schema_built/ApiResponses.py b/sdk/schema_built/ApiResponses.py index 62e9e90..ee6c10c 100644 --- a/sdk/schema_built/ApiResponses.py +++ b/sdk/schema_built/ApiResponses.py @@ -1,24 +1,11 @@ from sqlmodel import SQLModel +from sdk.schema.BaseModels import Semester -class IndexSemester(SQLModel): - year:int - term:int - - model_config = { - "json_schema_extra": { - "examples": [ - { - "year": 2023, - "term": 20 - } - ] - } - } class IndexSemesterList(SQLModel): count: int - semesters: list[IndexSemester] + semesters: list[Semester] model_config = { "json_schema_extra": { diff --git a/sdk/schema_built/Course.py b/sdk/schema_built/Course.py deleted file mode 100644 index e5151f5..0000000 --- a/sdk/schema_built/Course.py +++ /dev/null @@ -1,156 +0,0 @@ -from enum import Enum - -from typing import Optional -from sqlmodel import Field, SQLModel, Relationship - -from sdk.schema.Section import RPEnum, SectionAPI, SectionDB -from sdk.schema.Transfer import Transfer, TransferAPI, TransferDB - - - -class availabilitiesEnum(Enum): - spring = "Spring" - summer = "Summer" - fall = "Fall" - springsummer = "Spring & Summer" - springfall = "Spring & Fall" - summerfall = "Summer & Fall" - all = "All Semesters" - unknown = "Unknown" - discontinued = "Discontinued" - -class PrereqEnum(Enum): - ALL_OF = "ALL OF" - ONE_OF = "ONE OF" - COREQ = "COREQ" - REQ = "REQ" - -# probably needs its own class once implemented -class Prerequisite(SQLModel): - type : PrereqEnum - course : str - grade : Optional[str] - - -# TODO: fill in all attributes from all possible sources - - - - -class CourseBase(SQLModel): - id:str = Field(primary_key=True, description="Unique identifier for each Course.") - - # GENERAL INFO - subject: str = Field(index=True, description="Subject area e.g. ```CPSC```.") - course_code: int = Field(index=True, description="Course code e.g. ```1050```.") - - # FROM CourseSummary.py - credits: Optional[float] = Field(default=None, description="Credits the course is worth.") - - title: str = Field(default="", description="*Unabbreviated* title of the course e.g. ```Intro to Computer Science```.") - description: Optional[str] = Field(default=None, description="Description of course.") - - hours_lecture: Optional[float] = Field(default=None, description="Lecture hours of the course.") - hours_seminar: Optional[float] = Field(default=None, description="Lecture hours of the course.") - hours_lab: Optional[float] = Field(default=None, description="Lecture hours of the course.") - - # TODO: Not implemented (needs another scraper ._.) - # course_outline_url: Optional[str] = Field(default=None, description="Link to course outline (if available).") - - # Generated from Section.py (uses the most recent section) - RP : Optional[RPEnum] = Field(default=None, description='Prerequisites of the course.') - abbreviated_title: Optional[str] = Field(default=None, description="Abbreviated title of the course e.g. ```Algrthms & Data Strctrs I```.") - add_fees: Optional[float] = Field(default=None, description="Additional fees (in dollars).") - rpt_limit: Optional[int] = Field(default=None, description="Repeat limit. There may be other repeat limits not listed here you should keep in mind.") - - # FROM Attribute.py - attr_ar: Optional[bool] =Field(default=None, description="Second year arts course.") - attr_sc: Optional[bool] =Field(default=None, description="Second year science course.") - attr_hum: Optional[bool] =Field(default=None, description="Humanities course.") - attr_lsc: Optional[bool] =Field(default=None, description="Lab science course.") - attr_sci: Optional[bool] =Field(default=None, description="Science course.") - attr_soc: Optional[bool] =Field(default=None, description="SOC course.") - attr_ut: Optional[bool] =Field(default=None, description="University transferrable course.") - - # Derived from Section.py (uses aggregate data from all sections) - # average_seats: Optional[float] = Field(default=None) - # average_waitlist: Optional[float] = Field(default=None) - # maximum_seats: Optional[int] = Field(default=None) - - last_offered_year: Optional[int] = Field(default=None, description="The last year the course was offered e.g. ```2023```.") - last_offered_term: Optional[int] = Field(default=None, description="The last term the course was offered e.g. ```10```.") - first_offered_year: Optional[int] = Field(default=None, description="The first year the course was offered e.g. ```2013```.") - first_offered_term: Optional[int] = Field(default=None, description="The first term the course was offered e.g. ```30```.") - - # Derived from multiple sources - # availability: availabilitiesEnum = Field(default=None, description="(NOT IMPLEMENTED) Availability of course. Extracted automatically - may not be correct. Consult Langara advisors if in doubt.") - # prerequisites: Optional[list[Prerequisite]] = Field(default=[], description="(NOT IMPLEMENTED) Prerequisites for the course.") - - # restriction: Optional[str] = Field(default=None, description="(NOT IMPLEMENTED) Program you must be in to register for this course.") - - # THE MOST IMPORTANT PART - -class CourseDB(CourseBase, table=True): - id:str = Field(primary_key=True, description="Unique identifier for each Course.") - - # this only changes when we run the course search, so we should - # prefill the data instead of running a query live - latest_course_summary_id: Optional[str] = Field(foreign_key="coursesummarydb.id") - latest_section_id: Optional[str] = Field(foreign_key="sectiondb.id") - latest_attribute_id: Optional[str] = Field(foreign_key="attributedb.id") - - - -class CourseAPIBuild(CourseBase): - - # all of these will be removed once the course is returned - year: int = Field(default=0) - term: int = Field(default=0) - - - offerings: list[SectionAPI] = Field(default=[], description="All past offerings of the course") - transfers: list[TransferDB] = Field(default=[], description="Information on how the course transfers.") - -class CourseAPIExt(CourseBase): - offerings: list[SectionAPI] = Field(default=[], description="All past offerings of the course") - transfers: list[TransferAPI] = Field(default=[], description="Information on how the course transfers.") - -class CourseAPI(CourseBase): - # offerings: list[SectionAPI] = Field(default=[], description="All past offerings of the course") - transfers: list[TransferAPI] = Field(default=[], description="Information on how the course transfers.") - - - - # class Config: - # json_schema_extra = { - # "example": { - # "RP" : None, - # "subject" : "CPSC", - # "course_code" : 1050, - # "credits" : 3.0, - # "title": "Introduction to Computer Science", - # "description" : "Offers a broad overview of the computer science discipline. Provides students with an appreciation for and an understanding of the many different aspects of the discipline. Topics include information and data representation; introduction to computer hardware and programming; networks; applications (e.g., spreadsheet, database); social networking; ethics; and history. Intended for both students expecting to continue in computer science as well as for those taking it for general interest.", - # "hours": { - # "lecture": 4, - # "seminar": 0, - # "lab": 2 - # }, - # "add_fees" : 34., - # "rpt_limit" : 2, - # # TODO: fix attributes - # # "attributes" : { - # # "2AR" : False, - # # "2SC" : False, - # # "HUM" : False, - # # "LSC" : False, - # # "SCI" : True, - # # "SOC" : False, - # # "UT" : True, - # # }, - # "transfer" : [ - # Transfer.Config.json_schema_extra["example1"], - # Transfer.Config.json_schema_extra["example2"] - # ], - # } - # } - \ No newline at end of file diff --git a/sdk/schema_built/CourseMax.py b/sdk/schema_built/CourseMax.py new file mode 100644 index 0000000..849d2d6 --- /dev/null +++ b/sdk/schema_built/CourseMax.py @@ -0,0 +1,183 @@ +from enum import Enum + +from typing import Optional +from sqlmodel import Field, SQLModel, Relationship + +from sqlalchemy.orm import RelationshipProperty + +from sdk.schema.BaseModels import Course +from sdk.schema.CourseOutline import CourseOutline +from sdk.schema.Section import RPEnum, SectionAPI, SectionDB +from sdk.schema.Transfer import Transfer, TransferAPI, TransferDB + + + +class availabilitiesEnum(Enum): + spring = "Spring" + summer = "Summer" + fall = "Fall" + springsummer = "Spring & Summer" + springfall = "Spring & Fall" + summerfall = "Summer & Fall" + all = "All Semesters" + unknown = "Unknown" + discontinued = "Discontinued" + +class PrereqEnum(Enum): + ALL_OF = "ALL OF" + ONE_OF = "ONE OF" + COREQ = "COREQ" + REQ = "REQ" + +# probably needs its own class once implemented +class Prerequisite(SQLModel): + type : PrereqEnum + course : str + grade : Optional[str] + + +# TODO: fill in all attributes from all possible sources + + + + +class CourseMax(SQLModel): + + subject: str = Field(index=True, foreign_key="course.subject") + course_code: int = Field(index=True, foreign_key="course.course_code") + + + # FROM CourseSummary.py + credits: Optional[float] = Field(default=None, description="Credits that the course is worth.") + + title: Optional[str] = Field(default=None, description="*Unabbreviated* title of the course e.g. ```Intro to Computer Science```.") + + description: Optional[str] = Field(description="Summary of the course.") + desc_duplicate_credit: Optional[str] = Field(description="If the credits for this course exclude credits from another course.") + desc_registration_restriction: Optional[str] = Field(description="If a course is restricted or has priority registration it will say here.") + desc_prerequisite: Optional[str] = Field(description="Prerequisites of the course are stated here.") + + hours_lecture: Optional[float] = Field(default=None, description="Lecture hours of the course.") + hours_seminar: Optional[float] = Field(default=None, description="Lecture hours of the course.") + hours_lab: Optional[float] = Field(default=None, description="Lecture hours of the course.") + + + # FROM Section.py (uses the most recent section) + RP : Optional[RPEnum] = Field(default=None, description='Prerequisites of the course.') + abbreviated_title: Optional[str] = Field(default=None, description="Abbreviated title of the course e.g. ```Algrthms & Data Strctrs I```.") + add_fees: Optional[float] = Field(default=None, description="Additional fees (in dollars).") + rpt_limit: Optional[int] = Field(default=None, description="Repeat limit. There may be other repeat limits not listed here you should keep in mind.") + + # FROM Attribute.py + attr_ar: Optional[bool] =Field(default=None, description="Second year arts course.") + attr_sc: Optional[bool] =Field(default=None, description="Second year science course.") + attr_hum: Optional[bool] =Field(default=None, description="Humanities course.") + attr_lsc: Optional[bool] =Field(default=None, description="Lab science course.") + attr_sci: Optional[bool] =Field(default=None, description="Science course.") + attr_soc: Optional[bool] =Field(default=None, description="SOC course.") + attr_ut: Optional[bool] =Field(default=None, description="University transferrable course.") + + # Calculated from Section + first_offered_year: Optional[int] = Field(default=None, description="The first year the course was offered e.g. ```2013```.") + first_offered_term: Optional[int] = Field(default=None, description="The first term the course was offered e.g. ```30```.") + last_offered_year: Optional[int] = Field(default=None, description="The last year the course was offered e.g. ```2023```.") + last_offered_term: Optional[int] = Field(default=None, description="The last term the course was offered e.g. ```10```.") + + # Derived from multiple sources + availability: Optional[availabilitiesEnum] = Field(default=None, description="(NOT IMPLEMENTED) Availability of course. Extracted automatically - may not be correct. Consult Langara advisors if in doubt.") + + # Funny SQLModel relationships that ARE NOT database relationships + # course_outlines: list["CourseOutline"] = Relationship() # description="TODO: Course outlines for the course if available." + # transfers: list["TransferDB"] = Relationship() # description="All transfers for the course." + # page: "CoursePage" = Relationship(back_populates="course") + # summaries: list["CourseSummaryDB"] = Relationship(back_populates="course") + # sections: list["SectionDB"] = Relationship(back_populates="course") + + + + +class CourseMaxDB(CourseMax, table=True): + id: str = Field(primary_key=True, description="Internal primary and unique key (e.g. CMAX-ENGL-1123).") + + transfers: list["TransferDB"] = Relationship() + + offerings: list["SectionDB"] = Relationship() + + # id_course: str = Field(index=True, foreign_key="course.id") + # course: Course = Relationship( + # sa_relationship_kwargs={"primaryjoin": "CourseMaxDB.subject==Course.subject and CourseMaxDB.course_code==Course.course_code", "lazy": "joined"} + # ) + +class CourseMaxAPI(CourseMax): + id: str + + transfers: list["TransferAPI"] = [] + offerings: list["SectionDB"] = [] + +class CourseMaxAPIOnlyTransfers(CourseMax): + id: str + + transfers: list["TransferAPI"] = [] + + + + +# class CourseBuiltDB(CourseBase, table=True): +# subject: str = Field(primary_key=True, foreign_key="course.subject") +# course_code: int = Field(primary_key=True, foreign_key="course.course_code") + + + +# class CourseAPIBuild(CourseBase): + +# # all of these will be removed once the course is returned +# year: int = Field(default=0) +# term: int = Field(default=0) + + +# offerings: list[SectionAPI] = Field(default=[], description="All past offerings of the course") +# transfers: list[TransferDB] = Field(default=[], description="Information on how the course transfers.") + +# class CourseAPIExt(CourseBase): +# offerings: list[SectionAPI] = Field(default=[], description="All past offerings of the course") +# transfers: list[TransferAPI] = Field(default=[], description="Information on how the course transfers.") + +# class CourseAPI(CourseBase): +# # offerings: list[SectionAPI] = Field(default=[], description="All past offerings of the course") +# transfers: list[TransferAPI] = Field(default=[], description="Information on how the course transfers.") + + + + # class Config: + # json_schema_extra = { + # "example": { + # "RP" : None, + # "subject" : "CPSC", + # "course_code" : 1050, + # "credits" : 3.0, + # "title": "Introduction to Computer Science", + # "description" : "Offers a broad overview of the computer science discipline. Provides students with an appreciation for and an understanding of the many different aspects of the discipline. Topics include information and data representation; introduction to computer hardware and programming; networks; applications (e.g., spreadsheet, database); social networking; ethics; and history. Intended for both students expecting to continue in computer science as well as for those taking it for general interest.", + # "hours": { + # "lecture": 4, + # "seminar": 0, + # "lab": 2 + # }, + # "add_fees" : 34., + # "rpt_limit" : 2, + # # TODO: fix attributes + # # "attributes" : { + # # "2AR" : False, + # # "2SC" : False, + # # "HUM" : False, + # # "LSC" : False, + # # "SCI" : True, + # # "SOC" : False, + # # "UT" : True, + # # }, + # "transfer" : [ + # Transfer.Config.json_schema_extra["example1"], + # Transfer.Config.json_schema_extra["example2"] + # ], + # } + # } + \ No newline at end of file diff --git a/sdk/schema_built/Semester.py b/sdk/schema_built/Semester.py deleted file mode 100644 index 3aa0a68..0000000 --- a/sdk/schema_built/Semester.py +++ /dev/null @@ -1,48 +0,0 @@ -import logging - -from enum import Enum -from typing import Optional -from sqlmodel import Field, SQLModel - -from sdk.schema.Section import SectionAPI, SectionDB -from sdk.schema.ScheduleEntry import ScheduleEntryDB -from sdk.schema.Attribute import AttributeDB -from sdk.schema.CourseSummary import CourseSummaryDB - -from sdk.schema_built.Course import CourseBase, CourseAPI - -class Semesters(Enum): - spring = 10 - summer = 20 - fall = 30 - - -class Semester(SQLModel): - year: int = Field(description='Year of semester e.g. ```2024```.') - term: int = Field(description='Term of semester e.g. ```30```.') - - -class SemesterCourses(Semester): - courses: list[CourseAPI] = Field(default=[]) - -class SemesterSections(Semester): - sections: list[SectionAPI] = Field(default=[]) - - # attributes: list[AttributeDB] = Field(default=[]) - # courseSummaries: list[CourseSummaryDB] = Field(default=[]) - # sections: list[SectionDB] = Field(default=[], description='List of sections in the semester.') - # schedules: list[ScheduleEntryDB] = Field(default=[]) - - - # class Config: - # json_schema_extra = { - # "example": { - # # "datetime_retrieved" : "2023-04-04", - # "year": 2023, - # "term" : Semesters.spring, - # # "courses_first_day" : "2023-5-08", - # # "courses_last_day" : "2023-8-31", - # # "courses" : [CourseEnhanced.Config.json_schema_extra[0]] - # } - # } - \ No newline at end of file diff --git a/sdk/scrapers/DownloadTransferInfo.py b/sdk/scrapers/DownloadTransferInfo.py index 555417d..3692f87 100644 --- a/sdk/scrapers/DownloadTransferInfo.py +++ b/sdk/scrapers/DownloadTransferInfo.py @@ -145,7 +145,7 @@ class PageResponse(SQLModel): def parsePageRequest(data:dict, current_subject=None, current_course_code=None, current_i=0) -> PageResponse: - + r = PageResponse( current_page=data["currentPage"], total_pages=data["totalPages"], @@ -172,24 +172,31 @@ def parsePageRequest(data:dict, current_subject=None, current_course_code=None, transfer = TransferDB( - # TRA-ABST-1100-CAPU-1 - id = f'TRA-{subject}-{course_code}-{r.current_i}', - course_id=f'CRS-{subject}-{course_code}', - subject = subject, - course_code = course_code, + # TRAN-ENGL-1123-UBCV-309967 + id = f'TRAN-{subject}-{course_code}-{t["Id"]}', + + transfer_guide_id=t["Id"], + + source_credits = t["SndrCourseCredit"], + source_title = t["SndrCourseTitle"], source= t["SndrInstitutionCode"], # source_name = t["SndrInstitutionName"], - destination = t["RcvrInstitutionCode"], - # destination_name = t["RcvrInstitutionName"], + destination_name = t["RcvrInstitutionName"], credit = t["Detail"], condition = t["Condition"], effective_start= t["StartDate"], - effective_end = t["EndDate"] + effective_end = t["EndDate"], + + subject = subject, + course_code = course_code, + + id_course=f'CRSE-{subject}-{course_code}', + id_course_max=f'CMAX-{subject}-{course_code}' ) r.current_i += 1 @@ -231,8 +238,9 @@ def getTransferInformation(use_cache:bool, institution="LANG", institution_id:in # WOW THAT WAS PAINFUL async def getWPNonce(use_cache: bool=False, url='https://www.bctransferguide.ca/transfer-options/search-courses/') -> str | None: - if use_cache: - return "CACHE_NONCE" + # this breaks on a clean run of the code + # if use_cache: + # return "CACHE_NONCE" nonce_container = {'nonce': None} From e2261de1c9f6dd4f74b2f38d859b7a498f7e51f9 Mon Sep 17 00:00:00 2001 From: Anderson T Date: Thu, 20 Jun 2024 01:09:26 -0700 Subject: [PATCH 2/4] implement course page scraper --- Controller.py | 86 ++++++++- api.py | 4 +- sdk/schema/CourseOutline.py | 9 +- sdk/schema/CoursePage.py | 7 +- sdk/schema_built/CourseMax.py | 10 +- sdk/scrapers/LangaraCourseIndex.py | 292 +++++++++++++++++++++++++++++ 6 files changed, 394 insertions(+), 14 deletions(-) create mode 100644 sdk/scrapers/LangaraCourseIndex.py diff --git a/Controller.py b/Controller.py index 5530339..048c805 100644 --- a/Controller.py +++ b/Controller.py @@ -9,6 +9,8 @@ from sdk.schema.BaseModels import Course, Semester from sdk.schema.CourseAttribute import CourseAttributeDB +from sdk.schema.CourseOutline import CourseOutlineDB +from sdk.schema.CoursePage import CoursePageDB from sdk.schema.CourseSummary import CourseSummaryDB from sdk.schema.Section import SectionAPI, SectionDB from sdk.schema.ScheduleEntry import ScheduleEntryDB @@ -24,6 +26,8 @@ from sdk.parsers.CatalogueParser import parseCatalogueHTML from sdk.parsers.AttributesParser import parseAttributesHTML from sdk.scrapers.DownloadTransferInfo import getTransferInformation +from sdk.scrapers.LangaraCourseIndex import getCoursePageInfo +from sdk.scrapers.ScraperUtilities import createSession class Controller(): @@ -80,6 +84,8 @@ def buildDatabase(self, use_cache=False): print("Building database...\n") start = time.time() + + # Download, parse and save Transfer Information # Takes 20-30 minutes from live and 20 seconds from cache. print("=== FETCHING TRANSFER INFORMATION ===") @@ -88,6 +94,14 @@ def buildDatabase(self, use_cache=False): print(f"Transfer information downloaded and parsed in {Controller.timeDeltaString(start, timepoint1)}") print() + # DPS course pages from the main langara website + # Takes ?? from live and about a minute from cache. + print("=== FETCHING COURSE PAGES INFORMATION ===") + self.fetchParseSaveCoursePages(use_cache) + timepoint2 = time.time() + print(f"Langara course page information downloaded and parsed in {Controller.timeDeltaString(timepoint1, timepoint2)}") + + # Download, parse and save Langara Tnformation # Takes 20-30 minutes from live and 10 - 5 minutes from cache print("=== FETCHING SEMESTERLY INFORMATION ===") @@ -99,21 +113,52 @@ def buildDatabase(self, use_cache=False): out = self.updateSemester(year, term, use_cache) year, term = Controller.incrementTerm(year, term) - timepoint2 = time.time() - print(f"Langara information downloaded and parsed in {Controller.timeDeltaString(timepoint1, timepoint2)}") + timepoint3 = time.time() + print(f"Langara sections downloaded and parsed in {Controller.timeDeltaString(timepoint2, timepoint3)}") print() # Takes approximately 3 minutes print("=== GENERATING AGGREGATIONS & PREBUILTS ===") self.genIndexesAndPreBuilts() - timepoint3 = time.time() - print(f"Database indexes built in {Controller.timeDeltaString(timepoint2, timepoint3)}") + timepoint4 = time.time() + print(f"Database indexes built in {Controller.timeDeltaString(timepoint3, timepoint4)}") print() - print(f"Database built in {Controller.timeDeltaString(start, timepoint3)}!") - + print(f"Database built in {Controller.timeDeltaString(start, timepoint4)}!") + def fetchParseSaveCoursePages(self, use_cache): + web_session = createSession("database/cache/cache.db", use_cache) + courses, outlines = getCoursePageInfo(web_session) + + with Session(self.engine) as session: + for c in courses: + self.checkCourseExists(session, c.subject, c.course_code, c) + + result = session.get(CoursePageDB, c.id) + + # insert if it doesn't exist or update if it does exist + if result == None: + session.add(c) + else: + new_data = c.model_dump() + result.sqlmodel_update(new_data) + session.add(result) + + for o in outlines: + result = session.get(CourseOutlineDB, o.id) + + # insert if it doesn't exist or update if it does exist + if result == None: + session.add(o) + else: + new_data = o.model_dump() + result.sqlmodel_update(new_data) + session.add(result) + + session.commit() + + print(f"Saved {len(courses)} courses to the database.") class SemesterInternal(SQLModel): @@ -234,7 +279,7 @@ def updateSemester(self, year:int, term:int, use_cache:bool=False) -> bool | Non print(f"{year}{term} : Finished DB update.") return True - + def timeDeltaString(time1:float, time2:float) -> str: hours, rem = divmod(time2-time1, 3600) @@ -326,7 +371,6 @@ def _generateCourseIndexes(self) -> None: This takes quite a bit of effort to build... """ - # TODO: TO BE REPLACED BY CoursePage once scraper is implemented statement = select(CourseSummaryDB).where( CourseSummaryDB.subject == subject, CourseSummaryDB.course_code == course_code @@ -341,6 +385,32 @@ def _generateCourseIndexes(self) -> None: c.hours_seminar = r.hours_seminar c.hours_lab = r.hours_lab + # CoursePage + # We replace the attributes from CourseSummary because + # CourseSummary has information for some discontinued courses + statement = select(CoursePageDB).where( + CoursePageDB.subject == subject, + CoursePageDB.course_code == course_code + ).limit(1) + results = session.exec(statement) + r = session.exec(statement).first() + if r: + c.title = r.title + c.description = r.description + c.desc_duplicate_credit = r.desc_duplicate_credit + c.desc_registration_restriction = r.desc_registration_restriction + c.desc_prerequisite = r.desc_prerequisite + + c.credits = r.credits + c.hours_lecture = r.hours_lecture + c.hours_seminar = r.hours_seminar + c.hours_lab = r.hours_lab + + # c.university_transferrable = r.university_transferrable + c.offered_online = r.offered_online + c.preparatory_course = r.preparatory_course + + statement = select(CourseAttributeDB).where( CourseAttributeDB.subject == subject, diff --git a/api.py b/api.py index b78ffcc..00349fa 100644 --- a/api.py +++ b/api.py @@ -75,8 +75,8 @@ def daily(use_cache: bool = False): if (os.path.exists(DB_LOCATION)): print("Database found.") - controller.create_db_and_tables() - hourly(use_cache=True) + # controller.create_db_and_tables() + # hourly(use_cache=True) else: controller.create_db_and_tables() print("Database not found. Building database from scratch.") diff --git a/sdk/schema/CourseOutline.py b/sdk/schema/CourseOutline.py index 2fc9e2c..89218c5 100644 --- a/sdk/schema/CourseOutline.py +++ b/sdk/schema/CourseOutline.py @@ -9,7 +9,7 @@ """ class CourseOutline(SQLModel): url: str = Field(description="URL to the pdf of the course outline.") - url_name: str = Field(description="Text that links to the course outline e.g. `CPSC 1150 - Summer 2021 (v. 1)`.") + file_name: str = Field(description="Text that links to the course outline e.g. `CPSC 1150 - Summer 2021 (v. 1)`.") class CourseOutlineDB(CourseOutline, table=True): id: str = Field(primary_key=True, description="Internal primary and unique key (e.g. OUTL-ENGL-1123-1).") @@ -19,7 +19,12 @@ class CourseOutlineDB(CourseOutline, table=True): course_code: int = Field(index=True, foreign_key="course.course_code") id_course: str = Field(index=True, foreign_key="course.id") + id_course_max : str = Field(index=True, foreign_key="coursemaxdb.id") course: Course = Relationship( sa_relationship_kwargs={"primaryjoin": "CourseOutlineDB.id_course==Course.id", "lazy": "joined"} - ) \ No newline at end of file + ) + +class CourseOutlineAPI(CourseOutline): + id: str + \ No newline at end of file diff --git a/sdk/schema/CoursePage.py b/sdk/schema/CoursePage.py index f31e235..39b0b5f 100644 --- a/sdk/schema/CoursePage.py +++ b/sdk/schema/CoursePage.py @@ -18,11 +18,16 @@ class CoursePage(SQLModel): hours_seminar: float = Field(description="Seminar hours of the course.") hours_lab: float = Field(description="Lab hours of the course.") - description: str = Field(description="Summary of the course.") + description: Optional[str] = Field(description="Summary of the course.") desc_duplicate_credit: Optional[str] = Field(description="If the credits for this course exclude credits from another course.") desc_registration_restriction: Optional[str] = Field(description="If a course is restricted or has priority registration it will say here.") desc_prerequisite: Optional[str] = Field(description="Prerequisites of the course are stated here.") + university_transferrable: bool = Field(description="If the course is university transferrable.") + offered_online: bool = Field(description="If there are online offerings for the course.") + preparatory_course: bool = Field(description="If the course is prepatory (ie does not offer credits.)") + + class CoursePageDB(CoursePage, table=True): id: str = Field(primary_key=True, description="Internal primary and unique key (e.g. CPGE-ENGL-1123).") # 1:1 relationship with course diff --git a/sdk/schema_built/CourseMax.py b/sdk/schema_built/CourseMax.py index 849d2d6..d2f16e6 100644 --- a/sdk/schema_built/CourseMax.py +++ b/sdk/schema_built/CourseMax.py @@ -6,7 +6,7 @@ from sqlalchemy.orm import RelationshipProperty from sdk.schema.BaseModels import Course -from sdk.schema.CourseOutline import CourseOutline +from sdk.schema.CourseOutline import CourseOutline, CourseOutlineAPI, CourseOutlineDB from sdk.schema.Section import RPEnum, SectionAPI, SectionDB from sdk.schema.Transfer import Transfer, TransferAPI, TransferDB @@ -52,6 +52,7 @@ class CourseMax(SQLModel): title: Optional[str] = Field(default=None, description="*Unabbreviated* title of the course e.g. ```Intro to Computer Science```.") + # FROM CoursePage.py description: Optional[str] = Field(description="Summary of the course.") desc_duplicate_credit: Optional[str] = Field(description="If the credits for this course exclude credits from another course.") desc_registration_restriction: Optional[str] = Field(description="If a course is restricted or has priority registration it will say here.") @@ -61,6 +62,10 @@ class CourseMax(SQLModel): hours_seminar: Optional[float] = Field(default=None, description="Lecture hours of the course.") hours_lab: Optional[float] = Field(default=None, description="Lecture hours of the course.") + # university_transferrable: Optional[bool] = Field(description="If the course is university transferrable.") + offered_online: Optional[bool] = Field(default=None, description="If there are online offerings for the course.") + preparatory_course: Optional[bool] = Field(default=None, description="If the course is prepatory (ie does not offer credits.)") + # FROM Section.py (uses the most recent section) RP : Optional[RPEnum] = Field(default=None, description='Prerequisites of the course.') @@ -103,6 +108,8 @@ class CourseMaxDB(CourseMax, table=True): offerings: list["SectionDB"] = Relationship() + outlines: list["CourseOutlineDB"] = Relationship() + # id_course: str = Field(index=True, foreign_key="course.id") # course: Course = Relationship( # sa_relationship_kwargs={"primaryjoin": "CourseMaxDB.subject==Course.subject and CourseMaxDB.course_code==Course.course_code", "lazy": "joined"} @@ -111,6 +118,7 @@ class CourseMaxDB(CourseMax, table=True): class CourseMaxAPI(CourseMax): id: str + outlines: list["CourseOutlineAPI"] = [] transfers: list["TransferAPI"] = [] offerings: list["SectionDB"] = [] diff --git a/sdk/scrapers/LangaraCourseIndex.py b/sdk/scrapers/LangaraCourseIndex.py new file mode 100644 index 0000000..b500a64 --- /dev/null +++ b/sdk/scrapers/LangaraCourseIndex.py @@ -0,0 +1,292 @@ +import requests +from bs4 import BeautifulSoup +import lxml +import cchardet + +import requests_cache +from sqlmodel import Field, SQLModel + +from sdk.schema.CourseOutline import CourseOutlineDB +from sdk.schema.CoursePage import CoursePage, CoursePageDB + +from sdk.scrapers.ScraperUtilities import createSession + + + +# from typing import TYPE_CHECKING + +# if TYPE_CHECKING: +# from main import CACHE_DB_LOCATION + +class _PageSubject(SQLModel): + subject_name: str + subject_code: str + href : str + +class _PageCourse(SQLModel): + subject: str + course_code: int + href: str + + university_transferable: bool = Field(description="If the course is university transferrable.") + offered_online: bool = Field(description="If there are online offerings for the course.") + preparatory_course: bool = Field(description="If the course is prepatory (ie does not offer credits.)") + + + + +def getPageSubjectLinks(session) -> list[_PageSubject]: + # get links to the course index pages of all subjects + + url = f"https://langara.ca/programs-and-courses/courses/index.html" + + response = session.get(url) + + soup = BeautifulSoup(response.text, features="lxml") + + # Find all the
  • tags within the container + li_tags = soup.select('div.category-column ul.grid li a') + + # Extract course names and URLs + subjects = [] + for li in li_tags: + course_name = li.get_text(strip=True) + course_url = li['href'] + subject_code = course_url.split('/')[0] # Assuming the subject code is the first part of the URL + + course = _PageSubject( + subject_name=course_name, + subject_code=subject_code, + href=course_url + ) + + subjects.append(course) + + return subjects + + +def getCoursesFromSubjectPage( + session: requests_cache.CachedSession | requests.Session, + page:_PageSubject +) -> list[_PageCourse]: + + courses = [] + + # Get the page of the subject + url = f'https://langara.ca/programs-and-courses/courses/{page.href}' + response = session.get(url) + soup = BeautifulSoup(response.text, 'lxml') + + # Find all tags + tr_tags = soup.find_all('tr')[1:] + + courses = [] + + for tr in tr_tags: + + a_tag = tr.find('a') + + # bandaid fixes for bad selecting + if a_tag == None: + continue + if 'href' not in getattr(a_tag, 'attrs', {}): + continue + + url = a_tag['href'] + + if url == "#": + continue + + full_code = a_tag.string.strip() + subject, code = full_code.split() + + # Check glyph statuses + university_transferable = 'icon-u-transfer-active' in tr.find('span', class_='icon-u-transfer')['class'] + offered_online = 'icon-online-active' in tr.find('span', class_='icon-online')['class'] + preparatory_course = 'icon-preparatory-active' in tr.find('span', class_='icon-preparatory')['class'] + + course = _PageCourse( + subject=subject, + course_code=code, + href=url, + + university_transferable=university_transferable, + offered_online=offered_online, + preparatory_course=preparatory_course) + courses.append(course) + + return courses + +def getInformationFromCoursePage( + session: requests_cache.CachedSession | requests.Session, + course:_PageCourse +) -> tuple[CoursePageDB, list[CourseOutlineDB] | None]: + + url = f'https://langara.ca{course.href}' + response = session.get(url) + soup = BeautifulSoup(response.text, 'lxml') + + all_section_inner_divs = soup.find_all('div', class_='section-inner') + + # Iterate through each div and find the one with a child div with class 'section-inner' + section = None + for div in all_section_inner_divs: + if div.find('div', class_='section-inner'): + section = div + break + + assert section != None + + # print(section) + # input() + + # Extract the course title, subject, and code + h2_tag = section.find('h2') + full_title = h2_tag.string.strip() + subject_code, title = full_title.split(': ', 1) + subject, course_code = subject_code.split() + + # Extract the course format details + table = section.find('table', class_='table-course-detail') + rows = table.find_all('tr') + hours_lecture, hours_seminar, hours_lab = 0.0, 0.0, 0.0 + credits = 0.0 + + for row in rows: + + header = row.find('td').string.strip() + value = row.find_all('td')[1].string.strip() + + if header == "Course Format": + hours_lecture = float(value.split('Lecture ')[1].split(' h')[0]) + hours_seminar = float(value.split('Seminar ')[1].split(' h')[0]) + hours_lab = float(value.split('Lab. ')[1].split(' h')[0]) + elif header == "Credits": + credits = float(value) + + # this breaks sometimes (AHIS 1110), bandaid fix for that + # if lecture_hours == None: + # lecture_hours = 0 + # if seminar_hours == None: + # seminar_hours = 0 + # if lab_hours == None: + # lab_hours = 0 + + description = "" + duplicate_credits = None + registration_restrictions = None + prerequisites = None + + # Extract the course description + if section.find('h3', string='Course Description') == None: + description_tag = None + else: + description_tag = section.find('h3', string='Course Description').find_next('p') + + # coding is painful sometimes + for content in description_tag: + if isinstance(content, str): + if 'registration in this course' in content: + registration_restrictions = content.strip() + elif 'receive credit' in content: + duplicate_credits = content.strip() + elif 'Prerequisite(s)' in content: + prerequisites = content.strip() + else: + if description != "": + description += "\n" + description += content.strip() + + # Extract course outlines + outlines = [] + i_outline = 0 + + outline_section = section.find('h3', text='Course Outline') + if outline_section: + ul_tag = outline_section.find_next('ul') + if ul_tag: + for li_tag in ul_tag.find_all('li'): + a_tag = li_tag.find('a') + if a_tag: + + link:str = a_tag['href'].strip() + link = link.replace("../", "") + url=f'https://langara.ca/programs-and-courses/courses/{link}' + + o = CourseOutlineDB( + url=url, + file_name=a_tag.text.strip(), + + # OUTL-ENGL-1123-1 + id=f'OUTL-{subject}-{course_code}-{i_outline}', + + subject=subject, + course_code=course_code, + id_course=f'CRSE-{subject}-{course_code}', + id_course_max=f'CMAX-{subject}-{course_code}' + ) + i_outline+=1 + outlines.append(o) + + if outlines == []: + outlines = None + + + # print(description) + # input() + + c = CoursePageDB( + # CPGE-ENGL-1123 + id=f'CPGE-{subject}-{course_code}', + subject=subject, + course_code=course_code, + title=title, + + credits=credits, + hours_lecture=hours_lecture, + hours_seminar=hours_seminar, + hours_lab=hours_lab, + + description=description, + desc_duplicate_credits=duplicate_credits, + desc_registration_restriction=registration_restrictions, + desc_prerequisite=prerequisites, + + university_transferrable=course.university_transferable, + offered_online=course.offered_online, + preparatory_course=course.preparatory_course, + + id_course=f'CRSE-{subject}-{course_code}' + ) + + return (c, outlines) + +# THE FUNCTION YOU SHOULD CALL IF YOU WANT COURSE PAGES +def getCoursePageInfo( + session: requests_cache.CachedSession | requests.Session +) -> tuple[list[CoursePageDB], list[CourseOutlineDB]]: + + subjects = getPageSubjectLinks(session) + courses:list[CoursePageDB] = [] + outlines: list[CourseOutlineDB] = [] + + for s in subjects: + print(f"{s.subject_code} ({s.subject_name}): Fetching course pages.") + + course_links = getCoursesFromSubjectPage(session, s) + + i=0 + for c in course_links: + c_page, c_outlines = getInformationFromCoursePage(session, c) + courses.append(c_page) + if c_outlines != None: + outlines += c_outlines + i+=1 + + print(f"{s.subject_code} ({s.subject_name}): Fetched and parsed {i} courses.") + + return (courses, outlines) + +if __name__ == "__main__": + session = createSession("database/cache/cache.db", use_cache=True) + courses, outlines = getCoursePageInfo(session) From e2bf5fb8f123d262cf162d038380990f01d3da1a Mon Sep 17 00:00:00 2001 From: Anderson T Date: Thu, 20 Jun 2024 01:18:15 -0700 Subject: [PATCH 3/4] add active bool in lieu of availability --- Controller.py | 5 ++++- sdk/schema_built/CourseMax.py | 6 +++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/Controller.py b/Controller.py index 048c805..ad5a0d0 100644 --- a/Controller.py +++ b/Controller.py @@ -417,7 +417,10 @@ def _generateCourseIndexes(self) -> None: CourseAttributeDB.course_code == course_code ).order_by(col(CourseAttributeDB.year).desc(), col(CourseAttributeDB.term).desc()).limit(1) r = session.exec(statement).first() - if r: + if not r: + c.active = False + else: + c.active = True c.attr_ar = r.attr_ar c.attr_hum = r.attr_hum c.attr_lsc = r.attr_lsc diff --git a/sdk/schema_built/CourseMax.py b/sdk/schema_built/CourseMax.py index d2f16e6..acfb711 100644 --- a/sdk/schema_built/CourseMax.py +++ b/sdk/schema_built/CourseMax.py @@ -21,6 +21,7 @@ class availabilitiesEnum(Enum): summerfall = "Summer & Fall" all = "All Semesters" unknown = "Unknown" + not_offered = "Not Offered" discontinued = "Discontinued" class PrereqEnum(Enum): @@ -89,7 +90,10 @@ class CourseMax(SQLModel): last_offered_term: Optional[int] = Field(default=None, description="The last term the course was offered e.g. ```10```.") # Derived from multiple sources - availability: Optional[availabilitiesEnum] = Field(default=None, description="(NOT IMPLEMENTED) Availability of course. Extracted automatically - may not be correct. Consult Langara advisors if in doubt.") + # NOT IMPLEMENTED BECAUSE IT SEEMS LIKE A VALUE JUDGEMENT + # availability: Optional[availabilitiesEnum] = Field(default=None, description="(NOT IMPLEMENTED) Availability of course. Extracted automatically - may not be correct. Consult Langara advisors if in doubt.") + active: Optional[bool] = Field(default=None, description="Whether a page for this course is active on the Langara website. This is not a guarantee that a course is being actively offered.") + # Funny SQLModel relationships that ARE NOT database relationships # course_outlines: list["CourseOutline"] = Relationship() # description="TODO: Course outlines for the course if available." From cef884e802eda2ac3c94a2287cad8d05a03414dc Mon Sep 17 00:00:00 2001 From: Anderson T Date: Thu, 20 Jun 2024 01:59:47 -0700 Subject: [PATCH 4/4] gah. --- Controller.py | 16 ++++++---------- api.py | 6 +++--- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/Controller.py b/Controller.py index ad5a0d0..09e761c 100644 --- a/Controller.py +++ b/Controller.py @@ -394,7 +394,10 @@ def _generateCourseIndexes(self) -> None: ).limit(1) results = session.exec(statement) r = session.exec(statement).first() - if r: + if r == None: + c.active = False + else: + c.active = True c.title = r.title c.description = r.description c.desc_duplicate_credit = r.desc_duplicate_credit @@ -410,17 +413,13 @@ def _generateCourseIndexes(self) -> None: c.offered_online = r.offered_online c.preparatory_course = r.preparatory_course - statement = select(CourseAttributeDB).where( CourseAttributeDB.subject == subject, CourseAttributeDB.course_code == course_code ).order_by(col(CourseAttributeDB.year).desc(), col(CourseAttributeDB.term).desc()).limit(1) r = session.exec(statement).first() - if not r: - c.active = False - else: - c.active = True + if r: c.attr_ar = r.attr_ar c.attr_hum = r.attr_hum c.attr_lsc = r.attr_lsc @@ -458,7 +457,7 @@ def _generateCourseIndexes(self) -> None: if r.source_credits != None and c.credits == None: c.credits = r.source_credits - # generate availability + # generate some aggregate values statement = select(SectionDB).where( SectionDB.subject == subject, SectionDB.course_code == course_code @@ -478,9 +477,6 @@ def _generateCourseIndexes(self) -> None: c.first_offered_year = r.year c.first_offered_term = r.term - - # TODO: calculate availability here. - c.availability = None # save CourseMax to the database once we are done diff --git a/api.py b/api.py index 00349fa..d1920a3 100644 --- a/api.py +++ b/api.py @@ -75,12 +75,12 @@ def daily(use_cache: bool = False): if (os.path.exists(DB_LOCATION)): print("Database found.") - # controller.create_db_and_tables() - # hourly(use_cache=True) -else: controller.create_db_and_tables() + hourly(use_cache=True) +else: print("Database not found. Building database from scratch.") # save results to cache if cache doesn't exist + controller.create_db_and_tables() controller.buildDatabase(use_cache=True)