This repository has been archived by the owner on Dec 10, 2017. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #43 from compserv/brian/scraper
Brian/scraper
- Loading branch information
Showing
2 changed files
with
198 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
course_info_*.csv | ||
crosslists_*.txt | ||
errors_*.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,195 @@ | ||
#!/usr/bin/env ruby | ||
|
||
# This script constructs a csv for scheduling course surveys from the EECS | ||
# schedule. | ||
# + http://www.eecs.berkeley.edu/Scheduling/EE/schedule.html | ||
# + http://www.eecs.berkeley.edu/Scheduling/CS/schedule.html | ||
# | ||
# It also scrapes enrollment data from osoc.berkeley.edu. They aren't real big | ||
# fans of people doing that, but I think if we limit ourselves to not real-time | ||
# stats, we might be okay. If the API ever works, we should try to use that. | ||
# + http://osoc.berkeley.edu/OSOC/osoc | ||
# | ||
# In theory, this could break at any time, but it's probably okay since all | ||
# those pages are auto-generated. It should take <10 seconds to run; if it takes | ||
# longer, OSOC probably doesn't like us. | ||
# - brian | ||
|
||
require 'nokogiri' | ||
require 'open-uri' | ||
require 'csv' | ||
require 'set' | ||
|
||
EE_URL = 'http://www.eecs.berkeley.edu/Scheduling/EE/schedule.html' | ||
CS_URL = 'http://www.eecs.berkeley.edu/Scheduling/CS/schedule.html' | ||
OSOC_URL = 'http://osoc.berkeley.edu/OSOC/osoc' | ||
|
||
# Use Nokogiri to actually scrape the pages | ||
|
||
def parse_schedule(url) | ||
# Turn class schedule into an array of arrays. | ||
page = Nokogiri::HTML(open(url)) | ||
schedule = page.search('table')[5] | ||
rows = schedule.search('tr') | ||
rows.map { |row| parse_row(row) } | ||
end | ||
|
||
def parse_row(row) | ||
# Return an array of [CCN, Course, Section, Type, Title, Instructor, Day/Time, | ||
# Location, Units, Exam Group, is_main?] | ||
cells = row.search('th', 'td').map { |cell| cell.content } | ||
cells << (!row.search('strong').empty?) # if class is main section | ||
end | ||
|
||
def update_enrollment(courses, term = "SP") | ||
# Mutate the Course objects and return a list of courses that are on the | ||
# Berkeley schedule, but not on the EECS schedule. | ||
ee_enroll = parse_enrollment('Electrical+Engineering', term) | ||
cs_enroll = parse_enrollment('Computer+Science', term) | ||
|
||
errors = [] | ||
(ee_enroll + cs_enroll).each do |course, enrolled| | ||
if courses.key?(course) | ||
courses[course].enrollment = enrolled | ||
else | ||
errors << "#{course} (#{enrolled} enrolled)" | ||
end | ||
end | ||
errors | ||
end | ||
|
||
def parse_enrollment(dept, term) | ||
# Return array of [:course, enrollment] from all OSOC pages. | ||
url = OSOC_URL + "?p_term=#{term}&p_deptname=#{dept}" | ||
row = 1 | ||
pages = [] | ||
while true | ||
puts "Begging #{url} for page #{row/100}" | ||
pages << Nokogiri::HTML(open(url + "&p_start_row=#{row}")) | ||
first_label = pages.last.search('label')[0].text | ||
first_label == 'see next results' ? row += 100 : break | ||
end | ||
pages.map { |page| parse_osoc(page) }.flatten(1) | ||
end | ||
|
||
def parse_osoc(osoc_page) | ||
# Return array of [:course, enrollment] from one OSOC page. | ||
result = [] | ||
osoc_page.search('table')[1..-2].each do |section| | ||
cells = section.search('td') | ||
name = cells[2].text.split | ||
course = "#{name[0][0]}#{name[1][0]}#{name[2]}-#{name[-2].gsub(/^0*/, '')}" | ||
units = cells[14].text | ||
enrollment = cells[-2].text.split[1].delete('Enrolled:') | ||
if !units.empty? # To avoid discussion/lab sections | ||
result << [course.downcase.to_sym, enrollment] | ||
end | ||
end | ||
result | ||
end | ||
|
||
# Turn the scraped information into a hash of Course objects | ||
|
||
class Course | ||
attr_reader :name, :section, :prof_list, :time, :place | ||
attr_accessor :ta_list, :enrollment | ||
|
||
def initialize(row) | ||
@ccn, @name, @section, _, @title, @prof_list, @time, @place = *row | ||
@ta_list = Set.new | ||
@enrollment = 'UNKNOWN' | ||
|
||
# for csv formatting | ||
@prof_list = @prof_list.split('; ') | ||
@prof_list << '' while @prof_list.size < 2 | ||
@time = @time.gsub(/;/, ',') | ||
end | ||
|
||
def add_tas(row) | ||
@ta_list.merge(row[5].split('; ')) | ||
end | ||
|
||
def to_sym | ||
"#{@name}-#{@section}".delete(' ').downcase.to_sym | ||
end | ||
|
||
def to_a | ||
[@name, @section, @enrollment, *@prof_list, @time, @place, *@ta_list] | ||
end | ||
end | ||
|
||
def make_courses(rows) | ||
# Turn table into hash of {:courses => <Courses>} | ||
courses = {} | ||
next_course = [] | ||
rows.each do |row| | ||
if row.last && !next_course.empty? # this row is main section | ||
courses.update(make_course(next_course)) | ||
next_course = [] | ||
end | ||
next_course << row | ||
end | ||
courses.update(make_course(next_course)) # don't forget last class | ||
end | ||
|
||
def make_course(course_rows) | ||
# Turn rows for a course into a hash of {:course => <Course>} | ||
course = Course.new(course_rows.first) | ||
course_rows.drop(1).each { |row| course.add_tas(row) } | ||
{course.to_sym => course} | ||
end | ||
|
||
def write_course_info(courses, date) | ||
CSV.open("course_info_#{date}.csv", 'wb') do |csv| | ||
courses.each { |course| csv << course.to_a } | ||
end | ||
end | ||
|
||
def find_overlap(info, block) | ||
# Return a map sorting elements of info into buckets, according to block. | ||
# Only keep buckets with more than one element. | ||
map = Hash.new { |h, k| h[k] = [] } | ||
info.each { |elem| map[block[elem]] << elem } | ||
map.delete_if { |_, v| v.size == 1 } | ||
end | ||
|
||
def write_crosslists(courses, date) | ||
File.open("crosslists_#{date}.txt", 'w') do |file| | ||
file.puts("The following courses *might* be cross-listed.", "\n") | ||
courses.select! { |course| course.time != 'UNSCHED' } | ||
by_times = find_overlap(courses, ->(course) { course.time }) | ||
by_times.each do |time, courses| | ||
by_prof = find_overlap(courses, ->(course) { course.prof_list.join('; ') }) | ||
by_prof.each do |profs, courses| | ||
file.puts(time, profs, courses.map { |course| course.name }.join(', '), "\n") | ||
end | ||
end | ||
end | ||
end | ||
|
||
def write_errors(errors, date) | ||
File.open("errors_#{date}.txt", 'w') do |file| | ||
file.puts "The following courses may be missing from course_info_#{date}.csv" | ||
file.puts errors | ||
end | ||
end | ||
|
||
def main | ||
ee, cs = parse_schedule(EE_URL), parse_schedule(CS_URL) | ||
all_classes = ee.drop(1) + cs.drop(1) | ||
courses = make_courses(all_classes) | ||
osoc_errors = update_enrollment(courses) | ||
|
||
date = Time.now.strftime('%Y%m%d') | ||
head = [%w(Course Sec Enrolled Instructor(1) Instructor(2) Time Place TA's)] | ||
write_course_info(head + courses.values, date) | ||
write_crosslists(courses.values, date) | ||
puts "Written to course_info_#{date}.csv and crosslists_#{date}.txt." | ||
|
||
if !osoc_errors.empty? | ||
write_errors(osoc_errors, date) | ||
puts "Errors written to errors_#{date}.txt" | ||
end | ||
end | ||
|
||
main |