Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Add Scraper/ScraperRun classes #7

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
81 changes: 67 additions & 14 deletions scraper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,6 @@ class MemberRow < Scraped::HTML
area_data.first
end

field :term do
2012
end

field :source do
tds[1].css('a/@href').text
end
Expand Down Expand Up @@ -105,17 +101,74 @@ def box
end
end

def scrape(h)
url, klass = h.to_a.first
klass.new(response: Scraped::Request.new(url: url).response)
end
module EveryPolitician
class ScraperRun
def initialize(id: SecureRandom.uuid, table: 'data', index_fields: nil, default_index_fields: %i(id term))
@run_data = { id: id, started: Time.now }
@table = table
@index_fields = index_fields
@default_index_fields = default_index_fields
ScraperWiki.save_sqlite(%i(id), run_data, 'runs')
ScraperWiki.sqliteexecute('DELETE FROM %s' % table) rescue nil
end

def save_all(data, debugging: ENV['MORPH_PRINT_DATA'])
data.each { |r| puts r.reject { |_, v| v.to_s.empty? }.sort_by { |k, _| k }.to_h } if debugging
ScraperWiki.save_sqlite(index_fields_from(data), data, table)
ScraperWiki.save_sqlite(%i(id), run_data.merge(ended: Time.now), 'runs')
end

def error(e)
ScraperWiki.save_sqlite(%i(id), run_data.merge(errored: Time.now), 'runs')
# TODO: do something better with the error
raise e
end

private

attr_reader :run_data, :table, :index_fields, :default_index_fields

def index_fields_from(data)
index_fields || (data.first.keys & default_index_fields)
end
end

start = 'http://www.cdep.ro/pls/parlam/structura2015.de?leg=2012&idl=2'
data = scrape(start => MembersPage).members.map do |mem|
mem.to_h.merge(scrape(mem.source => MemberPage).to_h)
class Scraper
def initialize(url:, default_data: {})
@url = url
@default_data = default_data
end

def run
scraper_run.save_all(data)
rescue => e
scraper_run.error(e)
end

private

attr_reader :url, :default_data

def scraper_run
@scraper_run = EveryPolitician::ScraperRun.new
end

def scrape(h)
url, klass = h.to_a.first
klass.new(response: Scraped::Request.new(url: url).response)
end
end
end

# puts data.map { |r| r.reject { |_, v| v.to_s.empty? }.sort_by { |k, _| k }.to_h }
class RomanianParliamentScraper < EveryPolitician::Scraper
def data
scrape(url => MembersPage).members.map do |mem|
default_data.merge(mem.to_h).merge(scrape(mem.source => MemberPage).to_h)
end
end
end

ScraperWiki.sqliteexecute('DELETE FROM data') rescue nil
ScraperWiki.save_sqlite(%i(id term), data)
RomanianParliamentScraper.new(
url: 'http://www.cdep.ro/pls/parlam/structura2015.de?leg=2012&idl=2',
default_data: { term: 2012 }
).run