From 598e59be9d89305438d45275372a606524dcd47f Mon Sep 17 00:00:00 2001 From: Tony Bowden Date: Mon, 6 Feb 2017 11:40:34 +0000 Subject: [PATCH 1/7] WIP: Add Scraper/ScraperRun classes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit First stab at factoring out a Scraper class and a ScraperRun class. The IndexToMemebers class is probably overkill for now — we can find the higher level abstractions like that after we nail down the basics — but it's probably useful to include in this intial WIP which is more for feedback than expected to actually merge yet. --- scraper.rb | 90 +++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 75 insertions(+), 15 deletions(-) diff --git a/scraper.rb b/scraper.rb index 4cb6c9d6a11..9ae27054382 100644 --- a/scraper.rb +++ b/scraper.rb @@ -48,10 +48,6 @@ class MemberRow < Scraped::HTML area_data.first end - field :term do - 2012 - end - field :source do tds[1].css('a/@href').text end @@ -105,17 +101,81 @@ def box end end -def scrape(h) - url, klass = h.to_a.first - klass.new(response: Scraped::Request.new(url: url).response) -end +module EveryPolitician + class ScraperRun + def initialize(id: SecureRandom.uuid, table: 'data', index_fields: nil, default_index_fields: %i(id term start_date)) + @run_data = { id: id, started: Time.now } + @table = table + @index_fields = index_fields + @default_index_fields = default_index_fields + ScraperWiki.save_sqlite(%i(id), run_data, 'runs') + ScraperWiki.sqliteexecute('DELETE FROM %s' % table) rescue nil + end -start = 'http://www.cdep.ro/pls/parlam/structura2015.de?leg=2012&idl=2' -data = scrape(start => MembersPage).members.map do |mem| - mem.to_h.merge(scrape(mem.source => MemberPage).to_h) -end + def index_fields_from(data) + index_fields || (data.keys & default_index_fields) + end + + def save_all(data, debugging: ENV['MORPH_PRINT_DATA']) + data.each { |r| puts r.reject { |_, v| v.to_s.empty? }.sort_by { |k, _| k }.to_h } if debugging + ScraperWiki.save_sqlite(index_fields_from(data), data, table) + ScraperWiki.save_sqlite(%i(id), run_data.merge(ended: Time.now), 'runs') + end + + def error(e) + # TODO: do something better with the error + warn e + ScraperWiki.save_sqlite(%i(id), run_data.merge(errored: Time.now), 'runs') + end + + private + + attr_reader :run_data, :table, :index_fields, :default_index_fields + end -# puts data.map { |r| r.reject { |_, v| v.to_s.empty? }.sort_by { |k, _| k }.to_h } + class Scraper + def initialize + @scraper_run = EveryPolitician::ScraperRun.new + end + + def run + scraper_run.save_all(data) + rescue => e + scraper_run.error(e) + end + + private + + def scrape(h) + url, klass = h.to_a.first + klass.new(response: Scraped::Request.new(url: url).response) + end + + class IndexToMembers < Scraper + def initialize(url:, members_class:, member_class:, default_data: {}) + @url = url + @members_class = members_class + @member_class = member_class + @default_data = default_data + super() + end + + def data + scrape(url => members_class).members.map do |mem| + default_data.merge(mem.to_h).merge(scrape(mem.source => member_class).to_h) + end + end + + private + + attr_reader :scraper_run, :url, :members_class, :member_class, :default_data + end + end +end -ScraperWiki.sqliteexecute('DELETE FROM data') rescue nil -ScraperWiki.save_sqlite(%i(id term), data) +EveryPolitician::Scraper::IndexToMembers.new( + url: 'http://www.cdep.ro/pls/parlam/structura2015.de?leg=2012&idl=2', + members_class: MembersPage, + member_class: MemberPage, + default_data: { term: 2012 } +).run From 0cfe17097d215765e5c048e041d3975ec26a9d9d Mon Sep 17 00:00:00 2001 From: Chris Mytton Date: Mon, 13 Feb 2017 13:30:10 +0100 Subject: [PATCH 2/7] Raise error after saving to runs table This gives us the full stack trace and ensures that the program actually exits after an error. --- scraper.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scraper.rb b/scraper.rb index 9ae27054382..a2ddc551f80 100644 --- a/scraper.rb +++ b/scraper.rb @@ -123,9 +123,9 @@ def save_all(data, debugging: ENV['MORPH_PRINT_DATA']) end def error(e) - # TODO: do something better with the error - warn e ScraperWiki.save_sqlite(%i(id), run_data.merge(errored: Time.now), 'runs') + # TODO: do something better with the error + raise e end private From 98737bf158af480e76b7fa7aacd17f061b28b24f Mon Sep 17 00:00:00 2001 From: Chris Mytton Date: Mon, 13 Feb 2017 13:31:14 +0100 Subject: [PATCH 3/7] Fix bug in index_fields_from This was trying to call `#keys` on an array, which was raising an error. For now just use the first item from the data array. --- scraper.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scraper.rb b/scraper.rb index a2ddc551f80..e23dc035acf 100644 --- a/scraper.rb +++ b/scraper.rb @@ -113,7 +113,7 @@ def initialize(id: SecureRandom.uuid, table: 'data', index_fields: nil, default_ end def index_fields_from(data) - index_fields || (data.keys & default_index_fields) + index_fields || (data.first.keys & default_index_fields) end def save_all(data, debugging: ENV['MORPH_PRINT_DATA']) From 1a084ee54956738e81c19e10e79c66d7c2618497 Mon Sep 17 00:00:00 2001 From: Chris Mytton Date: Mon, 13 Feb 2017 13:31:54 +0100 Subject: [PATCH 4/7] Remove start_date from default_index_fields Not all records seem to have a start date, so removing this from the list of default index fields for now. --- scraper.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scraper.rb b/scraper.rb index e23dc035acf..9d8ab8f233e 100644 --- a/scraper.rb +++ b/scraper.rb @@ -103,7 +103,7 @@ def box module EveryPolitician class ScraperRun - def initialize(id: SecureRandom.uuid, table: 'data', index_fields: nil, default_index_fields: %i(id term start_date)) + def initialize(id: SecureRandom.uuid, table: 'data', index_fields: nil, default_index_fields: %i(id term)) @run_data = { id: id, started: Time.now } @table = table @index_fields = index_fields From 119a86d748fd9daaece7ca1f6a9ea4dd7f5272b4 Mon Sep 17 00:00:00 2001 From: Chris Mytton Date: Mon, 13 Feb 2017 18:28:15 +0100 Subject: [PATCH 5/7] Make index_fields_from method private This doesn't need to be a public method as it's only used internally. --- scraper.rb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scraper.rb b/scraper.rb index 9d8ab8f233e..1f88dd257c7 100644 --- a/scraper.rb +++ b/scraper.rb @@ -112,10 +112,6 @@ def initialize(id: SecureRandom.uuid, table: 'data', index_fields: nil, default_ ScraperWiki.sqliteexecute('DELETE FROM %s' % table) rescue nil end - def index_fields_from(data) - index_fields || (data.first.keys & default_index_fields) - end - def save_all(data, debugging: ENV['MORPH_PRINT_DATA']) data.each { |r| puts r.reject { |_, v| v.to_s.empty? }.sort_by { |k, _| k }.to_h } if debugging ScraperWiki.save_sqlite(index_fields_from(data), data, table) @@ -131,6 +127,10 @@ def error(e) private attr_reader :run_data, :table, :index_fields, :default_index_fields + + def index_fields_from(data) + index_fields || (data.first.keys & default_index_fields) + end end class Scraper From ccfe9704d144bf423e066846194fb3c57da5e0e4 Mon Sep 17 00:00:00 2001 From: Chris Mytton Date: Mon, 13 Feb 2017 18:29:18 +0100 Subject: [PATCH 6/7] Replace IndexToMembers class with RomanianParliament class Rather than trying to find the correct generic abstraction this forces the scrapers to have a class that encapsulates navigating a website to get the correct content. In the future we can potentially wrap these up in `IndexToMembers` type classes, but for now it seems easier to just do scraper specific ones until we find the correct abstraction. --- scraper.rb | 43 ++++++++++++++++++------------------------- 1 file changed, 18 insertions(+), 25 deletions(-) diff --git a/scraper.rb b/scraper.rb index 1f88dd257c7..a3705529cc6 100644 --- a/scraper.rb +++ b/scraper.rb @@ -134,8 +134,9 @@ def index_fields_from(data) end class Scraper - def initialize - @scraper_run = EveryPolitician::ScraperRun.new + def initialize(url:, default_data: {}) + @url = url + @default_data = default_data end def run @@ -146,36 +147,28 @@ def run private + attr_reader :url, :default_data + + def scraper_run + @scraper_run = EveryPolitician::ScraperRun.new + end + def scrape(h) url, klass = h.to_a.first klass.new(response: Scraped::Request.new(url: url).response) end + end +end - class IndexToMembers < Scraper - def initialize(url:, members_class:, member_class:, default_data: {}) - @url = url - @members_class = members_class - @member_class = member_class - @default_data = default_data - super() - end - - def data - scrape(url => members_class).members.map do |mem| - default_data.merge(mem.to_h).merge(scrape(mem.source => member_class).to_h) - end - end - - private - - attr_reader :scraper_run, :url, :members_class, :member_class, :default_data +class RomanianParliament < EveryPolitician::Scraper + def data + scrape(url => MembersPage).members.map do |mem| + default_data.merge(mem.to_h).merge(scrape(mem.source => MemberPage).to_h) end end end -EveryPolitician::Scraper::IndexToMembers.new( - url: 'http://www.cdep.ro/pls/parlam/structura2015.de?leg=2012&idl=2', - members_class: MembersPage, - member_class: MemberPage, - default_data: { term: 2012 } +RomanianParliament.new( + url: 'http://www.cdep.ro/pls/parlam/structura2015.de?leg=2012&idl=2', + default_data: { term: 2012 } ).run From a1b813f72eeef7d743b35b8d123c36b088122488 Mon Sep 17 00:00:00 2001 From: Chris Mytton Date: Mon, 13 Feb 2017 19:34:27 +0100 Subject: [PATCH 7/7] Better name for scraper class --- scraper.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scraper.rb b/scraper.rb index a3705529cc6..a9a84ad7ac6 100644 --- a/scraper.rb +++ b/scraper.rb @@ -160,7 +160,7 @@ def scrape(h) end end -class RomanianParliament < EveryPolitician::Scraper +class RomanianParliamentScraper < EveryPolitician::Scraper def data scrape(url => MembersPage).members.map do |mem| default_data.merge(mem.to_h).merge(scrape(mem.source => MemberPage).to_h) @@ -168,7 +168,7 @@ def data end end -RomanianParliament.new( +RomanianParliamentScraper.new( url: 'http://www.cdep.ro/pls/parlam/structura2015.de?leg=2012&idl=2', default_data: { term: 2012 } ).run