From f0fdca3428885df70ae73f3918adda8a8b90f072 Mon Sep 17 00:00:00 2001 From: Oliver Denman Date: Fri, 12 May 2017 13:02:53 +0100 Subject: [PATCH 1/4] Update start url The previous URL now redirects to http://www.tucamarapr.org/dnncamara/web/Inicio.aspx --- scraper.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scraper.rb b/scraper.rb index e71defa5..198de855 100644 --- a/scraper.rb +++ b/scraper.rb @@ -58,7 +58,7 @@ class MembersPage < Scraped::HTML end end -start = 'http://www.tucamarapr.org/dnncamara/web/composiciondelacamara.aspx' +start = 'http://www.tucamarapr.org/dnncamara/web/ComposiciondelaCamara/Biografia.aspx' page = MembersPage.new(response: Scraped::Request.new(url: start).response) data = page.members.map(&:to_h) data.each { |mem| puts mem.reject { |_, v| v.to_s.empty? }.sort_by { |k, _| k }.to_h } if ENV['MORPH_DEBUG'] From 2e093bb5826bb4a8fdd38567b988ab9a526ccbcd Mon Sep 17 00:00:00 2001 From: Oliver Denman Date: Fri, 12 May 2017 13:15:01 +0100 Subject: [PATCH 2/4] Update css selectors for individual member rows --- scraper.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scraper.rb b/scraper.rb index 198de855..c5d623fc 100644 --- a/scraper.rb +++ b/scraper.rb @@ -52,7 +52,7 @@ class MembersPage < Scraped::HTML decorator Scraped::Response::Decorator::CleanUrls field :members do - noko.css('div.info-block div.info-wrap').map do |div| + noko.css('.list-article .selectionRep').map do |div| fragment div => MemberDiv end end From dca9f6883d3bed3d584dae5285c53f32c4240215 Mon Sep 17 00:00:00 2001 From: Oliver Denman Date: Fri, 12 May 2017 14:30:56 +0100 Subject: [PATCH 3/4] Update MemberDiv for new site layout Phone, fax, party and contact information are no longer listed in individual member rows. Contact form urls are no longer listed in either member rows or on member pages. Phone, fax and party info will be captured in MemberPage which will be added in a forthcoming commit. --- lib/member_div.rb | 31 +++++++++++++++++++++++++++++++ scraper.rb | 37 +------------------------------------ 2 files changed, 32 insertions(+), 36 deletions(-) create mode 100644 lib/member_div.rb diff --git a/lib/member_div.rb b/lib/member_div.rb new file mode 100644 index 00000000..4ebbad71 --- /dev/null +++ b/lib/member_div.rb @@ -0,0 +1,31 @@ +# frozen_string_literal: true + +require 'scraped' + +class MemberDiv < Scraped::HTML + field :id do + source.split('=').last + end + + field :name do + bio.first.sub('Hon. ', '') + end + + field :area do + bio.last.split(/del|por/).last.tidy + end + + field :image do + noko.at_css('img @src').text + end + + field :source do + noko.at_css('a @href').text + end + + private + + def bio + noko.at_css('.biodiv').text.split("\n").map(&:tidy).reject(&:empty?) + end +end diff --git a/scraper.rb b/scraper.rb index c5d623fc..9df5b767 100644 --- a/scraper.rb +++ b/scraper.rb @@ -10,43 +10,8 @@ # OpenURI::Cache.cache_path = '.cache' require 'scraped_page_archive/open-uri' -class MemberDiv < Scraped::HTML - field :id do - noko.css('a.more-info/@href').text[/rep=(\d+)/, 1] - end - - field :name do - noko.xpath('.//span[@class="info"]//span[@class="name"]/text()').text.split(' - ').first.tidy.sub('Hon. ', '') - end - - field :party do - noko.css('.info .party').text.tidy - end - - field :area do - noko.css('.info .district').text.tidy - end - - field :image do - noko.css('.identity img/@src').text - end +require_rel 'lib' - field :phone do - noko.xpath('.//span[@class="data-type" and contains(.,"Tel:")]').map { |n| n.text.sub('Tel:', '').tidy }.reject(&:empty?).join(' / ') - end - - field :fax do - noko.xpath('.//span[@class="data-type" and contains(.,"Fax:")]').map { |n| n.text.sub('Fax:', '').tidy }.reject(&:empty?).join(' / ') - end - - field :contact_form do - noko.css('a.mail/@href').text - end - - field :source do - noko.css('a.more-info/@href').text - end -end class MembersPage < Scraped::HTML decorator Scraped::Response::Decorator::CleanUrls From 60de3147c3d626d842b6371e731ca271bd606248 Mon Sep 17 00:00:00 2001 From: Oliver Denman Date: Fri, 12 May 2017 14:37:59 +0100 Subject: [PATCH 4/4] Add MemberPage --- lib/member_page.rb | 33 +++++++++++++++++++++++++++++++++ scraper.rb | 13 ++++++++----- 2 files changed, 41 insertions(+), 5 deletions(-) create mode 100644 lib/member_page.rb diff --git a/lib/member_page.rb b/lib/member_page.rb new file mode 100644 index 00000000..4c5c29bb --- /dev/null +++ b/lib/member_page.rb @@ -0,0 +1,33 @@ +# frozen_string_literal: true + +require 'scraped' + +class MemberPage < Scraped::HTML + field :party do + noko.at_css('.partyBio').text.tidy + end + + field :phone do + contact_numbers_for('Tel') + end + + field :fax do + contact_numbers_for('Fax') + end + + field :tty do + contact_numbers_for('TTY') + end + + private + + def contact_numbers + noko.xpath('.//span[@class="data-type"]') + end + + def contact_numbers_for(str) + contact_numbers.xpath("text()[contains(.,'#{str}')]").map do |n| + n.text.gsub("#{str}.", '').tidy + end.reject(&:empty?).join(';') + end +end diff --git a/scraper.rb b/scraper.rb index 9df5b767..ef796f78 100644 --- a/scraper.rb +++ b/scraper.rb @@ -12,6 +12,10 @@ require_rel 'lib' +def scrape(h) + url, klass = h.to_a.first + klass.new(response: Scraped::Request.new(url: url).response) +end class MembersPage < Scraped::HTML decorator Scraped::Response::Decorator::CleanUrls @@ -24,12 +28,11 @@ class MembersPage < Scraped::HTML end start = 'http://www.tucamarapr.org/dnncamara/web/ComposiciondelaCamara/Biografia.aspx' -page = MembersPage.new(response: Scraped::Request.new(url: start).response) -data = page.members.map(&:to_h) +page = scrape start => MembersPage +data = page.members.map do |member| + member.to_h.merge((scrape member.source => MemberPage).to_h) +end data.each { |mem| puts mem.reject { |_, v| v.to_s.empty? }.sort_by { |k, _| k }.to_h } if ENV['MORPH_DEBUG'] ScraperWiki.sqliteexecute('DROP TABLE data') rescue nil ScraperWiki.save_sqlite(%i[id party area], data) - -# visit each 'source' page to archive it -data.each { |p| open(p[:source]).read }