Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Site layout changes #8

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions lib/member_div.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# frozen_string_literal: true

require 'scraped'

class MemberDiv < Scraped::HTML
field :id do
source.split('=').last
end

field :name do
bio.first.sub('Hon. ', '')
end

field :area do
bio.last.split(/del|por/).last.tidy
end

field :image do
noko.at_css('img @src').text
end

field :source do
noko.at_css('a @href').text
end

private

def bio
noko.at_css('.biodiv').text.split("\n").map(&:tidy).reject(&:empty?)
end
end
33 changes: 33 additions & 0 deletions lib/member_page.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# frozen_string_literal: true

require 'scraped'

class MemberPage < Scraped::HTML
field :party do
noko.at_css('.partyBio').text.tidy
end

field :phone do
contact_numbers_for('Tel')
end

field :fax do
contact_numbers_for('Fax')
end

field :tty do
contact_numbers_for('TTY')
end

private

def contact_numbers
noko.xpath('.//span[@class="data-type"]')
end

def contact_numbers_for(str)
contact_numbers.xpath("text()[contains(.,'#{str}')]").map do |n|
n.text.gsub("#{str}.", '').tidy
end.reject(&:empty?).join(';')
end
end
52 changes: 10 additions & 42 deletions scraper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,61 +10,29 @@
# OpenURI::Cache.cache_path = '.cache'
require 'scraped_page_archive/open-uri'

class MemberDiv < Scraped::HTML
field :id do
noko.css('a.more-info/@href').text[/rep=(\d+)/, 1]
end

field :name do
noko.xpath('.//span[@class="info"]//span[@class="name"]/text()').text.split(' - ').first.tidy.sub('Hon. ', '')
end

field :party do
noko.css('.info .party').text.tidy
end

field :area do
noko.css('.info .district').text.tidy
end

field :image do
noko.css('.identity img/@src').text
end
require_rel 'lib'

field :phone do
noko.xpath('.//span[@class="data-type" and contains(.,"Tel:")]').map { |n| n.text.sub('Tel:', '').tidy }.reject(&:empty?).join(' / ')
end

field :fax do
noko.xpath('.//span[@class="data-type" and contains(.,"Fax:")]').map { |n| n.text.sub('Fax:', '').tidy }.reject(&:empty?).join(' / ')
end

field :contact_form do
noko.css('a.mail/@href').text
end

field :source do
noko.css('a.more-info/@href').text
end
def scrape(h)
url, klass = h.to_a.first
klass.new(response: Scraped::Request.new(url: url).response)
end

class MembersPage < Scraped::HTML
decorator Scraped::Response::Decorator::CleanUrls

field :members do
noko.css('div.info-block div.info-wrap').map do |div|
noko.css('.list-article .selectionRep').map do |div|
fragment div => MemberDiv
end
end
end

start = 'http://www.tucamarapr.org/dnncamara/web/composiciondelacamara.aspx'
page = MembersPage.new(response: Scraped::Request.new(url: start).response)
data = page.members.map(&:to_h)
start = 'http://www.tucamarapr.org/dnncamara/web/ComposiciondelaCamara/Biografia.aspx'
page = scrape start => MembersPage
data = page.members.map do |member|
member.to_h.merge((scrape member.source => MemberPage).to_h)
end
data.each { |mem| puts mem.reject { |_, v| v.to_s.empty? }.sort_by { |k, _| k }.to_h } if ENV['MORPH_DEBUG']

ScraperWiki.sqliteexecute('DROP TABLE data') rescue nil
ScraperWiki.save_sqlite(%i[id party area], data)

# visit each 'source' page to archive it
data.each { |p| open(p[:source]).read }