From d40f9420d889bfca0ecb1659ea5c32270c85afc8 Mon Sep 17 00:00:00 2001 From: watsy0007 Date: Fri, 21 Oct 2016 13:18:48 +0800 Subject: [PATCH 1/5] add cookies support --- lib/wombat.rb | 6 +++++- lib/wombat/processing/parser.rb | 21 +++++++++++++++------ spec/wombat_spec.rb | 2 ++ 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/lib/wombat.rb b/lib/wombat.rb index 59fc2eb..f9544b4 100644 --- a/lib/wombat.rb +++ b/lib/wombat.rb @@ -5,7 +5,7 @@ module Wombat class << self - attr_reader :proxy_args, :user_agent, :user_agent_alias + attr_reader :proxy_args, :user_agent, :user_agent_alias, :cookies def crawl(&block) klass = Class.new @@ -29,6 +29,10 @@ def set_user_agent_alias(user_agent_alias) @user_agent_alias = user_agent_alias end + def set_cookies(cookies) + @cookies = cookies + end + alias_method :scrape, :crawl end end diff --git a/lib/wombat/processing/parser.rb b/lib/wombat/processing/parser.rb index b80f23f..eb46074 100644 --- a/lib/wombat/processing/parser.rb +++ b/lib/wombat/processing/parser.rb @@ -3,6 +3,7 @@ require 'wombat/processing/node_selector' require 'mechanize' require 'restclient' +require 'uri' module Nokogiri module XML @@ -42,28 +43,36 @@ def initialize def parse(metadata, url=nil) @context = parser_for(metadata, url) - + return nil if @context.nil? Wombat::Property::Locators::Factory.locator_for(metadata).locate(@context, @mechanize) end private + def parser_for(metadata, url) url ||= "#{metadata[:base_url]}#{metadata[:path]}" - page = nil - parser = nil - _method = method_from(metadata[:http_method]) + return if url.nil? + method = method_from(metadata[:http_method]) data = metadata[:data] args = [url, data].compact begin @page = metadata[:page] if metadata[:document_format] == :html - @page = @mechanize.public_send(_method, *args) unless @page + unless (Wombat.cookies.nil? || Wombat.cookies.empty?) + Wombat.cookies.each do |k, v| + cookie = Mechanize::Cookie.new(k.to_s, v.to_s) + cookie.domain = URI.parse(url).host + cookie.path = '/' + @mechanize.cookie_jar << cookie + end + end + @page = @mechanize.public_send(method, *args) unless @page parser = @page.parser # Nokogiri::HTML::Document parser.mechanize_page = @page # Mechanize::Page parser.headers = @page.header else - @page = RestClient.public_send(_method, *args) unless @page + @page = RestClient.public_send(method, *args) unless @page parser = Nokogiri::XML @page parser.headers = @page.headers end diff --git a/spec/wombat_spec.rb b/spec/wombat_spec.rb index 646c40b..bbe05e6 100644 --- a/spec/wombat_spec.rb +++ b/spec/wombat_spec.rb @@ -22,10 +22,12 @@ config.set_proxy "10.0.0.1", 8080 config.set_user_agent "Wombat" config.set_user_agent_alias 'Mac Safari' + config.set_cookies expired: Time.now end Wombat.proxy_args.should == ["10.0.0.1", 8080] Wombat.user_agent.should == 'Wombat' Wombat.user_agent_alias.should == 'Mac Safari' + Wombat.cookies.keys.should == [:expired] end it 'should accept regular properties (non-selectors)' do From 05a17858c438654b3851669b94b2ce589a3b5285 Mon Sep 17 00:00:00 2001 From: watsy0007 Date: Fri, 21 Oct 2016 13:59:30 +0800 Subject: [PATCH 2/5] crawl instalce proxy & user agent support --- lib/wombat.rb | 6 +---- lib/wombat/crawler.rb | 6 ++--- lib/wombat/processing/parser.rb | 40 +++++++++++++++++++++------------ spec/crawler_spec.rb | 9 ++++---- spec/wombat_spec.rb | 2 -- 5 files changed, 35 insertions(+), 28 deletions(-) diff --git a/lib/wombat.rb b/lib/wombat.rb index f9544b4..59fc2eb 100644 --- a/lib/wombat.rb +++ b/lib/wombat.rb @@ -5,7 +5,7 @@ module Wombat class << self - attr_reader :proxy_args, :user_agent, :user_agent_alias, :cookies + attr_reader :proxy_args, :user_agent, :user_agent_alias def crawl(&block) klass = Class.new @@ -29,10 +29,6 @@ def set_user_agent_alias(user_agent_alias) @user_agent_alias = user_agent_alias end - def set_cookies(cookies) - @cookies = cookies - end - alias_method :scrape, :crawl end end diff --git a/lib/wombat/crawler.rb b/lib/wombat/crawler.rb index 4d154d5..5a16f0c 100644 --- a/lib/wombat/crawler.rb +++ b/lib/wombat/crawler.rb @@ -17,7 +17,7 @@ class << self self.metadata = DSL::Metadata.new end - def crawl(url = nil, &block) + def crawl(url = nil, options = {}, &block) if block @metadata_dup = self.class.send(:metadata).clone instance_eval do @@ -27,14 +27,14 @@ def method_missing method, *args, &block end end self.instance_eval &block - parsed = parse(@metadata_dup, url) + parsed = parse(@metadata_dup, url, options) instance_eval do alias :method_missing :old_method_missing remove_instance_variable :@metadata_dup end parsed else - parse(self.class.send(:metadata), url) + parse(self.class.send(:metadata), url, options) end end diff --git a/lib/wombat/processing/parser.rb b/lib/wombat/processing/parser.rb index eb46074..d4bb423 100644 --- a/lib/wombat/processing/parser.rb +++ b/lib/wombat/processing/parser.rb @@ -41,15 +41,21 @@ def initialize @mechanize.user_agent_alias = Wombat.user_agent_alias if Wombat.user_agent_alias end - def parse(metadata, url=nil) - @context = parser_for(metadata, url) + def parse(metadata, url = nil, options = {}) + unless options.empty? + options = options.reduce({}).each do |memo, (k, v)| + memo[k.to_sym] = v + memo + end + end + @context = parser_for(metadata, url, options) return nil if @context.nil? Wombat::Property::Locators::Factory.locator_for(metadata).locate(@context, @mechanize) end private - def parser_for(metadata, url) + def parser_for(metadata, url, options = {}) url ||= "#{metadata[:base_url]}#{metadata[:path]}" return if url.nil? method = method_from(metadata[:http_method]) @@ -59,14 +65,10 @@ def parser_for(metadata, url) @page = metadata[:page] if metadata[:document_format] == :html - unless (Wombat.cookies.nil? || Wombat.cookies.empty?) - Wombat.cookies.each do |k, v| - cookie = Mechanize::Cookie.new(k.to_s, v.to_s) - cookie.domain = URI.parse(url).host - cookie.path = '/' - @mechanize.cookie_jar << cookie - end - end + @mechanize.set_proxy(*options[:proxy_args]) if options[:proxy_args] + @mechanize.user_agent = options[:user_agent] if options[:user_agent] + @mechanize.user_agent_alias = options[:user_agent_alias] if options[:user_agent_alias] + update_cookies(url, options[:cookies]) if options[:cookies] @page = @mechanize.public_send(method, *args) unless @page parser = @page.parser # Nokogiri::HTML::Document parser.mechanize_page = @page # Mechanize::Page @@ -88,9 +90,19 @@ def parser_for(metadata, url) end end - def method_from(_method) - return :get if _method.nil? - HTTP_METHODS.detect(->{:get}){ |i| i == _method.downcase.to_sym } + def update_cookies(url, cookies) + domain = URI.parse(url).host + cookies.each do |k, v| + cookie = Mechanize::Cookie.new(k.to_s, v.to_s) + cookie.domain = domain + cookie.path = '/' + @mechanize.cookie_jar << cookie + end + end + + def method_from(method) + return :get if method.nil? + HTTP_METHODS.detect(-> {:get}){ |i| i == method.downcase.to_sym } end end end diff --git a/spec/crawler_spec.rb b/spec/crawler_spec.rb index 7d6d6ee..14a1bfe 100644 --- a/spec/crawler_spec.rb +++ b/spec/crawler_spec.rb @@ -134,14 +134,15 @@ it 'should crawl with url and block' do url = 'http://danielinc.com/itens' - expect(@crawler_instance).to receive(:parse).with(anything, url) - @crawler_instance.crawl(url) do + opts = {cookies: :cookies} + expect(@crawler_instance).to receive(:parse).with(anything, url, opts) + @crawler_instance.crawl(url, opts) do end another_instance = @crawler.new - expect(another_instance).to receive(:parse).with(anything, url) + expect(another_instance).to receive(:parse).with(anything, url, opts) - another_instance.crawl(url) + another_instance.crawl(url, opts) end it 'should remove created method missing' do diff --git a/spec/wombat_spec.rb b/spec/wombat_spec.rb index bbe05e6..646c40b 100644 --- a/spec/wombat_spec.rb +++ b/spec/wombat_spec.rb @@ -22,12 +22,10 @@ config.set_proxy "10.0.0.1", 8080 config.set_user_agent "Wombat" config.set_user_agent_alias 'Mac Safari' - config.set_cookies expired: Time.now end Wombat.proxy_args.should == ["10.0.0.1", 8080] Wombat.user_agent.should == 'Wombat' Wombat.user_agent_alias.should == 'Mac Safari' - Wombat.cookies.keys.should == [:expired] end it 'should accept regular properties (non-selectors)' do From 899e3f50f1992f9e461799e17a5ad91cf5104ee8 Mon Sep 17 00:00:00 2001 From: watsy0007 Date: Fri, 21 Oct 2016 15:23:15 +0800 Subject: [PATCH 3/5] fixbug [parser] fix options format bug --- lib/wombat/processing/parser.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/wombat/processing/parser.rb b/lib/wombat/processing/parser.rb index d4bb423..7e3c92c 100644 --- a/lib/wombat/processing/parser.rb +++ b/lib/wombat/processing/parser.rb @@ -44,7 +44,7 @@ def initialize def parse(metadata, url = nil, options = {}) unless options.empty? options = options.reduce({}).each do |memo, (k, v)| - memo[k.to_sym] = v + memo[k.to_sym] = v.to_s memo end end From 5d7b899bb1f5b5147e59b0fcde1cc8d85598ca40 Mon Sep 17 00:00:00 2001 From: watsy0007 Date: Fri, 21 Oct 2016 15:27:18 +0800 Subject: [PATCH 4/5] improvement [parser] options --- lib/wombat/processing/parser.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/wombat/processing/parser.rb b/lib/wombat/processing/parser.rb index 7e3c92c..392e60e 100644 --- a/lib/wombat/processing/parser.rb +++ b/lib/wombat/processing/parser.rb @@ -44,7 +44,7 @@ def initialize def parse(metadata, url = nil, options = {}) unless options.empty? options = options.reduce({}).each do |memo, (k, v)| - memo[k.to_sym] = v.to_s + memo[k.to_sym] = v.to_s unless v.nil? memo end end From 663c351d3a248b6f8bb0195a4acc6a86e3c76809 Mon Sep 17 00:00:00 2001 From: watsy0007 Date: Fri, 21 Oct 2016 15:42:58 +0800 Subject: [PATCH 5/5] fixbug [parser] options --- lib/wombat/processing/parser.rb | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/wombat/processing/parser.rb b/lib/wombat/processing/parser.rb index 392e60e..03611d8 100644 --- a/lib/wombat/processing/parser.rb +++ b/lib/wombat/processing/parser.rb @@ -43,9 +43,8 @@ def initialize def parse(metadata, url = nil, options = {}) unless options.empty? - options = options.reduce({}).each do |memo, (k, v)| + options = options.each_with_object({}) do |(k, v), memo| memo[k.to_sym] = v.to_s unless v.nil? - memo end end @context = parser_for(metadata, url, options)