Skip to content

Commit

Permalink
SRCH-156 stub www.agency.gov/robots.txt response (#164)
Browse files Browse the repository at this point in the history
- log warning if entire site is disallowed
  • Loading branch information
MothOnMars authored Nov 7, 2018
1 parent 65c96f4 commit 6ac0283
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 6 deletions.
6 changes: 6 additions & 0 deletions app/models/searchgov_crawler.rb
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ def initialize(domain:, skip_query_strings: true, delay: nil, srsly: false)
end

def crawl
Rails.logger.warn(disallowed_warning) unless robotex.allowed? base_url

begin
Medusa.crawl(base_url, @medusa_opts) do |medusa|
medusa.skip_links_like(skip_extensions_regex)
Expand Down Expand Up @@ -117,4 +119,8 @@ def initialize_url_file
def current_url(page)
page.redirect_to || page.url
end

def disallowed_warning
"Warning: #{domain} is not crawlable based on the rules in its robots.txt."
end
end
22 changes: 16 additions & 6 deletions spec/models/searchgov_crawler_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,15 @@
</html>
HTML
end
let(:robots_txt) { '' }

before do
stub_request(:get, base_url).
to_return(status: 200, body: html, headers: { content_type: 'text/html' })
stub_request(:get, "#{base_url}robots.txt").
to_return(status: [200, 'OK'],
headers: { content_type: 'text/plain' },
body: robots_txt)
end

describe '.crawl' do
Expand All @@ -49,12 +54,7 @@
describe 'options' do
describe 'crawl delay' do
context 'when a crawl delay is specified in robots.txt' do
before do
stub_request(:get, 'http://www.agency.gov/robots.txt').
to_return(status: [200, 'OK'],
headers: { content_type: 'text/plain' },
body: "User-agent: *\nCrawl-delay: 10")
end
let(:robots_txt) { "User-agent: *\nCrawl-delay: 10" }

it 'sets the specified delay' do
Medusa.should_receive(:crawl).
Expand Down Expand Up @@ -315,6 +315,16 @@
end
end
end

context 'when robots.txt disallows everything' do
let(:robots_txt) { "User-agent: *\nDisallow: /" }

it 'logs a warning' do
message = 'Warning: www.agency.gov is not crawlable based on the rules in its robots.txt.'
expect(Rails.logger).to receive(:warn).with message
crawl
end
end
end

# Testing this private method directly for test speed
Expand Down

0 comments on commit 6ac0283

Please sign in to comment.