Skip to content

Commit

Permalink
SRCH-1511 SRCH-1560 improve error handling in SitemapIndexer (#603)
Browse files Browse the repository at this point in the history
- move observers into relevant directories
  • Loading branch information
MothOnMars authored Jul 24, 2020
1 parent b9d85f1 commit 20e1646
Show file tree
Hide file tree
Showing 15 changed files with 85 additions and 32 deletions.
2 changes: 1 addition & 1 deletion .rubocop.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3880,7 +3880,7 @@ Style/RescueModifier:

Style/RescueStandardError:
Description: 'Avoid rescuing without specifying an error class.'
Enabled: true
Enabled: false
VersionAdded: 0.52
EnforcedStyle: explicit
# implicit: Do not include the error class, `rescue`
Expand Down
20 changes: 16 additions & 4 deletions app/models/sitemap_indexer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def initialize(sitemap_url:)
end

def index
sitemaps_stream.any? ? enqueue_sitemaps : process_entries
sitemap_index? ? enqueue_sitemaps : process_entries
end

private
Expand All @@ -24,6 +24,13 @@ def sitemaps_stream
within('sitemapindex').for_tag('sitemap')
end

def sitemap_index?
sitemaps_stream.any?
rescue Saxerator::ParseException
# Rescue & move on, in case we can process any URLs before the parser barfs
false
end

def sitemap_entries_stream
@sitemap_entries_stream ||= Saxerator.parser(sitemap).
within('urlset').for_tag('url')
Expand All @@ -40,8 +47,10 @@ def process_entries
sitemap_entries_stream.each do |entry|
process_entry(entry) if entry_matches_domain?(entry)
end
searchgov_domain.index_urls
rescue => e
Rails.logger.error("Error processing sitemap entries for #{uri}: #{e}")
ensure
searchgov_domain.index_urls
set_counter_callbacks
update_counter_caches
end
Expand All @@ -61,7 +70,10 @@ def entry_matches_domain?(entry)
# strictly adhering to the sitemap protocol,
# but matching the domain should suffice for now.
# https://www.pivotaltracker.com/story/show/157485118
URI(entry['loc'].strip).host == domain
url = entry['loc'].strip
URI(url).host == domain
rescue URI::InvalidURIError
Rails.logger.error("Error processing sitemap entry. Invalid URL: #{url}")
end

def log_info
Expand All @@ -75,7 +87,7 @@ def log_info
def sitemap
@sitemap ||= begin
HTTP.headers(user_agent: DEFAULT_USER_AGENT).
timeout(connect: 20, read: 60).follow.get(uri).to_s.freeze
timeout(connect: 20, read: 60).follow.get(uri).to_s
rescue => e
error_info = log_info.merge(error: e.message)
log_line = "[Searchgov SitemapIndexer] #{error_info.to_json}"
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
43 changes: 43 additions & 0 deletions spec/models/sitemap_indexer_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -182,5 +182,48 @@
index
end
end

context 'when the XML is poorly formatted' do
let(:sitemap_entries) do
<<~SITEMAP_ENTRIES
<url><loc>http://agency.gov/good</loc></url>'
<url><loc>http://agency.gov/bad</loc></bad_tag>'
SITEMAP_ENTRIES
end

it 'does not raise an error' do
expect{ index }.not_to raise_error
end

it 'processes as many entries as possible' do
index
expect(SearchgovUrl.find_by(url: 'http://agency.gov/good')).not_to be_nil
end

it 'logs the error' do
expect(Rails.logger).to receive(:error).with(/Missing end tag for 'url'/)
index
end

it 'kicks off indexing' do
allow(SearchgovDomain).to receive(:find_by).
with(domain: 'agency.gov').and_return(searchgov_domain)
expect(searchgov_domain).to receive(:index_urls)
index
end
end

context 'when a sitemap contains an invalid URL' do
let(:sitemap_entries) { '<url><loc>http://agency.gov/doc (1).pdf</loc></url>' }

it 'does not raise an error' do
expect{ indexer.index }.not_to raise_error
end

it 'logs the error' do
expect(Rails.logger).to receive(:error).with(/Invalid URL/)
index
end
end
end
end
52 changes: 25 additions & 27 deletions spec/models/sitemap_spec.rb
Original file line number Diff line number Diff line change
@@ -1,43 +1,41 @@
require 'spec_helper'

describe Sitemap do
pending('until the sitemapindexer gem file is updated for rails 5 ') do
let(:url) { 'http://agency.gov/sitemap.xml' }
let(:valid_attributes) { { url: url } }
let(:url) { 'http://agency.gov/sitemap.xml' }
let(:valid_attributes) { { url: url } }

it { is_expected.to have_readonly_attribute(:url) }
it { is_expected.to have_readonly_attribute(:url) }

describe 'schema' do
it do
is_expected.to have_db_column(:url).of_type(:string).
with_options(null: false, limit: 2000)
end
describe 'schema' do
it do
is_expected.to have_db_column(:url).of_type(:string).
with_options(null: false, limit: 2000)
end
end

describe 'validations' do
context 'when validating url uniqueness' do
let!(:original) { Sitemap.create!(valid_attributes) }
describe 'validations' do
context 'when validating url uniqueness' do
let!(:original) { Sitemap.create!(valid_attributes) }

it 'rejects duplicate urls' do
expect(Sitemap.new(valid_attributes)).to_not be_valid
end
it 'rejects duplicate urls' do
expect(Sitemap.new(valid_attributes)).to_not be_valid
end

it 'is not case-sensitive' do
expect(Sitemap.create!(url: url.upcase)).to be_valid
end
it 'is not case-sensitive' do
expect(Sitemap.create!(url: url.upcase)).to be_valid
end
end
end

describe 'lifecycle' do
describe 'on create' do
it 'is automatically indexed' do
expect(SitemapIndexerJob).to receive(:perform_later).with(sitemap_url: url)
Sitemap.create!(url: url)
end
describe 'lifecycle' do
describe 'on create' do
it 'is automatically indexed' do
expect(SitemapIndexerJob).to receive(:perform_later).with(sitemap_url: url)
Sitemap.create!(url: url)
end
end

it_should_behave_like 'a record with a fetchable url'
it_should_behave_like 'a record that belongs to a searchgov_domain'
end

it_should_behave_like 'a record with a fetchable url'
it_should_behave_like 'a record that belongs to a searchgov_domain'
end
File renamed without changes.

0 comments on commit 20e1646

Please sign in to comment.