diff --git a/.rubocop.yml b/.rubocop.yml
index f93c20c873..7f943a5e30 100644
--- a/.rubocop.yml
+++ b/.rubocop.yml
@@ -3880,7 +3880,7 @@ Style/RescueModifier:
Style/RescueStandardError:
Description: 'Avoid rescuing without specifying an error class.'
- Enabled: true
+ Enabled: false
VersionAdded: 0.52
EnforcedStyle: explicit
# implicit: Do not include the error class, `rescue`
diff --git a/app/models/sitemap_indexer.rb b/app/models/sitemap_indexer.rb
index 4a1f620ffb..b56dfd14fa 100644
--- a/app/models/sitemap_indexer.rb
+++ b/app/models/sitemap_indexer.rb
@@ -14,7 +14,7 @@ def initialize(sitemap_url:)
end
def index
- sitemaps_stream.any? ? enqueue_sitemaps : process_entries
+ sitemap_index? ? enqueue_sitemaps : process_entries
end
private
@@ -24,6 +24,13 @@ def sitemaps_stream
within('sitemapindex').for_tag('sitemap')
end
+ def sitemap_index?
+ sitemaps_stream.any?
+ rescue Saxerator::ParseException
+ # Rescue & move on, in case we can process any URLs before the parser barfs
+ false
+ end
+
def sitemap_entries_stream
@sitemap_entries_stream ||= Saxerator.parser(sitemap).
within('urlset').for_tag('url')
@@ -40,8 +47,10 @@ def process_entries
sitemap_entries_stream.each do |entry|
process_entry(entry) if entry_matches_domain?(entry)
end
- searchgov_domain.index_urls
+ rescue => e
+ Rails.logger.error("Error processing sitemap entries for #{uri}: #{e}")
ensure
+ searchgov_domain.index_urls
set_counter_callbacks
update_counter_caches
end
@@ -61,7 +70,10 @@ def entry_matches_domain?(entry)
# strictly adhering to the sitemap protocol,
# but matching the domain should suffice for now.
# https://www.pivotaltracker.com/story/show/157485118
- URI(entry['loc'].strip).host == domain
+ url = entry['loc'].strip
+ URI(url).host == domain
+ rescue URI::InvalidURIError
+ Rails.logger.error("Error processing sitemap entry. Invalid URL: #{url}")
end
def log_info
@@ -75,7 +87,7 @@ def log_info
def sitemap
@sitemap ||= begin
HTTP.headers(user_agent: DEFAULT_USER_AGENT).
- timeout(connect: 20, read: 60).follow.get(uri).to_s.freeze
+ timeout(connect: 20, read: 60).follow.get(uri).to_s
rescue => e
error_info = log_info.merge(error: e.message)
log_line = "[Searchgov SitemapIndexer] #{error_info.to_json}"
diff --git a/app/models/affiliate_observer.rb b/app/observers/affiliate_observer.rb
similarity index 100%
rename from app/models/affiliate_observer.rb
rename to app/observers/affiliate_observer.rb
diff --git a/app/models/i14y_drawer_observer.rb b/app/observers/i14y_drawer_observer.rb
similarity index 100%
rename from app/models/i14y_drawer_observer.rb
rename to app/observers/i14y_drawer_observer.rb
diff --git a/app/models/indexed_document_observer.rb b/app/observers/indexed_document_observer.rb
similarity index 100%
rename from app/models/indexed_document_observer.rb
rename to app/observers/indexed_document_observer.rb
diff --git a/app/models/misspelling_observer.rb b/app/observers/misspelling_observer.rb
similarity index 100%
rename from app/models/misspelling_observer.rb
rename to app/observers/misspelling_observer.rb
diff --git a/app/models/navigable_observer.rb b/app/observers/navigable_observer.rb
similarity index 100%
rename from app/models/navigable_observer.rb
rename to app/observers/navigable_observer.rb
diff --git a/app/models/routed_query_keyword_observer.rb b/app/observers/routed_query_keyword_observer.rb
similarity index 100%
rename from app/models/routed_query_keyword_observer.rb
rename to app/observers/routed_query_keyword_observer.rb
diff --git a/app/models/rss_feed_url_observer.rb b/app/observers/rss_feed_url_observer.rb
similarity index 100%
rename from app/models/rss_feed_url_observer.rb
rename to app/observers/rss_feed_url_observer.rb
diff --git a/app/models/sayt_filter_observer.rb b/app/observers/sayt_filter_observer.rb
similarity index 100%
rename from app/models/sayt_filter_observer.rb
rename to app/observers/sayt_filter_observer.rb
diff --git a/app/models/searchable_observer.rb b/app/observers/searchable_observer.rb
similarity index 100%
rename from app/models/searchable_observer.rb
rename to app/observers/searchable_observer.rb
diff --git a/app/models/watcher_observer.rb b/app/observers/watcher_observer.rb
similarity index 100%
rename from app/models/watcher_observer.rb
rename to app/observers/watcher_observer.rb
diff --git a/spec/models/sitemap_indexer_spec.rb b/spec/models/sitemap_indexer_spec.rb
index 1dedfef3d1..dceabc8c5a 100644
--- a/spec/models/sitemap_indexer_spec.rb
+++ b/spec/models/sitemap_indexer_spec.rb
@@ -182,5 +182,48 @@
index
end
end
+
+ context 'when the XML is poorly formatted' do
+ let(:sitemap_entries) do
+ <<~SITEMAP_ENTRIES
+ http://agency.gov/good'
+ http://agency.gov/bad'
+ SITEMAP_ENTRIES
+ end
+
+ it 'does not raise an error' do
+ expect{ index }.not_to raise_error
+ end
+
+ it 'processes as many entries as possible' do
+ index
+ expect(SearchgovUrl.find_by(url: 'http://agency.gov/good')).not_to be_nil
+ end
+
+ it 'logs the error' do
+ expect(Rails.logger).to receive(:error).with(/Missing end tag for 'url'/)
+ index
+ end
+
+ it 'kicks off indexing' do
+ allow(SearchgovDomain).to receive(:find_by).
+ with(domain: 'agency.gov').and_return(searchgov_domain)
+ expect(searchgov_domain).to receive(:index_urls)
+ index
+ end
+ end
+
+ context 'when a sitemap contains an invalid URL' do
+ let(:sitemap_entries) { 'http://agency.gov/doc (1).pdf' }
+
+ it 'does not raise an error' do
+ expect{ indexer.index }.not_to raise_error
+ end
+
+ it 'logs the error' do
+ expect(Rails.logger).to receive(:error).with(/Invalid URL/)
+ index
+ end
+ end
end
end
diff --git a/spec/models/sitemap_spec.rb b/spec/models/sitemap_spec.rb
index 877366995a..4d90f80b73 100644
--- a/spec/models/sitemap_spec.rb
+++ b/spec/models/sitemap_spec.rb
@@ -1,43 +1,41 @@
require 'spec_helper'
describe Sitemap do
- pending('until the sitemapindexer gem file is updated for rails 5 ') do
- let(:url) { 'http://agency.gov/sitemap.xml' }
- let(:valid_attributes) { { url: url } }
+ let(:url) { 'http://agency.gov/sitemap.xml' }
+ let(:valid_attributes) { { url: url } }
- it { is_expected.to have_readonly_attribute(:url) }
+ it { is_expected.to have_readonly_attribute(:url) }
- describe 'schema' do
- it do
- is_expected.to have_db_column(:url).of_type(:string).
- with_options(null: false, limit: 2000)
- end
+ describe 'schema' do
+ it do
+ is_expected.to have_db_column(:url).of_type(:string).
+ with_options(null: false, limit: 2000)
end
+ end
- describe 'validations' do
- context 'when validating url uniqueness' do
- let!(:original) { Sitemap.create!(valid_attributes) }
+ describe 'validations' do
+ context 'when validating url uniqueness' do
+ let!(:original) { Sitemap.create!(valid_attributes) }
- it 'rejects duplicate urls' do
- expect(Sitemap.new(valid_attributes)).to_not be_valid
- end
+ it 'rejects duplicate urls' do
+ expect(Sitemap.new(valid_attributes)).to_not be_valid
+ end
- it 'is not case-sensitive' do
- expect(Sitemap.create!(url: url.upcase)).to be_valid
- end
+ it 'is not case-sensitive' do
+ expect(Sitemap.create!(url: url.upcase)).to be_valid
end
end
+ end
- describe 'lifecycle' do
- describe 'on create' do
- it 'is automatically indexed' do
- expect(SitemapIndexerJob).to receive(:perform_later).with(sitemap_url: url)
- Sitemap.create!(url: url)
- end
+ describe 'lifecycle' do
+ describe 'on create' do
+ it 'is automatically indexed' do
+ expect(SitemapIndexerJob).to receive(:perform_later).with(sitemap_url: url)
+ Sitemap.create!(url: url)
end
end
-
- it_should_behave_like 'a record with a fetchable url'
- it_should_behave_like 'a record that belongs to a searchgov_domain'
end
+
+ it_should_behave_like 'a record with a fetchable url'
+ it_should_behave_like 'a record that belongs to a searchgov_domain'
end
diff --git a/spec/models/watcher_observer_spec.rb b/spec/observers/watcher_observer_spec.rb
similarity index 100%
rename from spec/models/watcher_observer_spec.rb
rename to spec/observers/watcher_observer_spec.rb