diff --git a/.rubocop.yml b/.rubocop.yml index f93c20c873..7f943a5e30 100644 --- a/.rubocop.yml +++ b/.rubocop.yml @@ -3880,7 +3880,7 @@ Style/RescueModifier: Style/RescueStandardError: Description: 'Avoid rescuing without specifying an error class.' - Enabled: true + Enabled: false VersionAdded: 0.52 EnforcedStyle: explicit # implicit: Do not include the error class, `rescue` diff --git a/app/models/sitemap_indexer.rb b/app/models/sitemap_indexer.rb index 4a1f620ffb..b56dfd14fa 100644 --- a/app/models/sitemap_indexer.rb +++ b/app/models/sitemap_indexer.rb @@ -14,7 +14,7 @@ def initialize(sitemap_url:) end def index - sitemaps_stream.any? ? enqueue_sitemaps : process_entries + sitemap_index? ? enqueue_sitemaps : process_entries end private @@ -24,6 +24,13 @@ def sitemaps_stream within('sitemapindex').for_tag('sitemap') end + def sitemap_index? + sitemaps_stream.any? + rescue Saxerator::ParseException + # Rescue & move on, in case we can process any URLs before the parser barfs + false + end + def sitemap_entries_stream @sitemap_entries_stream ||= Saxerator.parser(sitemap). within('urlset').for_tag('url') @@ -40,8 +47,10 @@ def process_entries sitemap_entries_stream.each do |entry| process_entry(entry) if entry_matches_domain?(entry) end - searchgov_domain.index_urls + rescue => e + Rails.logger.error("Error processing sitemap entries for #{uri}: #{e}") ensure + searchgov_domain.index_urls set_counter_callbacks update_counter_caches end @@ -61,7 +70,10 @@ def entry_matches_domain?(entry) # strictly adhering to the sitemap protocol, # but matching the domain should suffice for now. # https://www.pivotaltracker.com/story/show/157485118 - URI(entry['loc'].strip).host == domain + url = entry['loc'].strip + URI(url).host == domain + rescue URI::InvalidURIError + Rails.logger.error("Error processing sitemap entry. Invalid URL: #{url}") end def log_info @@ -75,7 +87,7 @@ def log_info def sitemap @sitemap ||= begin HTTP.headers(user_agent: DEFAULT_USER_AGENT). - timeout(connect: 20, read: 60).follow.get(uri).to_s.freeze + timeout(connect: 20, read: 60).follow.get(uri).to_s rescue => e error_info = log_info.merge(error: e.message) log_line = "[Searchgov SitemapIndexer] #{error_info.to_json}" diff --git a/app/models/affiliate_observer.rb b/app/observers/affiliate_observer.rb similarity index 100% rename from app/models/affiliate_observer.rb rename to app/observers/affiliate_observer.rb diff --git a/app/models/i14y_drawer_observer.rb b/app/observers/i14y_drawer_observer.rb similarity index 100% rename from app/models/i14y_drawer_observer.rb rename to app/observers/i14y_drawer_observer.rb diff --git a/app/models/indexed_document_observer.rb b/app/observers/indexed_document_observer.rb similarity index 100% rename from app/models/indexed_document_observer.rb rename to app/observers/indexed_document_observer.rb diff --git a/app/models/misspelling_observer.rb b/app/observers/misspelling_observer.rb similarity index 100% rename from app/models/misspelling_observer.rb rename to app/observers/misspelling_observer.rb diff --git a/app/models/navigable_observer.rb b/app/observers/navigable_observer.rb similarity index 100% rename from app/models/navigable_observer.rb rename to app/observers/navigable_observer.rb diff --git a/app/models/routed_query_keyword_observer.rb b/app/observers/routed_query_keyword_observer.rb similarity index 100% rename from app/models/routed_query_keyword_observer.rb rename to app/observers/routed_query_keyword_observer.rb diff --git a/app/models/rss_feed_url_observer.rb b/app/observers/rss_feed_url_observer.rb similarity index 100% rename from app/models/rss_feed_url_observer.rb rename to app/observers/rss_feed_url_observer.rb diff --git a/app/models/sayt_filter_observer.rb b/app/observers/sayt_filter_observer.rb similarity index 100% rename from app/models/sayt_filter_observer.rb rename to app/observers/sayt_filter_observer.rb diff --git a/app/models/searchable_observer.rb b/app/observers/searchable_observer.rb similarity index 100% rename from app/models/searchable_observer.rb rename to app/observers/searchable_observer.rb diff --git a/app/models/watcher_observer.rb b/app/observers/watcher_observer.rb similarity index 100% rename from app/models/watcher_observer.rb rename to app/observers/watcher_observer.rb diff --git a/spec/models/sitemap_indexer_spec.rb b/spec/models/sitemap_indexer_spec.rb index 1dedfef3d1..dceabc8c5a 100644 --- a/spec/models/sitemap_indexer_spec.rb +++ b/spec/models/sitemap_indexer_spec.rb @@ -182,5 +182,48 @@ index end end + + context 'when the XML is poorly formatted' do + let(:sitemap_entries) do + <<~SITEMAP_ENTRIES + http://agency.gov/good' + http://agency.gov/bad' + SITEMAP_ENTRIES + end + + it 'does not raise an error' do + expect{ index }.not_to raise_error + end + + it 'processes as many entries as possible' do + index + expect(SearchgovUrl.find_by(url: 'http://agency.gov/good')).not_to be_nil + end + + it 'logs the error' do + expect(Rails.logger).to receive(:error).with(/Missing end tag for 'url'/) + index + end + + it 'kicks off indexing' do + allow(SearchgovDomain).to receive(:find_by). + with(domain: 'agency.gov').and_return(searchgov_domain) + expect(searchgov_domain).to receive(:index_urls) + index + end + end + + context 'when a sitemap contains an invalid URL' do + let(:sitemap_entries) { 'http://agency.gov/doc (1).pdf' } + + it 'does not raise an error' do + expect{ indexer.index }.not_to raise_error + end + + it 'logs the error' do + expect(Rails.logger).to receive(:error).with(/Invalid URL/) + index + end + end end end diff --git a/spec/models/sitemap_spec.rb b/spec/models/sitemap_spec.rb index 877366995a..4d90f80b73 100644 --- a/spec/models/sitemap_spec.rb +++ b/spec/models/sitemap_spec.rb @@ -1,43 +1,41 @@ require 'spec_helper' describe Sitemap do - pending('until the sitemapindexer gem file is updated for rails 5 ') do - let(:url) { 'http://agency.gov/sitemap.xml' } - let(:valid_attributes) { { url: url } } + let(:url) { 'http://agency.gov/sitemap.xml' } + let(:valid_attributes) { { url: url } } - it { is_expected.to have_readonly_attribute(:url) } + it { is_expected.to have_readonly_attribute(:url) } - describe 'schema' do - it do - is_expected.to have_db_column(:url).of_type(:string). - with_options(null: false, limit: 2000) - end + describe 'schema' do + it do + is_expected.to have_db_column(:url).of_type(:string). + with_options(null: false, limit: 2000) end + end - describe 'validations' do - context 'when validating url uniqueness' do - let!(:original) { Sitemap.create!(valid_attributes) } + describe 'validations' do + context 'when validating url uniqueness' do + let!(:original) { Sitemap.create!(valid_attributes) } - it 'rejects duplicate urls' do - expect(Sitemap.new(valid_attributes)).to_not be_valid - end + it 'rejects duplicate urls' do + expect(Sitemap.new(valid_attributes)).to_not be_valid + end - it 'is not case-sensitive' do - expect(Sitemap.create!(url: url.upcase)).to be_valid - end + it 'is not case-sensitive' do + expect(Sitemap.create!(url: url.upcase)).to be_valid end end + end - describe 'lifecycle' do - describe 'on create' do - it 'is automatically indexed' do - expect(SitemapIndexerJob).to receive(:perform_later).with(sitemap_url: url) - Sitemap.create!(url: url) - end + describe 'lifecycle' do + describe 'on create' do + it 'is automatically indexed' do + expect(SitemapIndexerJob).to receive(:perform_later).with(sitemap_url: url) + Sitemap.create!(url: url) end end - - it_should_behave_like 'a record with a fetchable url' - it_should_behave_like 'a record that belongs to a searchgov_domain' end + + it_should_behave_like 'a record with a fetchable url' + it_should_behave_like 'a record that belongs to a searchgov_domain' end diff --git a/spec/models/watcher_observer_spec.rb b/spec/observers/watcher_observer_spec.rb similarity index 100% rename from spec/models/watcher_observer_spec.rb rename to spec/observers/watcher_observer_spec.rb