SRCH-1511 SRCH-1560 improve error handling in SitemapIndexer (#603)

- move observers into relevant directories
GSA · Jul 24, 2020 · 20e1646 · 20e1646
1 parent b9d85f1
commit 20e1646
Show file tree

Hide file tree

Showing 15 changed files with 85 additions and 32 deletions.
diff --git a/.rubocop.yml b/.rubocop.yml
@@ -3880,7 +3880,7 @@ Style/RescueModifier:
 
 Style/RescueStandardError:
   Description: 'Avoid rescuing without specifying an error class.'
-  Enabled: true
+  Enabled: false
   VersionAdded: 0.52
   EnforcedStyle: explicit
   # implicit: Do not include the error class, `rescue`

diff --git a/app/models/sitemap_indexer.rb b/app/models/sitemap_indexer.rb
@@ -14,7 +14,7 @@ def initialize(sitemap_url:)
   end
 
   def index
-    sitemaps_stream.any? ? enqueue_sitemaps : process_entries
+    sitemap_index? ? enqueue_sitemaps : process_entries
   end
 
   private
@@ -24,6 +24,13 @@ def sitemaps_stream
                            within('sitemapindex').for_tag('sitemap')
   end
 
+  def sitemap_index?
+    sitemaps_stream.any?
+  rescue Saxerator::ParseException
+    # Rescue & move on, in case we can process any URLs before the parser barfs
+    false
+  end
+
   def sitemap_entries_stream
     @sitemap_entries_stream ||= Saxerator.parser(sitemap).
                                   within('urlset').for_tag('url')
@@ -40,8 +47,10 @@ def process_entries
     sitemap_entries_stream.each do |entry|
       process_entry(entry) if entry_matches_domain?(entry)
     end
-    searchgov_domain.index_urls
+  rescue => e
+    Rails.logger.error("Error processing sitemap entries for #{uri}: #{e}")
   ensure
+    searchgov_domain.index_urls
     set_counter_callbacks
     update_counter_caches
   end
@@ -61,7 +70,10 @@ def entry_matches_domain?(entry)
     # strictly adhering to the sitemap protocol,
     # but matching the domain should suffice for now.
     # https://www.pivotaltracker.com/story/show/157485118
-    URI(entry['loc'].strip).host == domain
+    url = entry['loc'].strip
+    URI(url).host == domain
+  rescue URI::InvalidURIError
+    Rails.logger.error("Error processing sitemap entry. Invalid URL: #{url}")
   end
 
   def log_info
@@ -75,7 +87,7 @@ def log_info
   def sitemap
     @sitemap ||= begin
       HTTP.headers(user_agent: DEFAULT_USER_AGENT).
-        timeout(connect: 20, read: 60).follow.get(uri).to_s.freeze
+        timeout(connect: 20, read: 60).follow.get(uri).to_s
     rescue => e
       error_info = log_info.merge(error: e.message)
       log_line = "[Searchgov SitemapIndexer] #{error_info.to_json}"

diff --git a/app/models/affiliate_observer.rb → app/observers/affiliate_observer.rb b/app/models/affiliate_observer.rb → app/observers/affiliate_observer.rb
diff --git a/app/models/i14y_drawer_observer.rb → app/observers/i14y_drawer_observer.rb b/app/models/i14y_drawer_observer.rb → app/observers/i14y_drawer_observer.rb
diff --git a/app/models/indexed_document_observer.rb → app/observers/indexed_document_observer.rb b/app/models/indexed_document_observer.rb → app/observers/indexed_document_observer.rb
diff --git a/app/models/misspelling_observer.rb → app/observers/misspelling_observer.rb b/app/models/misspelling_observer.rb → app/observers/misspelling_observer.rb
diff --git a/app/models/navigable_observer.rb → app/observers/navigable_observer.rb b/app/models/navigable_observer.rb → app/observers/navigable_observer.rb
diff --git a/app/models/routed_query_keyword_observer.rb → ...bservers/routed_query_keyword_observer.rb b/app/models/routed_query_keyword_observer.rb → ...bservers/routed_query_keyword_observer.rb
diff --git a/app/models/rss_feed_url_observer.rb → app/observers/rss_feed_url_observer.rb b/app/models/rss_feed_url_observer.rb → app/observers/rss_feed_url_observer.rb
diff --git a/app/models/sayt_filter_observer.rb → app/observers/sayt_filter_observer.rb b/app/models/sayt_filter_observer.rb → app/observers/sayt_filter_observer.rb
diff --git a/app/models/searchable_observer.rb → app/observers/searchable_observer.rb b/app/models/searchable_observer.rb → app/observers/searchable_observer.rb
diff --git a/app/models/watcher_observer.rb → app/observers/watcher_observer.rb b/app/models/watcher_observer.rb → app/observers/watcher_observer.rb
diff --git a/spec/models/sitemap_indexer_spec.rb b/spec/models/sitemap_indexer_spec.rb
@@ -182,5 +182,48 @@
         index
       end
     end
+
+    context 'when the XML is poorly formatted' do
+      let(:sitemap_entries) do
+        <<~SITEMAP_ENTRIES
+          <url><loc>http://agency.gov/good</loc></url>'
+          <url><loc>http://agency.gov/bad</loc></bad_tag>'
+        SITEMAP_ENTRIES
+      end
+
+      it 'does not raise an error' do
+        expect{ index }.not_to raise_error
+      end
+
+      it 'processes as many entries as possible' do
+        index
+        expect(SearchgovUrl.find_by(url: 'http://agency.gov/good')).not_to be_nil
+      end
+
+      it 'logs the error' do
+        expect(Rails.logger).to receive(:error).with(/Missing end tag for 'url'/)
+        index
+      end
+
+      it 'kicks off indexing' do
+        allow(SearchgovDomain).to receive(:find_by).
+          with(domain: 'agency.gov').and_return(searchgov_domain)
+        expect(searchgov_domain).to receive(:index_urls)
+        index
+      end
+    end
+
+    context 'when a sitemap contains an invalid URL' do
+      let(:sitemap_entries) { '<url><loc>http://agency.gov/doc (1).pdf</loc></url>' }
+
+      it 'does not raise an error' do
+        expect{ indexer.index }.not_to raise_error
+      end
+
+      it 'logs the error' do
+        expect(Rails.logger).to receive(:error).with(/Invalid URL/)
+        index
+      end
+    end
   end
 end
diff --git a/spec/models/sitemap_spec.rb b/spec/models/sitemap_spec.rb
@@ -1,43 +1,41 @@
 require 'spec_helper'
 
 describe Sitemap do
-  pending('until the sitemapindexer gem file is updated for rails 5 ') do
-    let(:url) { 'http://agency.gov/sitemap.xml' }
-    let(:valid_attributes) { { url: url } }
+  let(:url) { 'http://agency.gov/sitemap.xml' }
+  let(:valid_attributes) { { url: url } }
 
-    it { is_expected.to have_readonly_attribute(:url) }
+  it { is_expected.to have_readonly_attribute(:url) }
 
-    describe 'schema' do
-      it do
-        is_expected.to have_db_column(:url).of_type(:string).
-          with_options(null: false, limit: 2000)
-      end
+  describe 'schema' do
+    it do
+      is_expected.to have_db_column(:url).of_type(:string).
+        with_options(null: false, limit: 2000)
     end
+  end
 
-    describe 'validations' do
-      context 'when validating url uniqueness' do
-        let!(:original) { Sitemap.create!(valid_attributes) }
+  describe 'validations' do
+    context 'when validating url uniqueness' do
+      let!(:original) { Sitemap.create!(valid_attributes) }
 
-        it 'rejects duplicate urls' do
-          expect(Sitemap.new(valid_attributes)).to_not be_valid
-        end
+      it 'rejects duplicate urls' do
+        expect(Sitemap.new(valid_attributes)).to_not be_valid
+      end
 
-        it 'is not case-sensitive' do
-          expect(Sitemap.create!(url: url.upcase)).to be_valid
-        end
+      it 'is not case-sensitive' do
+        expect(Sitemap.create!(url: url.upcase)).to be_valid
       end
     end
+  end
 
-    describe 'lifecycle' do
-      describe 'on create' do
-        it 'is automatically indexed' do
-          expect(SitemapIndexerJob).to receive(:perform_later).with(sitemap_url: url)
-          Sitemap.create!(url: url)
-        end
+  describe 'lifecycle' do
+    describe 'on create' do
+      it 'is automatically indexed' do
+        expect(SitemapIndexerJob).to receive(:perform_later).with(sitemap_url: url)
+        Sitemap.create!(url: url)
       end
     end
-
-    it_should_behave_like 'a record with a fetchable url'
-    it_should_behave_like 'a record that belongs to a searchgov_domain'
   end
+
+  it_should_behave_like 'a record with a fetchable url'
+  it_should_behave_like 'a record that belongs to a searchgov_domain'
 end
diff --git a/spec/models/watcher_observer_spec.rb → spec/observers/watcher_observer_spec.rb b/spec/models/watcher_observer_spec.rb → spec/observers/watcher_observer_spec.rb