diff --git a/app/models/searchgov_url.rb b/app/models/searchgov_url.rb index c68692d126..5b1e2e0903 100644 --- a/app/models/searchgov_url.rb +++ b/app/models/searchgov_url.rb @@ -14,7 +14,11 @@ class SearchgovUrl < ActiveRecord::Base application/vnd.openxmlformats-officedocument.spreadsheetml.sheet ] - attr_accessible :last_crawl_status, :last_crawled_at, :url, :lastmod + attr_accessible :last_crawl_status, + :last_crawled_at, + :url, + :lastmod, + :enqueued_for_reindex attr_reader :response, :document, :tempfile attr_readonly :url @@ -35,14 +39,18 @@ class SearchgovUrl < ActiveRecord::Base column_name: proc {|url| !url.fetched? ? 'unfetched_urls_count' : nil }, column_names: { ['searchgov_urls.last_crawled_at IS NULL'] => 'unfetched_urls_count' } - scope :fetch_required, -> { where('last_crawled_at IS NULL OR lastmod > last_crawled_at') } + scope :fetch_required, -> do + where('last_crawled_at IS NULL + OR lastmod > last_crawled_at + OR enqueued_for_reindex') + end class SearchgovUrlError < StandardError; end class DomainError < StandardError; end def fetch raise DomainError.new("#{searchgov_domain.domain}: #{searchgov_domain.status}") if !searchgov_domain.available? - self.update_attributes(last_crawled_at: Time.now) + update(last_crawled_at: Time.now, enqueued_for_reindex: false) self.load_time = Benchmark.realtime do DocumentFetchLogger.new(url, 'searchgov_url').log begin diff --git a/spec/fixtures/searchgov_urls.yml b/spec/fixtures/searchgov_urls.yml new file mode 100644 index 0000000000..fa0fef99d0 --- /dev/null +++ b/spec/fixtures/searchgov_urls.yml @@ -0,0 +1,18 @@ +new: + url: http://www.agency.gov/new + +outdated: + url: http://www.agency.gov/outdated + last_crawled_at: <%= 1.week.ago.to_s(:db) %> + lastmod: <%= 1.day.ago.to_s(:db) %> + +current: + url: http://www.agency.gov/current + last_crawled_at: <%= 1.day.ago.to_s(:db) %> + lastmod: <%= 1.week.ago.to_s(:db) %> + +enqueued: + url: http://www.agency.gov/enqueued + last_crawled_at: <%= 1.day.ago.to_s(:db) %> + lastmod: <%= 1.week.ago.to_s(:db) %> + enqueued_for_reindex: true diff --git a/spec/jobs/searchgov_domain_destroyer_job_spec.rb b/spec/jobs/searchgov_domain_destroyer_job_spec.rb index eb18c8d143..d7c8a2a7ba 100644 --- a/spec/jobs/searchgov_domain_destroyer_job_spec.rb +++ b/spec/jobs/searchgov_domain_destroyer_job_spec.rb @@ -28,7 +28,7 @@ let!(:searchgov_url2) { SearchgovUrl.create!(url: url2) } it 'destroys the searchgov_urls' do - expect{ perform }.to change{ SearchgovUrl.count }.from(2).to(0) + expect { perform }.to change{ SearchgovUrl.count }.by(-2) end end end diff --git a/spec/lib/tasks/searchgov_spec.rb b/spec/lib/tasks/searchgov_spec.rb index 663262fb4c..c55a7cf186 100644 --- a/spec/lib/tasks/searchgov_spec.rb +++ b/spec/lib/tasks/searchgov_spec.rb @@ -77,7 +77,7 @@ it 'indexes new urls' do allow(I14yDocument).to receive(:promote). with(handle: 'searchgov', document_id: doc_id, bool: 'true').at_least(:once) - expect{ promote_urls }.to change{ SearchgovUrl.count }.from(0).to(1) + expect { promote_urls }.to change{ SearchgovUrl.count }.by(1) end it 'creates new urls' do diff --git a/spec/models/searchgov_url_spec.rb b/spec/models/searchgov_url_spec.rb index 3dbe2c6216..5a3213571d 100644 --- a/spec/models/searchgov_url_spec.rb +++ b/spec/models/searchgov_url_spec.rb @@ -1,6 +1,8 @@ require 'spec_helper' describe SearchgovUrl do + fixtures :searchgov_urls + let(:url) { 'http://www.agency.gov/boring.html' } let(:html) { read_fixture_file("/html/page_with_og_metadata.html") } let(:valid_attributes) { { url: url } } @@ -22,19 +24,20 @@ describe 'scopes' do describe '.fetch_required' do - before do - SearchgovUrl.create!(url: 'http://www.agency.gov/new') - SearchgovUrl.create!( - url: 'http://www.agency.gov/outdated', last_crawled_at: 1.week.ago, lastmod: 1.day.ago - ) - SearchgovUrl.create!( - url: 'http://www.agency.gov/current', last_crawled_at: 1.day.ago, lastmod: 1.week.ago - ) - end it 'includes urls that have never been crawled and outdated urls' do expect(SearchgovUrl.fetch_required.pluck(:url)). - to eq %w[http://www.agency.gov/new http://www.agency.gov/outdated] + to include('http://www.agency.gov/new', 'http://www.agency.gov/outdated') + end + + it 'does not include current, crawled and not enqueued urls' do + expect(SearchgovUrl.fetch_required.pluck(:url)). + not_to include('http://www.agency.gov/current') + end + + it 'includes urls that have been enqueued for reindexing' do + expect(SearchgovUrl.fetch_required.pluck(:url)). + to include 'http://www.agency.gov/enqueued' end end end @@ -43,7 +46,8 @@ it 'requires a valid domain' do searchgov_url = SearchgovUrl.new(url: 'https://foo/bar') expect(searchgov_url).not_to be_valid - expect(searchgov_url.errors.messages[:searchgov_domain]).to include 'is invalid' + expect(searchgov_url.errors.messages[:searchgov_domain]). + to include 'is invalid' end describe 'validating url uniqueness' do @@ -75,7 +79,7 @@ end it 'deletes the Searchgov Url' do - expect{ searchgov_url.destroy }.to change{ SearchgovUrl.count }.from(1).to(0) + expect { searchgov_url.destroy }.to change{ SearchgovUrl.count }.by(-1) end end end @@ -128,6 +132,17 @@ fetch end + context 'when the record is enqueued for reindex' do + let(:searchgov_url) do + SearchgovUrl.create!(valid_attributes.merge(enqueued_for_reindex: true)) + end + + it 'sets enqueued_for_reindex to false' do + expect { fetch }.to change{ searchgov_url.enqueued_for_reindex }. + from(true).to(false) + end + end + context 'when the record includes a lastmod value' do let(:valid_attributes) { { url: url, lastmod: '2018-01-01' } } diff --git a/spec/models/sitemap_indexer_spec.rb b/spec/models/sitemap_indexer_spec.rb index 6f0fb9d1b7..b2c522b996 100644 --- a/spec/models/sitemap_indexer_spec.rb +++ b/spec/models/sitemap_indexer_spec.rb @@ -24,7 +24,7 @@ subject(:index) { indexer.index } it 'creates searchgov urls' do - expect{ index }.to change{SearchgovUrl.count}.from(0).to(1) + expect { index }.to change{ SearchgovUrl.count }.by(1) end it 'updates the counter cache columns' do @@ -115,7 +115,7 @@ let(:sitemap_entries) { "\n \n http://agency.gov/doc1 \n \n " } it 'creates a searchgov_url record' do - expect{index}.to change{SearchgovUrl.count}.from(0).to(1) + expect { index }.to change{ SearchgovUrl.count }.by(1) end end @@ -143,7 +143,7 @@ it 'ignores them' do index - expect(SearchgovUrl.pluck(:url)).to eq ['http://agency.gov/doc1'] + expect(SearchgovUrl.pluck(:url)).not_to include 'http://other.gov/doc1' end end diff --git a/spec/support/fetchable_behavior.rb b/spec/support/fetchable_behavior.rb index b5f064577a..d821684c7b 100644 --- a/spec/support/fetchable_behavior.rb +++ b/spec/support/fetchable_behavior.rb @@ -30,34 +30,52 @@ describe 'scopes' do context 'by last_crawl_status or last_crawled_at' do before do - described_class.create!(valid_attributes.merge(url: 'http://agency.gov/ok', last_crawl_status: 'OK', last_crawled_at: 1.day.ago)) - described_class.create!(valid_attributes.merge(url: 'http://agency.gov/failed', last_crawl_status: 'failed', last_crawled_at: 1.day.ago)) - described_class.create!(valid_attributes.merge(url: 'http://agency.gov/unfetched', last_crawl_status: nil, last_crawled_at: nil)) + described_class.create!(valid_attributes.merge(url: 'http://agency.gov/ok', + last_crawl_status: 'OK', + last_crawled_at: 1.day.ago)) + described_class.create!(valid_attributes.merge(url: 'http://agency.gov/failed', + last_crawl_status: 'failed', + last_crawled_at: 1.day.ago)) + described_class.create!(valid_attributes.merge(url: 'http://agency.gov/unfetched', + last_crawl_status: nil, + last_crawled_at: nil)) end describe '.fetched' do it 'includes successfully and unsuccessfully fetched records' do expect(described_class.fetched.pluck(:url)). - to match_array %w[http://agency.gov/ok http://agency.gov/failed] + to include('http://agency.gov/ok', 'http://agency.gov/failed') + end + + it 'does not include unfetched records' do + expect(described_class.fetched.pluck(:url)). + not_to include 'http://agency.gov/unfetched' end end describe '.unfetched' do it 'includes unfetched records' do - expect(described_class.unfetched.pluck(:url)).to eq ['http://agency.gov/unfetched'] + expect(described_class.unfetched.pluck(:url)). + to include 'http://agency.gov/unfetched' + end + + it 'does not include fetched records' do + expect(described_class.unfetched.pluck(:url)). + not_to include 'http://agency.gov/ok' end end describe '.ok' do it 'includes successfully fetched records' do - expect(described_class.ok.pluck(:url)).to match_array ['http://agency.gov/ok'] + expect(described_class.ok.pluck(:url)). + to match_array ['http://agency.gov/ok'] end end describe '.not_ok' do it 'includes failed or unfetched records' do expect(described_class.not_ok.pluck(:url)). - to match_array %w[http://agency.gov/unfetched http://agency.gov/failed] + to include('http://agency.gov/unfetched', 'http://agency.gov/failed') end end end @@ -76,28 +94,32 @@ context "when an URL contains an anchor tag" do let(:url) { "http://www.nps.gov/sdfsdf#anchorme" } it "should remove it" do - expect(described_class.create!(valid_attributes.merge(url: url)).url).to eq("http://www.nps.gov/sdfsdf") + expect(described_class.create!(valid_attributes.merge(url: url)).url). + to eq("http://www.nps.gov/sdfsdf") end end context "when URL is mixed case" do let(:url) { "HTTP://Www.nps.GOV/UsaGovLovesToCapitalize" } it "should downcase the scheme and host only" do - expect(described_class.create!(valid_attributes.merge(url: url)).url).to eq("http://www.nps.gov/UsaGovLovesToCapitalize") + expect(described_class.create!(valid_attributes.merge(url: url)).url). + to eq("http://www.nps.gov/UsaGovLovesToCapitalize") end end context "when URL is missing trailing slash for a scheme+host URL" do let(:url) { "http://www.nps.gov" } it "should append a /" do - expect(described_class.create!(valid_attributes.merge(url: url)).url).to eq("http://www.nps.gov/") + expect(described_class.create!(valid_attributes.merge(url: url)).url). + to eq("http://www.nps.gov/") end end context "when URL contains duplicate leading slashes in request" do let(:url) { "http://www.nps.gov//hey/I/am/usagov/and/love/extra////slashes.shtml" } it "should collapse the slashes" do - expect(described_class.create!(valid_attributes.merge(url: url)).url).to eq("http://www.nps.gov/hey/I/am/usagov/and/love/extra/slashes.shtml") + expect(described_class.create!(valid_attributes.merge(url: url)).url). + to eq("http://www.nps.gov/hey/I/am/usagov/and/love/extra/slashes.shtml") end end diff --git a/spec/vcr_cassettes/jobs/search_options_.yml b/spec/vcr_cassettes/jobs/search_options_.yml index 6fbefe6338..330acc4522 100644 --- a/spec/vcr_cassettes/jobs/search_options_.yml +++ b/spec/vcr_cassettes/jobs/search_options_.yml @@ -1222,9 +1222,9 @@ http_interactions: string: '' headers: Authorization-Key: - - Qbk5RB/WRc1ctYqwojqlSKeoLVrwokT8OnSLq+G1qu0= + - "" User-Agent: - - parissa.eggleston@gmail.com + - "" Accept-Encoding: - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 Accept: @@ -1877,9 +1877,9 @@ http_interactions: string: '' headers: Authorization-Key: - - Qbk5RB/WRc1ctYqwojqlSKeoLVrwokT8OnSLq+G1qu0= + - "" User-Agent: - - parissa.eggleston@gmail.com + - "" Accept-Encoding: - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 Accept: @@ -3061,7 +3061,7 @@ http_interactions: Authorization-Key: - Qbk5RB/WRc1ctYqwojqlSKeoLVrwokT8OnSLq+G1qu0= User-Agent: - - parissa.eggleston@gmail.com + - "" Accept-Encoding: - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 Accept: @@ -3111,9 +3111,9 @@ http_interactions: string: '' headers: Authorization-Key: - - Qbk5RB/WRc1ctYqwojqlSKeoLVrwokT8OnSLq+G1qu0= + - "" User-Agent: - - parissa.eggleston@gmail.com + - "" Accept-Encoding: - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 Accept: @@ -3163,9 +3163,9 @@ http_interactions: string: '' headers: Authorization-Key: - - Qbk5RB/WRc1ctYqwojqlSKeoLVrwokT8OnSLq+G1qu0= + - "" User-Agent: - - parissa.eggleston@gmail.com + - "" Accept-Encoding: - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 Accept: @@ -4345,9 +4345,9 @@ http_interactions: string: '' headers: Authorization-Key: - - Qbk5RB/WRc1ctYqwojqlSKeoLVrwokT8OnSLq+G1qu0= + - "" User-Agent: - - parissa.eggleston@gmail.com + - "" Accept-Encoding: - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 Accept: @@ -4975,9 +4975,9 @@ http_interactions: string: '' headers: Authorization-Key: - - Qbk5RB/WRc1ctYqwojqlSKeoLVrwokT8OnSLq+G1qu0= + - "" User-Agent: - - parissa.eggleston@gmail.com + - "" Accept-Encoding: - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 Accept: @@ -5229,9 +5229,9 @@ http_interactions: string: '' headers: Authorization-Key: - - Qbk5RB/WRc1ctYqwojqlSKeoLVrwokT8OnSLq+G1qu0= + - "" User-Agent: - - parissa.eggleston@gmail.com + - "" Accept-Encoding: - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 Accept: diff --git a/spec/vcr_cassettes/search/gov/tasks_searchgov_bulk_index.yml b/spec/vcr_cassettes/search/gov/tasks_searchgov_bulk_index.yml index d82b48d27a..9c1e7c6425 100644 --- a/spec/vcr_cassettes/search/gov/tasks_searchgov_bulk_index.yml +++ b/spec/vcr_cassettes/search/gov/tasks_searchgov_bulk_index.yml @@ -3466,14 +3466,14 @@ http_interactions: Location: - https://www.consumerfinance.gov/robots.txt Date: - - Wed, 31 Oct 2018 01:09:10 GMT + - Fri, 16 Nov 2018 22:42:39 GMT Connection: - keep-alive body: encoding: UTF-8 string: '' http_version: - recorded_at: Wed, 31 Oct 2018 01:09:10 GMT + recorded_at: Fri, 16 Nov 2018 22:42:39 GMT - request: method: get uri: https://www.consumerfinance.gov/robots.txt @@ -3495,7 +3495,7 @@ http_interactions: Content-Type: - text/plain; charset=UTF-8 Etag: - - '"125-5795e2c925641"' + - '"125-57abc510cd176"' Server: - Apache X-Frame-Options: @@ -3505,13 +3505,13 @@ http_interactions: X-Content-Type-Options: - nosniff Last-Modified: - - Mon, 29 Oct 2018 13:37:45 GMT + - Thu, 15 Nov 2018 23:21:55 GMT Access-Control-Allow-Origin: - https://www.consumerfinance.gov Vary: - Accept-Encoding Date: - - Wed, 31 Oct 2018 01:09:10 GMT + - Fri, 16 Nov 2018 22:42:39 GMT Content-Length: - '293' Connection: @@ -3532,5 +3532,5 @@ http_interactions: sitemap: https://www.consumerfinance.gov/sitemap.xml http_version: - recorded_at: Wed, 31 Oct 2018 01:09:10 GMT + recorded_at: Fri, 16 Nov 2018 22:42:39 GMT recorded_with: VCR 4.0.0 diff --git a/spec/vcr_cassettes/search/gov/tasks_searchgov_bulk_index_when_a_url_has_already_been_indexed.yml b/spec/vcr_cassettes/search/gov/tasks_searchgov_bulk_index_when_a_url_has_already_been_indexed.yml index 8f2b9312d0..828c0aec34 100644 --- a/spec/vcr_cassettes/search/gov/tasks_searchgov_bulk_index_when_a_url_has_already_been_indexed.yml +++ b/spec/vcr_cassettes/search/gov/tasks_searchgov_bulk_index_when_a_url_has_already_been_indexed.yml @@ -3466,14 +3466,14 @@ http_interactions: Location: - https://www.consumerfinance.gov/robots.txt Date: - - Wed, 31 Oct 2018 01:09:10 GMT + - Fri, 16 Nov 2018 22:42:40 GMT Connection: - keep-alive body: encoding: UTF-8 string: '' http_version: - recorded_at: Wed, 31 Oct 2018 01:09:10 GMT + recorded_at: Fri, 16 Nov 2018 22:42:40 GMT - request: method: get uri: https://www.consumerfinance.gov/robots.txt @@ -3495,7 +3495,7 @@ http_interactions: Content-Type: - text/plain; charset=UTF-8 Etag: - - '"125-5795e2c925641"' + - '"125-57abc510cd176"' Server: - Apache X-Frame-Options: @@ -3505,13 +3505,13 @@ http_interactions: X-Content-Type-Options: - nosniff Last-Modified: - - Mon, 29 Oct 2018 13:37:45 GMT + - Thu, 15 Nov 2018 23:21:55 GMT Access-Control-Allow-Origin: - https://www.consumerfinance.gov Vary: - Accept-Encoding Date: - - Wed, 31 Oct 2018 01:09:10 GMT + - Fri, 16 Nov 2018 22:42:40 GMT Content-Length: - '293' Connection: @@ -3532,5 +3532,5 @@ http_interactions: sitemap: https://www.consumerfinance.gov/sitemap.xml http_version: - recorded_at: Wed, 31 Oct 2018 01:09:10 GMT + recorded_at: Fri, 16 Nov 2018 22:42:40 GMT recorded_with: VCR 4.0.0 diff --git a/spec/vcr_cassettes/search/gov/tasks_searchgov_promote.yml b/spec/vcr_cassettes/search/gov/tasks_searchgov_promote.yml index d884b1735d..a74f0eb406 100644 --- a/spec/vcr_cassettes/search/gov/tasks_searchgov_promote.yml +++ b/spec/vcr_cassettes/search/gov/tasks_searchgov_promote.yml @@ -4995,14 +4995,14 @@ http_interactions: Location: - https://www.consumerfinance.gov/robots.txt Date: - - Wed, 31 Oct 2018 01:09:12 GMT + - Fri, 16 Nov 2018 22:42:40 GMT Connection: - keep-alive body: encoding: UTF-8 string: '' http_version: - recorded_at: Wed, 31 Oct 2018 01:09:11 GMT + recorded_at: Fri, 16 Nov 2018 22:42:40 GMT - request: method: get uri: https://www.consumerfinance.gov/robots.txt @@ -5024,7 +5024,7 @@ http_interactions: Content-Type: - text/plain; charset=UTF-8 Etag: - - '"125-5795e2c925641"' + - '"125-57abc510cd176"' Server: - Apache X-Frame-Options: @@ -5034,13 +5034,13 @@ http_interactions: X-Content-Type-Options: - nosniff Last-Modified: - - Mon, 29 Oct 2018 13:37:45 GMT + - Thu, 15 Nov 2018 23:21:55 GMT Access-Control-Allow-Origin: - https://www.consumerfinance.gov Vary: - Accept-Encoding Date: - - Wed, 31 Oct 2018 01:09:12 GMT + - Fri, 16 Nov 2018 22:42:40 GMT Content-Length: - '293' Connection: @@ -5061,5 +5061,5 @@ http_interactions: sitemap: https://www.consumerfinance.gov/sitemap.xml http_version: - recorded_at: Wed, 31 Oct 2018 01:09:12 GMT + recorded_at: Fri, 16 Nov 2018 22:42:40 GMT recorded_with: VCR 4.0.0