diff --git a/app/models/searchgov_url.rb b/app/models/searchgov_url.rb
index c68692d126..5b1e2e0903 100644
--- a/app/models/searchgov_url.rb
+++ b/app/models/searchgov_url.rb
@@ -14,7 +14,11 @@ class SearchgovUrl < ActiveRecord::Base
application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
]
- attr_accessible :last_crawl_status, :last_crawled_at, :url, :lastmod
+ attr_accessible :last_crawl_status,
+ :last_crawled_at,
+ :url,
+ :lastmod,
+ :enqueued_for_reindex
attr_reader :response, :document, :tempfile
attr_readonly :url
@@ -35,14 +39,18 @@ class SearchgovUrl < ActiveRecord::Base
column_name: proc {|url| !url.fetched? ? 'unfetched_urls_count' : nil },
column_names: { ['searchgov_urls.last_crawled_at IS NULL'] => 'unfetched_urls_count' }
- scope :fetch_required, -> { where('last_crawled_at IS NULL OR lastmod > last_crawled_at') }
+ scope :fetch_required, -> do
+ where('last_crawled_at IS NULL
+ OR lastmod > last_crawled_at
+ OR enqueued_for_reindex')
+ end
class SearchgovUrlError < StandardError; end
class DomainError < StandardError; end
def fetch
raise DomainError.new("#{searchgov_domain.domain}: #{searchgov_domain.status}") if !searchgov_domain.available?
- self.update_attributes(last_crawled_at: Time.now)
+ update(last_crawled_at: Time.now, enqueued_for_reindex: false)
self.load_time = Benchmark.realtime do
DocumentFetchLogger.new(url, 'searchgov_url').log
begin
diff --git a/spec/fixtures/searchgov_urls.yml b/spec/fixtures/searchgov_urls.yml
new file mode 100644
index 0000000000..fa0fef99d0
--- /dev/null
+++ b/spec/fixtures/searchgov_urls.yml
@@ -0,0 +1,18 @@
+new:
+ url: http://www.agency.gov/new
+
+outdated:
+ url: http://www.agency.gov/outdated
+ last_crawled_at: <%= 1.week.ago.to_s(:db) %>
+ lastmod: <%= 1.day.ago.to_s(:db) %>
+
+current:
+ url: http://www.agency.gov/current
+ last_crawled_at: <%= 1.day.ago.to_s(:db) %>
+ lastmod: <%= 1.week.ago.to_s(:db) %>
+
+enqueued:
+ url: http://www.agency.gov/enqueued
+ last_crawled_at: <%= 1.day.ago.to_s(:db) %>
+ lastmod: <%= 1.week.ago.to_s(:db) %>
+ enqueued_for_reindex: true
diff --git a/spec/jobs/searchgov_domain_destroyer_job_spec.rb b/spec/jobs/searchgov_domain_destroyer_job_spec.rb
index eb18c8d143..d7c8a2a7ba 100644
--- a/spec/jobs/searchgov_domain_destroyer_job_spec.rb
+++ b/spec/jobs/searchgov_domain_destroyer_job_spec.rb
@@ -28,7 +28,7 @@
let!(:searchgov_url2) { SearchgovUrl.create!(url: url2) }
it 'destroys the searchgov_urls' do
- expect{ perform }.to change{ SearchgovUrl.count }.from(2).to(0)
+ expect { perform }.to change{ SearchgovUrl.count }.by(-2)
end
end
end
diff --git a/spec/lib/tasks/searchgov_spec.rb b/spec/lib/tasks/searchgov_spec.rb
index 663262fb4c..c55a7cf186 100644
--- a/spec/lib/tasks/searchgov_spec.rb
+++ b/spec/lib/tasks/searchgov_spec.rb
@@ -77,7 +77,7 @@
it 'indexes new urls' do
allow(I14yDocument).to receive(:promote).
with(handle: 'searchgov', document_id: doc_id, bool: 'true').at_least(:once)
- expect{ promote_urls }.to change{ SearchgovUrl.count }.from(0).to(1)
+ expect { promote_urls }.to change{ SearchgovUrl.count }.by(1)
end
it 'creates new urls' do
diff --git a/spec/models/searchgov_url_spec.rb b/spec/models/searchgov_url_spec.rb
index 3dbe2c6216..5a3213571d 100644
--- a/spec/models/searchgov_url_spec.rb
+++ b/spec/models/searchgov_url_spec.rb
@@ -1,6 +1,8 @@
require 'spec_helper'
describe SearchgovUrl do
+ fixtures :searchgov_urls
+
let(:url) { 'http://www.agency.gov/boring.html' }
let(:html) { read_fixture_file("/html/page_with_og_metadata.html") }
let(:valid_attributes) { { url: url } }
@@ -22,19 +24,20 @@
describe 'scopes' do
describe '.fetch_required' do
- before do
- SearchgovUrl.create!(url: 'http://www.agency.gov/new')
- SearchgovUrl.create!(
- url: 'http://www.agency.gov/outdated', last_crawled_at: 1.week.ago, lastmod: 1.day.ago
- )
- SearchgovUrl.create!(
- url: 'http://www.agency.gov/current', last_crawled_at: 1.day.ago, lastmod: 1.week.ago
- )
- end
it 'includes urls that have never been crawled and outdated urls' do
expect(SearchgovUrl.fetch_required.pluck(:url)).
- to eq %w[http://www.agency.gov/new http://www.agency.gov/outdated]
+ to include('http://www.agency.gov/new', 'http://www.agency.gov/outdated')
+ end
+
+ it 'does not include current, crawled and not enqueued urls' do
+ expect(SearchgovUrl.fetch_required.pluck(:url)).
+ not_to include('http://www.agency.gov/current')
+ end
+
+ it 'includes urls that have been enqueued for reindexing' do
+ expect(SearchgovUrl.fetch_required.pluck(:url)).
+ to include 'http://www.agency.gov/enqueued'
end
end
end
@@ -43,7 +46,8 @@
it 'requires a valid domain' do
searchgov_url = SearchgovUrl.new(url: 'https://foo/bar')
expect(searchgov_url).not_to be_valid
- expect(searchgov_url.errors.messages[:searchgov_domain]).to include 'is invalid'
+ expect(searchgov_url.errors.messages[:searchgov_domain]).
+ to include 'is invalid'
end
describe 'validating url uniqueness' do
@@ -75,7 +79,7 @@
end
it 'deletes the Searchgov Url' do
- expect{ searchgov_url.destroy }.to change{ SearchgovUrl.count }.from(1).to(0)
+ expect { searchgov_url.destroy }.to change{ SearchgovUrl.count }.by(-1)
end
end
end
@@ -128,6 +132,17 @@
fetch
end
+ context 'when the record is enqueued for reindex' do
+ let(:searchgov_url) do
+ SearchgovUrl.create!(valid_attributes.merge(enqueued_for_reindex: true))
+ end
+
+ it 'sets enqueued_for_reindex to false' do
+ expect { fetch }.to change{ searchgov_url.enqueued_for_reindex }.
+ from(true).to(false)
+ end
+ end
+
context 'when the record includes a lastmod value' do
let(:valid_attributes) { { url: url, lastmod: '2018-01-01' } }
diff --git a/spec/models/sitemap_indexer_spec.rb b/spec/models/sitemap_indexer_spec.rb
index 6f0fb9d1b7..b2c522b996 100644
--- a/spec/models/sitemap_indexer_spec.rb
+++ b/spec/models/sitemap_indexer_spec.rb
@@ -24,7 +24,7 @@
subject(:index) { indexer.index }
it 'creates searchgov urls' do
- expect{ index }.to change{SearchgovUrl.count}.from(0).to(1)
+ expect { index }.to change{ SearchgovUrl.count }.by(1)
end
it 'updates the counter cache columns' do
@@ -115,7 +115,7 @@
let(:sitemap_entries) { "\n \n http://agency.gov/doc1 \n \n " }
it 'creates a searchgov_url record' do
- expect{index}.to change{SearchgovUrl.count}.from(0).to(1)
+ expect { index }.to change{ SearchgovUrl.count }.by(1)
end
end
@@ -143,7 +143,7 @@
it 'ignores them' do
index
- expect(SearchgovUrl.pluck(:url)).to eq ['http://agency.gov/doc1']
+ expect(SearchgovUrl.pluck(:url)).not_to include 'http://other.gov/doc1'
end
end
diff --git a/spec/support/fetchable_behavior.rb b/spec/support/fetchable_behavior.rb
index b5f064577a..d821684c7b 100644
--- a/spec/support/fetchable_behavior.rb
+++ b/spec/support/fetchable_behavior.rb
@@ -30,34 +30,52 @@
describe 'scopes' do
context 'by last_crawl_status or last_crawled_at' do
before do
- described_class.create!(valid_attributes.merge(url: 'http://agency.gov/ok', last_crawl_status: 'OK', last_crawled_at: 1.day.ago))
- described_class.create!(valid_attributes.merge(url: 'http://agency.gov/failed', last_crawl_status: 'failed', last_crawled_at: 1.day.ago))
- described_class.create!(valid_attributes.merge(url: 'http://agency.gov/unfetched', last_crawl_status: nil, last_crawled_at: nil))
+ described_class.create!(valid_attributes.merge(url: 'http://agency.gov/ok',
+ last_crawl_status: 'OK',
+ last_crawled_at: 1.day.ago))
+ described_class.create!(valid_attributes.merge(url: 'http://agency.gov/failed',
+ last_crawl_status: 'failed',
+ last_crawled_at: 1.day.ago))
+ described_class.create!(valid_attributes.merge(url: 'http://agency.gov/unfetched',
+ last_crawl_status: nil,
+ last_crawled_at: nil))
end
describe '.fetched' do
it 'includes successfully and unsuccessfully fetched records' do
expect(described_class.fetched.pluck(:url)).
- to match_array %w[http://agency.gov/ok http://agency.gov/failed]
+ to include('http://agency.gov/ok', 'http://agency.gov/failed')
+ end
+
+ it 'does not include unfetched records' do
+ expect(described_class.fetched.pluck(:url)).
+ not_to include 'http://agency.gov/unfetched'
end
end
describe '.unfetched' do
it 'includes unfetched records' do
- expect(described_class.unfetched.pluck(:url)).to eq ['http://agency.gov/unfetched']
+ expect(described_class.unfetched.pluck(:url)).
+ to include 'http://agency.gov/unfetched'
+ end
+
+ it 'does not include fetched records' do
+ expect(described_class.unfetched.pluck(:url)).
+ not_to include 'http://agency.gov/ok'
end
end
describe '.ok' do
it 'includes successfully fetched records' do
- expect(described_class.ok.pluck(:url)).to match_array ['http://agency.gov/ok']
+ expect(described_class.ok.pluck(:url)).
+ to match_array ['http://agency.gov/ok']
end
end
describe '.not_ok' do
it 'includes failed or unfetched records' do
expect(described_class.not_ok.pluck(:url)).
- to match_array %w[http://agency.gov/unfetched http://agency.gov/failed]
+ to include('http://agency.gov/unfetched', 'http://agency.gov/failed')
end
end
end
@@ -76,28 +94,32 @@
context "when an URL contains an anchor tag" do
let(:url) { "http://www.nps.gov/sdfsdf#anchorme" }
it "should remove it" do
- expect(described_class.create!(valid_attributes.merge(url: url)).url).to eq("http://www.nps.gov/sdfsdf")
+ expect(described_class.create!(valid_attributes.merge(url: url)).url).
+ to eq("http://www.nps.gov/sdfsdf")
end
end
context "when URL is mixed case" do
let(:url) { "HTTP://Www.nps.GOV/UsaGovLovesToCapitalize" }
it "should downcase the scheme and host only" do
- expect(described_class.create!(valid_attributes.merge(url: url)).url).to eq("http://www.nps.gov/UsaGovLovesToCapitalize")
+ expect(described_class.create!(valid_attributes.merge(url: url)).url).
+ to eq("http://www.nps.gov/UsaGovLovesToCapitalize")
end
end
context "when URL is missing trailing slash for a scheme+host URL" do
let(:url) { "http://www.nps.gov" }
it "should append a /" do
- expect(described_class.create!(valid_attributes.merge(url: url)).url).to eq("http://www.nps.gov/")
+ expect(described_class.create!(valid_attributes.merge(url: url)).url).
+ to eq("http://www.nps.gov/")
end
end
context "when URL contains duplicate leading slashes in request" do
let(:url) { "http://www.nps.gov//hey/I/am/usagov/and/love/extra////slashes.shtml" }
it "should collapse the slashes" do
- expect(described_class.create!(valid_attributes.merge(url: url)).url).to eq("http://www.nps.gov/hey/I/am/usagov/and/love/extra/slashes.shtml")
+ expect(described_class.create!(valid_attributes.merge(url: url)).url).
+ to eq("http://www.nps.gov/hey/I/am/usagov/and/love/extra/slashes.shtml")
end
end
diff --git a/spec/vcr_cassettes/jobs/search_options_.yml b/spec/vcr_cassettes/jobs/search_options_.yml
index 6fbefe6338..330acc4522 100644
--- a/spec/vcr_cassettes/jobs/search_options_.yml
+++ b/spec/vcr_cassettes/jobs/search_options_.yml
@@ -1222,9 +1222,9 @@ http_interactions:
string: ''
headers:
Authorization-Key:
- - Qbk5RB/WRc1ctYqwojqlSKeoLVrwokT8OnSLq+G1qu0=
+ - ""
User-Agent:
- - parissa.eggleston@gmail.com
+ - ""
Accept-Encoding:
- gzip;q=1.0,deflate;q=0.6,identity;q=0.3
Accept:
@@ -1877,9 +1877,9 @@ http_interactions:
string: ''
headers:
Authorization-Key:
- - Qbk5RB/WRc1ctYqwojqlSKeoLVrwokT8OnSLq+G1qu0=
+ - ""
User-Agent:
- - parissa.eggleston@gmail.com
+ - ""
Accept-Encoding:
- gzip;q=1.0,deflate;q=0.6,identity;q=0.3
Accept:
@@ -3061,7 +3061,7 @@ http_interactions:
Authorization-Key:
- Qbk5RB/WRc1ctYqwojqlSKeoLVrwokT8OnSLq+G1qu0=
User-Agent:
- - parissa.eggleston@gmail.com
+ - ""
Accept-Encoding:
- gzip;q=1.0,deflate;q=0.6,identity;q=0.3
Accept:
@@ -3111,9 +3111,9 @@ http_interactions:
string: ''
headers:
Authorization-Key:
- - Qbk5RB/WRc1ctYqwojqlSKeoLVrwokT8OnSLq+G1qu0=
+ - ""
User-Agent:
- - parissa.eggleston@gmail.com
+ - ""
Accept-Encoding:
- gzip;q=1.0,deflate;q=0.6,identity;q=0.3
Accept:
@@ -3163,9 +3163,9 @@ http_interactions:
string: ''
headers:
Authorization-Key:
- - Qbk5RB/WRc1ctYqwojqlSKeoLVrwokT8OnSLq+G1qu0=
+ - ""
User-Agent:
- - parissa.eggleston@gmail.com
+ - ""
Accept-Encoding:
- gzip;q=1.0,deflate;q=0.6,identity;q=0.3
Accept:
@@ -4345,9 +4345,9 @@ http_interactions:
string: ''
headers:
Authorization-Key:
- - Qbk5RB/WRc1ctYqwojqlSKeoLVrwokT8OnSLq+G1qu0=
+ - ""
User-Agent:
- - parissa.eggleston@gmail.com
+ - ""
Accept-Encoding:
- gzip;q=1.0,deflate;q=0.6,identity;q=0.3
Accept:
@@ -4975,9 +4975,9 @@ http_interactions:
string: ''
headers:
Authorization-Key:
- - Qbk5RB/WRc1ctYqwojqlSKeoLVrwokT8OnSLq+G1qu0=
+ - ""
User-Agent:
- - parissa.eggleston@gmail.com
+ - ""
Accept-Encoding:
- gzip;q=1.0,deflate;q=0.6,identity;q=0.3
Accept:
@@ -5229,9 +5229,9 @@ http_interactions:
string: ''
headers:
Authorization-Key:
- - Qbk5RB/WRc1ctYqwojqlSKeoLVrwokT8OnSLq+G1qu0=
+ - ""
User-Agent:
- - parissa.eggleston@gmail.com
+ - ""
Accept-Encoding:
- gzip;q=1.0,deflate;q=0.6,identity;q=0.3
Accept:
diff --git a/spec/vcr_cassettes/search/gov/tasks_searchgov_bulk_index.yml b/spec/vcr_cassettes/search/gov/tasks_searchgov_bulk_index.yml
index d82b48d27a..9c1e7c6425 100644
--- a/spec/vcr_cassettes/search/gov/tasks_searchgov_bulk_index.yml
+++ b/spec/vcr_cassettes/search/gov/tasks_searchgov_bulk_index.yml
@@ -3466,14 +3466,14 @@ http_interactions:
Location:
- https://www.consumerfinance.gov/robots.txt
Date:
- - Wed, 31 Oct 2018 01:09:10 GMT
+ - Fri, 16 Nov 2018 22:42:39 GMT
Connection:
- keep-alive
body:
encoding: UTF-8
string: ''
http_version:
- recorded_at: Wed, 31 Oct 2018 01:09:10 GMT
+ recorded_at: Fri, 16 Nov 2018 22:42:39 GMT
- request:
method: get
uri: https://www.consumerfinance.gov/robots.txt
@@ -3495,7 +3495,7 @@ http_interactions:
Content-Type:
- text/plain; charset=UTF-8
Etag:
- - '"125-5795e2c925641"'
+ - '"125-57abc510cd176"'
Server:
- Apache
X-Frame-Options:
@@ -3505,13 +3505,13 @@ http_interactions:
X-Content-Type-Options:
- nosniff
Last-Modified:
- - Mon, 29 Oct 2018 13:37:45 GMT
+ - Thu, 15 Nov 2018 23:21:55 GMT
Access-Control-Allow-Origin:
- https://www.consumerfinance.gov
Vary:
- Accept-Encoding
Date:
- - Wed, 31 Oct 2018 01:09:10 GMT
+ - Fri, 16 Nov 2018 22:42:39 GMT
Content-Length:
- '293'
Connection:
@@ -3532,5 +3532,5 @@ http_interactions:
sitemap: https://www.consumerfinance.gov/sitemap.xml
http_version:
- recorded_at: Wed, 31 Oct 2018 01:09:10 GMT
+ recorded_at: Fri, 16 Nov 2018 22:42:39 GMT
recorded_with: VCR 4.0.0
diff --git a/spec/vcr_cassettes/search/gov/tasks_searchgov_bulk_index_when_a_url_has_already_been_indexed.yml b/spec/vcr_cassettes/search/gov/tasks_searchgov_bulk_index_when_a_url_has_already_been_indexed.yml
index 8f2b9312d0..828c0aec34 100644
--- a/spec/vcr_cassettes/search/gov/tasks_searchgov_bulk_index_when_a_url_has_already_been_indexed.yml
+++ b/spec/vcr_cassettes/search/gov/tasks_searchgov_bulk_index_when_a_url_has_already_been_indexed.yml
@@ -3466,14 +3466,14 @@ http_interactions:
Location:
- https://www.consumerfinance.gov/robots.txt
Date:
- - Wed, 31 Oct 2018 01:09:10 GMT
+ - Fri, 16 Nov 2018 22:42:40 GMT
Connection:
- keep-alive
body:
encoding: UTF-8
string: ''
http_version:
- recorded_at: Wed, 31 Oct 2018 01:09:10 GMT
+ recorded_at: Fri, 16 Nov 2018 22:42:40 GMT
- request:
method: get
uri: https://www.consumerfinance.gov/robots.txt
@@ -3495,7 +3495,7 @@ http_interactions:
Content-Type:
- text/plain; charset=UTF-8
Etag:
- - '"125-5795e2c925641"'
+ - '"125-57abc510cd176"'
Server:
- Apache
X-Frame-Options:
@@ -3505,13 +3505,13 @@ http_interactions:
X-Content-Type-Options:
- nosniff
Last-Modified:
- - Mon, 29 Oct 2018 13:37:45 GMT
+ - Thu, 15 Nov 2018 23:21:55 GMT
Access-Control-Allow-Origin:
- https://www.consumerfinance.gov
Vary:
- Accept-Encoding
Date:
- - Wed, 31 Oct 2018 01:09:10 GMT
+ - Fri, 16 Nov 2018 22:42:40 GMT
Content-Length:
- '293'
Connection:
@@ -3532,5 +3532,5 @@ http_interactions:
sitemap: https://www.consumerfinance.gov/sitemap.xml
http_version:
- recorded_at: Wed, 31 Oct 2018 01:09:10 GMT
+ recorded_at: Fri, 16 Nov 2018 22:42:40 GMT
recorded_with: VCR 4.0.0
diff --git a/spec/vcr_cassettes/search/gov/tasks_searchgov_promote.yml b/spec/vcr_cassettes/search/gov/tasks_searchgov_promote.yml
index d884b1735d..a74f0eb406 100644
--- a/spec/vcr_cassettes/search/gov/tasks_searchgov_promote.yml
+++ b/spec/vcr_cassettes/search/gov/tasks_searchgov_promote.yml
@@ -4995,14 +4995,14 @@ http_interactions:
Location:
- https://www.consumerfinance.gov/robots.txt
Date:
- - Wed, 31 Oct 2018 01:09:12 GMT
+ - Fri, 16 Nov 2018 22:42:40 GMT
Connection:
- keep-alive
body:
encoding: UTF-8
string: ''
http_version:
- recorded_at: Wed, 31 Oct 2018 01:09:11 GMT
+ recorded_at: Fri, 16 Nov 2018 22:42:40 GMT
- request:
method: get
uri: https://www.consumerfinance.gov/robots.txt
@@ -5024,7 +5024,7 @@ http_interactions:
Content-Type:
- text/plain; charset=UTF-8
Etag:
- - '"125-5795e2c925641"'
+ - '"125-57abc510cd176"'
Server:
- Apache
X-Frame-Options:
@@ -5034,13 +5034,13 @@ http_interactions:
X-Content-Type-Options:
- nosniff
Last-Modified:
- - Mon, 29 Oct 2018 13:37:45 GMT
+ - Thu, 15 Nov 2018 23:21:55 GMT
Access-Control-Allow-Origin:
- https://www.consumerfinance.gov
Vary:
- Accept-Encoding
Date:
- - Wed, 31 Oct 2018 01:09:12 GMT
+ - Fri, 16 Nov 2018 22:42:40 GMT
Content-Length:
- '293'
Connection:
@@ -5061,5 +5061,5 @@ http_interactions:
sitemap: https://www.consumerfinance.gov/sitemap.xml
http_version:
- recorded_at: Wed, 31 Oct 2018 01:09:12 GMT
+ recorded_at: Fri, 16 Nov 2018 22:42:40 GMT
recorded_with: VCR 4.0.0