Skip to content

Commit

Permalink
Feature/bulk reindex urls (#175)
Browse files Browse the repository at this point in the history
* SRCH-202 - scrub job keywords from query

* update jobs.rb to have options in the jobs module

* change job_scrub to scrub_keyword, squish instead of strip, delete instead of gsub

* add a comment to explain why we keeping the keyword

* fix bracket and update scrub_keyword spec

* update jobs.rb to scrub jobs if employment, or jobs or posting or opening

* spec/vcr_cassettes/govbox_set/new_query_/affiliate_geoip_info_when_the_affiliate_has_the_jobs_govbox_enabled.yml

* update vcr cassettes

* SRCH-183, SRCH - 181 - bulk reindex urls

* test to check urls can be enqueued for reindexing

* wrap line for attr_accessible
remove parenthesis in rspec test

* Feature/bool db column urls (#172)

* SRCH-179 - Add a boolean column to the searchgov_urls

* update structure.sql

* migration file for enqueued for reindex to searachgov_url
  • Loading branch information
peggles2 authored Dec 12, 2018
1 parent 50329a8 commit 30fca50
Show file tree
Hide file tree
Showing 11 changed files with 127 additions and 64 deletions.
14 changes: 11 additions & 3 deletions app/models/searchgov_url.rb
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,11 @@ class SearchgovUrl < ActiveRecord::Base
application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
]

attr_accessible :last_crawl_status, :last_crawled_at, :url, :lastmod
attr_accessible :last_crawl_status,
:last_crawled_at,
:url,
:lastmod,
:enqueued_for_reindex
attr_reader :response, :document, :tempfile
attr_readonly :url

Expand All @@ -35,14 +39,18 @@ class SearchgovUrl < ActiveRecord::Base
column_name: proc {|url| !url.fetched? ? 'unfetched_urls_count' : nil },
column_names: { ['searchgov_urls.last_crawled_at IS NULL'] => 'unfetched_urls_count' }

scope :fetch_required, -> { where('last_crawled_at IS NULL OR lastmod > last_crawled_at') }
scope :fetch_required, -> do
where('last_crawled_at IS NULL
OR lastmod > last_crawled_at
OR enqueued_for_reindex')
end

class SearchgovUrlError < StandardError; end
class DomainError < StandardError; end

def fetch
raise DomainError.new("#{searchgov_domain.domain}: #{searchgov_domain.status}") if !searchgov_domain.available?
self.update_attributes(last_crawled_at: Time.now)
update(last_crawled_at: Time.now, enqueued_for_reindex: false)
self.load_time = Benchmark.realtime do
DocumentFetchLogger.new(url, 'searchgov_url').log
begin
Expand Down
18 changes: 18 additions & 0 deletions spec/fixtures/searchgov_urls.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
new:
url: http://www.agency.gov/new

outdated:
url: http://www.agency.gov/outdated
last_crawled_at: <%= 1.week.ago.to_s(:db) %>
lastmod: <%= 1.day.ago.to_s(:db) %>

current:
url: http://www.agency.gov/current
last_crawled_at: <%= 1.day.ago.to_s(:db) %>
lastmod: <%= 1.week.ago.to_s(:db) %>

enqueued:
url: http://www.agency.gov/enqueued
last_crawled_at: <%= 1.day.ago.to_s(:db) %>
lastmod: <%= 1.week.ago.to_s(:db) %>
enqueued_for_reindex: true
2 changes: 1 addition & 1 deletion spec/jobs/searchgov_domain_destroyer_job_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
let!(:searchgov_url2) { SearchgovUrl.create!(url: url2) }

it 'destroys the searchgov_urls' do
expect{ perform }.to change{ SearchgovUrl.count }.from(2).to(0)
expect { perform }.to change{ SearchgovUrl.count }.by(-2)
end
end
end
Expand Down
2 changes: 1 addition & 1 deletion spec/lib/tasks/searchgov_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@
it 'indexes new urls' do
allow(I14yDocument).to receive(:promote).
with(handle: 'searchgov', document_id: doc_id, bool: 'true').at_least(:once)
expect{ promote_urls }.to change{ SearchgovUrl.count }.from(0).to(1)
expect { promote_urls }.to change{ SearchgovUrl.count }.by(1)
end

it 'creates new urls' do
Expand Down
39 changes: 27 additions & 12 deletions spec/models/searchgov_url_spec.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
require 'spec_helper'

describe SearchgovUrl do
fixtures :searchgov_urls

let(:url) { 'http://www.agency.gov/boring.html' }
let(:html) { read_fixture_file("/html/page_with_og_metadata.html") }
let(:valid_attributes) { { url: url } }
Expand All @@ -22,19 +24,20 @@

describe 'scopes' do
describe '.fetch_required' do
before do
SearchgovUrl.create!(url: 'http://www.agency.gov/new')
SearchgovUrl.create!(
url: 'http://www.agency.gov/outdated', last_crawled_at: 1.week.ago, lastmod: 1.day.ago
)
SearchgovUrl.create!(
url: 'http://www.agency.gov/current', last_crawled_at: 1.day.ago, lastmod: 1.week.ago
)
end

it 'includes urls that have never been crawled and outdated urls' do
expect(SearchgovUrl.fetch_required.pluck(:url)).
to eq %w[http://www.agency.gov/new http://www.agency.gov/outdated]
to include('http://www.agency.gov/new', 'http://www.agency.gov/outdated')
end

it 'does not include current, crawled and not enqueued urls' do
expect(SearchgovUrl.fetch_required.pluck(:url)).
not_to include('http://www.agency.gov/current')
end

it 'includes urls that have been enqueued for reindexing' do
expect(SearchgovUrl.fetch_required.pluck(:url)).
to include 'http://www.agency.gov/enqueued'
end
end
end
Expand All @@ -43,7 +46,8 @@
it 'requires a valid domain' do
searchgov_url = SearchgovUrl.new(url: 'https://foo/bar')
expect(searchgov_url).not_to be_valid
expect(searchgov_url.errors.messages[:searchgov_domain]).to include 'is invalid'
expect(searchgov_url.errors.messages[:searchgov_domain]).
to include 'is invalid'
end

describe 'validating url uniqueness' do
Expand Down Expand Up @@ -75,7 +79,7 @@
end

it 'deletes the Searchgov Url' do
expect{ searchgov_url.destroy }.to change{ SearchgovUrl.count }.from(1).to(0)
expect { searchgov_url.destroy }.to change{ SearchgovUrl.count }.by(-1)
end
end
end
Expand Down Expand Up @@ -128,6 +132,17 @@
fetch
end

context 'when the record is enqueued for reindex' do
let(:searchgov_url) do
SearchgovUrl.create!(valid_attributes.merge(enqueued_for_reindex: true))
end

it 'sets enqueued_for_reindex to false' do
expect { fetch }.to change{ searchgov_url.enqueued_for_reindex }.
from(true).to(false)
end
end

context 'when the record includes a lastmod value' do
let(:valid_attributes) { { url: url, lastmod: '2018-01-01' } }

Expand Down
6 changes: 3 additions & 3 deletions spec/models/sitemap_indexer_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
subject(:index) { indexer.index }

it 'creates searchgov urls' do
expect{ index }.to change{SearchgovUrl.count}.from(0).to(1)
expect { index }.to change{ SearchgovUrl.count }.by(1)
end

it 'updates the counter cache columns' do
Expand Down Expand Up @@ -115,7 +115,7 @@
let(:sitemap_entries) { "<url>\n <loc>\n http://agency.gov/doc1 \n </loc>\n </url>" }

it 'creates a searchgov_url record' do
expect{index}.to change{SearchgovUrl.count}.from(0).to(1)
expect { index }.to change{ SearchgovUrl.count }.by(1)
end
end

Expand Down Expand Up @@ -143,7 +143,7 @@

it 'ignores them' do
index
expect(SearchgovUrl.pluck(:url)).to eq ['http://agency.gov/doc1']
expect(SearchgovUrl.pluck(:url)).not_to include 'http://other.gov/doc1'
end
end

Expand Down
44 changes: 33 additions & 11 deletions spec/support/fetchable_behavior.rb
Original file line number Diff line number Diff line change
Expand Up @@ -30,34 +30,52 @@
describe 'scopes' do
context 'by last_crawl_status or last_crawled_at' do
before do
described_class.create!(valid_attributes.merge(url: 'http://agency.gov/ok', last_crawl_status: 'OK', last_crawled_at: 1.day.ago))
described_class.create!(valid_attributes.merge(url: 'http://agency.gov/failed', last_crawl_status: 'failed', last_crawled_at: 1.day.ago))
described_class.create!(valid_attributes.merge(url: 'http://agency.gov/unfetched', last_crawl_status: nil, last_crawled_at: nil))
described_class.create!(valid_attributes.merge(url: 'http://agency.gov/ok',
last_crawl_status: 'OK',
last_crawled_at: 1.day.ago))
described_class.create!(valid_attributes.merge(url: 'http://agency.gov/failed',
last_crawl_status: 'failed',
last_crawled_at: 1.day.ago))
described_class.create!(valid_attributes.merge(url: 'http://agency.gov/unfetched',
last_crawl_status: nil,
last_crawled_at: nil))
end

describe '.fetched' do
it 'includes successfully and unsuccessfully fetched records' do
expect(described_class.fetched.pluck(:url)).
to match_array %w[http://agency.gov/ok http://agency.gov/failed]
to include('http://agency.gov/ok', 'http://agency.gov/failed')
end

it 'does not include unfetched records' do
expect(described_class.fetched.pluck(:url)).
not_to include 'http://agency.gov/unfetched'
end
end

describe '.unfetched' do
it 'includes unfetched records' do
expect(described_class.unfetched.pluck(:url)).to eq ['http://agency.gov/unfetched']
expect(described_class.unfetched.pluck(:url)).
to include 'http://agency.gov/unfetched'
end

it 'does not include fetched records' do
expect(described_class.unfetched.pluck(:url)).
not_to include 'http://agency.gov/ok'
end
end

describe '.ok' do
it 'includes successfully fetched records' do
expect(described_class.ok.pluck(:url)).to match_array ['http://agency.gov/ok']
expect(described_class.ok.pluck(:url)).
to match_array ['http://agency.gov/ok']
end
end

describe '.not_ok' do
it 'includes failed or unfetched records' do
expect(described_class.not_ok.pluck(:url)).
to match_array %w[http://agency.gov/unfetched http://agency.gov/failed]
to include('http://agency.gov/unfetched', 'http://agency.gov/failed')
end
end
end
Expand All @@ -76,28 +94,32 @@
context "when an URL contains an anchor tag" do
let(:url) { "http://www.nps.gov/sdfsdf#anchorme" }
it "should remove it" do
expect(described_class.create!(valid_attributes.merge(url: url)).url).to eq("http://www.nps.gov/sdfsdf")
expect(described_class.create!(valid_attributes.merge(url: url)).url).
to eq("http://www.nps.gov/sdfsdf")
end
end

context "when URL is mixed case" do
let(:url) { "HTTP://Www.nps.GOV/UsaGovLovesToCapitalize" }
it "should downcase the scheme and host only" do
expect(described_class.create!(valid_attributes.merge(url: url)).url).to eq("http://www.nps.gov/UsaGovLovesToCapitalize")
expect(described_class.create!(valid_attributes.merge(url: url)).url).
to eq("http://www.nps.gov/UsaGovLovesToCapitalize")
end
end

context "when URL is missing trailing slash for a scheme+host URL" do
let(:url) { "http://www.nps.gov" }
it "should append a /" do
expect(described_class.create!(valid_attributes.merge(url: url)).url).to eq("http://www.nps.gov/")
expect(described_class.create!(valid_attributes.merge(url: url)).url).
to eq("http://www.nps.gov/")
end
end

context "when URL contains duplicate leading slashes in request" do
let(:url) { "http://www.nps.gov//hey/I/am/usagov/and/love/extra////slashes.shtml" }
it "should collapse the slashes" do
expect(described_class.create!(valid_attributes.merge(url: url)).url).to eq("http://www.nps.gov/hey/I/am/usagov/and/love/extra/slashes.shtml")
expect(described_class.create!(valid_attributes.merge(url: url)).url).
to eq("http://www.nps.gov/hey/I/am/usagov/and/love/extra/slashes.shtml")
end
end

Expand Down
30 changes: 15 additions & 15 deletions spec/vcr_cassettes/jobs/search_options_.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1222,9 +1222,9 @@ http_interactions:
string: ''
headers:
Authorization-Key:
- Qbk5RB/WRc1ctYqwojqlSKeoLVrwokT8OnSLq+G1qu0=
- "<JOBS_AUTHORIZATION_KEY>"
User-Agent:
- [email protected]
- "<JOBS_USER_AGENT>"
Accept-Encoding:
- gzip;q=1.0,deflate;q=0.6,identity;q=0.3
Accept:
Expand Down Expand Up @@ -1877,9 +1877,9 @@ http_interactions:
string: ''
headers:
Authorization-Key:
- Qbk5RB/WRc1ctYqwojqlSKeoLVrwokT8OnSLq+G1qu0=
- "<JOBS_AUTHORIZATION_KEY>"
User-Agent:
- [email protected]
- "<JOBS_USER_AGENT>"
Accept-Encoding:
- gzip;q=1.0,deflate;q=0.6,identity;q=0.3
Accept:
Expand Down Expand Up @@ -3061,7 +3061,7 @@ http_interactions:
Authorization-Key:
- Qbk5RB/WRc1ctYqwojqlSKeoLVrwokT8OnSLq+G1qu0=
User-Agent:
- [email protected]
- "<JOBS_USER_AGENT>"
Accept-Encoding:
- gzip;q=1.0,deflate;q=0.6,identity;q=0.3
Accept:
Expand Down Expand Up @@ -3111,9 +3111,9 @@ http_interactions:
string: ''
headers:
Authorization-Key:
- Qbk5RB/WRc1ctYqwojqlSKeoLVrwokT8OnSLq+G1qu0=
- "<JOBS_AUTHORIZATION_KEY>"
User-Agent:
- [email protected]
- "<JOBS_USER_AGENT>"
Accept-Encoding:
- gzip;q=1.0,deflate;q=0.6,identity;q=0.3
Accept:
Expand Down Expand Up @@ -3163,9 +3163,9 @@ http_interactions:
string: ''
headers:
Authorization-Key:
- Qbk5RB/WRc1ctYqwojqlSKeoLVrwokT8OnSLq+G1qu0=
- "<JOBS_AUTHORIZATION_KEY>"
User-Agent:
- [email protected]
- "<JOBS_USER_AGENT>"
Accept-Encoding:
- gzip;q=1.0,deflate;q=0.6,identity;q=0.3
Accept:
Expand Down Expand Up @@ -4345,9 +4345,9 @@ http_interactions:
string: ''
headers:
Authorization-Key:
- Qbk5RB/WRc1ctYqwojqlSKeoLVrwokT8OnSLq+G1qu0=
- "<JOBS_AUTHORIZATION_KEY>"
User-Agent:
- [email protected]
- "<JOBS_USER_AGENT>"
Accept-Encoding:
- gzip;q=1.0,deflate;q=0.6,identity;q=0.3
Accept:
Expand Down Expand Up @@ -4975,9 +4975,9 @@ http_interactions:
string: ''
headers:
Authorization-Key:
- Qbk5RB/WRc1ctYqwojqlSKeoLVrwokT8OnSLq+G1qu0=
- "<JOBS_AUTHORIZATION_KEY>"
User-Agent:
- [email protected]
- "<JOBS_USER_AGENT>"
Accept-Encoding:
- gzip;q=1.0,deflate;q=0.6,identity;q=0.3
Accept:
Expand Down Expand Up @@ -5229,9 +5229,9 @@ http_interactions:
string: ''
headers:
Authorization-Key:
- Qbk5RB/WRc1ctYqwojqlSKeoLVrwokT8OnSLq+G1qu0=
- "<JOBS_AUTHORIZATION_KEY>"
User-Agent:
- [email protected]
- "<JOBS_USER_AGENT>"
Accept-Encoding:
- gzip;q=1.0,deflate;q=0.6,identity;q=0.3
Accept:
Expand Down
Loading

0 comments on commit 30fca50

Please sign in to comment.