From b369917529f91363bb7999dbfe6812a96da03528 Mon Sep 17 00:00:00 2001 From: jmax-fearless <66315527+jmax-fearless@users.noreply.github.com> Date: Fri, 6 Aug 2021 15:05:38 -0400 Subject: [PATCH] SRCH-2300 Handle non-ASCII characters in URLs (#752) SRCH-2300 Handle non-ASCII characters in URLs * Set encoding on tempfile. * Overhauled specs and features a little. --- app/services/bulk_url_uploader.rb | 2 +- spec/fixtures/searchgov_domain.yml | 11 -- spec/fixtures/searchgov_domains.yml | 13 ++ spec/fixtures/txt/non_ascii_urls.txt | 2 + spec/system/admin/bulk_url_upload_spec.rb | 161 ++++++++++------------ 5 files changed, 85 insertions(+), 104 deletions(-) delete mode 100644 spec/fixtures/searchgov_domain.yml create mode 100644 spec/fixtures/txt/non_ascii_urls.txt diff --git a/app/services/bulk_url_uploader.rb b/app/services/bulk_url_uploader.rb index f20f84540d..ce9d764b2f 100644 --- a/app/services/bulk_url_uploader.rb +++ b/app/services/bulk_url_uploader.rb @@ -93,7 +93,7 @@ def self.create_job(uploaded_file, user) SearchgovUrlBulkUploaderJob.perform_later( user, uploaded_file.original_filename, - uploaded_file.tempfile.readlines + uploaded_file.tempfile.set_encoding('UTF-8').readlines ) end diff --git a/spec/fixtures/searchgov_domain.yml b/spec/fixtures/searchgov_domain.yml deleted file mode 100644 index 1c9b191424..0000000000 --- a/spec/fixtures/searchgov_domain.yml +++ /dev/null @@ -1,11 +0,0 @@ -agency_gov: - domain: agency.gov - -www_agency_gov: - domain: www.agency.gov - -www_consumerfinance_gov: - domain: www.consumerfinance.gov - -www_medicare_gov: - domain: www.medicare.gov diff --git a/spec/fixtures/searchgov_domains.yml b/spec/fixtures/searchgov_domains.yml index 3dec018759..da0ba54d39 100644 --- a/spec/fixtures/searchgov_domains.yml +++ b/spec/fixtures/searchgov_domains.yml @@ -4,3 +4,16 @@ DEFAULTS: &DEFAULTS basic_domain: <<: *DEFAULTS domain: foo.gov + +agency_gov: + <<: *DEFAULTS + domain: agency.gov + +www_agency_gov: + domain: www.agency.gov + +www_consumerfinance_gov: + domain: www.consumerfinance.gov + +www_medicare_gov: + domain: www.medicare.gov diff --git a/spec/fixtures/txt/non_ascii_urls.txt b/spec/fixtures/txt/non_ascii_urls.txt new file mode 100644 index 0000000000..fb117aea02 --- /dev/null +++ b/spec/fixtures/txt/non_ascii_urls.txt @@ -0,0 +1,2 @@ +https://agency.gov/foo § 208.pdf?open +https://agency.gov/Asesor√≠a.pdf diff --git a/spec/system/admin/bulk_url_upload_spec.rb b/spec/system/admin/bulk_url_upload_spec.rb index 3a7a7d687d..c24340f91e 100644 --- a/spec/system/admin/bulk_url_upload_spec.rb +++ b/spec/system/admin/bulk_url_upload_spec.rb @@ -1,130 +1,107 @@ # frozen_string_literal: true +shared_examples 'a failed bulk upload with error' do |error_message| + it 'sends us back to the bulk upload page' do + do_bulk_upload + expect(page).to have_text('Bulk Search.gov URL Upload') + end + + it 'shows an error message' do + do_bulk_upload + expect(page).to have_text(error_message) + end +end + describe 'Bulk URL upload' do include ActiveJob::TestHelper - subject(:bulk_upload) do + subject(:do_bulk_upload) do perform_enqueued_jobs do visit url - attach_file('bulk_upload_urls', url_file) + attach_file('bulk_upload_urls', upload_file) click_button('Upload') end end let(:url) { '/admin/bulk_url_upload' } - let(:url_filedir) { 'txt' } - let(:url_filename) { 'good_url_file.txt' } - let(:url_file) { file_fixture("#{url_filedir}/#{url_filename}") } - let(:urls) { File.open(url_file).readlines.map(&:strip) } - let(:searchgov_domains) do - urls.reduce(Set.new) do |searchgov_domains, raw_url| - parsed_url = URI(raw_url) - domain = parsed_url.host - searchgov_domain = SearchgovDomain.find_by(domain: domain) - searchgov_domains << searchgov_domain if searchgov_domain - end - end + let(:upload_file) { file_fixture("txt/#{upload_filename}") } + let(:urls) { File.open(upload_file).readlines.map(&:strip) } + let(:searchgov_domain) { searchgov_domains(:agency_gov) } - before do - @reindexed_domains = Set.new - allow_any_instance_of(SearchgovDomain).to receive(:index_urls) do |searchgov_domain| - @reindexed_domains << searchgov_domain - end - end + before { allow(searchgov_domain).to receive(:index_urls) } it_behaves_like 'a page restricted to super admins' - describe 'bulk uploading a file of URLs' do + context 'when a super admin is logged in' do include_context 'log in super admin' - it 'sends us back to the bulk upload page' do - bulk_upload - expect(page).to have_text('Bulk Search.gov URL Upload') - end - - it 'shows a confirmation message' do - bulk_upload - expect(page).to have_text( - <<~CONFIRMATION_MESSAGE - Successfully uploaded #{url_filename} for processing. - The results will be emailed to you. - CONFIRMATION_MESSAGE - ) - end + describe 'bulk uploading a file of URLs' do + let(:upload_filename) { 'good_url_file.txt' } - it 'creates the URLs' do - bulk_upload - urls.each do |url| - expect(SearchgovUrl.find_by(url: url)).not_to be_blank + it 'sends us back to the bulk upload page' do + do_bulk_upload + expect(page).to have_text('Bulk Search.gov URL Upload') end - end - - it 're-indexes the domains for the URLs' do - bulk_upload - expect(@reindexed_domains).to eq(searchgov_domains) - end - end - describe 'trying to bulk upload a file of URLs when there is no file attached' do - include_context 'log in super admin' + it 'shows a confirmation message' do + do_bulk_upload + expect(page).to have_text( + <<~CONFIRMATION_MESSAGE + Successfully uploaded #{upload_filename} for processing. + The results will be emailed to you. + CONFIRMATION_MESSAGE + ) + end - subject(:bulk_upload) do - visit url - click_button('Upload') - end + it 'creates the URLs' do + do_bulk_upload - it 'sends us back to the bulk upload page' do - bulk_upload - expect(page).to have_text('Bulk Search.gov URL Upload') - end + urls.each do |url| + expect(SearchgovUrl.find_by(url: url)).not_to be_blank + end + end - it 'shows an error message' do - bulk_upload - expect(page).to have_text( - <<~ERROR_MESSAGE - Please choose a file to upload. - ERROR_MESSAGE - ) + it 're-indexes the domains for the URLs' do + reindexed_domains = Set.new + allow_any_instance_of(SearchgovDomain).to receive(:index_urls) do |searchgov_domain| + reindexed_domains << searchgov_domain + end + do_bulk_upload + expect(reindexed_domains).to eq(Set[searchgov_domain]) + end end - end - describe 'trying to bulk upload a file of URLs that is not a text file' do - include_context 'log in super admin' + context 'when the URLs contain non-ASCII characters' do + let(:upload_filename) { 'non_ascii_urls.txt' } - let(:url_file) { file_fixture('word/bogus_url_file.docx') } + it 'saves the encoded URLs' do + do_bulk_upload - it 'sends us back to the bulk upload page' do - bulk_upload - expect(page).to have_text('Bulk Search.gov URL Upload') + expect(SearchgovUrl.pluck(:url)).to include( + 'https://agency.gov/foo%20%C2%A7%20208.pdf?open', + 'https://agency.gov/Asesor%E2%88%9A%E2%89%A0a.pdf' + ) + end end - it 'shows an error message' do - bulk_upload - expect(page).to have_text( - <<~ERROR_MESSAGE - Files of type application/vnd.openxmlformats-officedocument.wordprocessingml.document are not supported - ERROR_MESSAGE - ) - end - end + describe 'trying to bulk upload a file of URLs when there is no file attached' do + let(:upload_file) { nil } - describe 'trying to bulk upload a file of URLs that is too big' do - include_context 'log in super admin' + it_behaves_like 'a failed bulk upload with error', 'Please choose a file to upload' + end - let(:url_filename) { 'too_big_url_file.txt' } + describe 'trying to bulk upload a file of URLs that is not a text file' do + let(:upload_filename) { 'bogus_url_file.docx' } + let(:upload_file) { file_fixture('word/bogus_url_file.docx') } - it 'sends us back to the bulk upload page' do - bulk_upload - expect(page).to have_text('Bulk Search.gov URL Upload') + it_behaves_like 'a failed bulk upload with error', + 'Files of type application/vnd.openxmlformats-officedocument.wordprocessingml.document are not supported' end - it 'shows an error message' do - bulk_upload - expect(page).to have_text( - <<~ERROR_MESSAGE - #{url_filename} is too big; please split it. - ERROR_MESSAGE - ) + describe 'trying to bulk upload a file of URLs that is too big' do + let(:upload_filename) { 'too_big_url_file.txt' } + + it_behaves_like 'a failed bulk upload with error', 'too_big_url_file.txt is too big; please split it' end end end