diff --git a/app/concerns/fetchable.rb b/app/concerns/fetchable.rb index e8daa10955..e78d6436a8 100644 --- a/app/concerns/fetchable.rb +++ b/app/concerns/fetchable.rb @@ -30,7 +30,6 @@ module Fetchable swf tar tgz - txt wav wmv wsdl diff --git a/app/models/searchgov_url.rb b/app/models/searchgov_url.rb index b65b6449e5..c68692d126 100644 --- a/app/models/searchgov_url.rb +++ b/app/models/searchgov_url.rb @@ -4,14 +4,15 @@ class SearchgovUrl < ActiveRecord::Base include ActionView::Helpers::NumberHelper MAX_DOC_SIZE = 15.megabytes - SUPPORTED_CONTENT_TYPES = %w( + SUPPORTED_CONTENT_TYPES = %w[ text/html + text/plain application/msword application/pdf application/vnd.ms-excel application/vnd.openxmlformats-officedocument.wordprocessingml.document application/vnd.openxmlformats-officedocument.spreadsheetml.sheet - ) + ] attr_accessible :last_crawl_status, :last_crawled_at, :url, :lastmod attr_reader :response, :document, :tempfile @@ -167,7 +168,7 @@ def url_without_protocol def parse_document Rails.logger.info "[SearchgovUrl] Parsing document for #{url}" - if /^application/ === response.content_type.mime_type + if /^application|text\/plain/ === response.content_type.mime_type ApplicationDocument.new(document: download.open, url: url) else HtmlDocument.new(document: response.to_s, url: url) diff --git a/spec/models/searchgov_url_spec.rb b/spec/models/searchgov_url_spec.rb index 0ec76c2f66..3dbe2c6216 100644 --- a/spec/models/searchgov_url_spec.rb +++ b/spec/models/searchgov_url_spec.rb @@ -97,6 +97,7 @@ before do allow(searchgov_url).to receive(:searchgov_domain).and_return(searchgov_domain) + allow(I14yDocument).to receive(:create) end context 'when the fetch is successful' do @@ -404,6 +405,30 @@ end end + context 'when the url points to a TXT doc (.txt)' do + let(:url) { 'https://www.irs.gov/test.txt' } + + before do + stub_request(:get, url). + to_return(status: 200, + body: 'This is my text content.', + headers: { content_type: 'text/plain' }) + end + + it 'fetches and indexes the document' do + expect(I14yDocument).to receive(:create). + with(hash_including( + handle: 'searchgov', + path: 'https://www.irs.gov/test.txt', + title: 'test.txt', + description: nil, + content: 'This is my text content.', + language: 'en' + )) + fetch + end + end + context 'when the request fails' do before do stub_request(:get, url).to_raise(StandardError.new('faaaaail')) diff --git a/spec/vcr_cassettes/searchgov_url_fetch/when_the_url_points_to_a_txt_doc_/txt_.yml b/spec/vcr_cassettes/searchgov_url_fetch/when_the_url_points_to_a_txt_doc_/txt_.yml new file mode 100644 index 0000000000..73e83623ef --- /dev/null +++ b/spec/vcr_cassettes/searchgov_url_fetch/when_the_url_points_to_a_txt_doc_/txt_.yml @@ -0,0 +1,41 @@ +--- +http_interactions: +- request: + method: post + uri: http://localhost:9998/rmeta/form/text + body: + encoding: UTF-8 + string: "-----------------------4c3920e9b4798e45290007b7ee45b0c93db035b4ed\r\nContent-Disposition: + form-data; name=\"upload\"; filename=\"SearchgovUrl:154275270720181120-34326-1v398b6\"\r\nContent-Type: + application/octet-stream\r\n\r\nThis is my text content.\n\r\n-----------------------4c3920e9b4798e45290007b7ee45b0c93db035b4ed--" + headers: + Connection: + - Keep-Alive + Content-Type: + - multipart/form-data; boundary=---------------------4c3920e9b4798e45290007b7ee45b0c93db035b4ed + Content-Length: + - '308' + Host: + - localhost:9998 + User-Agent: + - http.rb/1.0.4 + response: + status: + code: 200 + message: OK + headers: + Date: + - Tue, 20 Nov 2018 22:25:07 GMT + Content-Type: + - application/json + Transfer-Encoding: + - chunked + Server: + - Jetty(9.4.z-SNAPSHOT) + body: + encoding: UTF-8 + string: '[{"Content-Encoding":"ISO-8859-1","Content-Type":"text/plain; charset\u003dISO-8859-1","X-Parsed-By":["org.apache.tika.parser.DefaultParser","org.apache.tika.parser.txt.TXTParser"],"X-TIKA:content":"\n\n\n\n\n\n\n\nThis + is my text content.\n\n","X-TIKA:parse_time_millis":"29"}]' + http_version: + recorded_at: Tue, 20 Nov 2018 22:25:07 GMT +recorded_with: VCR 4.0.0