Skip to content
This repository has been archived by the owner on Jul 14, 2024. It is now read-only.

Commit

Permalink
Merge pull request #5 from boost/rm/lenz-ingest-perf
Browse files Browse the repository at this point in the history
S3 performance
  • Loading branch information
richardmatthewsdev authored Apr 15, 2024
2 parents 0b87afc + f8b8adf commit 540f89c
Showing 1 changed file with 8 additions and 2 deletions.
10 changes: 8 additions & 2 deletions lib/tasks/extract_from_s3.rake
Original file line number Diff line number Diff line change
Expand Up @@ -71,26 +71,32 @@ class S3ExtractionExecution
end

# rubocop:disable Metrics/MethodLength
# rubocop:disable Metrics/AbcSize
def convert_files_to_extraction_job
p 'Converting files from S3 into extraction job format...'

page = 1

Dir["#{@directory_path}/#{@job_id}/**/*.xml"].each do |file|
Dir["#{@directory_path}/#{@job_id}/**/*.xml"].each_slice(100) do |batch|
page_str = format('%09d', page)[-9..]
name_str = @extraction_definition.name.parameterize(separator: '_')

files = batch.map { |file| File.read(file) }
metadata = files.map { |record| Nokogiri::HTML(record).xpath('/html/body/metadata') }
body = "<?xml version=\"1.0\"?><root>#{metadata.map(&:to_xml).join}</root>"

Extraction::Document.new(
url: 's3', method: 's3 cli',
params: '', request_headers: [],
status: '', response_headers: [],
body: File.read(file)
body:
).save("#{@extraction_job.extraction_folder}/#{name_str}__-__#{page_str}.json")

page += 1
end

@extraction_job.completed!
end
# rubocop:enable Metrics/AbcSize
# rubocop:enable Metrics/MethodLength
end

0 comments on commit 540f89c

Please sign in to comment.