From bff49ef907f23985f24c89edff14d6a51745f358 Mon Sep 17 00:00:00 2001 From: Gen Schmitt Date: Thu, 18 Mar 2021 11:03:03 -0500 Subject: [PATCH 01/20] Updated S3 path for json reponse --- lib/extractor.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/extractor.rb b/lib/extractor.rb index 51bd1a7..f767c32 100644 --- a/lib/extractor.rb +++ b/lib/extractor.rb @@ -10,7 +10,7 @@ class Extractor def self.extract(bucket_name, object_key, binary_name, web_id) begin local_path = "./#{binary_name}" - s3_path = "#{web_id}_#{binary_name}" + s3_path = "messages/#{web_id}.json" region = 'us-east-2' s3_client = Aws::S3::Client.new(region: region) @@ -52,7 +52,7 @@ def self.extract(bucket_name, object_key, binary_name, web_id) puts "Error putting json response for object #{object_key} in S3 bucket #{bucket_name}: #{e.message}" end - retVal = {"bucket name" => bucket_name, "object key" => s3_path} + retVal = {"bucketName" => bucket_name, "objectKey" => s3_path} sqs = Aws::SQS::Client.new(region: region) From 6077abaefd4671d7acae3ee2fd6ca15a17990861 Mon Sep 17 00:00:00 2001 From: Gen Schmitt Date: Thu, 25 Mar 2021 15:39:29 -0500 Subject: [PATCH 02/20] Removed unused code --- lib/extractor.rb | 5 ++--- lib/extractor/extraction.rb | 37 ++++-------------------------------- lib/extractor/nested_item.rb | 3 --- lib/extractor/peek_type.rb | 5 ----- 4 files changed, 6 insertions(+), 44 deletions(-) delete mode 100644 lib/extractor/nested_item.rb diff --git a/lib/extractor.rb b/lib/extractor.rb index f767c32..429dd3a 100644 --- a/lib/extractor.rb +++ b/lib/extractor.rb @@ -9,8 +9,6 @@ class Extractor def self.extract(bucket_name, object_key, binary_name, web_id) begin - local_path = "./#{binary_name}" - s3_path = "messages/#{web_id}.json" region = 'us-east-2' s3_client = Aws::S3::Client.new(region: region) @@ -41,6 +39,7 @@ def self.extract(bucket_name, object_key, binary_name, web_id) retVal = {"web_id" => web_id, "status" => extraction.status, "error" => extraction.error, "peek_type" => extraction.peek_type, "peek_text" => extraction.peek_text, "nested_items" => items} + s3_path = "messages/#{web_id}.json" begin s3_client.put_object({ body: retVal.to_json, @@ -52,7 +51,7 @@ def self.extract(bucket_name, object_key, binary_name, web_id) puts "Error putting json response for object #{object_key} in S3 bucket #{bucket_name}: #{e.message}" end - retVal = {"bucketName" => bucket_name, "objectKey" => s3_path} + retVal = {"bucket_name" => bucket_name, "object_key" => s3_path} sqs = Aws::SQS::Client.new(region: region) diff --git a/lib/extractor/extraction.rb b/lib/extractor/extraction.rb index 75d762d..a6fc436 100644 --- a/lib/extractor/extraction.rb +++ b/lib/extractor/extraction.rb @@ -8,7 +8,6 @@ require 'libarchive' require 'rubygems/package' -require_relative 'nested_item.rb' require_relative 'extraction_status.rb' require_relative 'peek_type.rb' @@ -38,7 +37,6 @@ def process self.status = ExtractionStatus::ERROR self.peek_type = PeekType::NONE report_problem(error.message) - #raise error ensure if self.peek_text && self.peek_text.encoding.name != 'UTF-8' begin @@ -55,15 +53,12 @@ def process end def report_problem(report) - #Problem.create(task_id: self.id, report: report) self.error = {"task_id" => self.id, "report" => report} end def extract_features mime_guess = top_level_mime || Extraction.mime_from_filename(self.binary_name) || 'application/octet-stream' - #Rails.logger.warn("#{self.binary_name} - #{mime_guess}") - mime_parts = mime_guess.split("/") nonzip_archive_subtypes = ['x-7z-compressed', 'x-tar'] @@ -87,9 +82,8 @@ def top_level_mime end def self.mime_from_path(path) -# puts "path provided #{path}" file_mime_response = MimeMagic.by_path(File.open("#{path}")).to_s -# puts "file mime response #{file_mime_response}" + if file_mime_response.length > 0 file_mime_response else @@ -167,10 +161,6 @@ def extract_zip File.delete(extracted_entry_path) if File.exist?(extracted_entry_path) end - else - #Rails.logger.warn("skipped entry is ds_store: #{is_ds_store(entry_path)}") - #Rails.logger.warn("skipped entry is mac thing: #{is_mac_thing(entry_path)}") - end end end @@ -204,7 +194,7 @@ def extract_archive while entry = ar.next_header entry_path = valid_entry_path(entry.pathname) -# puts "archive name #{entry.pathname}" + if entry_path if !is_ds_store(entry_path) && !is_mac_thing(entry_path) @@ -287,7 +277,7 @@ def extract_gzip if entry_path if !is_ds_store(entry_path) && !is_mac_thing(entry_path) && !is_mac_tar_thing(entry_path) -# puts entry.full_name + entry_paths << entry_path @@ -371,7 +361,7 @@ def extract_default end def valid_entry_path(entry_path) - if entry_path[-1] == '/' + if ends_in_slash(entry_path) return entry_path[0...-1] elsif entry_path.length > 0 return entry_path @@ -412,25 +402,6 @@ def name_part(path) end end - def self.charset_from_path(path) - - file_info = "" - - if OS.mac? - file_info = `file -I #{path}` - elsif OS.linux? - file_info = `file -i #{path}` - else - return nil - end - - if file_info.length > 0 - file_info.strip.split('charset=').last - else - nil - end - end - def entry_paths_arr_to_html(entry_paths) return_string = ' ' diff --git a/lib/extractor/nested_item.rb b/lib/extractor/nested_item.rb deleted file mode 100644 index d235fc9..0000000 --- a/lib/extractor/nested_item.rb +++ /dev/null @@ -1,3 +0,0 @@ -class NestedItem - attr_accessor :item_name, :item_path, :item_size, :media_type, :is_directory -end \ No newline at end of file diff --git a/lib/extractor/peek_type.rb b/lib/extractor/peek_type.rb index 0ac1db3..af99a01 100644 --- a/lib/extractor/peek_type.rb +++ b/lib/extractor/peek_type.rb @@ -1,9 +1,4 @@ class PeekType - ALL_TEXT = 'all_text' - PART_TEXT = 'part_text' - IMAGE = 'image' - MICROSOFT = 'microsoft' - PDF = 'pdf' LISTING = 'listing' NONE = 'none' end \ No newline at end of file From 676c96454553651e1db58d5709bafba45565f4d2 Mon Sep 17 00:00:00 2001 From: Gen Schmitt Date: Mon, 29 Mar 2021 13:36:38 -0500 Subject: [PATCH 03/20] Updated mimemagic gem and peek text format --- Gemfile | 3 +-- Gemfile.lock | 31 ++++++++++++++++++------------- lib/extractor/extraction.rb | 4 ++-- 3 files changed, 21 insertions(+), 17 deletions(-) diff --git a/Gemfile b/Gemfile index 8af86cb..73c0c87 100644 --- a/Gemfile +++ b/Gemfile @@ -10,14 +10,13 @@ gem 'mime-types', require: 'mime/types/full' gem 'rubyzip' # Use archive for non-zip archive files -#gem 'archive' gem 'libarchive' # Use os to interact with operating system gem 'os' # Use mimemagic to find the mime type of a file from the extension or content -gem 'mimemagic' +gem "mimemagic", "~> 0.3.6" gem "rake", "~> 13.0" diff --git a/Gemfile.lock b/Gemfile.lock index b9f00c6..e49612c 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,31 +1,36 @@ GEM remote: https://rubygems.org/ specs: - aws-eventstream (1.1.0) - aws-partitions (1.416.0) - aws-sdk-core (3.111.0) + aws-eventstream (1.1.1) + aws-partitions (1.436.0) + aws-sdk-core (3.113.0) aws-eventstream (~> 1, >= 1.0.2) aws-partitions (~> 1, >= 1.239.0) aws-sigv4 (~> 1.1) jmespath (~> 1.0) - aws-sdk-kms (1.41.0) - aws-sdk-core (~> 3, >= 3.109.0) + aws-sdk-kms (1.43.0) + aws-sdk-core (~> 3, >= 3.112.0) aws-sigv4 (~> 1.1) - aws-sdk-s3 (1.87.0) - aws-sdk-core (~> 3, >= 3.109.0) + aws-sdk-s3 (1.93.0) + aws-sdk-core (~> 3, >= 3.112.0) aws-sdk-kms (~> 1) aws-sigv4 (~> 1.1) - aws-sdk-sqs (1.35.0) - aws-sdk-core (~> 3, >= 3.109.0) + aws-sdk-sqs (1.38.0) + aws-sdk-core (~> 3, >= 3.112.0) aws-sigv4 (~> 1.1) - aws-sigv4 (1.2.2) + aws-sigv4 (1.2.3) aws-eventstream (~> 1, >= 1.0.2) jmespath (1.4.0) mime-types (3.3.1) mime-types-data (~> 3.2015) - mime-types-data (3.2020.1104) - mimemagic (0.3.5) + mime-types-data (3.2021.0225) + mimemagic (0.3.10) + nokogiri (~> 1) + rake + nokogiri (1.11.2-x86_64-darwin) + racc (~> 1.4) os (1.1.1) + racc (1.5.2) rake (13.0.3) rubyzip (2.3.0) @@ -36,7 +41,7 @@ DEPENDENCIES aws-sdk-s3 aws-sdk-sqs mime-types - mimemagic + mimemagic (~> 0.3.6) os rake (~> 13.0) rubyzip diff --git a/lib/extractor/extraction.rb b/lib/extractor/extraction.rb index a6fc436..ee10327 100644 --- a/lib/extractor/extraction.rb +++ b/lib/extractor/extraction.rb @@ -414,7 +414,7 @@ def entry_paths_arr_to_html(entry_paths) name_arr = entry_path.split("/") name_arr.length.times do - return_string << '
' + return_string << '
' end if entry_path[-1] == "/" # means directory @@ -432,7 +432,7 @@ def entry_paths_arr_to_html(entry_paths) end - return return_string + return return_string.gsub("\"", "'") end From 006432d2ff914d03fb306900e2606842c313485e Mon Sep 17 00:00:00 2001 From: Gen Schmitt Date: Tue, 20 Apr 2021 15:47:34 -0500 Subject: [PATCH 04/20] updated error handling, and added script to simplify updating ECR image --- ecr-push.sh | 9 +++++++++ lib/extractor.rb | 35 +++++++++++++++++++++++------------ lib/extractor/extraction.rb | 5 +++-- 3 files changed, 35 insertions(+), 14 deletions(-) create mode 100755 ecr-push.sh diff --git a/ecr-push.sh b/ecr-push.sh new file mode 100755 index 0000000..752efc3 --- /dev/null +++ b/ecr-push.sh @@ -0,0 +1,9 @@ +#!/bin/sh +# +# Builds a Docker image and pushes it to AWS ECR. +# + +aws ecr get-login-password --region us-east-2 | docker login --username AWS --password-stdin 721945215539.dkr.ecr.us-east-2.amazonaws.com +docker build -t databank-archive-extractor-demo . +docker tag databank-archive-extractor-demo:latest 721945215539.dkr.ecr.us-east-2.amazonaws.com/databank-archive-extractor-demo:latest +docker push 721945215539.dkr.ecr.us-east-2.amazonaws.com/databank-archive-extractor-demo:latest \ No newline at end of file diff --git a/lib/extractor.rb b/lib/extractor.rb index 429dd3a..427ebc2 100644 --- a/lib/extractor.rb +++ b/lib/extractor.rb @@ -9,7 +9,10 @@ class Extractor def self.extract(bucket_name, object_key, binary_name, web_id) begin - + status = ExtractionStatus::ERROR + error = Hash.new + s3_put_status = ExtractionStatus::SUCCESS + s3_put_error = "" region = 'us-east-2' s3_client = Aws::S3::Client.new(region: region) del_path = "./mnt/efs/#{bucket_name}_#{web_id}" @@ -26,18 +29,20 @@ def self.extract(bucket_name, object_key, binary_name, web_id) bucket: bucket_name, key: object_key, ) - puts "Getting object #{object_key} from #{bucket_name}" + puts "Getting object #{object_key} with ID #{web_id} from #{bucket_name}" rescue StandardError => e - puts "Error getting object #{object_key} from S3 bucket #{bucket_name}: #{e.message}" + error = {"task_id" => web_id, "s3_get_report" => "Error getting object #{object_key} with ID #{web_id} from S3 bucket #{bucket_name}: #{e.message}"} + puts error end extraction = Extraction.new(binary_name, local_path, web_id) extraction.process - puts "status: #{extraction.status}" - puts "error: #{extraction.error}" if extraction.error == ExtractionStatus::ERROR + status = extraction.status + puts "status: #{status}" + puts "error: #{extraction.error}" if status == ExtractionStatus::ERROR + error = error.merge(extraction.error) items = extraction.nested_items.map { |o| Hash[o.each_pair.to_a] } - retVal = {"web_id" => web_id, "status" => extraction.status, "error" => extraction.error, "peek_type" => extraction.peek_type, "peek_text" => extraction.peek_text, "nested_items" => items} - + retVal = {"web_id" => web_id, "status" => status, "error" => error, "peek_type" => extraction.peek_type, "peek_text" => extraction.peek_text, "nested_items" => items} s3_path = "messages/#{web_id}.json" begin @@ -46,12 +51,18 @@ def self.extract(bucket_name, object_key, binary_name, web_id) bucket: "databank-demo-main", key: s3_path, }) - puts "Putting json response for object #{object_key} in S3 bucket #{bucket_name} with key #{s3_path}" + puts "Putting json response for object #{object_key} with ID #{web_id} in S3 bucket #{bucket_name} with key #{s3_path}" rescue StandardError => e - puts "Error putting json response for object #{object_key} in S3 bucket #{bucket_name}: #{e.message}" + s3_put_status = ExtractionStatus::ERROR + s3_put_error = "Error putting json response for object #{object_key} with ID #{web_id} in S3 bucket #{bucket_name}: #{e.message}" + puts s3_put_error end - retVal = {"bucket_name" => bucket_name, "object_key" => s3_path} + if s3_put_status == ExtractionStatus::SUCCESS + retVal = {"bucket_name" => bucket_name, "object_key" => s3_path} + else + retVal = {"s3_status" => s3_put_status, "s3_put_report" =>s3_put_error} + end sqs = Aws::SQS::Client.new(region: region) @@ -66,9 +77,9 @@ def self.extract(bucket_name, object_key, binary_name, web_id) message_body: retVal.to_json, message_attributes: {} }) - puts "Sending message in queue #{queue_name} for object #{object_key}" + puts "Sending message in queue #{queue_name} for object #{object_key} with ID #{web_id}" rescue StandardError => e - puts "Error sending message in queue #{queue_name} for object #{object_key}: #{e.message}" + puts "Error sending message in queue #{queue_name} for object #{object_key} with ID #{web_id}: #{e.message}" end diff --git a/lib/extractor/extraction.rb b/lib/extractor/extraction.rb index ee10327..aeed1b2 100644 --- a/lib/extractor/extraction.rb +++ b/lib/extractor/extraction.rb @@ -20,6 +20,7 @@ def initialize(binary_name, storage_path, id) @binary_name = binary_name @storage_path = storage_path @id = id + @error = Hash.new end ALLOWED_CHAR_NUM = 1024 * 8 @@ -46,14 +47,14 @@ def process self.peek_type = PeekType::NONE report_problem('invalid encoding for peek text') rescue Exception => ex - report_problem("invalid encoding and problem characer: #{ex.class}, #{ex.message}") + report_problem("invalid encoding and problem character: #{ex.class}, #{ex.message}") end end end end def report_problem(report) - self.error = {"task_id" => self.id, "report" => report} + self.error = {"task_id" => self.id, "extraction_report" => report} end def extract_features From 021a672e71307b471f9dd2a26bcec0715b0dab0c Mon Sep 17 00:00:00 2001 From: Gen Schmitt Date: Wed, 21 Apr 2021 10:45:17 -0500 Subject: [PATCH 05/20] Added error messaging for processing the extraction --- ecr-push.sh | 2 +- lib/extractor.rb | 26 +++++++++++++++++--------- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/ecr-push.sh b/ecr-push.sh index 752efc3..9ac4826 100755 --- a/ecr-push.sh +++ b/ecr-push.sh @@ -6,4 +6,4 @@ aws ecr get-login-password --region us-east-2 | docker login --username AWS --password-stdin 721945215539.dkr.ecr.us-east-2.amazonaws.com docker build -t databank-archive-extractor-demo . docker tag databank-archive-extractor-demo:latest 721945215539.dkr.ecr.us-east-2.amazonaws.com/databank-archive-extractor-demo:latest -docker push 721945215539.dkr.ecr.us-east-2.amazonaws.com/databank-archive-extractor-demo:latest \ No newline at end of file +docker push 721945215539.dkr.ecr.us-east-2.amazonaws.com/databank-archive-extractor-demo:latest diff --git a/lib/extractor.rb b/lib/extractor.rb index 427ebc2..6fb4360 100644 --- a/lib/extractor.rb +++ b/lib/extractor.rb @@ -14,6 +14,7 @@ def self.extract(bucket_name, object_key, binary_name, web_id) s3_put_status = ExtractionStatus::SUCCESS s3_put_error = "" region = 'us-east-2' + s3_client = Aws::S3::Client.new(region: region) del_path = "./mnt/efs/#{bucket_name}_#{web_id}" local_path = "#{del_path}/#{object_key}" @@ -31,18 +32,25 @@ def self.extract(bucket_name, object_key, binary_name, web_id) ) puts "Getting object #{object_key} with ID #{web_id} from #{bucket_name}" rescue StandardError => e - error = {"task_id" => web_id, "s3_get_report" => "Error getting object #{object_key} with ID #{web_id} from S3 bucket #{bucket_name}: #{e.message}"} + error.merge!({"task_id" => web_id, "s3_get_report" => "Error getting object #{object_key} with ID #{web_id} from S3 bucket #{bucket_name}: #{e.message}"}) puts error end - extraction = Extraction.new(binary_name, local_path, web_id) - extraction.process - status = extraction.status - puts "status: #{status}" - puts "error: #{extraction.error}" if status == ExtractionStatus::ERROR - error = error.merge(extraction.error) - items = extraction.nested_items.map { |o| Hash[o.each_pair.to_a] } - retVal = {"web_id" => web_id, "status" => status, "error" => error, "peek_type" => extraction.peek_type, "peek_text" => extraction.peek_text, "nested_items" => items} + + begin + extraction = Extraction.new(binary_name, local_path, web_id) + extraction.process + status = extraction.status + puts "status: #{status}" + puts "error: #{extraction.error}" if status == ExtractionStatus::ERROR + error.merge!(extraction.error) + items = extraction.nested_items.map { |o| Hash[o.each_pair.to_a] } + retVal = {"web_id" => web_id, "status" => status, "error" => error, "peek_type" => extraction.peek_type, "peek_text" => extraction.peek_text, "nested_items" => items} + rescue StandardError => e + error.merge!({"task_id" => web_id, "extraction_process_report" => "Error extracting #{object_key} with ID #{web_id}: #{e.message}"}) + retVal = {"web_id" => web_id, "status" => ExtractionStatus::ERROR, "error" => error, "peek_type" => PeekType::NONE, "peek_text" => null, "nested_items" => []} + end + s3_path = "messages/#{web_id}.json" begin From 5e8940e36dfd331b61b23c784c65594e0b152e7a Mon Sep 17 00:00:00 2001 From: Gen Schmitt Date: Fri, 23 Apr 2021 11:01:34 -0500 Subject: [PATCH 06/20] Changed error reporting from a hash to an array and added error type --- lib/extractor.rb | 34 +++++++++++++++++++--------------- lib/extractor/error_type.rb | 6 ++++++ lib/extractor/extraction.rb | 5 +++-- 3 files changed, 28 insertions(+), 17 deletions(-) create mode 100644 lib/extractor/error_type.rb diff --git a/lib/extractor.rb b/lib/extractor.rb index 6fb4360..66d4749 100644 --- a/lib/extractor.rb +++ b/lib/extractor.rb @@ -5,14 +5,15 @@ require_relative 'extractor/extraction.rb' require_relative 'extractor/extraction_status.rb' +require_relative 'extractor/error_type.rb' class Extractor def self.extract(bucket_name, object_key, binary_name, web_id) begin status = ExtractionStatus::ERROR - error = Hash.new + error = Array.new s3_put_status = ExtractionStatus::SUCCESS - s3_put_error = "" + s3_put_error = Array.new region = 'us-east-2' s3_client = Aws::S3::Client.new(region: region) @@ -32,8 +33,9 @@ def self.extract(bucket_name, object_key, binary_name, web_id) ) puts "Getting object #{object_key} with ID #{web_id} from #{bucket_name}" rescue StandardError => e - error.merge!({"task_id" => web_id, "s3_get_report" => "Error getting object #{object_key} with ID #{web_id} from S3 bucket #{bucket_name}: #{e.message}"}) - puts error + s3_error= "Error getting object #{object_key} with ID #{web_id} from S3 bucket #{bucket_name}: #{e.message}" + error.push({"error_type" => ErrorType::S3_GET, "report" => s3_error}) + puts s3_error end @@ -43,12 +45,14 @@ def self.extract(bucket_name, object_key, binary_name, web_id) status = extraction.status puts "status: #{status}" puts "error: #{extraction.error}" if status == ExtractionStatus::ERROR - error.merge!(extraction.error) + error.concat(extraction.error) items = extraction.nested_items.map { |o| Hash[o.each_pair.to_a] } - retVal = {"web_id" => web_id, "status" => status, "error" => error, "peek_type" => extraction.peek_type, "peek_text" => extraction.peek_text, "nested_items" => items} + errors = error.map {|o| Hash[o.each_pair.to_a]} + retVal = {"web_id" => web_id, "status" => status, "error" => errors, "peek_type" => extraction.peek_type, "peek_text" => extraction.peek_text, "nested_items" => items} rescue StandardError => e - error.merge!({"task_id" => web_id, "extraction_process_report" => "Error extracting #{object_key} with ID #{web_id}: #{e.message}"}) - retVal = {"web_id" => web_id, "status" => ExtractionStatus::ERROR, "error" => error, "peek_type" => PeekType::NONE, "peek_text" => null, "nested_items" => []} + error.push({"task_id" => web_id, "extraction_process_report" => "Error extracting #{object_key} with ID #{web_id}: #{e.message}"}) + errors = error.map {|o| Hash[o.each_pair.to_a]} + retVal = {"web_id" => web_id, "status" => ExtractionStatus::ERROR, "error" => errors, "peek_type" => PeekType::NONE, "peek_text" => null, "nested_items" => []} end @@ -62,15 +66,15 @@ def self.extract(bucket_name, object_key, binary_name, web_id) puts "Putting json response for object #{object_key} with ID #{web_id} in S3 bucket #{bucket_name} with key #{s3_path}" rescue StandardError => e s3_put_status = ExtractionStatus::ERROR - s3_put_error = "Error putting json response for object #{object_key} with ID #{web_id} in S3 bucket #{bucket_name}: #{e.message}" - puts s3_put_error + s3_put_error_message = "Error putting json response for object #{object_key} with ID #{web_id} in S3 bucket #{bucket_name}: #{e.message}" + s3_put_error.push({"error_type" => ErrorType::S3_PUT, "report" => s3_put_error_message}) + puts s3_put_error_message end - if s3_put_status == ExtractionStatus::SUCCESS - retVal = {"bucket_name" => bucket_name, "object_key" => s3_path} - else - retVal = {"s3_status" => s3_put_status, "s3_put_report" =>s3_put_error} - end + s3_put_errors = s3_put_error.map {|o| Hash[o.each_pair.to_a]} + + retVal = {"bucket_name" => bucket_name, "object_key" => s3_path, "s3_status" => s3_put_status, "error" => s3_put_errors} + sqs = Aws::SQS::Client.new(region: region) diff --git a/lib/extractor/error_type.rb b/lib/extractor/error_type.rb new file mode 100644 index 0000000..bafc541 --- /dev/null +++ b/lib/extractor/error_type.rb @@ -0,0 +1,6 @@ +class ErrorType + EXTRACTION = 'extraction_error' + PROCESSING = 'processing_error' + S3_PUT = 's3_put_error' + S3_GET = 's3_get_error' +end \ No newline at end of file diff --git a/lib/extractor/extraction.rb b/lib/extractor/extraction.rb index aeed1b2..bb67297 100644 --- a/lib/extractor/extraction.rb +++ b/lib/extractor/extraction.rb @@ -10,6 +10,7 @@ require_relative 'extraction_status.rb' require_relative 'peek_type.rb' +require_relative 'error_type.rb' class Extraction @@ -20,7 +21,7 @@ def initialize(binary_name, storage_path, id) @binary_name = binary_name @storage_path = storage_path @id = id - @error = Hash.new + @error = Array.new end ALLOWED_CHAR_NUM = 1024 * 8 @@ -54,7 +55,7 @@ def process end def report_problem(report) - self.error = {"task_id" => self.id, "extraction_report" => report} + self.error.push({"error_type" => ErrorType::EXTRACTION, "report" => report}) end def extract_features From 0908fd349270905170e3a0e6ea942e47b85b1f59 Mon Sep 17 00:00:00 2001 From: Gen Schmitt Date: Fri, 23 Apr 2021 14:38:06 -0500 Subject: [PATCH 07/20] Updated variable name to ruby standards --- lib/extractor.rb | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/extractor.rb b/lib/extractor.rb index 66d4749..cef38a2 100644 --- a/lib/extractor.rb +++ b/lib/extractor.rb @@ -48,18 +48,18 @@ def self.extract(bucket_name, object_key, binary_name, web_id) error.concat(extraction.error) items = extraction.nested_items.map { |o| Hash[o.each_pair.to_a] } errors = error.map {|o| Hash[o.each_pair.to_a]} - retVal = {"web_id" => web_id, "status" => status, "error" => errors, "peek_type" => extraction.peek_type, "peek_text" => extraction.peek_text, "nested_items" => items} + return_value = {"web_id" => web_id, "status" => status, "error" => errors, "peek_type" => extraction.peek_type, "peek_text" => extraction.peek_text, "nested_items" => items} rescue StandardError => e error.push({"task_id" => web_id, "extraction_process_report" => "Error extracting #{object_key} with ID #{web_id}: #{e.message}"}) errors = error.map {|o| Hash[o.each_pair.to_a]} - retVal = {"web_id" => web_id, "status" => ExtractionStatus::ERROR, "error" => errors, "peek_type" => PeekType::NONE, "peek_text" => null, "nested_items" => []} + return_value = {"web_id" => web_id, "status" => ExtractionStatus::ERROR, "error" => errors, "peek_type" => PeekType::NONE, "peek_text" => null, "nested_items" => []} end s3_path = "messages/#{web_id}.json" begin s3_client.put_object({ - body: retVal.to_json, + body: return_value.to_json, bucket: "databank-demo-main", key: s3_path, }) @@ -73,7 +73,7 @@ def self.extract(bucket_name, object_key, binary_name, web_id) s3_put_errors = s3_put_error.map {|o| Hash[o.each_pair.to_a]} - retVal = {"bucket_name" => bucket_name, "object_key" => s3_path, "s3_status" => s3_put_status, "error" => s3_put_errors} + return_value = {"bucket_name" => bucket_name, "object_key" => s3_path, "s3_status" => s3_put_status, "error" => s3_put_errors} sqs = Aws::SQS::Client.new(region: region) @@ -86,7 +86,7 @@ def self.extract(bucket_name, object_key, binary_name, web_id) # Create and send a message. sqs.send_message({ queue_url: queue_url, - message_body: retVal.to_json, + message_body: return_value.to_json, message_attributes: {} }) puts "Sending message in queue #{queue_name} for object #{object_key} with ID #{web_id}" From d942f2af4f2bda283ef98a19af2ff9cd9522cee9 Mon Sep 17 00:00:00 2001 From: Gen Schmitt Date: Mon, 26 Apr 2021 15:39:18 -0500 Subject: [PATCH 08/20] Reconfigured to get mime type from Databank --- lib/extractor.rb | 4 ++-- lib/extractor/extraction.rb | 22 +++++++++------------- lib/extractor/mime_type.rb | 5 +++++ 3 files changed, 16 insertions(+), 15 deletions(-) create mode 100644 lib/extractor/mime_type.rb diff --git a/lib/extractor.rb b/lib/extractor.rb index cef38a2..1570025 100644 --- a/lib/extractor.rb +++ b/lib/extractor.rb @@ -8,7 +8,7 @@ require_relative 'extractor/error_type.rb' class Extractor - def self.extract(bucket_name, object_key, binary_name, web_id) + def self.extract(bucket_name, object_key, binary_name, web_id, mime_type) begin status = ExtractionStatus::ERROR error = Array.new @@ -40,7 +40,7 @@ def self.extract(bucket_name, object_key, binary_name, web_id) begin - extraction = Extraction.new(binary_name, local_path, web_id) + extraction = Extraction.new(binary_name, local_path, web_id, mime_type) extraction.process status = extraction.status puts "status: #{status}" diff --git a/lib/extractor/extraction.rb b/lib/extractor/extraction.rb index bb67297..9d5c94e 100644 --- a/lib/extractor/extraction.rb +++ b/lib/extractor/extraction.rb @@ -11,17 +11,19 @@ require_relative 'extraction_status.rb' require_relative 'peek_type.rb' require_relative 'error_type.rb' +require_relative 'mime_type.rb' class Extraction - attr_accessor :binary_name, :storage_path, :status, :peek_type, :peek_text, :id, :nested_items, :error + attr_accessor :binary_name, :storage_path, :status, :peek_type, :peek_text, :id, :nested_items, :error, :mime_type - def initialize(binary_name, storage_path, id) + def initialize(binary_name, storage_path, id, mime_type) @nested_items = Array.new @binary_name = binary_name @storage_path = storage_path @id = id @error = Array.new + @mime_type = mime_type end ALLOWED_CHAR_NUM = 1024 * 8 @@ -59,19 +61,16 @@ def report_problem(report) end def extract_features - mime_guess = top_level_mime || Extraction.mime_from_filename(self.binary_name) || 'application/octet-stream' - - mime_parts = mime_guess.split("/") + mime_parts = @mime_type.split("/") + subtype = mime_parts[1].downcase - nonzip_archive_subtypes = ['x-7z-compressed', 'x-tar'] - subtype = mime_parts[1].downcase - if subtype == 'zip' + if MimeType::ZIP.include?(subtype) return extract_zip - elsif nonzip_archive_subtypes.include?(subtype) + elsif MimeType::NON_ZIP_ARCHIVE.include?(subtype) return extract_archive - elsif self.binary_name.chars.last(6).join == 'tar.gz' + elsif MimeType::GZIP.include?(subtype) return extract_gzip else return extract_default @@ -79,9 +78,6 @@ def extract_features end - def top_level_mime - Extraction.mime_from_path(self.storage_path) - end def self.mime_from_path(path) file_mime_response = MimeMagic.by_path(File.open("#{path}")).to_s diff --git a/lib/extractor/mime_type.rb b/lib/extractor/mime_type.rb new file mode 100644 index 0000000..b915a34 --- /dev/null +++ b/lib/extractor/mime_type.rb @@ -0,0 +1,5 @@ +class MimeType + ZIP = ["x-zip-compressed", "zip"] + NON_ZIP_ARCHIVE = ["x-7z-compressed", "x-tar"] + GZIP = ["x-gzip","gzip"] +end \ No newline at end of file From d0125fee9a455892b66ce46a9f8f22c12387f077 Mon Sep 17 00:00:00 2001 From: Gen Schmitt Date: Wed, 28 Apr 2021 14:58:19 -0500 Subject: [PATCH 09/20] Corrected EFS path --- lib/extractor.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/extractor.rb b/lib/extractor.rb index 1570025..fba17dc 100644 --- a/lib/extractor.rb +++ b/lib/extractor.rb @@ -17,7 +17,7 @@ def self.extract(bucket_name, object_key, binary_name, web_id, mime_type) region = 'us-east-2' s3_client = Aws::S3::Client.new(region: region) - del_path = "./mnt/efs/#{bucket_name}_#{web_id}" + del_path = "/mnt/efs/#{bucket_name}_#{web_id}" local_path = "#{del_path}/#{object_key}" dirname = File.dirname(local_path) From 87b886ce91c1e1de43e40f9a4eab0625452ca2f0 Mon Sep 17 00:00:00 2001 From: Gen Schmitt Date: Mon, 3 May 2021 11:29:14 -0500 Subject: [PATCH 10/20] Added support for xz files --- lib/extractor/mime_type.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/extractor/mime_type.rb b/lib/extractor/mime_type.rb index b915a34..03df00b 100644 --- a/lib/extractor/mime_type.rb +++ b/lib/extractor/mime_type.rb @@ -1,5 +1,5 @@ class MimeType ZIP = ["x-zip-compressed", "zip"] - NON_ZIP_ARCHIVE = ["x-7z-compressed", "x-tar"] + NON_ZIP_ARCHIVE = ["x-7z-compressed", "x-tar", "x-xz"] GZIP = ["x-gzip","gzip"] end \ No newline at end of file From 96e8816764e9030ec159a13d352d239bae0d6cc7 Mon Sep 17 00:00:00 2001 From: Gen Schmitt Date: Wed, 5 May 2021 08:09:17 -0500 Subject: [PATCH 11/20] Added support for rar mime type --- lib/extractor/extraction.rb | 22 ++++------------------ lib/extractor/mime_type.rb | 2 +- 2 files changed, 5 insertions(+), 19 deletions(-) diff --git a/lib/extractor/extraction.rb b/lib/extractor/extraction.rb index 9d5c94e..433c3de 100644 --- a/lib/extractor/extraction.rb +++ b/lib/extractor/extraction.rb @@ -198,7 +198,7 @@ def extract_archive if !is_ds_store(entry_path) && !is_mac_thing(entry_path) entry_paths << entry_path - if is_directory(entry.pathname) + if entry.directory? || is_directory(entry.pathname) create_item(entry_path, name_part(entry_path), @@ -212,14 +212,7 @@ def extract_archive extracted_entry_dir = File.dirname(extracted_entry_path) FileUtils.mkdir_p extracted_entry_dir - entry_size = 0 - - File.open(extracted_entry_path, 'wb') do |entry_file| - ar.read_data(1024) do |x| - entry_file.write(x) - entry_size = entry_size + x.length - end - end + File.open(extracted_entry_path, 'wb') raise("extracting non-zip entry not working!") unless File.exist?(extracted_entry_path) @@ -293,14 +286,7 @@ def extract_gzip extracted_entry_dir = File.dirname(extracted_entry_path) FileUtils.mkdir_p extracted_entry_dir - entry_size = 0 - - File.open(extracted_entry_path, 'wb') do |entry_file| - entry.read(1024) do |x| - entry_file.write(x) - entry_size = entry_size + x.length - end - end + File.open(extracted_entry_path, 'wb') raise("extracting gzip entry not working!") unless File.exist?(extracted_entry_path) @@ -367,7 +353,7 @@ def valid_entry_path(entry_path) end def is_directory(path) - ends_in_slash(path) && !is_ds_store(path) && !is_mac_thing(path) + File.directory?(path) || (ends_in_slash(path) && !is_ds_store(path) && !is_mac_thing(path)) end def is_mac_thing(path) diff --git a/lib/extractor/mime_type.rb b/lib/extractor/mime_type.rb index 03df00b..4051a7e 100644 --- a/lib/extractor/mime_type.rb +++ b/lib/extractor/mime_type.rb @@ -1,5 +1,5 @@ class MimeType ZIP = ["x-zip-compressed", "zip"] - NON_ZIP_ARCHIVE = ["x-7z-compressed", "x-tar", "x-xz"] + NON_ZIP_ARCHIVE = ["x-7z-compressed", "x-tar", "x-xz", "x-rar", "x-rar-compressed"] GZIP = ["x-gzip","gzip"] end \ No newline at end of file From b96319b4441b3d56efa5af5d5aeab5d1bb30a311 Mon Sep 17 00:00:00 2001 From: Gen Schmitt Date: Wed, 5 May 2021 14:31:43 -0500 Subject: [PATCH 12/20] Added support for gtar files --- lib/extractor/mime_type.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/extractor/mime_type.rb b/lib/extractor/mime_type.rb index 4051a7e..3feaa3c 100644 --- a/lib/extractor/mime_type.rb +++ b/lib/extractor/mime_type.rb @@ -1,5 +1,5 @@ class MimeType ZIP = ["x-zip-compressed", "zip"] - NON_ZIP_ARCHIVE = ["x-7z-compressed", "x-tar", "x-xz", "x-rar", "x-rar-compressed"] + NON_ZIP_ARCHIVE = ["x-7z-compressed", "x-tar", "x-xz", "x-rar", "x-rar-compressed", "x-gtar"] GZIP = ["x-gzip","gzip"] end \ No newline at end of file From 5d185214c9a242e576e64c7de60bed3f419f77c1 Mon Sep 17 00:00:00 2001 From: Gen Schmitt Date: Fri, 7 May 2021 14:52:20 -0500 Subject: [PATCH 13/20] Corrected too many files open bug --- lib/extractor/extraction.rb | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/lib/extractor/extraction.rb b/lib/extractor/extraction.rb index 433c3de..004ad73 100644 --- a/lib/extractor/extraction.rb +++ b/lib/extractor/extraction.rb @@ -80,7 +80,9 @@ def extract_features def self.mime_from_path(path) - file_mime_response = MimeMagic.by_path(File.open("#{path}")).to_s + file = File.open("#{path}") + file_mime_response = MimeMagic.by_path(file).to_s + file.close if file_mime_response.length > 0 file_mime_response @@ -212,7 +214,7 @@ def extract_archive extracted_entry_dir = File.dirname(extracted_entry_path) FileUtils.mkdir_p extracted_entry_dir - File.open(extracted_entry_path, 'wb') + file = File.open(extracted_entry_path, 'wb') raise("extracting non-zip entry not working!") unless File.exist?(extracted_entry_path) @@ -226,7 +228,7 @@ def extract_archive entry.size, mime_guess, false) - + file.close File.delete(extracted_entry_path) if File.exist?(extracted_entry_path) end @@ -286,7 +288,7 @@ def extract_gzip extracted_entry_dir = File.dirname(extracted_entry_path) FileUtils.mkdir_p extracted_entry_dir - File.open(extracted_entry_path, 'wb') + file = File.open(extracted_entry_path, 'wb') raise("extracting gzip entry not working!") unless File.exist?(extracted_entry_path) @@ -300,7 +302,7 @@ def extract_gzip entry.size, mime_guess, false) - + file.close File.delete(extracted_entry_path) if File.exist?(extracted_entry_path) end From 7f4a63a2d111c8f8c0ff98e94517b50b5a1aadc2 Mon Sep 17 00:00:00 2001 From: Gen Schmitt Date: Wed, 23 Jun 2021 13:46:51 -0400 Subject: [PATCH 14/20] Fixed bug in processing tar archives --- lib/extractor/extraction.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/extractor/extraction.rb b/lib/extractor/extraction.rb index 004ad73..258735d 100644 --- a/lib/extractor/extraction.rb +++ b/lib/extractor/extraction.rb @@ -124,7 +124,7 @@ def extract_zip entry_path = valid_entry_path(entry.name) - if entry_path && !is_ds_store(entry_path) && !is_mac_thing(entry_path) + if entry_path && !is_ds_store(entry_path) && !is_mac_thing(entry_path) && !is_mac_tar_thing(entry_path) entry_paths << entry_path @@ -197,7 +197,7 @@ def extract_archive if entry_path - if !is_ds_store(entry_path) && !is_mac_thing(entry_path) + if !is_ds_store(entry_path) && !is_mac_thing(entry_path) && !is_mac_tar_thing(entry_path) entry_paths << entry_path if entry.directory? || is_directory(entry.pathname) From b294f1d2a0932a564f8439f1793bfc81dc394457 Mon Sep 17 00:00:00 2001 From: Gen Schmitt Date: Thu, 30 Nov 2023 14:53:40 -0700 Subject: [PATCH 15/20] updated to add test coverage --- .idea/.gitignore | 8 + .idea/databank-archive-extractor.iml | 47 +++ .idea/inspectionProfiles/Project_Default.xml | 6 + .idea/misc.xml | 6 + .idea/modules.xml | 8 + .idea/vcs.xml | 6 + .ruby-version | 1 + Dockerfile | 8 +- Gemfile | 12 +- Gemfile.lock | 100 ++++-- Rakefile | 11 +- bin/console | 15 - bin/set-test-vars.rb | 5 + bin/setup | 8 - config/settings.yml | 3 + config/settings/demo.yml | 8 + config/settings/prod.yml | 0 config/settings/test.yml | 8 + docker/extractor/Dockerfile-test | 27 ++ lib/archive_extractor.rb | 131 +++++++ lib/extractor.rb | 107 +----- lib/extractor/extraction.rb | 315 ++++++---------- lib/extractor/extraction_type.rb | 6 + test/archive_extractor_test.rb | 158 ++++++++ test/extraction_test.rb | 356 +++++++++++++++++++ test/test.tar | Bin 0 -> 2048 bytes test/test.txt.gz | Bin 0 -> 37 bytes test/test.zip | Bin 0 -> 178 bytes test/test_helper.rb | 17 + 29 files changed, 1011 insertions(+), 366 deletions(-) create mode 100644 .idea/.gitignore create mode 100644 .idea/databank-archive-extractor.iml create mode 100644 .idea/inspectionProfiles/Project_Default.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/vcs.xml create mode 100644 .ruby-version delete mode 100755 bin/console create mode 100644 bin/set-test-vars.rb delete mode 100755 bin/setup create mode 100644 config/settings.yml create mode 100644 config/settings/demo.yml create mode 100644 config/settings/prod.yml create mode 100644 config/settings/test.yml create mode 100644 docker/extractor/Dockerfile-test create mode 100644 lib/archive_extractor.rb create mode 100644 lib/extractor/extraction_type.rb create mode 100644 test/archive_extractor_test.rb create mode 100644 test/extraction_test.rb create mode 100644 test/test.tar create mode 100644 test/test.txt.gz create mode 100644 test/test.zip create mode 100644 test/test_helper.rb diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..73f69e0 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml +# Editor-based HTTP Client requests +/httpRequests/ diff --git a/.idea/databank-archive-extractor.iml b/.idea/databank-archive-extractor.iml new file mode 100644 index 0000000..7a849e6 --- /dev/null +++ b/.idea/databank-archive-extractor.iml @@ -0,0 +1,47 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..b0db9b0 --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..1f18249 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..1127073 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..35eb1dd --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.ruby-version b/.ruby-version new file mode 100644 index 0000000..ef538c2 --- /dev/null +++ b/.ruby-version @@ -0,0 +1 @@ +3.1.2 diff --git a/Dockerfile b/Dockerfile index 2de759a..4f10663 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ # N.B.: this must match the Ruby version in the Gemfile, and /.ruby-version. -FROM ruby:2.7.2 +FROM ruby:3.1.2 ENV RAILS_ENV=production ENV RAILS_LOG_TO_STDOUT=true @@ -18,15 +18,11 @@ WORKDIR app # Copy the Gemfile as well as the Gemfile.lock and install gems. # This is a separate step so the dependencies will be cached. COPY Gemfile Gemfile.lock ./ -RUN gem install bundler && bundle install --without development test --jobs 20 --retry 5 +RUN gem install bundler && bundle install # Copy the main application, except whatever is listed in .dockerignore. COPY . ./ -#RUN bin/rails assets:precompile - -EXPOSE 3000 - # This is the web server entry point. It will need to be overridden when # running the workers. CMD ["echo", "Error running task, please check the container override command!"] diff --git a/Gemfile b/Gemfile index 73c0c87..2013c04 100644 --- a/Gemfile +++ b/Gemfile @@ -1,16 +1,18 @@ source "https://rubygems.org" git_source(:github) { |repo| "https://github.com/#{repo}.git" } -ruby '2.7.2' - # Use mime-types to determine mimetypes based on extension gem 'mime-types', require: 'mime/types/full' # Use rubyzip to read zip files gem 'rubyzip' +gem 'config' + # Use archive for non-zip archive files -gem 'libarchive' +# gem 'libarchive' +# gem 'libarchive-ruby' +gem 'ffi-libarchive' # Use os to interact with operating system gem 'os' @@ -24,3 +26,7 @@ gem "aws-sdk-s3" gem "aws-sdk-sqs" +gem 'minitest' + +gem 'simplecov' + diff --git a/Gemfile.lock b/Gemfile.lock index e49612c..d6d7b8e 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,53 +1,99 @@ GEM remote: https://rubygems.org/ specs: - aws-eventstream (1.1.1) - aws-partitions (1.436.0) - aws-sdk-core (3.113.0) + aws-eventstream (1.2.0) + aws-partitions (1.854.0) + aws-sdk-core (3.187.1) aws-eventstream (~> 1, >= 1.0.2) - aws-partitions (~> 1, >= 1.239.0) + aws-partitions (~> 1, >= 1.651.0) + aws-sigv4 (~> 1.5) + jmespath (~> 1, >= 1.6.1) + aws-sdk-kms (1.72.0) + aws-sdk-core (~> 3, >= 3.184.0) aws-sigv4 (~> 1.1) - jmespath (~> 1.0) - aws-sdk-kms (1.43.0) - aws-sdk-core (~> 3, >= 3.112.0) - aws-sigv4 (~> 1.1) - aws-sdk-s3 (1.93.0) - aws-sdk-core (~> 3, >= 3.112.0) + aws-sdk-s3 (1.137.0) + aws-sdk-core (~> 3, >= 3.181.0) aws-sdk-kms (~> 1) + aws-sigv4 (~> 1.6) + aws-sdk-sqs (1.67.0) + aws-sdk-core (~> 3, >= 3.184.0) aws-sigv4 (~> 1.1) - aws-sdk-sqs (1.38.0) - aws-sdk-core (~> 3, >= 3.112.0) - aws-sigv4 (~> 1.1) - aws-sigv4 (1.2.3) + aws-sigv4 (1.6.1) aws-eventstream (~> 1, >= 1.0.2) - jmespath (1.4.0) - mime-types (3.3.1) + concurrent-ruby (1.2.2) + config (5.0.0) + deep_merge (~> 1.2, >= 1.2.1) + dry-validation (~> 1.0, >= 1.0.0) + deep_merge (1.2.2) + docile (1.4.0) + dry-configurable (1.1.0) + dry-core (~> 1.0, < 2) + zeitwerk (~> 2.6) + dry-core (1.0.1) + concurrent-ruby (~> 1.0) + zeitwerk (~> 2.6) + dry-inflector (1.0.0) + dry-initializer (3.1.1) + dry-logic (1.5.0) + concurrent-ruby (~> 1.0) + dry-core (~> 1.0, < 2) + zeitwerk (~> 2.6) + dry-schema (1.13.3) + concurrent-ruby (~> 1.0) + dry-configurable (~> 1.0, >= 1.0.1) + dry-core (~> 1.0, < 2) + dry-initializer (~> 3.0) + dry-logic (>= 1.4, < 2) + dry-types (>= 1.7, < 2) + zeitwerk (~> 2.6) + dry-types (1.7.1) + concurrent-ruby (~> 1.0) + dry-core (~> 1.0) + dry-inflector (~> 1.0) + dry-logic (~> 1.4) + zeitwerk (~> 2.6) + dry-validation (1.10.0) + concurrent-ruby (~> 1.0) + dry-core (~> 1.0, < 2) + dry-initializer (~> 3.0) + dry-schema (>= 1.12, < 2) + zeitwerk (~> 2.6) + jmespath (1.6.2) + mime-types (3.5.1) mime-types-data (~> 3.2015) - mime-types-data (3.2021.0225) + mime-types-data (3.2023.1003) mimemagic (0.3.10) nokogiri (~> 1) rake - nokogiri (1.11.2-x86_64-darwin) + minitest (5.20.0) + nokogiri (1.15.5-x86_64-darwin) racc (~> 1.4) - os (1.1.1) - racc (1.5.2) - rake (13.0.3) - rubyzip (2.3.0) + os (1.1.4) + racc (1.7.3) + rake (13.1.0) + rubyzip (2.3.2) + simplecov (0.22.0) + docile (~> 1.1) + simplecov-html (~> 0.11) + simplecov_json_formatter (~> 0.1) + simplecov-html (0.12.3) + simplecov_json_formatter (0.1.4) + zeitwerk (2.6.12) PLATFORMS - x86_64-darwin-19 + x86_64-darwin-21 DEPENDENCIES aws-sdk-s3 aws-sdk-sqs + config mime-types mimemagic (~> 0.3.6) + minitest os rake (~> 13.0) rubyzip - -RUBY VERSION - ruby 2.7.2p137 + simplecov BUNDLED WITH - 2.2.4 + 2.3.22 diff --git a/Rakefile b/Rakefile index cd510a0..0abb012 100644 --- a/Rakefile +++ b/Rakefile @@ -1,4 +1,11 @@ # frozen_string_literal: true -require "bundler/gem_tasks" -task default: %i[] +require 'rake/testtask' +require 'simplecov' +require_relative 'bin/set-test-vars' + +Rake::TestTask.new(:test) do |t| + t.libs << 'lib' << 'test' + # t.libs << 'lib' + t.test_files = FileList['test/*_test.rb'] +end diff --git a/bin/console b/bin/console deleted file mode 100755 index 8096e50..0000000 --- a/bin/console +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/env ruby -# frozen_string_literal: true - -require "bundler/setup" -require "databank/archive/extractor" - -# You can add fixtures and/or initialization code here to make experimenting -# with your gem easier. You can also use a different console, if you like. - -# (If you use this, don't forget to add pry to your Gemfile!) -# require "pry" -# Pry.start - -require "irb" -IRB.start(__FILE__) diff --git a/bin/set-test-vars.rb b/bin/set-test-vars.rb new file mode 100644 index 0000000..7959c7c --- /dev/null +++ b/bin/set-test-vars.rb @@ -0,0 +1,5 @@ +#!/usr/bin/env ruby + +ENV['RUBY_ENV'] = 'test' +ENV['RUBY_HOME'] = ENV['IS_DOCKER'] == 'true' ? '/extractor' : '/Users/gschmitt/workspace/databank-archive-extractor' +ENV['RUBY_TEST_HOME'] = "#{ENV['RUBY_HOME']}/test" diff --git a/bin/setup b/bin/setup deleted file mode 100755 index dce67d8..0000000 --- a/bin/setup +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail -IFS=$'\n\t' -set -vx - -bundle install - -# Do any other automated setup that you need to do here diff --git a/config/settings.yml b/config/settings.yml new file mode 100644 index 0000000..ed17934 --- /dev/null +++ b/config/settings.yml @@ -0,0 +1,3 @@ +aws: + region: "us-east-2" + diff --git a/config/settings/demo.yml b/config/settings/demo.yml new file mode 100644 index 0000000..18367ed --- /dev/null +++ b/config/settings/demo.yml @@ -0,0 +1,8 @@ +aws: + efs: + mount_point: "/mnt/efs/" + sqs: + queue_name: "extractor-to-databank-demo" + queue_url: "https://sqs.us-east-2.amazonaws.com/721945215539/extractor-to-databank-demo" + s3: + json_bucket: "databank-demo-main" diff --git a/config/settings/prod.yml b/config/settings/prod.yml new file mode 100644 index 0000000..e69de29 diff --git a/config/settings/test.yml b/config/settings/test.yml new file mode 100644 index 0000000..db955db --- /dev/null +++ b/config/settings/test.yml @@ -0,0 +1,8 @@ +aws: + efs: + mount_point: "test/efs/" + sqs: + queue_name: "extractor-to-databank-test" + queue_url: "https://sqs.us-east-2.amazonaws.com/721945215539/extractor-to-databank-test" + s3: + json_bucket: "databank-test-main" \ No newline at end of file diff --git a/docker/extractor/Dockerfile-test b/docker/extractor/Dockerfile-test new file mode 100644 index 0000000..8baa77c --- /dev/null +++ b/docker/extractor/Dockerfile-test @@ -0,0 +1,27 @@ +FROM ruby:3.1.2 +#FROM --platform=linux/arm64 ruby:3.1.2 + +ENV RAILS_ENV=test +ENV RAILS_LOG_TO_STDOUT=true +ENV RUBY_HOME=/extractor +ENV IS_DOCKER=true + +RUN apt-get update && apt-get install -y \ + build-essential \ + git \ + libpq-dev \ + libarchive-dev + +# Copy the Gemfile as well as the Gemfile.lock and install gems. +# This is a separate step so the dependencies will be cached. +RUN mkdir extractor +WORKDIR extractor + +#COPY Gemfile Gemfile.lock ./ +COPY Gemfile ./ +RUN gem install bundler && bundle install + +# Copy the main application, except whatever is listed in .dockerignore. +COPY . ./ + +CMD ["rake", "test"] \ No newline at end of file diff --git a/lib/archive_extractor.rb b/lib/archive_extractor.rb new file mode 100644 index 0000000..35c411d --- /dev/null +++ b/lib/archive_extractor.rb @@ -0,0 +1,131 @@ +# frozen_string_literal: true +require 'aws-sdk-sqs' +require 'aws-sdk-s3' +require 'fileutils' +require 'json' +require 'config' +require 'logger' + + +require_relative 'extractor/extraction' +require_relative 'extractor/extraction_status' +require_relative 'extractor/error_type' + +class ArchiveExtractor + attr_accessor :s3, :sqs, :bucket_name, :object_key, :binary_name, :web_id, :mime_type, :extraction + Config.load_and_set_settings(Config.setting_files("#{ENV['RUBY_HOME']}/config", ENV['RUBY_ENV'])) + LOGGER = Logger.new(STDOUT) + + def initialize(bucket_name, object_key, binary_name, web_id, mime_type, sqs, s3) + @bucket_name = bucket_name + @object_key = object_key + @binary_name = binary_name + @web_id = web_id + @mime_type = mime_type + @sqs = sqs + @s3 = s3 + end + + def extract + begin + error = [] + + del_path = "#{Settings.aws.efs.mount_point}#{@bucket_name}_#{@web_id}" + local_path = "#{del_path}/#{@object_key}" + + dirname = File.dirname(local_path) + unless File.directory?(dirname) + FileUtils.mkdir_p(dirname) + end + + get_object(local_path, error) + + extraction = Extraction.new(@binary_name, local_path, @web_id, @mime_type) + return_value = perform_extraction(extraction, error) + s3_path = "messages/#{@web_id}.json" + s3_put_status, s3_put_error = put_json_response(return_value, s3_path) + + s3_put_errors = s3_put_error.map {|o| Hash[o.each_pair.to_a]} + + return_value = {"bucket_name" => @bucket_name, "object_key" => s3_path, "s3_status" => s3_put_status, "error" => s3_put_errors} + send_sqs_message(return_value) + + ensure + FileUtils.rm_rf(dirname, :secure => true) + FileUtils.rm_rf(del_path, :secure => true) + end + end + + def get_object(local_path, error) + begin + @s3.get_object({ + response_target: local_path, + bucket: @bucket_name, + key: @object_key, + }) + LOGGER.info("Getting object #{@object_key} with ID #{@web_id} from #{@bucket_name}") + rescue StandardError => e + s3_error = "Error getting object #{@object_key} with ID #{@web_id} from S3 bucket #{@bucket_name}: #{e.message}" + LOGGER.error(s3_error) + error.push({"error_type" => ErrorType::S3_GET, "report" => s3_error}) + end + return error + end + + def perform_extraction(extraction, error) + begin + extraction.process + status = extraction.status + LOGGER.info("status: #{status}") + LOGGER.error("error: #{extraction.error}") if status == ExtractionStatus::ERROR + error.concat(extraction.error) + items = extraction.nested_items.map { |o| Hash[o.each_pair.to_a] } + errors = error.map {|o| Hash[o.each_pair.to_a]} + return_value = {"web_id" => @web_id, "status" => status, "error" => errors, "peek_type" => extraction.peek_type, "peek_text" => extraction.peek_text, "nested_items" => items} + rescue StandardError => e + error.push({"task_id" => @web_id, "extraction_process_report" => "Error extracting #{@object_key} with ID #{@web_id}: #{e.message}"}) + errors = error.map {|o| Hash[o.each_pair.to_a]} + return_value = {"web_id" => @web_id, "status" => ExtractionStatus::ERROR, "error" => errors, "peek_type" => PeekType::NONE, "peek_text" => nil, "nested_items" => []} + end + return return_value + end + + def send_sqs_message(return_value) + # Send a message to a queue. + queue_name = Settings.aws.sqs.queue_name + queue_url = Settings.aws.sqs.queue_url + + begin + # Create and send a message. + @sqs.send_message({ + queue_url: queue_url, + message_body: return_value.to_json, + message_attributes: {} + }) + LOGGER.info("Sending message in queue #{queue_name} for object #{@object_key} with ID #{@web_id}") + rescue StandardError => e + LOGGER.error("Error sending message in queue #{queue_name} for object #{@object_key} with ID #{@web_id}: #{e.message}") + end + end + + def put_json_response(return_value, s3_path) + s3_put_error = [] + begin + @s3.put_object({ + body: return_value.to_json, + bucket: Settings.aws.s3.json_bucket, + key: s3_path, + }) + LOGGER.info("Putting json response for object #{@object_key} with ID #{@web_id} in S3 bucket #{@bucket_name} with key #{s3_path}") + s3_put_status = ExtractionStatus::SUCCESS + rescue StandardError => e + s3_put_status = ExtractionStatus::ERROR + s3_put_error_message = "Error putting json response for object #{@object_key} with ID #{@web_id} in S3 bucket #{@bucket_name}: #{e.message}" + s3_put_error.push({"error_type" => ErrorType::S3_PUT, "report" => s3_put_error_message}) + LOGGER.error(s3_put_error_message) + end + return s3_put_status, s3_put_error + end + + +end diff --git a/lib/extractor.rb b/lib/extractor.rb index fba17dc..94f7176 100644 --- a/lib/extractor.rb +++ b/lib/extractor.rb @@ -1,105 +1,18 @@ require 'aws-sdk-sqs' require 'aws-sdk-s3' -require 'fileutils' -require 'json' +require 'config' -require_relative 'extractor/extraction.rb' -require_relative 'extractor/extraction_status.rb' -require_relative 'extractor/error_type.rb' +require_relative 'archive_extractor' class Extractor - def self.extract(bucket_name, object_key, binary_name, web_id, mime_type) - begin - status = ExtractionStatus::ERROR - error = Array.new - s3_put_status = ExtractionStatus::SUCCESS - s3_put_error = Array.new - region = 'us-east-2' - - s3_client = Aws::S3::Client.new(region: region) - del_path = "/mnt/efs/#{bucket_name}_#{web_id}" - local_path = "#{del_path}/#{object_key}" - - dirname = File.dirname(local_path) - unless File.directory?(dirname) - FileUtils.mkdir_p(dirname) - end - - begin - s3_client.get_object( - response_target: local_path, - bucket: bucket_name, - key: object_key, - ) - puts "Getting object #{object_key} with ID #{web_id} from #{bucket_name}" - rescue StandardError => e - s3_error= "Error getting object #{object_key} with ID #{web_id} from S3 bucket #{bucket_name}: #{e.message}" - error.push({"error_type" => ErrorType::S3_GET, "report" => s3_error}) - puts s3_error - end - - - begin - extraction = Extraction.new(binary_name, local_path, web_id, mime_type) - extraction.process - status = extraction.status - puts "status: #{status}" - puts "error: #{extraction.error}" if status == ExtractionStatus::ERROR - error.concat(extraction.error) - items = extraction.nested_items.map { |o| Hash[o.each_pair.to_a] } - errors = error.map {|o| Hash[o.each_pair.to_a]} - return_value = {"web_id" => web_id, "status" => status, "error" => errors, "peek_type" => extraction.peek_type, "peek_text" => extraction.peek_text, "nested_items" => items} - rescue StandardError => e - error.push({"task_id" => web_id, "extraction_process_report" => "Error extracting #{object_key} with ID #{web_id}: #{e.message}"}) - errors = error.map {|o| Hash[o.each_pair.to_a]} - return_value = {"web_id" => web_id, "status" => ExtractionStatus::ERROR, "error" => errors, "peek_type" => PeekType::NONE, "peek_text" => null, "nested_items" => []} - end - - - s3_path = "messages/#{web_id}.json" - begin - s3_client.put_object({ - body: return_value.to_json, - bucket: "databank-demo-main", - key: s3_path, - }) - puts "Putting json response for object #{object_key} with ID #{web_id} in S3 bucket #{bucket_name} with key #{s3_path}" - rescue StandardError => e - s3_put_status = ExtractionStatus::ERROR - s3_put_error_message = "Error putting json response for object #{object_key} with ID #{web_id} in S3 bucket #{bucket_name}: #{e.message}" - s3_put_error.push({"error_type" => ErrorType::S3_PUT, "report" => s3_put_error_message}) - puts s3_put_error_message - end - - s3_put_errors = s3_put_error.map {|o| Hash[o.each_pair.to_a]} + Config.load_and_set_settings(Config.setting_files("#{ENV['RUBY_HOME']}/config", ENV['RUBY_ENV'])) - return_value = {"bucket_name" => bucket_name, "object_key" => s3_path, "s3_status" => s3_put_status, "error" => s3_put_errors} - - - sqs = Aws::SQS::Client.new(region: region) - - # Send a message to a queue. - queue_name = "extractor-to-databank-demo" - queue_url = sqs.get_queue_url(queue_name: queue_name).queue_url - - begin - # Create and send a message. - sqs.send_message({ - queue_url: queue_url, - message_body: return_value.to_json, - message_attributes: {} - }) - puts "Sending message in queue #{queue_name} for object #{object_key} with ID #{web_id}" - rescue StandardError => e - puts "Error sending message in queue #{queue_name} for object #{object_key} with ID #{web_id}: #{e.message}" - end - - - ensure - FileUtils.rm_rf(dirname, :secure => true) - FileUtils.rm_rf(del_path, :secure => true) - - end - end + def self.extract(bucket_name, object_key, binary_name, web_id, mime_type) + region = Settings.aws.region + s3_client = Aws::S3::Client.new(region: region) + sqs = Aws::SQS::Client.new(region: region) + archive_extractor = ArchiveExtractor.new(bucket_name, object_key, binary_name, web_id, mime_type, sqs, s3_client) + archive_extractor.extract + end end \ No newline at end of file diff --git a/lib/extractor/extraction.rb b/lib/extractor/extraction.rb index 258735d..f5eaaae 100644 --- a/lib/extractor/extraction.rb +++ b/lib/extractor/extraction.rb @@ -5,49 +5,51 @@ require 'mimemagic/overlay' require 'zip' require 'zlib' -require 'libarchive' +require 'ffi-libarchive' require 'rubygems/package' +require 'config' +require 'logger' -require_relative 'extraction_status.rb' -require_relative 'peek_type.rb' -require_relative 'error_type.rb' -require_relative 'mime_type.rb' +require_relative 'extraction_status' +require_relative 'extraction_type' +require_relative 'peek_type' +require_relative 'error_type' +require_relative 'mime_type' class Extraction attr_accessor :binary_name, :storage_path, :status, :peek_type, :peek_text, :id, :nested_items, :error, :mime_type - + ALLOWED_CHAR_NUM = 1024 * 8 + ALLOWED_DISPLAY_BYTES = ALLOWED_CHAR_NUM * 8 + LOGGER = Logger.new(STDOUT) def initialize(binary_name, storage_path, id, mime_type) - @nested_items = Array.new @binary_name = binary_name @storage_path = storage_path @id = id - @error = Array.new @mime_type = mime_type + @nested_items = [] + @error = [] end - ALLOWED_CHAR_NUM = 1024 * 8 - ALLOWED_DISPLAY_BYTES = ALLOWED_CHAR_NUM * 8 - def process begin features_extracted = extract_features if features_extracted - self.status = ExtractionStatus::SUCCESS + @status = ExtractionStatus::SUCCESS else - self.status = ExtractionStatus::ERROR + @status = ExtractionStatus::ERROR end rescue StandardError => error - self.status = ExtractionStatus::ERROR - self.peek_type = PeekType::NONE + @status = ExtractionStatus::ERROR + @peek_type = PeekType::NONE report_problem(error.message) ensure - if self.peek_text && self.peek_text.encoding.name != 'UTF-8' + if @peek_text && @peek_text.encoding.name != 'UTF-8' begin - self.peek_text.encode('UTF-8') + @peek_text.encode('UTF-8') rescue Encoding::UndefinedConversionError - self.peek_text = nil - self.peek_type = PeekType::NONE + @peek_text = nil + @peek_type = PeekType::NONE report_problem('invalid encoding for peek text') rescue Exception => ex report_problem("invalid encoding and problem character: #{ex.class}, #{ex.message}") @@ -57,15 +59,13 @@ def process end def report_problem(report) - self.error.push({"error_type" => ErrorType::EXTRACTION, "report" => report}) + @error.push({"error_type" => ErrorType::EXTRACTION, "report" => report}) end def extract_features mime_parts = @mime_type.split("/") subtype = mime_parts[1].downcase - - if MimeType::ZIP.include?(subtype) return extract_zip elsif MimeType::NON_ZIP_ARCHIVE.include?(subtype) @@ -75,11 +75,10 @@ def extract_features else return extract_default end - end - def self.mime_from_path(path) + def mime_from_path(path) file = File.open("#{path}") file_mime_response = MimeMagic.by_path(file).to_s file.close @@ -97,7 +96,7 @@ def self.mime_from_path(path) end end - def self.mime_from_filename(filename) + def mime_from_filename(filename) mime_guesses = MIME::Types.type_for(filename).first.content_type if mime_guesses.length > 0 mime_guesses @@ -107,78 +106,27 @@ def self.mime_from_filename(filename) end def create_item(item_path, item_name, item_size, media_type, is_directory) - item = {"item_name" => item_name, "item_path" => item_path, "item_size" => item_size, "media_type" => media_type, "is_directory" => is_directory} + item = {"item_name" => item_name, "item_path" => item_path, "item_size" => item_size, "media_type" => media_type, + "is_directory" => is_directory} @nested_items.push(item) - end def extract_zip begin - puts "Extracting zip file #{binary_name}" + LOGGER.info("Extracting zip file #{@binary_name}") entry_paths = [] - Zip::File.open(self.storage_path) do |zip_file| + Zip::File.open(@storage_path) do |zip_file| zip_file.each do |entry| - if entry.name_safe? - - entry_path = valid_entry_path(entry.name) - - - if entry_path && !is_ds_store(entry_path) && !is_mac_thing(entry_path) && !is_mac_tar_thing(entry_path) - - entry_paths << entry_path - - if is_directory(entry.name) - - create_item(entry_path, - name_part(entry_path), - entry.size, - 'directory', - true) - - else - - storage_dir = File.dirname(storage_path) - extracted_entry_path = File.join(storage_dir, entry_path) - extracted_entry_dir = File.dirname(extracted_entry_path) - FileUtils.mkdir_p extracted_entry_dir - - raise Exception.new("extracted entry somehow already there?!!?!") if File.exist?(extracted_entry_path) - - entry.extract(extracted_entry_path) - - raise Exception.new("extracting entry not working!") unless File.exist?(extracted_entry_path) - - mime_guess = Extraction.mime_from_path(extracted_entry_path) || - Extraction.mime_from_filename(entry.name) || - 'application/octet-stream' - - create_item(entry_path, - name_part(entry_path), - entry.size, - mime_guess, - false) - File.delete(extracted_entry_path) if File.exist?(extracted_entry_path) - end - - end + entry_paths = extract_entry(entry, entry.name, entry_paths, ExtractionType::ZIP) end end end - - - if entry_paths.length > 0 - self.peek_type = PeekType::LISTING - self.peek_text = entry_paths_arr_to_html(entry_paths) - else - self.peek_type = PeekType::NONE - report_problem("no items found for zip listing for task #{self.id}") - end - + handle_entry_paths(entry_paths) return true rescue StandardError => ex - self.status = ExtractionStatus::ERROR - self.peek_type = PeekType::NONE + @status = ExtractionStatus::ERROR + @peek_type = PeekType::NONE report_problem("problem extracting zip listing for task: #{ex.message}") #return false raise ex @@ -187,161 +135,112 @@ def extract_zip def extract_archive begin - puts "Extracting archive file #{binary_name}" + LOGGER.info("Extracting archive file #{@binary_name}") entry_paths = [] - Archive.read_open_filename(self.storage_path) do |ar| while entry = ar.next_header - - entry_path = valid_entry_path(entry.pathname) - - if entry_path - - if !is_ds_store(entry_path) && !is_mac_thing(entry_path) && !is_mac_tar_thing(entry_path) - entry_paths << entry_path - - if entry.directory? || is_directory(entry.pathname) - - create_item(entry_path, - name_part(entry_path), - entry.size, - 'directory', - true) - else - - storage_dir = File.dirname(storage_path) - extracted_entry_path = File.join(storage_dir, entry_path) - extracted_entry_dir = File.dirname(extracted_entry_path) - FileUtils.mkdir_p extracted_entry_dir - - file = File.open(extracted_entry_path, 'wb') - - raise("extracting non-zip entry not working!") unless File.exist?(extracted_entry_path) - - mime_guess = Extraction.mime_from_path(extracted_entry_path) || - mime_from_filename(entry.name) || - 'application/octet-stream' - - - create_item(entry_path, - name_part(entry_path), - entry.size, - mime_guess, - false) - file.close - File.delete(extracted_entry_path) if File.exist?(extracted_entry_path) - end - - end - - end + entry_paths = extract_entry(entry, entry.pathname, entry_paths, ExtractionType::ARCHIVE) end end - - if entry_paths.length > 0 - self.peek_type = PeekType::LISTING - self.peek_text = entry_paths_arr_to_html(entry_paths) - return true - else - self.peek_type = PeekType::NONE - report_problem("no items found for archive listing for task #{self.id}") - return false - end + handle_entry_paths(entry_paths) rescue StandardError => ex - self.status = ExtractionStatus::ERROR - self.peek_type = PeekType::NONE - - report_problem("problem extracting extract listing for task #{self.id}: #{ex.message}") + LOGGER.error(ex) + @status = ExtractionStatus::ERROR + @peek_type = PeekType::NONE + report_problem("problem extracting extract listing for task #{@id}: #{ex.message}") return false end end def extract_gzip begin - puts "Extracting gzip file #{binary_name}" + LOGGER.info("Extracting gzip file #{@binary_name}") entry_paths = [] - tar_extract = Gem::Package::TarReader.new(Zlib::GzipReader.open(self.storage_path)) - tar_extract.rewind # The extract has to be rewinded after every iteration + tar_extract = Gem::Package::TarReader.new(Zlib::GzipReader.open(@storage_path)) + tar_extract.rewind # The extract has to be rewound after every iteration tar_extract.each do |entry| + entry_paths = extract_entry(entry, entry.full_name, entry_paths, ExtractionType::GZIP) + end + handle_entry_paths(entry_paths) - entry_path = valid_entry_path(entry.full_name) - if entry_path - - if !is_ds_store(entry_path) && !is_mac_thing(entry_path) && !is_mac_tar_thing(entry_path) - - - entry_paths << entry_path - - if entry.directory? - - create_item(entry_path, - name_part(entry_path), - entry.size, - 'directory', - true) - else - - storage_dir = File.dirname(storage_path) - extracted_entry_path = File.join(storage_dir, entry_path) - extracted_entry_dir = File.dirname(extracted_entry_path) - FileUtils.mkdir_p extracted_entry_dir - - file = File.open(extracted_entry_path, 'wb') - - raise("extracting gzip entry not working!") unless File.exist?(extracted_entry_path) - - mime_guess = Extraction.mime_from_path(extracted_entry_path) || - mime_from_filename(entry.name) || - 'application/octet-stream' + rescue StandardError => ex + @status = ExtractionStatus::ERROR + @peek_type = PeekType::NONE + report_problem("problem extracting extract listing for task #{@id}: #{ex.message}") + return false + end + ensure + tar_extract.close + end - create_item(entry_path, - name_part(entry_path), - entry.size, - mime_guess, - false) - file.close - File.delete(extracted_entry_path) if File.exist?(extracted_entry_path) - end + def extract_entry(entry, entry_name, entry_paths, type) + entry_path = valid_entry_path(entry_name) + if entry_path && !is_ds_store(entry_path) && !is_mac_thing(entry_path) && !is_mac_tar_thing(entry_path) + entry_paths << entry_path + if entry.directory? || is_directory(entry_name) + create_item(entry_path, + name_part(entry_path), + entry.size, + 'directory', + true) + else + storage_dir = File.dirname(storage_path) + extracted_entry_path = File.join(storage_dir, entry_path) + extracted_entry_dir = File.dirname(extracted_entry_path) + FileUtils.mkdir_p extracted_entry_dir - end + raise Exception.new("extracted entry somehow already there?!!?!") if File.exist?(extracted_entry_path) + file = nil + case type + when ExtractionType::ZIP + entry.extract(extracted_entry_path) + else + file = File.open(extracted_entry_path, 'wb') end + raise("extracting #{type} entry not working!") unless File.exist?(extracted_entry_path) + + mime_guess = mime_from_path(extracted_entry_path) || + mime_from_filename(entry_name) || + 'application/octet-stream' + + create_item(entry_path, + name_part(entry_path), + entry.size, + mime_guess, + false) + file.close if file + File.delete(extracted_entry_path) if File.exist?(extracted_entry_path) end + end + entry_paths + end - if entry_paths.length > 0 - self.peek_type = PeekType::LISTING - self.peek_text = entry_paths_arr_to_html(entry_paths) - return true - else - self.peek_type = PeekType::NONE - report_problem("no items found for archive listing for task #{self.id}") - return false - end - - rescue StandardError => ex - self.status = ExtractionStatus::ERROR - self.peek_type = PeekType::NONE - - report_problem("problem extracting extract listing for task #{self.id}: #{ex.message}") + def handle_entry_paths(entry_paths) + if entry_paths.length > 0 + @peek_type = PeekType::LISTING + @peek_text = entry_paths_arr_to_html(entry_paths) + puts @peek_text + return true + else + @peek_type = PeekType::NONE + report_problem("no items found for archive listing for task #{@id}") return false - - tar_extract.close end - end def extract_default - puts "Default extraction for #{binary_name}" + LOGGER.info("Default extraction for #{@binary_name}") begin - self.peek_type = PeekType::NONE + @peek_type = PeekType::NONE return true rescue StandardError => ex - self.status = ExtractionStatus::ERROR - self.peek_type = PeekType::NONE - report_problem("problem creating default peek for task #{self.id}") + @status = ExtractionStatus::ERROR + @peek_type = PeekType::NONE + report_problem("problem creating default peek for task #{@id}: #{ex}") return false end end @@ -391,7 +290,7 @@ def name_part(path) def entry_paths_arr_to_html(entry_paths) return_string = ' ' - return_string << self.binary_name + return_string << @binary_name entry_paths.each do |entry_path| diff --git a/lib/extractor/extraction_type.rb b/lib/extractor/extraction_type.rb new file mode 100644 index 0000000..cbe1283 --- /dev/null +++ b/lib/extractor/extraction_type.rb @@ -0,0 +1,6 @@ +# frozen_string_literal: true +class ExtractionType + ZIP = 'zip' + GZIP = 'gzip' + ARCHIVE = 'archive' +end diff --git a/test/archive_extractor_test.rb b/test/archive_extractor_test.rb new file mode 100644 index 0000000..1ed4313 --- /dev/null +++ b/test/archive_extractor_test.rb @@ -0,0 +1,158 @@ +# frozen_string_literal: true +require_relative 'test_helper' + +class TestArchiveExtractor < Minitest::Test + Config.load_and_set_settings(Config.setting_files("#{ENV['RUBY_HOME']}/config", 'test')) + def setup + bucket_name = 'test-bucket' + object_key = 'test-key' + binary_name = 'test' + web_id = 'test-id' + mime_type = 'application/zip' + @sqs = Minitest::Mock.new + @s3 = Minitest::Mock.new + @archive_extractor = ArchiveExtractor.new(bucket_name, object_key, binary_name, web_id, mime_type, @sqs, @s3) + end + + def test_extract + # setup + @archive_extractor.binary_name = 'test.zip' + @archive_extractor.web_id = 'test-zip' + @archive_extractor.mime_type = 'application/zip' + @archive_extractor.object_key = 'test.zip' + del_path = "#{Settings.aws.efs.mount_point}#{@archive_extractor.bucket_name}_#{@archive_extractor.web_id}" + local_path = "#{del_path}/#{@archive_extractor.object_key}" + file_path = "#{ENV['RUBY_HOME']}/test/test.zip" + dirname = File.dirname(local_path) + unless File.directory?(dirname) + FileUtils.mkdir_p(dirname) + end + FileUtils.cp(file_path, local_path) + @s3.expect(:get_object, nil, [{response_target: local_path, bucket: @archive_extractor.bucket_name, + key: @archive_extractor.object_key}]) + peek_text = " test.zip
test.txt
" + items = [{'item_name' => 'test.txt', 'item_path' => 'test.txt', 'item_size' => 12, 'media_type' => 'text/plain', 'is_directory' => false}] + return_value = {'web_id' => 'test-zip', 'status' => ExtractionStatus::SUCCESS, 'error' => [], 'peek_type' => PeekType::LISTING, 'peek_text' => peek_text, 'nested_items' => items} + s3_path = 'messages/test-zip.json' + @s3.expect(:put_object, [], [{body: return_value.to_json, bucket: Settings.aws.s3.json_bucket, key: s3_path}]) + return_value = {'bucket_name' => 'test-bucket', 'object_key' => s3_path, 's3_status' => ExtractionStatus::SUCCESS, 'error' => []} + @sqs.expect(:send_message, nil, [{queue_url: Settings.aws.sqs.queue_url, + message_body: return_value.to_json, + message_attributes:{}}]) + + # test + @archive_extractor.extract + + # verify + assert_mock(@s3) + assert_mock(@sqs) + end + + def test_get_object + # setup + local_path = 'test/path' + @s3.expect(:get_object, nil, [{response_target: local_path, bucket: @archive_extractor.bucket_name, + key: @archive_extractor.object_key}]) + # test + error = @archive_extractor.get_object(local_path, []) + + # verify + assert_mock(@s3) + assert_empty(error) + end + + def test_get_object_error + # setup + stub_s3 = Aws::S3::Client.new(region: Settings.aws.region) + @archive_extractor.s3 = stub_s3 + local_path = "test/path" + raises_exception = -> { raise StandardError.new } + + # test and verify + stub_s3.stub :get_object, raises_exception do + error = @archive_extractor.get_object(local_path, []) + assert(error.first.value?(ErrorType::S3_GET)) + end + end + + def test_perform_extraction + # setup + binary_name = 'test.zip' + web_id = 'test-zip' + mime_type = 'application/zip' + local_path = "#{ENV['RUBY_HOME']}/test/test.zip" + extraction = Extraction.new(binary_name, local_path, web_id, mime_type) + + #test + return_value = @archive_extractor.perform_extraction(extraction, []) + + # verify + assert(return_value.value?(PeekType::LISTING)) + exp_peek_text = " test.zip
test.txt
" + assert(return_value.value?(exp_peek_text)) + + end + + def test_perform_extraction_error + # setup + binary_name = 'test.zip' + web_id = 'test-zip' + mime_type = 'application/zip' + local_path = "#{ENV['RUBY_HOME']}/test/test.zip" + stub_extraction = Extraction.new(binary_name, local_path, web_id, mime_type) + raises_exception = -> { raise StandardError.new } + + # test and verify + stub_extraction.stub :process, raises_exception do + return_value = @archive_extractor.perform_extraction(stub_extraction, []) + assert(return_value.value?(PeekType::NONE)) + assert(return_value.value?(ExtractionStatus::ERROR)) + end + end + + def test_send_sqs_message + # setup + return_value = {'test' => 'retVal'} + @sqs.expect(:send_message, nil, [{queue_url: Settings.aws.sqs.queue_url, + message_body: return_value.to_json, + message_attributes:{}}]) + + # test + @archive_extractor.send_sqs_message(return_value) + + # verify + assert_mock(@sqs) + end + + def test_put_json_response + # setup + return_value = {'test' => 'retVal'} + s3_path = 'test/s3/key' + @s3.expect(:put_object, nil, [{body: return_value.to_json, bucket: Settings.aws.s3.json_bucket, key: s3_path}]) + + # test + s3_put_status, s3_put_error = @archive_extractor.put_json_response(return_value, s3_path) + + # verify + assert_mock(@s3) + assert_equal(ExtractionStatus::SUCCESS, s3_put_status) + assert_empty(s3_put_error) + end + + def test_put_json_response_error + # setup + return_value = {'test' => 'error'} + s3_path = 'test/s3/error' + stub_s3 = Aws::S3::Client.new(region: Settings.aws.region) + @archive_extractor.s3 = stub_s3 + raises_exception = -> { raise StandardError.new } + + # test and verify + stub_s3.stub :put_object, raises_exception do + s3_put_status, s3_put_error = @archive_extractor.put_json_response(return_value, s3_path) + assert_equal(ExtractionStatus::ERROR, s3_put_status) + assert(!s3_put_error.empty?) + end + end +end + diff --git a/test/extraction_test.rb b/test/extraction_test.rb new file mode 100644 index 0000000..7af66de --- /dev/null +++ b/test/extraction_test.rb @@ -0,0 +1,356 @@ +# frozen_string_literal: true +require_relative 'test_helper' + +class TestExtraction < Minitest::Test + Config.load_and_set_settings(Config.setting_files("#{ENV['RUBY_HOME']}/config", 'test')) + def setup + binary_name = 'test-binary' + web_id = 'test-id' + storage_path = "#{Settings.aws.efs.mount_point}test-bucket_#{web_id}/test-key" + mime_type = 'application/zip' + @extraction = Extraction.new(binary_name, storage_path, web_id, mime_type) + end + + def test_process + # setup + @extraction.binary_name = 'test.txt.gz' + @extraction.storage_path = "#{ENV['RUBY_HOME']}/test/test.txt.gz" + @extraction.id = 'test-gzip' + @extraction.mime_type = 'application/gzip' + + # test + @extraction.process + + # verify + assert_equal(ExtractionStatus::SUCCESS, @extraction.status) + assert_equal(PeekType::LISTING, @extraction.peek_type) + end + + def test_report_problem + # setup + report = 'Test report' + + # test + @extraction.report_problem(report) + + # verify + error = @extraction.error + assert_equal(true, error.include?({'error_type' => ErrorType::EXTRACTION, 'report' => report})) + end + + def test_extract_features_gzip + # setup + @extraction.binary_name = 'test.txt.gz' + @extraction.storage_path = "#{ENV['RUBY_HOME']}/test/test.txt.gz" + @extraction.id = 'test-gzip' + @extraction.mime_type = 'application/gzip' + + # test + @extraction.extract_features + + # verify + assert_equal(PeekType::LISTING, @extraction.peek_type) + exp_peek_text = " test.txt.gz
testing\n
" + assert_equal(exp_peek_text, @extraction.peek_text) + end + + def test_extract_features_zip + # setup + @extraction.binary_name = 'test.zip' + @extraction.storage_path = "#{ENV['RUBY_HOME']}/test/test.zip" + @extraction.id = 'test-zip' + @extraction.mime_type = 'application/zip' + + # test + @extraction.extract_features + + # verify + assert_equal(PeekType::LISTING, @extraction.peek_type) + exp_peek_text = " test.zip
test.txt
" + assert_equal(exp_peek_text, @extraction.peek_text) + end + + def test_extract_features_default + # setup + @extraction.binary_name = 'test' + @extraction.storage_path = "#{ENV['RUBY_HOME']}/test" + @extraction.id = 'test-default' + @extraction.mime_type = 'application/directory' + + # test + @extraction.extract_features + + # verify + assert_equal(PeekType::NONE, @extraction.peek_type) + end + + def test_mime_from_path + # setup + ruby_path = "#{ENV['RUBY_HOME']}/bin/set-test-vars.rb" + + # test + ruby_mime = @extraction.mime_from_path(ruby_path) + + # verify + assert_equal('application/x-ruby', ruby_mime) + end + + def test_mime_from_filename + # setup + zip_filename = 'test.zip' + + # test + zip_mime = @extraction.mime_from_filename(zip_filename) + + # verify + assert_equal('application/zip', zip_mime) + end + + def test_create_item + # setup + item_path = 'test/item/path/thing' + item_name = 'thing' + item_size = 123 + media_type = 'directory' + is_directory = true + + # test + @extraction.create_item(item_path, item_name, item_size, media_type, is_directory) + + # verify + nested_items = @extraction.nested_items + assert(nested_items.include?({'item_name' => item_name, 'item_path' => item_path, 'item_size' => item_size, + 'media_type' => media_type, 'is_directory' => is_directory})) + end + + def test_extract_zip + # setup + @extraction.binary_name = 'test.zip' + @extraction.storage_path = "#{ENV['RUBY_HOME']}/test/test.zip" + @extraction.id = 'test-zip' + @extraction.mime_type = 'application/zip' + + # test + @extraction.extract_zip + + # verify + assert_equal(PeekType::LISTING, @extraction.peek_type) + exp_peek_text = " test.zip
test.txt
" + assert_equal(exp_peek_text, @extraction.peek_text) + end + + def test_extract_archive + # setup + @extraction.binary_name = 'test.tar' + @extraction.storage_path = "#{ENV['RUBY_HOME']}/test/test.tar" + @extraction.id = 'test-tar' + @extraction.mime_type = 'application/x-tar' + @extraction.peek_type = nil + + # test + @extraction.extract_archive + + # verify + assert_equal(PeekType::LISTING, @extraction.peek_type) + exp_peek_text = " test.tar
test.txt
" + assert_equal(exp_peek_text, @extraction.peek_text) + end + + def test_extract_gzip + # setup + @extraction.binary_name = 'test.txt.gz' + @extraction.storage_path = "#{ENV['RUBY_HOME']}/test/test.txt.gz" + @extraction.id = 'test-gzip' + @extraction.mime_type = 'application/gzip' + + # test + @extraction.extract_gzip + + # verify + assert_equal(PeekType::LISTING, @extraction.peek_type) + exp_peek_text = " test.txt.gz
testing\n
" + assert_equal(exp_peek_text, @extraction.peek_text) + end + + def test_extract_entry + # setup + mock_entry = Minitest::Mock.new + entry_name = "#{ENV['RUBY_HOME']}/bin/set-test-vars.rb" + type = ExtractionType::GZIP + mock_entry.expect(:directory?, false) + mock_entry.expect(:size, 123) + + # test + entry_paths = @extraction.extract_entry(mock_entry, entry_name, [], type) + + # verify + assert_mock(mock_entry) + assert(entry_paths.include?(entry_name)) + expect_item = {'item_name' => 'set-test-vars.rb', 'item_path' => entry_name, 'item_size' => 123, + 'media_type' => 'application/x-ruby', 'is_directory' => false} + assert(@extraction.nested_items.include?(expect_item)) + + end + + def test_handle_entry_paths + # setup + entry_paths = ['test/path'] + + # test + resp = @extraction.handle_entry_paths(entry_paths) + + # verify + assert(resp) + exp_peek_text = " test-binary
path
" + assert_equal(exp_peek_text, @extraction.peek_text) + assert_equal(PeekType::LISTING, @extraction.peek_type) + end + + def test_handle_entry_paths_empty + # setup + entry_paths = [] + + # test + resp = @extraction.handle_entry_paths(entry_paths) + + # verify + assert_equal(false, resp) + assert_equal(PeekType::NONE, @extraction.peek_type) + assert(@extraction.error.include?({'error_type' => ErrorType::EXTRACTION, + 'report' => "no items found for archive listing for task #{@extraction.id}"})) + end + + def test_extract_default + # test + @extraction.extract_default + # verify + peek_type = @extraction.peek_type + assert_equal(PeekType::NONE, peek_type) + end + + def test_valid_entry_path + # setup + valid_path = 'test/path' + invalid_path = "" + + # test + path = @extraction.valid_entry_path(valid_path) + path_slash = @extraction.valid_entry_path("#{valid_path}/") + path_nil = @extraction.valid_entry_path(invalid_path) + + # verify + assert_equal(valid_path, path) + assert_equal(valid_path, path_slash) + assert_nil(path_nil) + end + + def test_is_directory + # setup + ruby_home = ENV['RUBY_HOME'] + object_path = 'test/path' + slash_path = 'test/path/' + mac_path = 'this/is/a/mac/._path' + ds_store_path = 'test/path/.DS_Store' + + # test + ruby_home_dir = @extraction.is_directory(ruby_home) + object_path_dir = @extraction.is_directory(object_path) + slash_path_dir = @extraction.is_directory(slash_path) + mac_path_dir = @extraction.is_directory(mac_path) + ds_store_path_dir = @extraction.is_directory(ds_store_path) + + # verify + assert_equal(true, ruby_home_dir) + assert_equal(true, slash_path_dir) + assert_equal(false, object_path_dir) + assert_equal(false, mac_path_dir) + assert_equal(false, ds_store_path_dir) + end + + def test_is_mac_thing + # setup + mac_path = 'this/is/a/mac/path/__MACOSX' + path = 'this/is/not/a/mac/path' + # test + mac = @extraction.is_mac_thing(mac_path) + not_mac = @extraction.is_mac_thing(path) + # verify + assert_equal(true, mac) + assert_equal(false, not_mac) + end + + def test_is_mac_tar_thing + # setup + mac_path = 'this/is/a/mac/._path' + paxheader_mac_path = 'PaxHeader/this/is/a/mac/path' + longlink_mac_path = 'this/is/a/mac/path/@LongLink' + path = 'this/is/not/a/mac/path' + # test + mac_underscore = @extraction.is_mac_tar_thing(mac_path) + mac_paxheader = @extraction.is_mac_tar_thing(paxheader_mac_path) + mac_longlink = @extraction.is_mac_tar_thing(longlink_mac_path) + not_mac = @extraction.is_mac_tar_thing(path) + # verify + assert_equal(true, mac_underscore) + assert_equal(true, mac_paxheader) + assert_equal(true, mac_longlink) + assert_equal(false, not_mac) + end + + def test_ends_in_slash + # setup + path_ends_in_slash = 'test/path/' + path_does_not_end_in_slash = 'test/path' + + # test + ends_in_slash = @extraction.ends_in_slash(path_ends_in_slash) + does_not_end_in_slash = @extraction.ends_in_slash(path_does_not_end_in_slash) + + # verify + assert_equal(true, ends_in_slash) + assert_equal(false, does_not_end_in_slash) + end + + def test_is_ds_store + # setup + ds_store_path = 'test/path/.DS_Store' + path = 'test/path' + + # test + ds_store = @extraction.is_ds_store(ds_store_path) + not_ds_store = @extraction.is_ds_store(path) + + # verify + assert_equal(true, ds_store) + assert_equal(false, not_ds_store) + end + + def test_name_part + # setup + path = 'test/path' + name = 'test' + invalid_path = "" + + # test + path_name = @extraction.name_part(path) + test_name = @extraction.name_part(name) + invalid_name = @extraction.name_part(invalid_path) + + # verify + assert_equal('path', path_name) + assert_equal('test', test_name) + assert_nil(invalid_name) + end + + def test_entry_paths_arr_to_html + # setup + entry_paths = ['test/path'] + + # test + return_string = @extraction.entry_paths_arr_to_html(entry_paths) + + # verify + exp_peek_text = " test-binary
path
" + assert_equal(exp_peek_text, return_string) + end +end diff --git a/test/test.tar b/test/test.tar new file mode 100644 index 0000000000000000000000000000000000000000..95dc7f68c459dda5ffd483192a2ebf9466d22ba4 GIT binary patch literal 2048 zcmeH^OAdrE2!^}%6r8|7OWX4pqmBz*SWM>lLu}&4%(96K*zgFDpAVUe^zs$g&=4^( zgHxYVzC8MxYe|h5h-08kJ~9DPh$%S0hMMlxXOZDhX-gSD+bVxSXxlvBQ_rAYQjbG~ dXTv)yqW@p_1;~f)oeE8r5ikNqzzDP;@B_ZL8-xG= literal 0 HcmV?d00001 diff --git a/test/test.txt.gz b/test/test.txt.gz new file mode 100644 index 0000000000000000000000000000000000000000..4f821bca15aaf64a3b012c94d5250ab42f815c0d GIT binary patch literal 37 tcmb2|=HPg9JSCNZxg@o?M6aZxghAU=SMTH*@8?Vme*$7|b1*P4008)4466VD literal 0 HcmV?d00001 diff --git a/test/test.zip b/test/test.zip new file mode 100644 index 0000000000000000000000000000000000000000..d048dc0c6b971c347314fb3484e3520cb098f0c6 GIT binary patch literal 178 zcmWIWW@h1H0D%X29l>lNAIf-uY!K#PkYOlEEiTb3sVE5z;bdT5BE2#3v8s7;X$3a} zBg=P21_l-ppgNG6%)E33LnC9Z0B=SnIc8kuNdQ$dFaY&2ENKL>5T>(2Oh+?0z?+o~ Pq>d2?{eUzGGcW)E3K=0p literal 0 HcmV?d00001 diff --git a/test/test_helper.rb b/test/test_helper.rb new file mode 100644 index 0000000..ec88121 --- /dev/null +++ b/test/test_helper.rb @@ -0,0 +1,17 @@ +# frozen_string_literal: true + +require 'simplecov' +SimpleCov.start + +require 'minitest/autorun' +require 'config' +require 'csv' +require 'json' +require_relative '../lib/archive_extractor' +require_relative '../lib/extractor' +require_relative '../lib/extractor/error_type' +require_relative '../lib/extractor/extraction' +require_relative '../lib/extractor/extraction_status' +require_relative '../lib/extractor/extraction_type' +require_relative '../lib/extractor/mime_type' +require_relative '../lib/extractor/peek_type' \ No newline at end of file From b007740f74b9249035df11954274e240257583c8 Mon Sep 17 00:00:00 2001 From: Gen Schmitt Date: Thu, 30 Nov 2023 14:57:47 -0700 Subject: [PATCH 16/20] added github action --- .github/workflows/test.yml | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 .github/workflows/test.yml diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..e7555c0 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,20 @@ +on: + push: + branches: + - test + - demo + +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: Checkout the repository + uses: actions/checkout@v4 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - name: Build and push + uses: docker/build-push-action@v5 + with: + context: . + file: ./docker/extractor/Dockerfile-test + tags: extractor-test \ No newline at end of file From 33e9f2cffc8ae6b8233dcc174973130fcc55cb40 Mon Sep 17 00:00:00 2001 From: Gen Schmitt Date: Thu, 30 Nov 2023 15:01:15 -0700 Subject: [PATCH 17/20] updated test dockerfile --- docker/extractor/Dockerfile-test | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docker/extractor/Dockerfile-test b/docker/extractor/Dockerfile-test index 8baa77c..5b209d0 100644 --- a/docker/extractor/Dockerfile-test +++ b/docker/extractor/Dockerfile-test @@ -24,4 +24,5 @@ RUN gem install bundler && bundle install # Copy the main application, except whatever is listed in .dockerignore. COPY . ./ -CMD ["rake", "test"] \ No newline at end of file +RUN rake test +#CMD ["rake", "test"] \ No newline at end of file From d5c61b0b14818c427bf39bf5d6198374c5e5cd6e Mon Sep 17 00:00:00 2001 From: Gen Schmitt Date: Fri, 1 Dec 2023 11:00:32 -0700 Subject: [PATCH 18/20] updated to add prod values and fix log statements --- .github/workflows/test.yml | 3 ++- .idea/databank-archive-extractor.iml | 2 ++ Gemfile.lock | 4 ++++ config/settings/prod.yml | 8 ++++++++ docker/extractor/Dockerfile-test | 4 ++-- lib/archive_extractor.rb | 8 +++++--- 6 files changed, 23 insertions(+), 6 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index e7555c0..e3b181f 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -17,4 +17,5 @@ jobs: with: context: . file: ./docker/extractor/Dockerfile-test - tags: extractor-test \ No newline at end of file + tags: extractor-test + platforms: linux/arm64 \ No newline at end of file diff --git a/.idea/databank-archive-extractor.iml b/.idea/databank-archive-extractor.iml index 7a849e6..cc5e1e8 100644 --- a/.idea/databank-archive-extractor.iml +++ b/.idea/databank-archive-extractor.iml @@ -29,6 +29,8 @@ + + diff --git a/Gemfile.lock b/Gemfile.lock index d6d7b8e..033c25b 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -58,6 +58,9 @@ GEM dry-initializer (~> 3.0) dry-schema (>= 1.12, < 2) zeitwerk (~> 2.6) + ffi (1.16.3) + ffi-libarchive (1.1.13) + ffi (~> 1.0) jmespath (1.6.2) mime-types (3.5.1) mime-types-data (~> 3.2015) @@ -87,6 +90,7 @@ DEPENDENCIES aws-sdk-s3 aws-sdk-sqs config + ffi-libarchive mime-types mimemagic (~> 0.3.6) minitest diff --git a/config/settings/prod.yml b/config/settings/prod.yml index e69de29..40c24e7 100644 --- a/config/settings/prod.yml +++ b/config/settings/prod.yml @@ -0,0 +1,8 @@ +aws: + efs: + mount_point: "/mnt/efs/" + sqs: + queue_name: "extractor-to-databank-prod" + queue_url: "https://sqs.us-east-2.amazonaws.com/721945215539/extractor-to-databank-prod" + s3: + json_bucket: "databank-main" \ No newline at end of file diff --git a/docker/extractor/Dockerfile-test b/docker/extractor/Dockerfile-test index 5b209d0..f5ea02c 100644 --- a/docker/extractor/Dockerfile-test +++ b/docker/extractor/Dockerfile-test @@ -1,5 +1,5 @@ -FROM ruby:3.1.2 -#FROM --platform=linux/arm64 ruby:3.1.2 +FROM --platform=linux/arm64 ruby:3.1.2 +#FROM ruby:3.1.2 ENV RAILS_ENV=test ENV RAILS_LOG_TO_STDOUT=true diff --git a/lib/archive_extractor.rb b/lib/archive_extractor.rb index 35c411d..094aa02 100644 --- a/lib/archive_extractor.rb +++ b/lib/archive_extractor.rb @@ -110,17 +110,19 @@ def send_sqs_message(return_value) def put_json_response(return_value, s3_path) s3_put_error = [] + json_bucket = Settings.aws.s3.json_bucket begin @s3.put_object({ body: return_value.to_json, - bucket: Settings.aws.s3.json_bucket, + bucket: json_bucket, key: s3_path, }) - LOGGER.info("Putting json response for object #{@object_key} with ID #{@web_id} in S3 bucket #{@bucket_name} with key #{s3_path}") + LOGGER.info(return_value.to_json) + LOGGER.info("Putting json response for object #{@object_key} with ID #{@web_id} in S3 bucket #{json_bucket} with key #{s3_path}") s3_put_status = ExtractionStatus::SUCCESS rescue StandardError => e s3_put_status = ExtractionStatus::ERROR - s3_put_error_message = "Error putting json response for object #{@object_key} with ID #{@web_id} in S3 bucket #{@bucket_name}: #{e.message}" + s3_put_error_message = "Error putting json response for object #{@object_key} with ID #{@web_id} in S3 bucket #{json_bucket}: #{e.message}" s3_put_error.push({"error_type" => ErrorType::S3_PUT, "report" => s3_put_error_message}) LOGGER.error(s3_put_error_message) end From c664c3f3d27a1e28c9803133e620daa317b3decc Mon Sep 17 00:00:00 2001 From: Gen Schmitt Date: Fri, 1 Dec 2023 13:32:27 -0700 Subject: [PATCH 19/20] updated dockerfiles --- Dockerfile | 7 ++++--- Gemfile | 2 -- docker/extractor/Dockerfile-test | 6 +++--- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/Dockerfile b/Dockerfile index 4f10663..774004e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,9 +1,10 @@ # N.B.: this must match the Ruby version in the Gemfile, and /.ruby-version. FROM ruby:3.1.2 -ENV RAILS_ENV=production +ENV RUBY_ENV=demo ENV RAILS_LOG_TO_STDOUT=true ENV RAILS_SERVE_STATIC_FILES=true +ENV RUBY_HOME=/extractor RUN apt-get update && apt-get install -y \ build-essential \ @@ -11,9 +12,9 @@ RUN apt-get update && apt-get install -y \ libpq-dev \ libarchive-dev -RUN mkdir app -WORKDIR app +RUN mkdir extractor +WORKDIR extractor # Copy the Gemfile as well as the Gemfile.lock and install gems. # This is a separate step so the dependencies will be cached. diff --git a/Gemfile b/Gemfile index 2013c04..0b08bd5 100644 --- a/Gemfile +++ b/Gemfile @@ -10,8 +10,6 @@ gem 'rubyzip' gem 'config' # Use archive for non-zip archive files -# gem 'libarchive' -# gem 'libarchive-ruby' gem 'ffi-libarchive' # Use os to interact with operating system diff --git a/docker/extractor/Dockerfile-test b/docker/extractor/Dockerfile-test index f5ea02c..0ca4179 100644 --- a/docker/extractor/Dockerfile-test +++ b/docker/extractor/Dockerfile-test @@ -1,7 +1,7 @@ FROM --platform=linux/arm64 ruby:3.1.2 #FROM ruby:3.1.2 -ENV RAILS_ENV=test +ENV RUBY_ENV=test ENV RAILS_LOG_TO_STDOUT=true ENV RUBY_HOME=/extractor ENV IS_DOCKER=true @@ -24,5 +24,5 @@ RUN gem install bundler && bundle install # Copy the main application, except whatever is listed in .dockerignore. COPY . ./ -RUN rake test -#CMD ["rake", "test"] \ No newline at end of file +#RUN rake test +CMD ["rake", "test"] \ No newline at end of file From 8ff3e480df1595ae37300481ece191c0344639c2 Mon Sep 17 00:00:00 2001 From: Gen Schmitt Date: Mon, 4 Dec 2023 10:01:27 -0700 Subject: [PATCH 20/20] updated to remove demo configuration --- Dockerfile | 2 +- ecr-push.sh | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index 774004e..6fd469e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,7 @@ # N.B.: this must match the Ruby version in the Gemfile, and /.ruby-version. FROM ruby:3.1.2 -ENV RUBY_ENV=demo +ENV RUBY_ENV=prod ENV RAILS_LOG_TO_STDOUT=true ENV RAILS_SERVE_STATIC_FILES=true ENV RUBY_HOME=/extractor diff --git a/ecr-push.sh b/ecr-push.sh index 9ac4826..1488dd0 100755 --- a/ecr-push.sh +++ b/ecr-push.sh @@ -4,6 +4,6 @@ # aws ecr get-login-password --region us-east-2 | docker login --username AWS --password-stdin 721945215539.dkr.ecr.us-east-2.amazonaws.com -docker build -t databank-archive-extractor-demo . -docker tag databank-archive-extractor-demo:latest 721945215539.dkr.ecr.us-east-2.amazonaws.com/databank-archive-extractor-demo:latest -docker push 721945215539.dkr.ecr.us-east-2.amazonaws.com/databank-archive-extractor-demo:latest +docker build -t databank-archive-extractor-prod . +docker tag databank-archive-extractor-prod:latest 721945215539.dkr.ecr.us-east-2.amazonaws.com/databank-archive-extractor-prod:latest +docker push 721945215539.dkr.ecr.us-east-2.amazonaws.com/databank-archive-extractor-prod:latest