From bff49ef907f23985f24c89edff14d6a51745f358 Mon Sep 17 00:00:00 2001
From: Gen Schmitt <gschmitt@illinois.edu>
Date: Thu, 18 Mar 2021 11:03:03 -0500
Subject: [PATCH 01/20] Updated S3 path for json reponse

---
 lib/extractor.rb | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/extractor.rb b/lib/extractor.rb
index 51bd1a7..f767c32 100644
--- a/lib/extractor.rb
+++ b/lib/extractor.rb
@@ -10,7 +10,7 @@ class Extractor
   def self.extract(bucket_name, object_key, binary_name, web_id)
     begin
       local_path = "./#{binary_name}"
-      s3_path = "#{web_id}_#{binary_name}"
+      s3_path = "messages/#{web_id}.json"
 
       region = 'us-east-2'
       s3_client = Aws::S3::Client.new(region: region)
@@ -52,7 +52,7 @@ def self.extract(bucket_name, object_key, binary_name, web_id)
         puts "Error putting json response for object #{object_key} in S3 bucket #{bucket_name}: #{e.message}"
       end
 
-      retVal = {"bucket name" => bucket_name, "object key" => s3_path}
+      retVal = {"bucketName" => bucket_name, "objectKey" => s3_path}
 
       sqs = Aws::SQS::Client.new(region: region)
 

From 6077abaefd4671d7acae3ee2fd6ca15a17990861 Mon Sep 17 00:00:00 2001
From: Gen Schmitt <gschmitt@illinois.edu>
Date: Thu, 25 Mar 2021 15:39:29 -0500
Subject: [PATCH 02/20] Removed unused code

---
 lib/extractor.rb             |  5 ++---
 lib/extractor/extraction.rb  | 37 ++++--------------------------------
 lib/extractor/nested_item.rb |  3 ---
 lib/extractor/peek_type.rb   |  5 -----
 4 files changed, 6 insertions(+), 44 deletions(-)
 delete mode 100644 lib/extractor/nested_item.rb

diff --git a/lib/extractor.rb b/lib/extractor.rb
index f767c32..429dd3a 100644
--- a/lib/extractor.rb
+++ b/lib/extractor.rb
@@ -9,8 +9,6 @@
 class Extractor
   def self.extract(bucket_name, object_key, binary_name, web_id)
     begin
-      local_path = "./#{binary_name}"
-      s3_path = "messages/#{web_id}.json"
 
       region = 'us-east-2'
       s3_client = Aws::S3::Client.new(region: region)
@@ -41,6 +39,7 @@ def self.extract(bucket_name, object_key, binary_name, web_id)
       retVal = {"web_id" => web_id, "status" => extraction.status, "error" => extraction.error, "peek_type" => extraction.peek_type, "peek_text" => extraction.peek_text, "nested_items" => items}
 
 
+      s3_path = "messages/#{web_id}.json"
       begin
         s3_client.put_object({
              body: retVal.to_json,
@@ -52,7 +51,7 @@ def self.extract(bucket_name, object_key, binary_name, web_id)
         puts "Error putting json response for object #{object_key} in S3 bucket #{bucket_name}: #{e.message}"
       end
 
-      retVal = {"bucketName" => bucket_name, "objectKey" => s3_path}
+      retVal = {"bucket_name" => bucket_name, "object_key" => s3_path}
 
       sqs = Aws::SQS::Client.new(region: region)
 
diff --git a/lib/extractor/extraction.rb b/lib/extractor/extraction.rb
index 75d762d..a6fc436 100644
--- a/lib/extractor/extraction.rb
+++ b/lib/extractor/extraction.rb
@@ -8,7 +8,6 @@
 require 'libarchive'
 require 'rubygems/package'
 
-require_relative 'nested_item.rb'
 require_relative 'extraction_status.rb'
 require_relative 'peek_type.rb'
 
@@ -38,7 +37,6 @@ def process
       self.status = ExtractionStatus::ERROR
       self.peek_type = PeekType::NONE
       report_problem(error.message)
-        #raise error
     ensure
       if self.peek_text && self.peek_text.encoding.name != 'UTF-8'
         begin
@@ -55,15 +53,12 @@ def process
   end
 
   def report_problem(report)
-    #Problem.create(task_id: self.id, report: report)
     self.error = {"task_id" => self.id, "report" => report}
   end
 
   def extract_features
     mime_guess = top_level_mime || Extraction.mime_from_filename(self.binary_name) || 'application/octet-stream'
 
-    #Rails.logger.warn("#{self.binary_name} - #{mime_guess}")
-
     mime_parts = mime_guess.split("/")
 
     nonzip_archive_subtypes = ['x-7z-compressed', 'x-tar']
@@ -87,9 +82,8 @@ def top_level_mime
   end
 
   def self.mime_from_path(path)
-#    puts "path provided #{path}"
     file_mime_response = MimeMagic.by_path(File.open("#{path}")).to_s
-#    puts "file mime response #{file_mime_response}"
+
     if file_mime_response.length > 0
       file_mime_response
     else
@@ -167,10 +161,6 @@ def extract_zip
                 File.delete(extracted_entry_path) if File.exist?(extracted_entry_path)
               end
 
-            else
-              #Rails.logger.warn("skipped entry is ds_store: #{is_ds_store(entry_path)}")
-              #Rails.logger.warn("skipped entry is mac thing: #{is_mac_thing(entry_path)}")
-
             end
           end
         end
@@ -204,7 +194,7 @@ def extract_archive
         while entry = ar.next_header
 
           entry_path = valid_entry_path(entry.pathname)
-#          puts "archive name #{entry.pathname}"
+
           if entry_path
 
             if !is_ds_store(entry_path) && !is_mac_thing(entry_path)
@@ -287,7 +277,7 @@ def extract_gzip
         if entry_path
 
           if !is_ds_store(entry_path) && !is_mac_thing(entry_path) && !is_mac_tar_thing(entry_path)
-#            puts entry.full_name
+
 
             entry_paths << entry_path
 
@@ -371,7 +361,7 @@ def extract_default
   end
 
   def valid_entry_path(entry_path)
-    if entry_path[-1] == '/'
+    if ends_in_slash(entry_path)
       return entry_path[0...-1]
     elsif entry_path.length > 0
       return entry_path
@@ -412,25 +402,6 @@ def name_part(path)
     end
   end
 
-  def self.charset_from_path(path)
-
-    file_info = ""
-
-    if OS.mac?
-      file_info = `file -I #{path}`
-    elsif OS.linux?
-      file_info = `file -i #{path}`
-    else
-      return nil
-    end
-
-    if file_info.length > 0
-      file_info.strip.split('charset=').last
-    else
-      nil
-    end
-  end
-
   def entry_paths_arr_to_html(entry_paths)
     return_string = '<span class="glyphicon glyphicon-folder-open"></span> '
 
diff --git a/lib/extractor/nested_item.rb b/lib/extractor/nested_item.rb
deleted file mode 100644
index d235fc9..0000000
--- a/lib/extractor/nested_item.rb
+++ /dev/null
@@ -1,3 +0,0 @@
-class NestedItem
-  attr_accessor :item_name, :item_path, :item_size, :media_type, :is_directory
-end
\ No newline at end of file
diff --git a/lib/extractor/peek_type.rb b/lib/extractor/peek_type.rb
index 0ac1db3..af99a01 100644
--- a/lib/extractor/peek_type.rb
+++ b/lib/extractor/peek_type.rb
@@ -1,9 +1,4 @@
 class PeekType
-  ALL_TEXT = 'all_text'
-  PART_TEXT = 'part_text'
-  IMAGE = 'image'
-  MICROSOFT = 'microsoft'
-  PDF = 'pdf'
   LISTING = 'listing'
   NONE = 'none'
 end
\ No newline at end of file

From 676c96454553651e1db58d5709bafba45565f4d2 Mon Sep 17 00:00:00 2001
From: Gen Schmitt <gschmitt@illinois.edu>
Date: Mon, 29 Mar 2021 13:36:38 -0500
Subject: [PATCH 03/20] Updated mimemagic gem and peek text format

---
 Gemfile                     |  3 +--
 Gemfile.lock                | 31 ++++++++++++++++++-------------
 lib/extractor/extraction.rb |  4 ++--
 3 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/Gemfile b/Gemfile
index 8af86cb..73c0c87 100644
--- a/Gemfile
+++ b/Gemfile
@@ -10,14 +10,13 @@ gem 'mime-types', require: 'mime/types/full'
 gem 'rubyzip'
 
 # Use archive for non-zip archive files
-#gem 'archive'
 gem 'libarchive'
 
 # Use os to interact with operating system
 gem 'os'
 
 # Use mimemagic to find the mime type of a file from the extension or content
-gem 'mimemagic'
+gem "mimemagic", "~> 0.3.6"
 
 gem "rake", "~> 13.0"
 
diff --git a/Gemfile.lock b/Gemfile.lock
index b9f00c6..e49612c 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -1,31 +1,36 @@
 GEM
   remote: https://rubygems.org/
   specs:
-    aws-eventstream (1.1.0)
-    aws-partitions (1.416.0)
-    aws-sdk-core (3.111.0)
+    aws-eventstream (1.1.1)
+    aws-partitions (1.436.0)
+    aws-sdk-core (3.113.0)
       aws-eventstream (~> 1, >= 1.0.2)
       aws-partitions (~> 1, >= 1.239.0)
       aws-sigv4 (~> 1.1)
       jmespath (~> 1.0)
-    aws-sdk-kms (1.41.0)
-      aws-sdk-core (~> 3, >= 3.109.0)
+    aws-sdk-kms (1.43.0)
+      aws-sdk-core (~> 3, >= 3.112.0)
       aws-sigv4 (~> 1.1)
-    aws-sdk-s3 (1.87.0)
-      aws-sdk-core (~> 3, >= 3.109.0)
+    aws-sdk-s3 (1.93.0)
+      aws-sdk-core (~> 3, >= 3.112.0)
       aws-sdk-kms (~> 1)
       aws-sigv4 (~> 1.1)
-    aws-sdk-sqs (1.35.0)
-      aws-sdk-core (~> 3, >= 3.109.0)
+    aws-sdk-sqs (1.38.0)
+      aws-sdk-core (~> 3, >= 3.112.0)
       aws-sigv4 (~> 1.1)
-    aws-sigv4 (1.2.2)
+    aws-sigv4 (1.2.3)
       aws-eventstream (~> 1, >= 1.0.2)
     jmespath (1.4.0)
     mime-types (3.3.1)
       mime-types-data (~> 3.2015)
-    mime-types-data (3.2020.1104)
-    mimemagic (0.3.5)
+    mime-types-data (3.2021.0225)
+    mimemagic (0.3.10)
+      nokogiri (~> 1)
+      rake
+    nokogiri (1.11.2-x86_64-darwin)
+      racc (~> 1.4)
     os (1.1.1)
+    racc (1.5.2)
     rake (13.0.3)
     rubyzip (2.3.0)
 
@@ -36,7 +41,7 @@ DEPENDENCIES
   aws-sdk-s3
   aws-sdk-sqs
   mime-types
-  mimemagic
+  mimemagic (~> 0.3.6)
   os
   rake (~> 13.0)
   rubyzip
diff --git a/lib/extractor/extraction.rb b/lib/extractor/extraction.rb
index a6fc436..ee10327 100644
--- a/lib/extractor/extraction.rb
+++ b/lib/extractor/extraction.rb
@@ -414,7 +414,7 @@ def entry_paths_arr_to_html(entry_paths)
         name_arr = entry_path.split("/")
 
         name_arr.length.times do
-          return_string << '<div class="indent"">'
+          return_string << '<div class="indent">'
         end
 
         if entry_path[-1] == "/" # means directory
@@ -432,7 +432,7 @@ def entry_paths_arr_to_html(entry_paths)
 
     end
 
-    return return_string
+    return return_string.gsub("\"", "'")
 
   end
 

From 006432d2ff914d03fb306900e2606842c313485e Mon Sep 17 00:00:00 2001
From: Gen Schmitt <gschmitt@illinois.edu>
Date: Tue, 20 Apr 2021 15:47:34 -0500
Subject: [PATCH 04/20] updated error handling, and added script to simplify
 updating ECR image

---
 ecr-push.sh                 |  9 +++++++++
 lib/extractor.rb            | 35 +++++++++++++++++++++++------------
 lib/extractor/extraction.rb |  5 +++--
 3 files changed, 35 insertions(+), 14 deletions(-)
 create mode 100755 ecr-push.sh

diff --git a/ecr-push.sh b/ecr-push.sh
new file mode 100755
index 0000000..752efc3
--- /dev/null
+++ b/ecr-push.sh
@@ -0,0 +1,9 @@
+#!/bin/sh
+#
+# Builds a Docker image and pushes it to AWS ECR.
+#
+
+aws ecr get-login-password --region us-east-2 | docker login --username AWS --password-stdin 721945215539.dkr.ecr.us-east-2.amazonaws.com
+docker build -t databank-archive-extractor-demo .
+docker tag databank-archive-extractor-demo:latest 721945215539.dkr.ecr.us-east-2.amazonaws.com/databank-archive-extractor-demo:latest
+docker push 721945215539.dkr.ecr.us-east-2.amazonaws.com/databank-archive-extractor-demo:latest
\ No newline at end of file
diff --git a/lib/extractor.rb b/lib/extractor.rb
index 429dd3a..427ebc2 100644
--- a/lib/extractor.rb
+++ b/lib/extractor.rb
@@ -9,7 +9,10 @@
 class Extractor
   def self.extract(bucket_name, object_key, binary_name, web_id)
     begin
-
+      status = ExtractionStatus::ERROR
+      error = Hash.new
+      s3_put_status = ExtractionStatus::SUCCESS
+      s3_put_error = ""
       region = 'us-east-2'
       s3_client = Aws::S3::Client.new(region: region)
       del_path = "./mnt/efs/#{bucket_name}_#{web_id}"
@@ -26,18 +29,20 @@ def self.extract(bucket_name, object_key, binary_name, web_id)
             bucket: bucket_name,
             key: object_key,
         )
-        puts "Getting object #{object_key} from #{bucket_name}"
+        puts "Getting object #{object_key} with ID #{web_id} from #{bucket_name}"
       rescue StandardError => e
-        puts "Error getting object #{object_key} from S3 bucket #{bucket_name}: #{e.message}"
+        error = {"task_id" => web_id, "s3_get_report" => "Error getting object #{object_key} with ID #{web_id} from S3 bucket #{bucket_name}: #{e.message}"}
+        puts error
       end
 
       extraction = Extraction.new(binary_name, local_path, web_id)
       extraction.process
-      puts "status: #{extraction.status}"
-      puts "error: #{extraction.error}" if extraction.error == ExtractionStatus::ERROR
+      status = extraction.status
+      puts "status: #{status}"
+      puts "error: #{extraction.error}" if status == ExtractionStatus::ERROR
+      error = error.merge(extraction.error)
       items = extraction.nested_items.map { |o| Hash[o.each_pair.to_a] }
-      retVal = {"web_id" => web_id, "status" => extraction.status, "error" => extraction.error, "peek_type" => extraction.peek_type, "peek_text" => extraction.peek_text, "nested_items" => items}
-
+      retVal = {"web_id" => web_id, "status" => status, "error" => error, "peek_type" => extraction.peek_type, "peek_text" => extraction.peek_text, "nested_items" => items}
 
       s3_path = "messages/#{web_id}.json"
       begin
@@ -46,12 +51,18 @@ def self.extract(bucket_name, object_key, binary_name, web_id)
              bucket: "databank-demo-main",
              key: s3_path,
          })
-        puts "Putting json response for object #{object_key} in S3 bucket #{bucket_name} with key #{s3_path}"
+        puts "Putting json response for object #{object_key} with ID #{web_id} in S3 bucket #{bucket_name} with key #{s3_path}"
       rescue StandardError => e
-        puts "Error putting json response for object #{object_key} in S3 bucket #{bucket_name}: #{e.message}"
+        s3_put_status = ExtractionStatus::ERROR
+        s3_put_error = "Error putting json response for object #{object_key} with ID #{web_id} in S3 bucket #{bucket_name}: #{e.message}"
+        puts s3_put_error
       end
 
-      retVal = {"bucket_name" => bucket_name, "object_key" => s3_path}
+      if s3_put_status == ExtractionStatus::SUCCESS
+        retVal = {"bucket_name" => bucket_name, "object_key" => s3_path}
+      else
+        retVal = {"s3_status" => s3_put_status, "s3_put_report" =>s3_put_error}
+      end
 
       sqs = Aws::SQS::Client.new(region: region)
 
@@ -66,9 +77,9 @@ def self.extract(bucket_name, object_key, binary_name, web_id)
            message_body: retVal.to_json,
            message_attributes: {}
          })
-        puts "Sending message in queue #{queue_name} for object #{object_key}"
+        puts "Sending message in queue #{queue_name} for object #{object_key} with ID #{web_id}"
       rescue StandardError => e
-      puts "Error sending message in queue #{queue_name} for object #{object_key}: #{e.message}"
+      puts "Error sending message in queue #{queue_name} for object #{object_key} with ID #{web_id}: #{e.message}"
       end
 
 
diff --git a/lib/extractor/extraction.rb b/lib/extractor/extraction.rb
index ee10327..aeed1b2 100644
--- a/lib/extractor/extraction.rb
+++ b/lib/extractor/extraction.rb
@@ -20,6 +20,7 @@ def initialize(binary_name, storage_path, id)
     @binary_name = binary_name
     @storage_path = storage_path
     @id = id
+    @error = Hash.new
   end
 
   ALLOWED_CHAR_NUM = 1024 * 8
@@ -46,14 +47,14 @@ def process
           self.peek_type = PeekType::NONE
           report_problem('invalid encoding for peek text')
         rescue Exception => ex
-          report_problem("invalid encoding and problem characer: #{ex.class}, #{ex.message}")
+          report_problem("invalid encoding and problem character: #{ex.class}, #{ex.message}")
         end
       end
     end
   end
 
   def report_problem(report)
-    self.error = {"task_id" => self.id, "report" => report}
+    self.error = {"task_id" => self.id, "extraction_report" => report}
   end
 
   def extract_features

From 021a672e71307b471f9dd2a26bcec0715b0dab0c Mon Sep 17 00:00:00 2001
From: Gen Schmitt <gschmitt@illinois.edu>
Date: Wed, 21 Apr 2021 10:45:17 -0500
Subject: [PATCH 05/20] Added error messaging for processing the extraction

---
 ecr-push.sh      |  2 +-
 lib/extractor.rb | 26 +++++++++++++++++---------
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/ecr-push.sh b/ecr-push.sh
index 752efc3..9ac4826 100755
--- a/ecr-push.sh
+++ b/ecr-push.sh
@@ -6,4 +6,4 @@
 aws ecr get-login-password --region us-east-2 | docker login --username AWS --password-stdin 721945215539.dkr.ecr.us-east-2.amazonaws.com
 docker build -t databank-archive-extractor-demo .
 docker tag databank-archive-extractor-demo:latest 721945215539.dkr.ecr.us-east-2.amazonaws.com/databank-archive-extractor-demo:latest
-docker push 721945215539.dkr.ecr.us-east-2.amazonaws.com/databank-archive-extractor-demo:latest
\ No newline at end of file
+docker push 721945215539.dkr.ecr.us-east-2.amazonaws.com/databank-archive-extractor-demo:latest
diff --git a/lib/extractor.rb b/lib/extractor.rb
index 427ebc2..6fb4360 100644
--- a/lib/extractor.rb
+++ b/lib/extractor.rb
@@ -14,6 +14,7 @@ def self.extract(bucket_name, object_key, binary_name, web_id)
       s3_put_status = ExtractionStatus::SUCCESS
       s3_put_error = ""
       region = 'us-east-2'
+
       s3_client = Aws::S3::Client.new(region: region)
       del_path = "./mnt/efs/#{bucket_name}_#{web_id}"
       local_path = "#{del_path}/#{object_key}"
@@ -31,18 +32,25 @@ def self.extract(bucket_name, object_key, binary_name, web_id)
         )
         puts "Getting object #{object_key} with ID #{web_id} from #{bucket_name}"
       rescue StandardError => e
-        error = {"task_id" => web_id, "s3_get_report" => "Error getting object #{object_key} with ID #{web_id} from S3 bucket #{bucket_name}: #{e.message}"}
+        error.merge!({"task_id" => web_id, "s3_get_report" => "Error getting object #{object_key} with ID #{web_id} from S3 bucket #{bucket_name}: #{e.message}"})
         puts error
       end
 
-      extraction = Extraction.new(binary_name, local_path, web_id)
-      extraction.process
-      status = extraction.status
-      puts "status: #{status}"
-      puts "error: #{extraction.error}" if status == ExtractionStatus::ERROR
-      error = error.merge(extraction.error)
-      items = extraction.nested_items.map { |o| Hash[o.each_pair.to_a] }
-      retVal = {"web_id" => web_id, "status" => status, "error" => error, "peek_type" => extraction.peek_type, "peek_text" => extraction.peek_text, "nested_items" => items}
+
+      begin
+        extraction = Extraction.new(binary_name, local_path, web_id)
+        extraction.process
+        status = extraction.status
+        puts "status: #{status}"
+        puts "error: #{extraction.error}" if status == ExtractionStatus::ERROR
+        error.merge!(extraction.error)
+        items = extraction.nested_items.map { |o| Hash[o.each_pair.to_a] }
+        retVal = {"web_id" => web_id, "status" => status, "error" => error, "peek_type" => extraction.peek_type, "peek_text" => extraction.peek_text, "nested_items" => items}
+      rescue  StandardError => e
+        error.merge!({"task_id" => web_id, "extraction_process_report" => "Error extracting #{object_key} with ID #{web_id}: #{e.message}"})
+        retVal = {"web_id" => web_id, "status" => ExtractionStatus::ERROR, "error" => error, "peek_type" => PeekType::NONE, "peek_text" => null, "nested_items" => []}
+      end
+
 
       s3_path = "messages/#{web_id}.json"
       begin

From 5e8940e36dfd331b61b23c784c65594e0b152e7a Mon Sep 17 00:00:00 2001
From: Gen Schmitt <gschmitt@illinois.edu>
Date: Fri, 23 Apr 2021 11:01:34 -0500
Subject: [PATCH 06/20] Changed error reporting from a hash to an array and
 added error type

---
 lib/extractor.rb            | 34 +++++++++++++++++++---------------
 lib/extractor/error_type.rb |  6 ++++++
 lib/extractor/extraction.rb |  5 +++--
 3 files changed, 28 insertions(+), 17 deletions(-)
 create mode 100644 lib/extractor/error_type.rb

diff --git a/lib/extractor.rb b/lib/extractor.rb
index 6fb4360..66d4749 100644
--- a/lib/extractor.rb
+++ b/lib/extractor.rb
@@ -5,14 +5,15 @@
 
 require_relative 'extractor/extraction.rb'
 require_relative 'extractor/extraction_status.rb'
+require_relative 'extractor/error_type.rb'
 
 class Extractor
   def self.extract(bucket_name, object_key, binary_name, web_id)
     begin
       status = ExtractionStatus::ERROR
-      error = Hash.new
+      error = Array.new
       s3_put_status = ExtractionStatus::SUCCESS
-      s3_put_error = ""
+      s3_put_error = Array.new
       region = 'us-east-2'
 
       s3_client = Aws::S3::Client.new(region: region)
@@ -32,8 +33,9 @@ def self.extract(bucket_name, object_key, binary_name, web_id)
         )
         puts "Getting object #{object_key} with ID #{web_id} from #{bucket_name}"
       rescue StandardError => e
-        error.merge!({"task_id" => web_id, "s3_get_report" => "Error getting object #{object_key} with ID #{web_id} from S3 bucket #{bucket_name}: #{e.message}"})
-        puts error
+        s3_error= "Error getting object #{object_key} with ID #{web_id} from S3 bucket #{bucket_name}: #{e.message}"
+        error.push({"error_type" => ErrorType::S3_GET, "report" => s3_error})
+        puts s3_error
       end
 
 
@@ -43,12 +45,14 @@ def self.extract(bucket_name, object_key, binary_name, web_id)
         status = extraction.status
         puts "status: #{status}"
         puts "error: #{extraction.error}" if status == ExtractionStatus::ERROR
-        error.merge!(extraction.error)
+        error.concat(extraction.error)
         items = extraction.nested_items.map { |o| Hash[o.each_pair.to_a] }
-        retVal = {"web_id" => web_id, "status" => status, "error" => error, "peek_type" => extraction.peek_type, "peek_text" => extraction.peek_text, "nested_items" => items}
+        errors = error.map {|o| Hash[o.each_pair.to_a]}
+        retVal = {"web_id" => web_id, "status" => status, "error" => errors, "peek_type" => extraction.peek_type, "peek_text" => extraction.peek_text, "nested_items" => items}
       rescue  StandardError => e
-        error.merge!({"task_id" => web_id, "extraction_process_report" => "Error extracting #{object_key} with ID #{web_id}: #{e.message}"})
-        retVal = {"web_id" => web_id, "status" => ExtractionStatus::ERROR, "error" => error, "peek_type" => PeekType::NONE, "peek_text" => null, "nested_items" => []}
+        error.push({"task_id" => web_id, "extraction_process_report" => "Error extracting #{object_key} with ID #{web_id}: #{e.message}"})
+        errors = error.map {|o| Hash[o.each_pair.to_a]}
+        retVal = {"web_id" => web_id, "status" => ExtractionStatus::ERROR, "error" => errors, "peek_type" => PeekType::NONE, "peek_text" => null, "nested_items" => []}
       end
 
 
@@ -62,15 +66,15 @@ def self.extract(bucket_name, object_key, binary_name, web_id)
         puts "Putting json response for object #{object_key} with ID #{web_id} in S3 bucket #{bucket_name} with key #{s3_path}"
       rescue StandardError => e
         s3_put_status = ExtractionStatus::ERROR
-        s3_put_error = "Error putting json response for object #{object_key} with ID #{web_id} in S3 bucket #{bucket_name}: #{e.message}"
-        puts s3_put_error
+        s3_put_error_message = "Error putting json response for object #{object_key} with ID #{web_id} in S3 bucket #{bucket_name}: #{e.message}"
+        s3_put_error.push({"error_type" => ErrorType::S3_PUT, "report" => s3_put_error_message})
+        puts s3_put_error_message
       end
 
-      if s3_put_status == ExtractionStatus::SUCCESS
-        retVal = {"bucket_name" => bucket_name, "object_key" => s3_path}
-      else
-        retVal = {"s3_status" => s3_put_status, "s3_put_report" =>s3_put_error}
-      end
+      s3_put_errors = s3_put_error.map {|o| Hash[o.each_pair.to_a]}
+
+      retVal = {"bucket_name" => bucket_name, "object_key" => s3_path, "s3_status" => s3_put_status, "error" => s3_put_errors}
+
 
       sqs = Aws::SQS::Client.new(region: region)
 
diff --git a/lib/extractor/error_type.rb b/lib/extractor/error_type.rb
new file mode 100644
index 0000000..bafc541
--- /dev/null
+++ b/lib/extractor/error_type.rb
@@ -0,0 +1,6 @@
+class ErrorType
+  EXTRACTION = 'extraction_error'
+  PROCESSING = 'processing_error'
+  S3_PUT = 's3_put_error'
+  S3_GET = 's3_get_error'
+end
\ No newline at end of file
diff --git a/lib/extractor/extraction.rb b/lib/extractor/extraction.rb
index aeed1b2..bb67297 100644
--- a/lib/extractor/extraction.rb
+++ b/lib/extractor/extraction.rb
@@ -10,6 +10,7 @@
 
 require_relative 'extraction_status.rb'
 require_relative 'peek_type.rb'
+require_relative 'error_type.rb'
 
 class Extraction
 
@@ -20,7 +21,7 @@ def initialize(binary_name, storage_path, id)
     @binary_name = binary_name
     @storage_path = storage_path
     @id = id
-    @error = Hash.new
+    @error = Array.new
   end
 
   ALLOWED_CHAR_NUM = 1024 * 8
@@ -54,7 +55,7 @@ def process
   end
 
   def report_problem(report)
-    self.error = {"task_id" => self.id, "extraction_report" => report}
+    self.error.push({"error_type" => ErrorType::EXTRACTION, "report" => report})
   end
 
   def extract_features

From 0908fd349270905170e3a0e6ea942e47b85b1f59 Mon Sep 17 00:00:00 2001
From: Gen Schmitt <gschmitt@illinois.edu>
Date: Fri, 23 Apr 2021 14:38:06 -0500
Subject: [PATCH 07/20] Updated variable name to ruby standards

---
 lib/extractor.rb | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/lib/extractor.rb b/lib/extractor.rb
index 66d4749..cef38a2 100644
--- a/lib/extractor.rb
+++ b/lib/extractor.rb
@@ -48,18 +48,18 @@ def self.extract(bucket_name, object_key, binary_name, web_id)
         error.concat(extraction.error)
         items = extraction.nested_items.map { |o| Hash[o.each_pair.to_a] }
         errors = error.map {|o| Hash[o.each_pair.to_a]}
-        retVal = {"web_id" => web_id, "status" => status, "error" => errors, "peek_type" => extraction.peek_type, "peek_text" => extraction.peek_text, "nested_items" => items}
+        return_value = {"web_id" => web_id, "status" => status, "error" => errors, "peek_type" => extraction.peek_type, "peek_text" => extraction.peek_text, "nested_items" => items}
       rescue  StandardError => e
         error.push({"task_id" => web_id, "extraction_process_report" => "Error extracting #{object_key} with ID #{web_id}: #{e.message}"})
         errors = error.map {|o| Hash[o.each_pair.to_a]}
-        retVal = {"web_id" => web_id, "status" => ExtractionStatus::ERROR, "error" => errors, "peek_type" => PeekType::NONE, "peek_text" => null, "nested_items" => []}
+        return_value = {"web_id" => web_id, "status" => ExtractionStatus::ERROR, "error" => errors, "peek_type" => PeekType::NONE, "peek_text" => null, "nested_items" => []}
       end
 
 
       s3_path = "messages/#{web_id}.json"
       begin
         s3_client.put_object({
-             body: retVal.to_json,
+             body: return_value.to_json,
              bucket: "databank-demo-main",
              key: s3_path,
          })
@@ -73,7 +73,7 @@ def self.extract(bucket_name, object_key, binary_name, web_id)
 
       s3_put_errors = s3_put_error.map {|o| Hash[o.each_pair.to_a]}
 
-      retVal = {"bucket_name" => bucket_name, "object_key" => s3_path, "s3_status" => s3_put_status, "error" => s3_put_errors}
+      return_value = {"bucket_name" => bucket_name, "object_key" => s3_path, "s3_status" => s3_put_status, "error" => s3_put_errors}
 
 
       sqs = Aws::SQS::Client.new(region: region)
@@ -86,7 +86,7 @@ def self.extract(bucket_name, object_key, binary_name, web_id)
         # Create and send a message.
         sqs.send_message({
            queue_url: queue_url,
-           message_body: retVal.to_json,
+           message_body: return_value.to_json,
            message_attributes: {}
          })
         puts "Sending message in queue #{queue_name} for object #{object_key} with ID #{web_id}"

From d942f2af4f2bda283ef98a19af2ff9cd9522cee9 Mon Sep 17 00:00:00 2001
From: Gen Schmitt <gschmitt@illinois.edu>
Date: Mon, 26 Apr 2021 15:39:18 -0500
Subject: [PATCH 08/20] Reconfigured to get mime type from Databank

---
 lib/extractor.rb            |  4 ++--
 lib/extractor/extraction.rb | 22 +++++++++-------------
 lib/extractor/mime_type.rb  |  5 +++++
 3 files changed, 16 insertions(+), 15 deletions(-)
 create mode 100644 lib/extractor/mime_type.rb

diff --git a/lib/extractor.rb b/lib/extractor.rb
index cef38a2..1570025 100644
--- a/lib/extractor.rb
+++ b/lib/extractor.rb
@@ -8,7 +8,7 @@
 require_relative 'extractor/error_type.rb'
 
 class Extractor
-  def self.extract(bucket_name, object_key, binary_name, web_id)
+  def self.extract(bucket_name, object_key, binary_name, web_id, mime_type)
     begin
       status = ExtractionStatus::ERROR
       error = Array.new
@@ -40,7 +40,7 @@ def self.extract(bucket_name, object_key, binary_name, web_id)
 
 
       begin
-        extraction = Extraction.new(binary_name, local_path, web_id)
+        extraction = Extraction.new(binary_name, local_path, web_id, mime_type)
         extraction.process
         status = extraction.status
         puts "status: #{status}"
diff --git a/lib/extractor/extraction.rb b/lib/extractor/extraction.rb
index bb67297..9d5c94e 100644
--- a/lib/extractor/extraction.rb
+++ b/lib/extractor/extraction.rb
@@ -11,17 +11,19 @@
 require_relative 'extraction_status.rb'
 require_relative 'peek_type.rb'
 require_relative 'error_type.rb'
+require_relative 'mime_type.rb'
 
 class Extraction
 
-  attr_accessor :binary_name, :storage_path, :status, :peek_type, :peek_text, :id, :nested_items, :error
+  attr_accessor :binary_name, :storage_path, :status, :peek_type, :peek_text, :id, :nested_items, :error, :mime_type
 
-  def initialize(binary_name, storage_path, id)
+  def initialize(binary_name, storage_path, id, mime_type)
     @nested_items = Array.new
     @binary_name = binary_name
     @storage_path = storage_path
     @id = id
     @error = Array.new
+    @mime_type = mime_type
   end
 
   ALLOWED_CHAR_NUM = 1024 * 8
@@ -59,19 +61,16 @@ def report_problem(report)
   end
 
   def extract_features
-    mime_guess = top_level_mime || Extraction.mime_from_filename(self.binary_name) || 'application/octet-stream'
-
-    mime_parts = mime_guess.split("/")
+    mime_parts = @mime_type.split("/")
+    subtype = mime_parts[1].downcase
 
-    nonzip_archive_subtypes = ['x-7z-compressed', 'x-tar']
 
-    subtype = mime_parts[1].downcase
 
-    if subtype == 'zip'
+    if MimeType::ZIP.include?(subtype)
       return extract_zip
-    elsif nonzip_archive_subtypes.include?(subtype)
+    elsif MimeType::NON_ZIP_ARCHIVE.include?(subtype)
       return extract_archive
-    elsif self.binary_name.chars.last(6).join == 'tar.gz'
+    elsif MimeType::GZIP.include?(subtype)
       return extract_gzip
     else
       return extract_default
@@ -79,9 +78,6 @@ def extract_features
 
   end
 
-  def top_level_mime
-    Extraction.mime_from_path(self.storage_path)
-  end
 
   def self.mime_from_path(path)
     file_mime_response = MimeMagic.by_path(File.open("#{path}")).to_s
diff --git a/lib/extractor/mime_type.rb b/lib/extractor/mime_type.rb
new file mode 100644
index 0000000..b915a34
--- /dev/null
+++ b/lib/extractor/mime_type.rb
@@ -0,0 +1,5 @@
+class MimeType
+  ZIP = ["x-zip-compressed", "zip"]
+  NON_ZIP_ARCHIVE = ["x-7z-compressed", "x-tar"]
+  GZIP = ["x-gzip","gzip"]
+end
\ No newline at end of file

From d0125fee9a455892b66ce46a9f8f22c12387f077 Mon Sep 17 00:00:00 2001
From: Gen Schmitt <gschmitt@illinois.edu>
Date: Wed, 28 Apr 2021 14:58:19 -0500
Subject: [PATCH 09/20] Corrected EFS path

---
 lib/extractor.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/extractor.rb b/lib/extractor.rb
index 1570025..fba17dc 100644
--- a/lib/extractor.rb
+++ b/lib/extractor.rb
@@ -17,7 +17,7 @@ def self.extract(bucket_name, object_key, binary_name, web_id, mime_type)
       region = 'us-east-2'
 
       s3_client = Aws::S3::Client.new(region: region)
-      del_path = "./mnt/efs/#{bucket_name}_#{web_id}"
+      del_path = "/mnt/efs/#{bucket_name}_#{web_id}"
       local_path = "#{del_path}/#{object_key}"
 
       dirname = File.dirname(local_path)

From 87b886ce91c1e1de43e40f9a4eab0625452ca2f0 Mon Sep 17 00:00:00 2001
From: Gen Schmitt <gschmitt@illinois.edu>
Date: Mon, 3 May 2021 11:29:14 -0500
Subject: [PATCH 10/20] Added support for xz files

---
 lib/extractor/mime_type.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/extractor/mime_type.rb b/lib/extractor/mime_type.rb
index b915a34..03df00b 100644
--- a/lib/extractor/mime_type.rb
+++ b/lib/extractor/mime_type.rb
@@ -1,5 +1,5 @@
 class MimeType
   ZIP = ["x-zip-compressed", "zip"]
-  NON_ZIP_ARCHIVE = ["x-7z-compressed", "x-tar"]
+  NON_ZIP_ARCHIVE = ["x-7z-compressed", "x-tar", "x-xz"]
   GZIP = ["x-gzip","gzip"]
 end
\ No newline at end of file

From 96e8816764e9030ec159a13d352d239bae0d6cc7 Mon Sep 17 00:00:00 2001
From: Gen Schmitt <gschmitt@illinois.edu>
Date: Wed, 5 May 2021 08:09:17 -0500
Subject: [PATCH 11/20] Added support for rar mime type

---
 lib/extractor/extraction.rb | 22 ++++------------------
 lib/extractor/mime_type.rb  |  2 +-
 2 files changed, 5 insertions(+), 19 deletions(-)

diff --git a/lib/extractor/extraction.rb b/lib/extractor/extraction.rb
index 9d5c94e..433c3de 100644
--- a/lib/extractor/extraction.rb
+++ b/lib/extractor/extraction.rb
@@ -198,7 +198,7 @@ def extract_archive
             if !is_ds_store(entry_path) && !is_mac_thing(entry_path)
               entry_paths << entry_path
 
-              if is_directory(entry.pathname)
+              if entry.directory? || is_directory(entry.pathname)
 
                 create_item(entry_path,
                             name_part(entry_path),
@@ -212,14 +212,7 @@ def extract_archive
                 extracted_entry_dir = File.dirname(extracted_entry_path)
                 FileUtils.mkdir_p extracted_entry_dir
 
-                entry_size = 0
-
-                File.open(extracted_entry_path, 'wb') do |entry_file|
-                  ar.read_data(1024) do |x|
-                    entry_file.write(x)
-                    entry_size = entry_size + x.length
-                  end
-                end
+                File.open(extracted_entry_path, 'wb')
 
                 raise("extracting non-zip entry not working!") unless File.exist?(extracted_entry_path)
 
@@ -293,14 +286,7 @@ def extract_gzip
               extracted_entry_dir = File.dirname(extracted_entry_path)
               FileUtils.mkdir_p extracted_entry_dir
 
-              entry_size = 0
-
-              File.open(extracted_entry_path, 'wb') do |entry_file|
-                entry.read(1024) do |x|
-                  entry_file.write(x)
-                  entry_size = entry_size + x.length
-                end
-              end
+              File.open(extracted_entry_path, 'wb')
 
               raise("extracting gzip entry not working!") unless File.exist?(extracted_entry_path)
 
@@ -367,7 +353,7 @@ def valid_entry_path(entry_path)
   end
 
   def is_directory(path)
-    ends_in_slash(path) && !is_ds_store(path) && !is_mac_thing(path)
+    File.directory?(path) || (ends_in_slash(path) && !is_ds_store(path) && !is_mac_thing(path))
   end
 
   def is_mac_thing(path)
diff --git a/lib/extractor/mime_type.rb b/lib/extractor/mime_type.rb
index 03df00b..4051a7e 100644
--- a/lib/extractor/mime_type.rb
+++ b/lib/extractor/mime_type.rb
@@ -1,5 +1,5 @@
 class MimeType
   ZIP = ["x-zip-compressed", "zip"]
-  NON_ZIP_ARCHIVE = ["x-7z-compressed", "x-tar", "x-xz"]
+  NON_ZIP_ARCHIVE = ["x-7z-compressed", "x-tar", "x-xz", "x-rar", "x-rar-compressed"]
   GZIP = ["x-gzip","gzip"]
 end
\ No newline at end of file

From b96319b4441b3d56efa5af5d5aeab5d1bb30a311 Mon Sep 17 00:00:00 2001
From: Gen Schmitt <gschmitt@illinois.edu>
Date: Wed, 5 May 2021 14:31:43 -0500
Subject: [PATCH 12/20] Added support for gtar files

---
 lib/extractor/mime_type.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/extractor/mime_type.rb b/lib/extractor/mime_type.rb
index 4051a7e..3feaa3c 100644
--- a/lib/extractor/mime_type.rb
+++ b/lib/extractor/mime_type.rb
@@ -1,5 +1,5 @@
 class MimeType
   ZIP = ["x-zip-compressed", "zip"]
-  NON_ZIP_ARCHIVE = ["x-7z-compressed", "x-tar", "x-xz", "x-rar", "x-rar-compressed"]
+  NON_ZIP_ARCHIVE = ["x-7z-compressed", "x-tar", "x-xz", "x-rar", "x-rar-compressed", "x-gtar"]
   GZIP = ["x-gzip","gzip"]
 end
\ No newline at end of file

From 5d185214c9a242e576e64c7de60bed3f419f77c1 Mon Sep 17 00:00:00 2001
From: Gen Schmitt <gschmitt@illinois.edu>
Date: Fri, 7 May 2021 14:52:20 -0500
Subject: [PATCH 13/20] Corrected too many files open bug

---
 lib/extractor/extraction.rb | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/lib/extractor/extraction.rb b/lib/extractor/extraction.rb
index 433c3de..004ad73 100644
--- a/lib/extractor/extraction.rb
+++ b/lib/extractor/extraction.rb
@@ -80,7 +80,9 @@ def extract_features
 
 
   def self.mime_from_path(path)
-    file_mime_response = MimeMagic.by_path(File.open("#{path}")).to_s
+    file = File.open("#{path}")
+    file_mime_response = MimeMagic.by_path(file).to_s
+    file.close
 
     if file_mime_response.length > 0
       file_mime_response
@@ -212,7 +214,7 @@ def extract_archive
                 extracted_entry_dir = File.dirname(extracted_entry_path)
                 FileUtils.mkdir_p extracted_entry_dir
 
-                File.open(extracted_entry_path, 'wb')
+                file = File.open(extracted_entry_path, 'wb')
 
                 raise("extracting non-zip entry not working!") unless File.exist?(extracted_entry_path)
 
@@ -226,7 +228,7 @@ def extract_archive
                             entry.size,
                             mime_guess,
                             false)
-
+                file.close
                 File.delete(extracted_entry_path) if File.exist?(extracted_entry_path)
               end
 
@@ -286,7 +288,7 @@ def extract_gzip
               extracted_entry_dir = File.dirname(extracted_entry_path)
               FileUtils.mkdir_p extracted_entry_dir
 
-              File.open(extracted_entry_path, 'wb')
+              file = File.open(extracted_entry_path, 'wb')
 
               raise("extracting gzip entry not working!") unless File.exist?(extracted_entry_path)
 
@@ -300,7 +302,7 @@ def extract_gzip
                           entry.size,
                           mime_guess,
                           false)
-
+              file.close
               File.delete(extracted_entry_path) if File.exist?(extracted_entry_path)
             end
 

From 7f4a63a2d111c8f8c0ff98e94517b50b5a1aadc2 Mon Sep 17 00:00:00 2001
From: Gen Schmitt <gschmitt@illinois.edu>
Date: Wed, 23 Jun 2021 13:46:51 -0400
Subject: [PATCH 14/20] Fixed bug in processing tar archives

---
 lib/extractor/extraction.rb | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/extractor/extraction.rb b/lib/extractor/extraction.rb
index 004ad73..258735d 100644
--- a/lib/extractor/extraction.rb
+++ b/lib/extractor/extraction.rb
@@ -124,7 +124,7 @@ def extract_zip
             entry_path = valid_entry_path(entry.name)
 
 
-            if entry_path && !is_ds_store(entry_path) && !is_mac_thing(entry_path)
+            if entry_path && !is_ds_store(entry_path) && !is_mac_thing(entry_path) && !is_mac_tar_thing(entry_path)
 
               entry_paths << entry_path
 
@@ -197,7 +197,7 @@ def extract_archive
 
           if entry_path
 
-            if !is_ds_store(entry_path) && !is_mac_thing(entry_path)
+            if !is_ds_store(entry_path) && !is_mac_thing(entry_path) && !is_mac_tar_thing(entry_path)
               entry_paths << entry_path
 
               if entry.directory? || is_directory(entry.pathname)

From b294f1d2a0932a564f8439f1793bfc81dc394457 Mon Sep 17 00:00:00 2001
From: Gen Schmitt <gschmitt@illinois.edu>
Date: Thu, 30 Nov 2023 14:53:40 -0700
Subject: [PATCH 15/20] updated to add test coverage

---
 .idea/.gitignore                             |   8 +
 .idea/databank-archive-extractor.iml         |  47 +++
 .idea/inspectionProfiles/Project_Default.xml |   6 +
 .idea/misc.xml                               |   6 +
 .idea/modules.xml                            |   8 +
 .idea/vcs.xml                                |   6 +
 .ruby-version                                |   1 +
 Dockerfile                                   |   8 +-
 Gemfile                                      |  12 +-
 Gemfile.lock                                 | 100 ++++--
 Rakefile                                     |  11 +-
 bin/console                                  |  15 -
 bin/set-test-vars.rb                         |   5 +
 bin/setup                                    |   8 -
 config/settings.yml                          |   3 +
 config/settings/demo.yml                     |   8 +
 config/settings/prod.yml                     |   0
 config/settings/test.yml                     |   8 +
 docker/extractor/Dockerfile-test             |  27 ++
 lib/archive_extractor.rb                     | 131 +++++++
 lib/extractor.rb                             | 107 +-----
 lib/extractor/extraction.rb                  | 315 ++++++----------
 lib/extractor/extraction_type.rb             |   6 +
 test/archive_extractor_test.rb               | 158 ++++++++
 test/extraction_test.rb                      | 356 +++++++++++++++++++
 test/test.tar                                | Bin 0 -> 2048 bytes
 test/test.txt.gz                             | Bin 0 -> 37 bytes
 test/test.zip                                | Bin 0 -> 178 bytes
 test/test_helper.rb                          |  17 +
 29 files changed, 1011 insertions(+), 366 deletions(-)
 create mode 100644 .idea/.gitignore
 create mode 100644 .idea/databank-archive-extractor.iml
 create mode 100644 .idea/inspectionProfiles/Project_Default.xml
 create mode 100644 .idea/misc.xml
 create mode 100644 .idea/modules.xml
 create mode 100644 .idea/vcs.xml
 create mode 100644 .ruby-version
 delete mode 100755 bin/console
 create mode 100644 bin/set-test-vars.rb
 delete mode 100755 bin/setup
 create mode 100644 config/settings.yml
 create mode 100644 config/settings/demo.yml
 create mode 100644 config/settings/prod.yml
 create mode 100644 config/settings/test.yml
 create mode 100644 docker/extractor/Dockerfile-test
 create mode 100644 lib/archive_extractor.rb
 create mode 100644 lib/extractor/extraction_type.rb
 create mode 100644 test/archive_extractor_test.rb
 create mode 100644 test/extraction_test.rb
 create mode 100644 test/test.tar
 create mode 100644 test/test.txt.gz
 create mode 100644 test/test.zip
 create mode 100644 test/test_helper.rb

diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..73f69e0
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
+# Editor-based HTTP Client requests
+/httpRequests/
diff --git a/.idea/databank-archive-extractor.iml b/.idea/databank-archive-extractor.iml
new file mode 100644
index 0000000..7a849e6
--- /dev/null
+++ b/.idea/databank-archive-extractor.iml
@@ -0,0 +1,47 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="RUBY_MODULE" version="4">
+  <component name="NewModuleRootManager" inherit-compiler-output="true">
+    <exclude-output />
+    <content url="file://$MODULE_DIR$">
+      <sourceFolder url="file://$MODULE_DIR$/features" isTestSource="true" />
+      <sourceFolder url="file://$MODULE_DIR$/spec" isTestSource="true" />
+      <sourceFolder url="file://$MODULE_DIR$/test" isTestSource="true" />
+    </content>
+    <orderEntry type="jdk" jdkName="ruby-3.1.2-p20" jdkType="RUBY_SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+    <orderEntry type="library" scope="PROVIDED" name="aws-eventstream (v1.2.0, ruby-3.1.2-p20) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="aws-partitions (v1.854.0, ruby-3.1.2-p20) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="aws-sdk-core (v3.187.1, ruby-3.1.2-p20) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="aws-sdk-kms (v1.72.0, ruby-3.1.2-p20) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="aws-sdk-s3 (v1.137.0, ruby-3.1.2-p20) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="aws-sdk-sqs (v1.67.0, ruby-3.1.2-p20) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="aws-sigv4 (v1.6.1, ruby-3.1.2-p20) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="bundler (v2.3.22, ruby-3.1.2-p20) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="concurrent-ruby (v1.2.2, ruby-3.1.2-p20) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="config (v5.0.0, ruby-3.1.2-p20) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="deep_merge (v1.2.2, ruby-3.1.2-p20) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="docile (v1.4.0, ruby-3.1.2-p20) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="dry-configurable (v1.1.0, ruby-3.1.2-p20) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="dry-core (v1.0.1, ruby-3.1.2-p20) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="dry-inflector (v1.0.0, ruby-3.1.2-p20) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="dry-initializer (v3.1.1, ruby-3.1.2-p20) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="dry-logic (v1.5.0, ruby-3.1.2-p20) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="dry-schema (v1.13.3, ruby-3.1.2-p20) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="dry-types (v1.7.1, ruby-3.1.2-p20) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="dry-validation (v1.10.0, ruby-3.1.2-p20) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="jmespath (v1.6.2, ruby-3.1.2-p20) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="mime-types (v3.5.1, ruby-3.1.2-p20) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="mime-types-data (v3.2023.1003, ruby-3.1.2-p20) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="mimemagic (v0.3.10, ruby-3.1.2-p20) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="minitest (v5.20.0, ruby-3.1.2-p20) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="nokogiri (v1.15.5, ruby-3.1.2-p20) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="os (v1.1.4, ruby-3.1.2-p20) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="racc (v1.7.3, ruby-3.1.2-p20) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="rake (v13.1.0, ruby-3.1.2-p20) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="rubyzip (v2.3.2, ruby-3.1.2-p20) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="simplecov (v0.22.0, ruby-3.1.2-p20) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="simplecov-html (v0.12.3, ruby-3.1.2-p20) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="simplecov_json_formatter (v0.1.4, ruby-3.1.2-p20) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="zeitwerk (v2.6.12, ruby-3.1.2-p20) [gem]" level="application" />
+  </component>
+</module>
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
new file mode 100644
index 0000000..b0db9b0
--- /dev/null
+++ b/.idea/inspectionProfiles/Project_Default.xml
@@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="Rubocop" enabled="false" level="WARNING" enabled_by_default="false" />
+  </profile>
+</component>
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..1f18249
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="rbenv: 2.7.2" project-jdk-type="RUBY_SDK">
+    <output url="file://$PROJECT_DIR$/out" />
+  </component>
+</project>
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..1127073
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/databank-archive-extractor.iml" filepath="$PROJECT_DIR$/.idea/databank-archive-extractor.iml" />
+    </modules>
+  </component>
+</project>
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..35eb1dd
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="" vcs="Git" />
+  </component>
+</project>
\ No newline at end of file
diff --git a/.ruby-version b/.ruby-version
new file mode 100644
index 0000000..ef538c2
--- /dev/null
+++ b/.ruby-version
@@ -0,0 +1 @@
+3.1.2
diff --git a/Dockerfile b/Dockerfile
index 2de759a..4f10663 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,5 +1,5 @@
 # N.B.: this must match the Ruby version in the Gemfile, and /.ruby-version.
-FROM ruby:2.7.2
+FROM ruby:3.1.2
 
 ENV RAILS_ENV=production
 ENV RAILS_LOG_TO_STDOUT=true
@@ -18,15 +18,11 @@ WORKDIR app
 # Copy the Gemfile as well as the Gemfile.lock and install gems.
 # This is a separate step so the dependencies will be cached.
 COPY Gemfile Gemfile.lock  ./
-RUN gem install bundler && bundle install --without development test --jobs 20 --retry 5
+RUN gem install bundler && bundle install
 
 # Copy the main application, except whatever is listed in .dockerignore.
 COPY . ./
 
-#RUN bin/rails assets:precompile
-
-EXPOSE 3000
-
 # This is the web server entry point. It will need to be overridden when
 # running the workers.
 CMD ["echo", "Error running task, please check the container override command!"]
diff --git a/Gemfile b/Gemfile
index 73c0c87..2013c04 100644
--- a/Gemfile
+++ b/Gemfile
@@ -1,16 +1,18 @@
 source "https://rubygems.org"
 git_source(:github) { |repo| "https://github.com/#{repo}.git" }
 
-ruby '2.7.2'
-
 # Use mime-types to determine mimetypes based on extension
 gem 'mime-types', require: 'mime/types/full'
 
 # Use rubyzip to read zip files
 gem 'rubyzip'
 
+gem 'config'
+
 # Use archive for non-zip archive files
-gem 'libarchive'
+# gem 'libarchive'
+# gem 'libarchive-ruby'
+gem 'ffi-libarchive'
 
 # Use os to interact with operating system
 gem 'os'
@@ -24,3 +26,7 @@ gem "aws-sdk-s3"
 
 gem "aws-sdk-sqs"
 
+gem 'minitest'
+
+gem 'simplecov'
+
diff --git a/Gemfile.lock b/Gemfile.lock
index e49612c..d6d7b8e 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -1,53 +1,99 @@
 GEM
   remote: https://rubygems.org/
   specs:
-    aws-eventstream (1.1.1)
-    aws-partitions (1.436.0)
-    aws-sdk-core (3.113.0)
+    aws-eventstream (1.2.0)
+    aws-partitions (1.854.0)
+    aws-sdk-core (3.187.1)
       aws-eventstream (~> 1, >= 1.0.2)
-      aws-partitions (~> 1, >= 1.239.0)
+      aws-partitions (~> 1, >= 1.651.0)
+      aws-sigv4 (~> 1.5)
+      jmespath (~> 1, >= 1.6.1)
+    aws-sdk-kms (1.72.0)
+      aws-sdk-core (~> 3, >= 3.184.0)
       aws-sigv4 (~> 1.1)
-      jmespath (~> 1.0)
-    aws-sdk-kms (1.43.0)
-      aws-sdk-core (~> 3, >= 3.112.0)
-      aws-sigv4 (~> 1.1)
-    aws-sdk-s3 (1.93.0)
-      aws-sdk-core (~> 3, >= 3.112.0)
+    aws-sdk-s3 (1.137.0)
+      aws-sdk-core (~> 3, >= 3.181.0)
       aws-sdk-kms (~> 1)
+      aws-sigv4 (~> 1.6)
+    aws-sdk-sqs (1.67.0)
+      aws-sdk-core (~> 3, >= 3.184.0)
       aws-sigv4 (~> 1.1)
-    aws-sdk-sqs (1.38.0)
-      aws-sdk-core (~> 3, >= 3.112.0)
-      aws-sigv4 (~> 1.1)
-    aws-sigv4 (1.2.3)
+    aws-sigv4 (1.6.1)
       aws-eventstream (~> 1, >= 1.0.2)
-    jmespath (1.4.0)
-    mime-types (3.3.1)
+    concurrent-ruby (1.2.2)
+    config (5.0.0)
+      deep_merge (~> 1.2, >= 1.2.1)
+      dry-validation (~> 1.0, >= 1.0.0)
+    deep_merge (1.2.2)
+    docile (1.4.0)
+    dry-configurable (1.1.0)
+      dry-core (~> 1.0, < 2)
+      zeitwerk (~> 2.6)
+    dry-core (1.0.1)
+      concurrent-ruby (~> 1.0)
+      zeitwerk (~> 2.6)
+    dry-inflector (1.0.0)
+    dry-initializer (3.1.1)
+    dry-logic (1.5.0)
+      concurrent-ruby (~> 1.0)
+      dry-core (~> 1.0, < 2)
+      zeitwerk (~> 2.6)
+    dry-schema (1.13.3)
+      concurrent-ruby (~> 1.0)
+      dry-configurable (~> 1.0, >= 1.0.1)
+      dry-core (~> 1.0, < 2)
+      dry-initializer (~> 3.0)
+      dry-logic (>= 1.4, < 2)
+      dry-types (>= 1.7, < 2)
+      zeitwerk (~> 2.6)
+    dry-types (1.7.1)
+      concurrent-ruby (~> 1.0)
+      dry-core (~> 1.0)
+      dry-inflector (~> 1.0)
+      dry-logic (~> 1.4)
+      zeitwerk (~> 2.6)
+    dry-validation (1.10.0)
+      concurrent-ruby (~> 1.0)
+      dry-core (~> 1.0, < 2)
+      dry-initializer (~> 3.0)
+      dry-schema (>= 1.12, < 2)
+      zeitwerk (~> 2.6)
+    jmespath (1.6.2)
+    mime-types (3.5.1)
       mime-types-data (~> 3.2015)
-    mime-types-data (3.2021.0225)
+    mime-types-data (3.2023.1003)
     mimemagic (0.3.10)
       nokogiri (~> 1)
       rake
-    nokogiri (1.11.2-x86_64-darwin)
+    minitest (5.20.0)
+    nokogiri (1.15.5-x86_64-darwin)
       racc (~> 1.4)
-    os (1.1.1)
-    racc (1.5.2)
-    rake (13.0.3)
-    rubyzip (2.3.0)
+    os (1.1.4)
+    racc (1.7.3)
+    rake (13.1.0)
+    rubyzip (2.3.2)
+    simplecov (0.22.0)
+      docile (~> 1.1)
+      simplecov-html (~> 0.11)
+      simplecov_json_formatter (~> 0.1)
+    simplecov-html (0.12.3)
+    simplecov_json_formatter (0.1.4)
+    zeitwerk (2.6.12)
 
 PLATFORMS
-  x86_64-darwin-19
+  x86_64-darwin-21
 
 DEPENDENCIES
   aws-sdk-s3
   aws-sdk-sqs
+  config
   mime-types
   mimemagic (~> 0.3.6)
+  minitest
   os
   rake (~> 13.0)
   rubyzip
-
-RUBY VERSION
-   ruby 2.7.2p137
+  simplecov
 
 BUNDLED WITH
-   2.2.4
+   2.3.22
diff --git a/Rakefile b/Rakefile
index cd510a0..0abb012 100644
--- a/Rakefile
+++ b/Rakefile
@@ -1,4 +1,11 @@
 # frozen_string_literal: true
 
-require "bundler/gem_tasks"
-task default: %i[]
+require 'rake/testtask'
+require 'simplecov'
+require_relative 'bin/set-test-vars'
+
+Rake::TestTask.new(:test) do |t|
+  t.libs << 'lib' << 'test'
+  # t.libs << 'lib'
+  t.test_files = FileList['test/*_test.rb']
+end
diff --git a/bin/console b/bin/console
deleted file mode 100755
index 8096e50..0000000
--- a/bin/console
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/usr/bin/env ruby
-# frozen_string_literal: true
-
-require "bundler/setup"
-require "databank/archive/extractor"
-
-# You can add fixtures and/or initialization code here to make experimenting
-# with your gem easier. You can also use a different console, if you like.
-
-# (If you use this, don't forget to add pry to your Gemfile!)
-# require "pry"
-# Pry.start
-
-require "irb"
-IRB.start(__FILE__)
diff --git a/bin/set-test-vars.rb b/bin/set-test-vars.rb
new file mode 100644
index 0000000..7959c7c
--- /dev/null
+++ b/bin/set-test-vars.rb
@@ -0,0 +1,5 @@
+#!/usr/bin/env ruby
+
+ENV['RUBY_ENV'] = 'test'
+ENV['RUBY_HOME'] = ENV['IS_DOCKER'] == 'true' ? '/extractor' : '/Users/gschmitt/workspace/databank-archive-extractor'
+ENV['RUBY_TEST_HOME'] = "#{ENV['RUBY_HOME']}/test"
diff --git a/bin/setup b/bin/setup
deleted file mode 100755
index dce67d8..0000000
--- a/bin/setup
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-IFS=$'\n\t'
-set -vx
-
-bundle install
-
-# Do any other automated setup that you need to do here
diff --git a/config/settings.yml b/config/settings.yml
new file mode 100644
index 0000000..ed17934
--- /dev/null
+++ b/config/settings.yml
@@ -0,0 +1,3 @@
+aws:
+    region: "us-east-2"
+
diff --git a/config/settings/demo.yml b/config/settings/demo.yml
new file mode 100644
index 0000000..18367ed
--- /dev/null
+++ b/config/settings/demo.yml
@@ -0,0 +1,8 @@
+aws:
+    efs:
+        mount_point: "/mnt/efs/"
+    sqs:
+        queue_name: "extractor-to-databank-demo"
+        queue_url: "https://sqs.us-east-2.amazonaws.com/721945215539/extractor-to-databank-demo"
+    s3:
+        json_bucket: "databank-demo-main"
diff --git a/config/settings/prod.yml b/config/settings/prod.yml
new file mode 100644
index 0000000..e69de29
diff --git a/config/settings/test.yml b/config/settings/test.yml
new file mode 100644
index 0000000..db955db
--- /dev/null
+++ b/config/settings/test.yml
@@ -0,0 +1,8 @@
+aws:
+    efs:
+        mount_point: "test/efs/"
+    sqs:
+        queue_name: "extractor-to-databank-test"
+        queue_url: "https://sqs.us-east-2.amazonaws.com/721945215539/extractor-to-databank-test"
+    s3:
+        json_bucket: "databank-test-main"
\ No newline at end of file
diff --git a/docker/extractor/Dockerfile-test b/docker/extractor/Dockerfile-test
new file mode 100644
index 0000000..8baa77c
--- /dev/null
+++ b/docker/extractor/Dockerfile-test
@@ -0,0 +1,27 @@
+FROM ruby:3.1.2
+#FROM --platform=linux/arm64 ruby:3.1.2
+
+ENV RAILS_ENV=test
+ENV RAILS_LOG_TO_STDOUT=true
+ENV RUBY_HOME=/extractor
+ENV IS_DOCKER=true
+
+RUN apt-get update && apt-get install -y \
+  build-essential \
+  git \
+  libpq-dev \
+  libarchive-dev
+
+# Copy the Gemfile as well as the Gemfile.lock and install gems.
+# This is a separate step so the dependencies will be cached.
+RUN mkdir extractor
+WORKDIR extractor
+
+#COPY Gemfile Gemfile.lock  ./
+COPY Gemfile ./
+RUN gem install bundler && bundle install
+
+# Copy the main application, except whatever is listed in .dockerignore.
+COPY . ./
+
+CMD ["rake", "test"]
\ No newline at end of file
diff --git a/lib/archive_extractor.rb b/lib/archive_extractor.rb
new file mode 100644
index 0000000..35c411d
--- /dev/null
+++ b/lib/archive_extractor.rb
@@ -0,0 +1,131 @@
+# frozen_string_literal: true
+require 'aws-sdk-sqs'
+require 'aws-sdk-s3'
+require 'fileutils'
+require 'json'
+require 'config'
+require 'logger'
+
+
+require_relative 'extractor/extraction'
+require_relative 'extractor/extraction_status'
+require_relative 'extractor/error_type'
+
+class ArchiveExtractor
+  attr_accessor :s3, :sqs, :bucket_name, :object_key, :binary_name, :web_id, :mime_type, :extraction
+  Config.load_and_set_settings(Config.setting_files("#{ENV['RUBY_HOME']}/config", ENV['RUBY_ENV']))
+  LOGGER = Logger.new(STDOUT)
+
+  def initialize(bucket_name, object_key, binary_name, web_id, mime_type, sqs, s3)
+    @bucket_name = bucket_name
+    @object_key = object_key
+    @binary_name = binary_name
+    @web_id = web_id
+    @mime_type = mime_type
+    @sqs = sqs
+    @s3 = s3
+  end
+
+  def extract
+    begin
+      error = []
+      
+      del_path = "#{Settings.aws.efs.mount_point}#{@bucket_name}_#{@web_id}"
+      local_path = "#{del_path}/#{@object_key}"
+
+      dirname = File.dirname(local_path)
+      unless File.directory?(dirname)
+        FileUtils.mkdir_p(dirname)
+      end
+
+      get_object(local_path, error)
+
+      extraction = Extraction.new(@binary_name, local_path, @web_id, @mime_type)
+      return_value = perform_extraction(extraction, error)
+      s3_path = "messages/#{@web_id}.json"
+      s3_put_status, s3_put_error = put_json_response(return_value, s3_path)
+
+      s3_put_errors = s3_put_error.map {|o| Hash[o.each_pair.to_a]}
+
+      return_value = {"bucket_name" => @bucket_name, "object_key" => s3_path, "s3_status" => s3_put_status, "error" => s3_put_errors}
+      send_sqs_message(return_value)
+
+    ensure
+      FileUtils.rm_rf(dirname, :secure => true)
+      FileUtils.rm_rf(del_path, :secure => true)
+    end
+  end
+
+  def get_object(local_path, error)
+    begin
+      @s3.get_object({
+                      response_target: local_path,
+                      bucket: @bucket_name,
+                      key: @object_key,
+                     })
+      LOGGER.info("Getting object #{@object_key} with ID #{@web_id} from #{@bucket_name}")
+    rescue StandardError => e
+      s3_error = "Error getting object #{@object_key} with ID #{@web_id} from S3 bucket #{@bucket_name}: #{e.message}"
+      LOGGER.error(s3_error)
+      error.push({"error_type" => ErrorType::S3_GET, "report" => s3_error})
+    end
+    return error
+  end
+
+  def perform_extraction(extraction, error)
+    begin
+      extraction.process
+      status = extraction.status
+      LOGGER.info("status: #{status}")
+      LOGGER.error("error: #{extraction.error}") if status == ExtractionStatus::ERROR
+      error.concat(extraction.error)
+      items = extraction.nested_items.map { |o| Hash[o.each_pair.to_a] }
+      errors = error.map {|o| Hash[o.each_pair.to_a]}
+      return_value = {"web_id" => @web_id, "status" => status, "error" => errors, "peek_type" => extraction.peek_type, "peek_text" => extraction.peek_text, "nested_items" => items}
+    rescue  StandardError => e
+      error.push({"task_id" => @web_id, "extraction_process_report" => "Error extracting #{@object_key} with ID #{@web_id}: #{e.message}"})
+      errors = error.map {|o| Hash[o.each_pair.to_a]}
+      return_value = {"web_id" => @web_id, "status" => ExtractionStatus::ERROR, "error" => errors, "peek_type" => PeekType::NONE, "peek_text" => nil, "nested_items" => []}
+    end
+    return return_value
+  end
+
+  def send_sqs_message(return_value)
+    # Send a message to a queue.
+    queue_name = Settings.aws.sqs.queue_name
+    queue_url = Settings.aws.sqs.queue_url
+
+    begin
+      # Create and send a message.
+      @sqs.send_message({
+                          queue_url: queue_url,
+                          message_body: return_value.to_json,
+                          message_attributes: {}
+                        })
+      LOGGER.info("Sending message in queue #{queue_name} for object #{@object_key} with ID #{@web_id}")
+    rescue StandardError => e
+      LOGGER.error("Error sending message in queue #{queue_name} for object #{@object_key} with ID #{@web_id}: #{e.message}")
+    end
+  end
+
+  def put_json_response(return_value, s3_path)
+    s3_put_error = []
+    begin
+      @s3.put_object({
+                       body: return_value.to_json,
+                       bucket: Settings.aws.s3.json_bucket,
+                       key: s3_path,
+                     })
+      LOGGER.info("Putting json response for object #{@object_key} with ID #{@web_id} in S3 bucket #{@bucket_name} with key #{s3_path}")
+      s3_put_status = ExtractionStatus::SUCCESS
+    rescue StandardError => e
+      s3_put_status = ExtractionStatus::ERROR
+      s3_put_error_message = "Error putting json response for object #{@object_key} with ID #{@web_id} in S3 bucket #{@bucket_name}: #{e.message}"
+      s3_put_error.push({"error_type" => ErrorType::S3_PUT, "report" => s3_put_error_message})
+      LOGGER.error(s3_put_error_message)
+    end
+    return s3_put_status, s3_put_error
+  end
+
+
+end
diff --git a/lib/extractor.rb b/lib/extractor.rb
index fba17dc..94f7176 100644
--- a/lib/extractor.rb
+++ b/lib/extractor.rb
@@ -1,105 +1,18 @@
 require 'aws-sdk-sqs'
 require 'aws-sdk-s3'
-require 'fileutils'
-require 'json'
+require 'config'
 
-require_relative 'extractor/extraction.rb'
-require_relative 'extractor/extraction_status.rb'
-require_relative 'extractor/error_type.rb'
+require_relative 'archive_extractor'
 
 class Extractor
-  def self.extract(bucket_name, object_key, binary_name, web_id, mime_type)
-    begin
-      status = ExtractionStatus::ERROR
-      error = Array.new
-      s3_put_status = ExtractionStatus::SUCCESS
-      s3_put_error = Array.new
-      region = 'us-east-2'
-
-      s3_client = Aws::S3::Client.new(region: region)
-      del_path = "/mnt/efs/#{bucket_name}_#{web_id}"
-      local_path = "#{del_path}/#{object_key}"
-
-      dirname = File.dirname(local_path)
-      unless File.directory?(dirname)
-        FileUtils.mkdir_p(dirname)
-      end
-
-      begin
-        s3_client.get_object(
-            response_target: local_path,
-            bucket: bucket_name,
-            key: object_key,
-        )
-        puts "Getting object #{object_key} with ID #{web_id} from #{bucket_name}"
-      rescue StandardError => e
-        s3_error= "Error getting object #{object_key} with ID #{web_id} from S3 bucket #{bucket_name}: #{e.message}"
-        error.push({"error_type" => ErrorType::S3_GET, "report" => s3_error})
-        puts s3_error
-      end
-
-
-      begin
-        extraction = Extraction.new(binary_name, local_path, web_id, mime_type)
-        extraction.process
-        status = extraction.status
-        puts "status: #{status}"
-        puts "error: #{extraction.error}" if status == ExtractionStatus::ERROR
-        error.concat(extraction.error)
-        items = extraction.nested_items.map { |o| Hash[o.each_pair.to_a] }
-        errors = error.map {|o| Hash[o.each_pair.to_a]}
-        return_value = {"web_id" => web_id, "status" => status, "error" => errors, "peek_type" => extraction.peek_type, "peek_text" => extraction.peek_text, "nested_items" => items}
-      rescue  StandardError => e
-        error.push({"task_id" => web_id, "extraction_process_report" => "Error extracting #{object_key} with ID #{web_id}: #{e.message}"})
-        errors = error.map {|o| Hash[o.each_pair.to_a]}
-        return_value = {"web_id" => web_id, "status" => ExtractionStatus::ERROR, "error" => errors, "peek_type" => PeekType::NONE, "peek_text" => null, "nested_items" => []}
-      end
-
-
-      s3_path = "messages/#{web_id}.json"
-      begin
-        s3_client.put_object({
-             body: return_value.to_json,
-             bucket: "databank-demo-main",
-             key: s3_path,
-         })
-        puts "Putting json response for object #{object_key} with ID #{web_id} in S3 bucket #{bucket_name} with key #{s3_path}"
-      rescue StandardError => e
-        s3_put_status = ExtractionStatus::ERROR
-        s3_put_error_message = "Error putting json response for object #{object_key} with ID #{web_id} in S3 bucket #{bucket_name}: #{e.message}"
-        s3_put_error.push({"error_type" => ErrorType::S3_PUT, "report" => s3_put_error_message})
-        puts s3_put_error_message
-      end
-
-      s3_put_errors = s3_put_error.map {|o| Hash[o.each_pair.to_a]}
+  Config.load_and_set_settings(Config.setting_files("#{ENV['RUBY_HOME']}/config", ENV['RUBY_ENV']))
 
-      return_value = {"bucket_name" => bucket_name, "object_key" => s3_path, "s3_status" => s3_put_status, "error" => s3_put_errors}
-
-
-      sqs = Aws::SQS::Client.new(region: region)
-
-      # Send a message to a queue.
-      queue_name = "extractor-to-databank-demo"
-      queue_url = sqs.get_queue_url(queue_name: queue_name).queue_url
-
-      begin
-        # Create and send a message.
-        sqs.send_message({
-           queue_url: queue_url,
-           message_body: return_value.to_json,
-           message_attributes: {}
-         })
-        puts "Sending message in queue #{queue_name} for object #{object_key} with ID #{web_id}"
-      rescue StandardError => e
-      puts "Error sending message in queue #{queue_name} for object #{object_key} with ID #{web_id}: #{e.message}"
-      end
-
-
-    ensure
-      FileUtils.rm_rf(dirname, :secure => true)
-      FileUtils.rm_rf(del_path, :secure => true)
-
-    end
-   end
+  def self.extract(bucket_name, object_key, binary_name, web_id, mime_type)
+    region = Settings.aws.region
+    s3_client = Aws::S3::Client.new(region: region)
+    sqs = Aws::SQS::Client.new(region: region)
+    archive_extractor = ArchiveExtractor.new(bucket_name, object_key, binary_name, web_id, mime_type, sqs, s3_client)
+    archive_extractor.extract
+  end
 
 end
\ No newline at end of file
diff --git a/lib/extractor/extraction.rb b/lib/extractor/extraction.rb
index 258735d..f5eaaae 100644
--- a/lib/extractor/extraction.rb
+++ b/lib/extractor/extraction.rb
@@ -5,49 +5,51 @@
 require 'mimemagic/overlay'
 require 'zip'
 require 'zlib'
-require 'libarchive'
+require 'ffi-libarchive'
 require 'rubygems/package'
+require 'config'
+require 'logger'
 
-require_relative 'extraction_status.rb'
-require_relative 'peek_type.rb'
-require_relative 'error_type.rb'
-require_relative 'mime_type.rb'
+require_relative 'extraction_status'
+require_relative 'extraction_type'
+require_relative 'peek_type'
+require_relative 'error_type'
+require_relative 'mime_type'
 
 class Extraction
 
   attr_accessor :binary_name, :storage_path, :status, :peek_type, :peek_text, :id, :nested_items, :error, :mime_type
-
+  ALLOWED_CHAR_NUM = 1024 * 8
+  ALLOWED_DISPLAY_BYTES = ALLOWED_CHAR_NUM * 8
+  LOGGER = Logger.new(STDOUT)
   def initialize(binary_name, storage_path, id, mime_type)
-    @nested_items = Array.new
     @binary_name = binary_name
     @storage_path = storage_path
     @id = id
-    @error = Array.new
     @mime_type = mime_type
+    @nested_items = []
+    @error = []
   end
 
-  ALLOWED_CHAR_NUM = 1024 * 8
-  ALLOWED_DISPLAY_BYTES = ALLOWED_CHAR_NUM * 8
-
   def process
     begin
       features_extracted = extract_features
       if features_extracted
-        self.status = ExtractionStatus::SUCCESS
+        @status = ExtractionStatus::SUCCESS
       else
-        self.status = ExtractionStatus::ERROR
+        @status = ExtractionStatus::ERROR
       end
     rescue StandardError => error
-      self.status = ExtractionStatus::ERROR
-      self.peek_type = PeekType::NONE
+      @status = ExtractionStatus::ERROR
+      @peek_type = PeekType::NONE
       report_problem(error.message)
     ensure
-      if self.peek_text && self.peek_text.encoding.name != 'UTF-8'
+      if @peek_text && @peek_text.encoding.name != 'UTF-8'
         begin
-          self.peek_text.encode('UTF-8')
+          @peek_text.encode('UTF-8')
         rescue Encoding::UndefinedConversionError
-          self.peek_text = nil
-          self.peek_type = PeekType::NONE
+          @peek_text = nil
+          @peek_type = PeekType::NONE
           report_problem('invalid encoding for peek text')
         rescue Exception => ex
           report_problem("invalid encoding and problem character: #{ex.class}, #{ex.message}")
@@ -57,15 +59,13 @@ def process
   end
 
   def report_problem(report)
-    self.error.push({"error_type" => ErrorType::EXTRACTION, "report" => report})
+    @error.push({"error_type" => ErrorType::EXTRACTION, "report" => report})
   end
 
   def extract_features
     mime_parts = @mime_type.split("/")
     subtype = mime_parts[1].downcase
 
-
-
     if MimeType::ZIP.include?(subtype)
       return extract_zip
     elsif MimeType::NON_ZIP_ARCHIVE.include?(subtype)
@@ -75,11 +75,10 @@ def extract_features
     else
       return extract_default
     end
-
   end
 
 
-  def self.mime_from_path(path)
+  def mime_from_path(path)
     file = File.open("#{path}")
     file_mime_response = MimeMagic.by_path(file).to_s
     file.close
@@ -97,7 +96,7 @@ def self.mime_from_path(path)
     end
   end
 
-  def self.mime_from_filename(filename)
+  def mime_from_filename(filename)
     mime_guesses = MIME::Types.type_for(filename).first.content_type
     if mime_guesses.length > 0
       mime_guesses
@@ -107,78 +106,27 @@ def self.mime_from_filename(filename)
   end
 
   def create_item(item_path, item_name, item_size, media_type, is_directory)
-    item = {"item_name" => item_name, "item_path" => item_path, "item_size" => item_size, "media_type" => media_type, "is_directory" => is_directory}
+    item = {"item_name" => item_name, "item_path" => item_path, "item_size" => item_size, "media_type" => media_type,
+            "is_directory" => is_directory}
     @nested_items.push(item)
-
   end
 
   def extract_zip
     begin
-      puts "Extracting zip file #{binary_name}"
+      LOGGER.info("Extracting zip file #{@binary_name}")
       entry_paths = []
-      Zip::File.open(self.storage_path) do |zip_file|
+      Zip::File.open(@storage_path) do |zip_file|
         zip_file.each do |entry|
-
           if entry.name_safe?
-
-            entry_path = valid_entry_path(entry.name)
-
-
-            if entry_path && !is_ds_store(entry_path) && !is_mac_thing(entry_path) && !is_mac_tar_thing(entry_path)
-
-              entry_paths << entry_path
-
-              if is_directory(entry.name)
-
-                create_item(entry_path,
-                            name_part(entry_path),
-                            entry.size,
-                            'directory',
-                            true)
-
-              else
-
-                storage_dir = File.dirname(storage_path)
-                extracted_entry_path = File.join(storage_dir, entry_path)
-                extracted_entry_dir = File.dirname(extracted_entry_path)
-                FileUtils.mkdir_p extracted_entry_dir
-
-                raise Exception.new("extracted entry somehow already there?!!?!") if File.exist?(extracted_entry_path)
-
-                entry.extract(extracted_entry_path)
-
-                raise Exception.new("extracting entry not working!") unless File.exist?(extracted_entry_path)
-
-                mime_guess = Extraction.mime_from_path(extracted_entry_path) ||
-                    Extraction.mime_from_filename(entry.name) ||
-                    'application/octet-stream'
-
-                create_item(entry_path,
-                            name_part(entry_path),
-                            entry.size,
-                            mime_guess,
-                            false)
-                File.delete(extracted_entry_path) if File.exist?(extracted_entry_path)
-              end
-
-            end
+            entry_paths = extract_entry(entry, entry.name, entry_paths, ExtractionType::ZIP)
           end
         end
       end
-
-
-      if entry_paths.length > 0
-        self.peek_type = PeekType::LISTING
-        self.peek_text = entry_paths_arr_to_html(entry_paths)
-      else
-        self.peek_type = PeekType::NONE
-        report_problem("no items found for zip listing for task #{self.id}")
-      end
-
+      handle_entry_paths(entry_paths)
       return true
     rescue StandardError => ex
-      self.status = ExtractionStatus::ERROR
-      self.peek_type = PeekType::NONE
+      @status = ExtractionStatus::ERROR
+      @peek_type = PeekType::NONE
       report_problem("problem extracting zip listing for task: #{ex.message}")
       #return false
       raise ex
@@ -187,161 +135,112 @@ def extract_zip
 
   def extract_archive
     begin
-      puts "Extracting archive file #{binary_name}"
+      LOGGER.info("Extracting archive file #{@binary_name}")
       entry_paths = []
-
       Archive.read_open_filename(self.storage_path) do |ar|
         while entry = ar.next_header
-
-          entry_path = valid_entry_path(entry.pathname)
-
-          if entry_path
-
-            if !is_ds_store(entry_path) && !is_mac_thing(entry_path) && !is_mac_tar_thing(entry_path)
-              entry_paths << entry_path
-
-              if entry.directory? || is_directory(entry.pathname)
-
-                create_item(entry_path,
-                            name_part(entry_path),
-                            entry.size,
-                            'directory',
-                            true)
-              else
-
-                storage_dir = File.dirname(storage_path)
-                extracted_entry_path = File.join(storage_dir, entry_path)
-                extracted_entry_dir = File.dirname(extracted_entry_path)
-                FileUtils.mkdir_p extracted_entry_dir
-
-                file = File.open(extracted_entry_path, 'wb')
-
-                raise("extracting non-zip entry not working!") unless File.exist?(extracted_entry_path)
-
-                mime_guess = Extraction.mime_from_path(extracted_entry_path) ||
-                    mime_from_filename(entry.name) ||
-                    'application/octet-stream'
-
-
-                create_item(entry_path,
-                            name_part(entry_path),
-                            entry.size,
-                            mime_guess,
-                            false)
-                file.close
-                File.delete(extracted_entry_path) if File.exist?(extracted_entry_path)
-              end
-
-            end
-
-          end
+          entry_paths = extract_entry(entry, entry.pathname, entry_paths, ExtractionType::ARCHIVE)
         end
       end
-
-      if entry_paths.length > 0
-        self.peek_type = PeekType::LISTING
-        self.peek_text = entry_paths_arr_to_html(entry_paths)
-        return true
-      else
-        self.peek_type = PeekType::NONE
-        report_problem("no items found for archive listing for task #{self.id}")
-        return false
-      end
+      handle_entry_paths(entry_paths)
 
     rescue StandardError => ex
-      self.status = ExtractionStatus::ERROR
-      self.peek_type = PeekType::NONE
-
-      report_problem("problem extracting extract listing for task #{self.id}: #{ex.message}")
+      LOGGER.error(ex)
+      @status = ExtractionStatus::ERROR
+      @peek_type = PeekType::NONE
+      report_problem("problem extracting extract listing for task #{@id}: #{ex.message}")
       return false
     end
   end
 
   def extract_gzip
     begin
-      puts "Extracting gzip file #{binary_name}"
+      LOGGER.info("Extracting gzip file #{@binary_name}")
       entry_paths = []
 
-      tar_extract = Gem::Package::TarReader.new(Zlib::GzipReader.open(self.storage_path))
-      tar_extract.rewind # The extract has to be rewinded after every iteration
+      tar_extract = Gem::Package::TarReader.new(Zlib::GzipReader.open(@storage_path))
+      tar_extract.rewind # The extract has to be rewound after every iteration
       tar_extract.each do |entry|
+        entry_paths = extract_entry(entry, entry.full_name, entry_paths, ExtractionType::GZIP)
+      end
+      handle_entry_paths(entry_paths)
 
-        entry_path = valid_entry_path(entry.full_name)
-        if entry_path
-
-          if !is_ds_store(entry_path) && !is_mac_thing(entry_path) && !is_mac_tar_thing(entry_path)
-
-
-            entry_paths << entry_path
-
-            if entry.directory?
-
-              create_item(entry_path,
-                          name_part(entry_path),
-                          entry.size,
-                          'directory',
-                          true)
-            else
-
-              storage_dir = File.dirname(storage_path)
-              extracted_entry_path = File.join(storage_dir, entry_path)
-              extracted_entry_dir = File.dirname(extracted_entry_path)
-              FileUtils.mkdir_p extracted_entry_dir
-
-              file = File.open(extracted_entry_path, 'wb')
-
-              raise("extracting gzip entry not working!") unless File.exist?(extracted_entry_path)
-
-              mime_guess = Extraction.mime_from_path(extracted_entry_path) ||
-                  mime_from_filename(entry.name) ||
-                  'application/octet-stream'
+    rescue StandardError => ex
+      @status = ExtractionStatus::ERROR
+      @peek_type = PeekType::NONE
 
+      report_problem("problem extracting extract listing for task #{@id}: #{ex.message}")
+      return false
+    end
+  ensure
+    tar_extract.close
+  end
 
-              create_item(entry_path,
-                          name_part(entry_path),
-                          entry.size,
-                          mime_guess,
-                          false)
-              file.close
-              File.delete(extracted_entry_path) if File.exist?(extracted_entry_path)
-            end
+  def extract_entry(entry, entry_name, entry_paths, type)
+    entry_path = valid_entry_path(entry_name)
+    if entry_path && !is_ds_store(entry_path) && !is_mac_thing(entry_path) && !is_mac_tar_thing(entry_path)
+      entry_paths << entry_path
+      if entry.directory? || is_directory(entry_name)
+        create_item(entry_path,
+                    name_part(entry_path),
+                    entry.size,
+                    'directory',
+                    true)
+      else
+        storage_dir = File.dirname(storage_path)
+        extracted_entry_path = File.join(storage_dir, entry_path)
+        extracted_entry_dir = File.dirname(extracted_entry_path)
+        FileUtils.mkdir_p extracted_entry_dir
 
-          end
+        raise Exception.new("extracted entry somehow already there?!!?!") if File.exist?(extracted_entry_path)
 
+        file = nil
+        case type
+        when ExtractionType::ZIP
+          entry.extract(extracted_entry_path)
+        else
+          file = File.open(extracted_entry_path, 'wb')
         end
+        raise("extracting #{type} entry not working!") unless File.exist?(extracted_entry_path)
+
+        mime_guess = mime_from_path(extracted_entry_path) ||
+          mime_from_filename(entry_name) ||
+          'application/octet-stream'
+
+        create_item(entry_path,
+                    name_part(entry_path),
+                    entry.size,
+                    mime_guess,
+                    false)
+        file.close if file
+        File.delete(extracted_entry_path) if File.exist?(extracted_entry_path)
       end
+    end
+    entry_paths
+  end
 
-      if entry_paths.length > 0
-        self.peek_type = PeekType::LISTING
-        self.peek_text = entry_paths_arr_to_html(entry_paths)
-        return true
-      else
-        self.peek_type = PeekType::NONE
-        report_problem("no items found for archive listing for task #{self.id}")
-        return false
-      end
-
-    rescue StandardError => ex
-      self.status = ExtractionStatus::ERROR
-      self.peek_type = PeekType::NONE
-
-      report_problem("problem extracting extract listing for task #{self.id}: #{ex.message}")
+  def handle_entry_paths(entry_paths)
+    if entry_paths.length > 0
+      @peek_type = PeekType::LISTING
+      @peek_text = entry_paths_arr_to_html(entry_paths)
+      puts @peek_text
+      return true
+    else
+      @peek_type = PeekType::NONE
+      report_problem("no items found for archive listing for task #{@id}")
       return false
-
-      tar_extract.close
     end
-
   end
 
   def extract_default
-    puts "Default extraction for #{binary_name}"
+    LOGGER.info("Default extraction for #{@binary_name}")
     begin
-      self.peek_type = PeekType::NONE
+      @peek_type = PeekType::NONE
       return true
     rescue StandardError => ex
-      self.status = ExtractionStatus::ERROR
-      self.peek_type = PeekType::NONE
-      report_problem("problem creating default peek for task #{self.id}")
+      @status = ExtractionStatus::ERROR
+      @peek_type = PeekType::NONE
+      report_problem("problem creating default peek for task #{@id}: #{ex}")
       return false
     end
   end
@@ -391,7 +290,7 @@ def name_part(path)
   def entry_paths_arr_to_html(entry_paths)
     return_string = '<span class="glyphicon glyphicon-folder-open"></span> '
 
-    return_string << self.binary_name
+    return_string << @binary_name
 
     entry_paths.each do |entry_path|
 
diff --git a/lib/extractor/extraction_type.rb b/lib/extractor/extraction_type.rb
new file mode 100644
index 0000000..cbe1283
--- /dev/null
+++ b/lib/extractor/extraction_type.rb
@@ -0,0 +1,6 @@
+# frozen_string_literal: true
+class ExtractionType
+  ZIP = 'zip'
+  GZIP = 'gzip'
+  ARCHIVE = 'archive'
+end
diff --git a/test/archive_extractor_test.rb b/test/archive_extractor_test.rb
new file mode 100644
index 0000000..1ed4313
--- /dev/null
+++ b/test/archive_extractor_test.rb
@@ -0,0 +1,158 @@
+# frozen_string_literal: true
+require_relative 'test_helper'
+
+class TestArchiveExtractor < Minitest::Test
+  Config.load_and_set_settings(Config.setting_files("#{ENV['RUBY_HOME']}/config", 'test'))
+  def setup
+    bucket_name = 'test-bucket'
+    object_key = 'test-key'
+    binary_name = 'test'
+    web_id = 'test-id'
+    mime_type = 'application/zip'
+    @sqs = Minitest::Mock.new
+    @s3 = Minitest::Mock.new
+    @archive_extractor = ArchiveExtractor.new(bucket_name, object_key, binary_name, web_id, mime_type, @sqs, @s3)
+  end
+
+  def test_extract
+    # setup
+    @archive_extractor.binary_name = 'test.zip'
+    @archive_extractor.web_id = 'test-zip'
+    @archive_extractor.mime_type = 'application/zip'
+    @archive_extractor.object_key = 'test.zip'
+    del_path = "#{Settings.aws.efs.mount_point}#{@archive_extractor.bucket_name}_#{@archive_extractor.web_id}"
+    local_path = "#{del_path}/#{@archive_extractor.object_key}"
+    file_path = "#{ENV['RUBY_HOME']}/test/test.zip"
+    dirname = File.dirname(local_path)
+    unless File.directory?(dirname)
+      FileUtils.mkdir_p(dirname)
+    end
+    FileUtils.cp(file_path, local_path)
+    @s3.expect(:get_object, nil, [{response_target: local_path, bucket: @archive_extractor.bucket_name,
+                                                   key: @archive_extractor.object_key}])
+    peek_text = "<span class='glyphicon glyphicon-folder-open'></span> test.zip<div class='indent'><span class='glyphicon glyphicon-file'></span> test.txt</div>"
+    items = [{'item_name' => 'test.txt', 'item_path' => 'test.txt', 'item_size' => 12, 'media_type' => 'text/plain', 'is_directory' => false}]
+    return_value = {'web_id' => 'test-zip', 'status' => ExtractionStatus::SUCCESS, 'error' => [], 'peek_type' => PeekType::LISTING, 'peek_text' => peek_text, 'nested_items' => items}
+    s3_path = 'messages/test-zip.json'
+    @s3.expect(:put_object, [], [{body: return_value.to_json, bucket: Settings.aws.s3.json_bucket, key: s3_path}])
+    return_value = {'bucket_name' => 'test-bucket', 'object_key' => s3_path, 's3_status' => ExtractionStatus::SUCCESS, 'error' => []}
+    @sqs.expect(:send_message, nil, [{queue_url: Settings.aws.sqs.queue_url,
+                                      message_body: return_value.to_json,
+                                      message_attributes:{}}])
+
+    # test
+    @archive_extractor.extract
+
+    # verify
+    assert_mock(@s3)
+    assert_mock(@sqs)
+  end
+
+  def test_get_object
+    # setup
+    local_path = 'test/path'
+    @s3.expect(:get_object, nil, [{response_target: local_path, bucket: @archive_extractor.bucket_name,
+                                                    key: @archive_extractor.object_key}])
+    # test
+    error = @archive_extractor.get_object(local_path, [])
+
+    # verify
+    assert_mock(@s3)
+    assert_empty(error)
+  end
+
+  def test_get_object_error
+    # setup
+    stub_s3 = Aws::S3::Client.new(region: Settings.aws.region)
+    @archive_extractor.s3 = stub_s3
+    local_path = "test/path"
+    raises_exception = -> { raise StandardError.new }
+
+    # test and verify
+    stub_s3.stub :get_object, raises_exception do
+      error = @archive_extractor.get_object(local_path, [])
+      assert(error.first.value?(ErrorType::S3_GET))
+    end
+  end
+
+  def test_perform_extraction
+    # setup
+    binary_name = 'test.zip'
+    web_id = 'test-zip'
+    mime_type = 'application/zip'
+    local_path = "#{ENV['RUBY_HOME']}/test/test.zip"
+    extraction = Extraction.new(binary_name, local_path, web_id, mime_type)
+
+    #test
+    return_value = @archive_extractor.perform_extraction(extraction, [])
+
+    # verify
+    assert(return_value.value?(PeekType::LISTING))
+    exp_peek_text = "<span class='glyphicon glyphicon-folder-open'></span> test.zip<div class='indent'><span class='glyphicon glyphicon-file'></span> test.txt</div>"
+    assert(return_value.value?(exp_peek_text))
+
+  end
+
+  def test_perform_extraction_error
+    # setup
+    binary_name = 'test.zip'
+    web_id = 'test-zip'
+    mime_type = 'application/zip'
+    local_path = "#{ENV['RUBY_HOME']}/test/test.zip"
+    stub_extraction = Extraction.new(binary_name, local_path, web_id, mime_type)
+    raises_exception = -> { raise StandardError.new }
+
+    # test and verify
+    stub_extraction.stub :process, raises_exception do
+      return_value = @archive_extractor.perform_extraction(stub_extraction, [])
+      assert(return_value.value?(PeekType::NONE))
+      assert(return_value.value?(ExtractionStatus::ERROR))
+    end
+  end
+
+  def test_send_sqs_message
+    # setup
+    return_value = {'test' => 'retVal'}
+    @sqs.expect(:send_message, nil, [{queue_url: Settings.aws.sqs.queue_url,
+                                                      message_body: return_value.to_json,
+                                                      message_attributes:{}}])
+
+    # test
+    @archive_extractor.send_sqs_message(return_value)
+
+    # verify
+    assert_mock(@sqs)
+  end
+
+  def test_put_json_response
+    # setup
+    return_value = {'test' => 'retVal'}
+    s3_path = 'test/s3/key'
+    @s3.expect(:put_object, nil, [{body: return_value.to_json, bucket: Settings.aws.s3.json_bucket, key: s3_path}])
+
+    # test
+    s3_put_status, s3_put_error = @archive_extractor.put_json_response(return_value, s3_path)
+
+    # verify
+    assert_mock(@s3)
+    assert_equal(ExtractionStatus::SUCCESS, s3_put_status)
+    assert_empty(s3_put_error)
+  end
+
+  def test_put_json_response_error
+    # setup
+    return_value = {'test' => 'error'}
+    s3_path = 'test/s3/error'
+    stub_s3 = Aws::S3::Client.new(region: Settings.aws.region)
+    @archive_extractor.s3 = stub_s3
+    raises_exception = -> { raise StandardError.new }
+
+    # test and verify
+    stub_s3.stub :put_object, raises_exception do
+      s3_put_status, s3_put_error = @archive_extractor.put_json_response(return_value, s3_path)
+      assert_equal(ExtractionStatus::ERROR, s3_put_status)
+      assert(!s3_put_error.empty?)
+    end
+  end
+end
+
diff --git a/test/extraction_test.rb b/test/extraction_test.rb
new file mode 100644
index 0000000..7af66de
--- /dev/null
+++ b/test/extraction_test.rb
@@ -0,0 +1,356 @@
+# frozen_string_literal: true
+require_relative 'test_helper'
+
+class TestExtraction < Minitest::Test
+  Config.load_and_set_settings(Config.setting_files("#{ENV['RUBY_HOME']}/config", 'test'))
+  def setup
+    binary_name = 'test-binary'
+    web_id = 'test-id'
+    storage_path = "#{Settings.aws.efs.mount_point}test-bucket_#{web_id}/test-key"
+    mime_type = 'application/zip'
+    @extraction = Extraction.new(binary_name, storage_path, web_id, mime_type)
+  end
+
+  def test_process
+    # setup
+    @extraction.binary_name = 'test.txt.gz'
+    @extraction.storage_path = "#{ENV['RUBY_HOME']}/test/test.txt.gz"
+    @extraction.id = 'test-gzip'
+    @extraction.mime_type = 'application/gzip'
+
+    # test
+    @extraction.process
+
+    # verify
+    assert_equal(ExtractionStatus::SUCCESS, @extraction.status)
+    assert_equal(PeekType::LISTING, @extraction.peek_type)
+  end
+
+  def test_report_problem
+    # setup
+    report = 'Test report'
+
+    # test
+    @extraction.report_problem(report)
+
+    # verify
+    error = @extraction.error
+    assert_equal(true, error.include?({'error_type' => ErrorType::EXTRACTION, 'report' => report}))
+  end
+
+  def test_extract_features_gzip
+    # setup
+    @extraction.binary_name = 'test.txt.gz'
+    @extraction.storage_path = "#{ENV['RUBY_HOME']}/test/test.txt.gz"
+    @extraction.id = 'test-gzip'
+    @extraction.mime_type = 'application/gzip'
+
+    # test
+    @extraction.extract_features
+
+    # verify
+    assert_equal(PeekType::LISTING, @extraction.peek_type)
+    exp_peek_text = "<span class='glyphicon glyphicon-folder-open'></span> test.txt.gz<div class='indent'><span class='glyphicon glyphicon-file'></span> testing\n</div>"
+    assert_equal(exp_peek_text, @extraction.peek_text)
+  end
+
+  def test_extract_features_zip
+    # setup
+    @extraction.binary_name = 'test.zip'
+    @extraction.storage_path = "#{ENV['RUBY_HOME']}/test/test.zip"
+    @extraction.id = 'test-zip'
+    @extraction.mime_type = 'application/zip'
+
+    # test
+    @extraction.extract_features
+
+    # verify
+    assert_equal(PeekType::LISTING, @extraction.peek_type)
+    exp_peek_text = "<span class='glyphicon glyphicon-folder-open'></span> test.zip<div class='indent'><span class='glyphicon glyphicon-file'></span> test.txt</div>"
+    assert_equal(exp_peek_text, @extraction.peek_text)
+  end
+
+  def test_extract_features_default
+    # setup
+    @extraction.binary_name = 'test'
+    @extraction.storage_path = "#{ENV['RUBY_HOME']}/test"
+    @extraction.id = 'test-default'
+    @extraction.mime_type = 'application/directory'
+
+    # test
+    @extraction.extract_features
+
+    # verify
+    assert_equal(PeekType::NONE, @extraction.peek_type)
+  end
+
+  def test_mime_from_path
+    # setup
+    ruby_path = "#{ENV['RUBY_HOME']}/bin/set-test-vars.rb"
+
+    # test
+    ruby_mime = @extraction.mime_from_path(ruby_path)
+
+    # verify
+    assert_equal('application/x-ruby', ruby_mime)
+  end
+
+  def test_mime_from_filename
+    # setup
+    zip_filename = 'test.zip'
+
+    # test
+    zip_mime = @extraction.mime_from_filename(zip_filename)
+
+    # verify
+    assert_equal('application/zip', zip_mime)
+  end
+
+  def test_create_item
+    # setup
+    item_path = 'test/item/path/thing'
+    item_name = 'thing'
+    item_size = 123
+    media_type = 'directory'
+    is_directory = true
+
+    # test
+    @extraction.create_item(item_path, item_name, item_size, media_type, is_directory)
+
+    # verify
+    nested_items = @extraction.nested_items
+    assert(nested_items.include?({'item_name' => item_name, 'item_path' => item_path, 'item_size' => item_size,
+                                  'media_type' => media_type, 'is_directory' => is_directory}))
+  end
+
+  def test_extract_zip
+    # setup
+    @extraction.binary_name = 'test.zip'
+    @extraction.storage_path = "#{ENV['RUBY_HOME']}/test/test.zip"
+    @extraction.id = 'test-zip'
+    @extraction.mime_type = 'application/zip'
+
+    # test
+    @extraction.extract_zip
+
+    # verify
+    assert_equal(PeekType::LISTING, @extraction.peek_type)
+    exp_peek_text = "<span class='glyphicon glyphicon-folder-open'></span> test.zip<div class='indent'><span class='glyphicon glyphicon-file'></span> test.txt</div>"
+    assert_equal(exp_peek_text, @extraction.peek_text)
+  end
+
+  def test_extract_archive
+    # setup
+    @extraction.binary_name = 'test.tar'
+    @extraction.storage_path = "#{ENV['RUBY_HOME']}/test/test.tar"
+    @extraction.id = 'test-tar'
+    @extraction.mime_type = 'application/x-tar'
+    @extraction.peek_type = nil
+
+    # test
+    @extraction.extract_archive
+
+    # verify
+    assert_equal(PeekType::LISTING, @extraction.peek_type)
+    exp_peek_text = "<span class='glyphicon glyphicon-folder-open'></span> test.tar<div class='indent'><span class='glyphicon glyphicon-file'></span> test.txt</div>"
+    assert_equal(exp_peek_text, @extraction.peek_text)
+  end
+
+  def test_extract_gzip
+    # setup
+    @extraction.binary_name = 'test.txt.gz'
+    @extraction.storage_path = "#{ENV['RUBY_HOME']}/test/test.txt.gz"
+    @extraction.id = 'test-gzip'
+    @extraction.mime_type = 'application/gzip'
+
+    # test
+    @extraction.extract_gzip
+
+    # verify
+    assert_equal(PeekType::LISTING, @extraction.peek_type)
+    exp_peek_text = "<span class='glyphicon glyphicon-folder-open'></span> test.txt.gz<div class='indent'><span class='glyphicon glyphicon-file'></span> testing\n</div>"
+    assert_equal(exp_peek_text, @extraction.peek_text)
+  end
+
+  def test_extract_entry
+    # setup
+    mock_entry = Minitest::Mock.new
+    entry_name = "#{ENV['RUBY_HOME']}/bin/set-test-vars.rb"
+    type = ExtractionType::GZIP
+    mock_entry.expect(:directory?, false)
+    mock_entry.expect(:size, 123)
+
+    # test
+    entry_paths = @extraction.extract_entry(mock_entry, entry_name, [], type)
+
+    # verify
+    assert_mock(mock_entry)
+    assert(entry_paths.include?(entry_name))
+    expect_item = {'item_name' => 'set-test-vars.rb', 'item_path' => entry_name, 'item_size' => 123,
+                   'media_type' => 'application/x-ruby', 'is_directory' => false}
+    assert(@extraction.nested_items.include?(expect_item))
+
+  end
+
+  def test_handle_entry_paths
+    # setup
+    entry_paths = ['test/path']
+
+    # test
+    resp = @extraction.handle_entry_paths(entry_paths)
+
+    # verify
+    assert(resp)
+    exp_peek_text = "<span class='glyphicon glyphicon-folder-open'></span> test-binary<div class='indent'><div class='indent'><span class='glyphicon glyphicon-file'></span> path</div></div>"
+    assert_equal(exp_peek_text, @extraction.peek_text)
+    assert_equal(PeekType::LISTING, @extraction.peek_type)
+  end
+
+  def test_handle_entry_paths_empty
+    # setup
+    entry_paths = []
+
+    # test
+    resp = @extraction.handle_entry_paths(entry_paths)
+
+    # verify
+    assert_equal(false, resp)
+    assert_equal(PeekType::NONE, @extraction.peek_type)
+    assert(@extraction.error.include?({'error_type' => ErrorType::EXTRACTION,
+                                       'report' => "no items found for archive listing for task #{@extraction.id}"}))
+  end
+
+  def test_extract_default
+    # test
+    @extraction.extract_default
+    # verify
+    peek_type = @extraction.peek_type
+    assert_equal(PeekType::NONE, peek_type)
+  end
+
+  def test_valid_entry_path
+    # setup
+    valid_path = 'test/path'
+    invalid_path = ""
+
+    # test
+    path = @extraction.valid_entry_path(valid_path)
+    path_slash = @extraction.valid_entry_path("#{valid_path}/")
+    path_nil = @extraction.valid_entry_path(invalid_path)
+
+    # verify
+    assert_equal(valid_path, path)
+    assert_equal(valid_path, path_slash)
+    assert_nil(path_nil)
+  end
+
+  def test_is_directory
+    # setup
+    ruby_home = ENV['RUBY_HOME']
+    object_path = 'test/path'
+    slash_path = 'test/path/'
+    mac_path = 'this/is/a/mac/._path'
+    ds_store_path = 'test/path/.DS_Store'
+
+    # test
+    ruby_home_dir = @extraction.is_directory(ruby_home)
+    object_path_dir = @extraction.is_directory(object_path)
+    slash_path_dir = @extraction.is_directory(slash_path)
+    mac_path_dir = @extraction.is_directory(mac_path)
+    ds_store_path_dir = @extraction.is_directory(ds_store_path)
+
+    # verify
+    assert_equal(true, ruby_home_dir)
+    assert_equal(true, slash_path_dir)
+    assert_equal(false, object_path_dir)
+    assert_equal(false, mac_path_dir)
+    assert_equal(false, ds_store_path_dir)
+  end
+
+  def test_is_mac_thing
+    # setup
+    mac_path = 'this/is/a/mac/path/__MACOSX'
+    path = 'this/is/not/a/mac/path'
+    # test
+    mac = @extraction.is_mac_thing(mac_path)
+    not_mac = @extraction.is_mac_thing(path)
+    # verify
+    assert_equal(true, mac)
+    assert_equal(false, not_mac)
+  end
+
+  def test_is_mac_tar_thing
+    # setup
+    mac_path = 'this/is/a/mac/._path'
+    paxheader_mac_path = 'PaxHeader/this/is/a/mac/path'
+    longlink_mac_path = 'this/is/a/mac/path/@LongLink'
+    path = 'this/is/not/a/mac/path'
+    # test
+    mac_underscore = @extraction.is_mac_tar_thing(mac_path)
+    mac_paxheader = @extraction.is_mac_tar_thing(paxheader_mac_path)
+    mac_longlink = @extraction.is_mac_tar_thing(longlink_mac_path)
+    not_mac = @extraction.is_mac_tar_thing(path)
+    # verify
+    assert_equal(true, mac_underscore)
+    assert_equal(true, mac_paxheader)
+    assert_equal(true, mac_longlink)
+    assert_equal(false, not_mac)
+  end
+
+  def test_ends_in_slash
+    # setup
+    path_ends_in_slash = 'test/path/'
+    path_does_not_end_in_slash = 'test/path'
+
+    # test
+    ends_in_slash = @extraction.ends_in_slash(path_ends_in_slash)
+    does_not_end_in_slash = @extraction.ends_in_slash(path_does_not_end_in_slash)
+
+    # verify
+    assert_equal(true, ends_in_slash)
+    assert_equal(false, does_not_end_in_slash)
+  end
+
+  def test_is_ds_store
+    # setup
+    ds_store_path = 'test/path/.DS_Store'
+    path = 'test/path'
+
+    # test
+    ds_store = @extraction.is_ds_store(ds_store_path)
+    not_ds_store = @extraction.is_ds_store(path)
+
+    # verify
+    assert_equal(true, ds_store)
+    assert_equal(false, not_ds_store)
+  end
+
+  def test_name_part
+    # setup
+    path = 'test/path'
+    name = 'test'
+    invalid_path = ""
+
+    # test
+    path_name = @extraction.name_part(path)
+    test_name = @extraction.name_part(name)
+    invalid_name = @extraction.name_part(invalid_path)
+
+    # verify
+    assert_equal('path', path_name)
+    assert_equal('test', test_name)
+    assert_nil(invalid_name)
+  end
+
+  def test_entry_paths_arr_to_html
+    # setup
+    entry_paths = ['test/path']
+
+    # test
+    return_string = @extraction.entry_paths_arr_to_html(entry_paths)
+
+    # verify
+    exp_peek_text = "<span class='glyphicon glyphicon-folder-open'></span> test-binary<div class='indent'><div class='indent'><span class='glyphicon glyphicon-file'></span> path</div></div>"
+    assert_equal(exp_peek_text, return_string)
+  end
+end
diff --git a/test/test.tar b/test/test.tar
new file mode 100644
index 0000000000000000000000000000000000000000..95dc7f68c459dda5ffd483192a2ebf9466d22ba4
GIT binary patch
literal 2048
zcmeH^OAdrE2!^}%6r8|7OWX4pqmBz*SWM>lLu}&4%(96K*zgFDpAVUe^zs$g&=4^(
zgHxYVzC8MxYe|h5h-08kJ~9DPh$%S0hMMlxXOZDhX-gSD+bVxSXxlvBQ_rAYQjbG~
dXTv)yqW@p_1;~f)oeE8r5ikNqzzDP;@B_ZL8-xG=

literal 0
HcmV?d00001

diff --git a/test/test.txt.gz b/test/test.txt.gz
new file mode 100644
index 0000000000000000000000000000000000000000..4f821bca15aaf64a3b012c94d5250ab42f815c0d
GIT binary patch
literal 37
tcmb2|=HPg9JSCNZxg@o?M6aZxghAU=SMTH*@8?Vme*$7|b1*P4008)4466VD

literal 0
HcmV?d00001

diff --git a/test/test.zip b/test/test.zip
new file mode 100644
index 0000000000000000000000000000000000000000..d048dc0c6b971c347314fb3484e3520cb098f0c6
GIT binary patch
literal 178
zcmWIWW@h1H0D%X29l>lNAIf-uY!K#PkYOlEEiTb3sVE5z;bdT5BE2#3v8s7;X$3a}
zBg=P21_l-ppgNG6%)E33LnC9Z0B=SnIc8kuNdQ$dFaY&2ENKL>5T>(2Oh+?0z?+o~
Pq>d2?{eUzGGcW)E3K=0p

literal 0
HcmV?d00001

diff --git a/test/test_helper.rb b/test/test_helper.rb
new file mode 100644
index 0000000..ec88121
--- /dev/null
+++ b/test/test_helper.rb
@@ -0,0 +1,17 @@
+# frozen_string_literal: true
+
+require 'simplecov'
+SimpleCov.start
+
+require 'minitest/autorun'
+require 'config'
+require 'csv'
+require 'json'
+require_relative '../lib/archive_extractor'
+require_relative '../lib/extractor'
+require_relative '../lib/extractor/error_type'
+require_relative '../lib/extractor/extraction'
+require_relative '../lib/extractor/extraction_status'
+require_relative '../lib/extractor/extraction_type'
+require_relative '../lib/extractor/mime_type'
+require_relative '../lib/extractor/peek_type'
\ No newline at end of file

From b007740f74b9249035df11954274e240257583c8 Mon Sep 17 00:00:00 2001
From: Gen Schmitt <gschmitt@illinois.edu>
Date: Thu, 30 Nov 2023 14:57:47 -0700
Subject: [PATCH 16/20] added github action

---
 .github/workflows/test.yml | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)
 create mode 100644 .github/workflows/test.yml

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 0000000..e7555c0
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,20 @@
+on:
+  push:
+    branches:
+    - test
+    - demo
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout the repository
+        uses: actions/checkout@v4
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Build and push
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          file: ./docker/extractor/Dockerfile-test
+          tags: extractor-test
\ No newline at end of file

From 33e9f2cffc8ae6b8233dcc174973130fcc55cb40 Mon Sep 17 00:00:00 2001
From: Gen Schmitt <gschmitt@illinois.edu>
Date: Thu, 30 Nov 2023 15:01:15 -0700
Subject: [PATCH 17/20] updated test dockerfile

---
 docker/extractor/Dockerfile-test | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docker/extractor/Dockerfile-test b/docker/extractor/Dockerfile-test
index 8baa77c..5b209d0 100644
--- a/docker/extractor/Dockerfile-test
+++ b/docker/extractor/Dockerfile-test
@@ -24,4 +24,5 @@ RUN gem install bundler && bundle install
 # Copy the main application, except whatever is listed in .dockerignore.
 COPY . ./
 
-CMD ["rake", "test"]
\ No newline at end of file
+RUN rake test
+#CMD ["rake", "test"]
\ No newline at end of file

From d5c61b0b14818c427bf39bf5d6198374c5e5cd6e Mon Sep 17 00:00:00 2001
From: Gen Schmitt <gschmitt@illinois.edu>
Date: Fri, 1 Dec 2023 11:00:32 -0700
Subject: [PATCH 18/20] updated to add prod values and fix log statements

---
 .github/workflows/test.yml           | 3 ++-
 .idea/databank-archive-extractor.iml | 2 ++
 Gemfile.lock                         | 4 ++++
 config/settings/prod.yml             | 8 ++++++++
 docker/extractor/Dockerfile-test     | 4 ++--
 lib/archive_extractor.rb             | 8 +++++---
 6 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index e7555c0..e3b181f 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -17,4 +17,5 @@ jobs:
         with:
           context: .
           file: ./docker/extractor/Dockerfile-test
-          tags: extractor-test
\ No newline at end of file
+          tags: extractor-test
+          platforms: linux/arm64
\ No newline at end of file
diff --git a/.idea/databank-archive-extractor.iml b/.idea/databank-archive-extractor.iml
index 7a849e6..cc5e1e8 100644
--- a/.idea/databank-archive-extractor.iml
+++ b/.idea/databank-archive-extractor.iml
@@ -29,6 +29,8 @@
     <orderEntry type="library" scope="PROVIDED" name="dry-schema (v1.13.3, ruby-3.1.2-p20) [gem]" level="application" />
     <orderEntry type="library" scope="PROVIDED" name="dry-types (v1.7.1, ruby-3.1.2-p20) [gem]" level="application" />
     <orderEntry type="library" scope="PROVIDED" name="dry-validation (v1.10.0, ruby-3.1.2-p20) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="ffi (v1.16.3, ruby-3.1.2-p20) [gem]" level="application" />
+    <orderEntry type="library" scope="PROVIDED" name="ffi-libarchive (v1.1.13, ruby-3.1.2-p20) [gem]" level="application" />
     <orderEntry type="library" scope="PROVIDED" name="jmespath (v1.6.2, ruby-3.1.2-p20) [gem]" level="application" />
     <orderEntry type="library" scope="PROVIDED" name="mime-types (v3.5.1, ruby-3.1.2-p20) [gem]" level="application" />
     <orderEntry type="library" scope="PROVIDED" name="mime-types-data (v3.2023.1003, ruby-3.1.2-p20) [gem]" level="application" />
diff --git a/Gemfile.lock b/Gemfile.lock
index d6d7b8e..033c25b 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -58,6 +58,9 @@ GEM
       dry-initializer (~> 3.0)
       dry-schema (>= 1.12, < 2)
       zeitwerk (~> 2.6)
+    ffi (1.16.3)
+    ffi-libarchive (1.1.13)
+      ffi (~> 1.0)
     jmespath (1.6.2)
     mime-types (3.5.1)
       mime-types-data (~> 3.2015)
@@ -87,6 +90,7 @@ DEPENDENCIES
   aws-sdk-s3
   aws-sdk-sqs
   config
+  ffi-libarchive
   mime-types
   mimemagic (~> 0.3.6)
   minitest
diff --git a/config/settings/prod.yml b/config/settings/prod.yml
index e69de29..40c24e7 100644
--- a/config/settings/prod.yml
+++ b/config/settings/prod.yml
@@ -0,0 +1,8 @@
+aws:
+    efs:
+        mount_point: "/mnt/efs/"
+    sqs:
+        queue_name: "extractor-to-databank-prod"
+        queue_url: "https://sqs.us-east-2.amazonaws.com/721945215539/extractor-to-databank-prod"
+    s3:
+        json_bucket: "databank-main"
\ No newline at end of file
diff --git a/docker/extractor/Dockerfile-test b/docker/extractor/Dockerfile-test
index 5b209d0..f5ea02c 100644
--- a/docker/extractor/Dockerfile-test
+++ b/docker/extractor/Dockerfile-test
@@ -1,5 +1,5 @@
-FROM ruby:3.1.2
-#FROM --platform=linux/arm64 ruby:3.1.2
+FROM --platform=linux/arm64 ruby:3.1.2
+#FROM ruby:3.1.2
 
 ENV RAILS_ENV=test
 ENV RAILS_LOG_TO_STDOUT=true
diff --git a/lib/archive_extractor.rb b/lib/archive_extractor.rb
index 35c411d..094aa02 100644
--- a/lib/archive_extractor.rb
+++ b/lib/archive_extractor.rb
@@ -110,17 +110,19 @@ def send_sqs_message(return_value)
 
   def put_json_response(return_value, s3_path)
     s3_put_error = []
+    json_bucket = Settings.aws.s3.json_bucket
     begin
       @s3.put_object({
                        body: return_value.to_json,
-                       bucket: Settings.aws.s3.json_bucket,
+                       bucket: json_bucket,
                        key: s3_path,
                      })
-      LOGGER.info("Putting json response for object #{@object_key} with ID #{@web_id} in S3 bucket #{@bucket_name} with key #{s3_path}")
+      LOGGER.info(return_value.to_json)
+      LOGGER.info("Putting json response for object #{@object_key} with ID #{@web_id} in S3 bucket #{json_bucket} with key #{s3_path}")
       s3_put_status = ExtractionStatus::SUCCESS
     rescue StandardError => e
       s3_put_status = ExtractionStatus::ERROR
-      s3_put_error_message = "Error putting json response for object #{@object_key} with ID #{@web_id} in S3 bucket #{@bucket_name}: #{e.message}"
+      s3_put_error_message = "Error putting json response for object #{@object_key} with ID #{@web_id} in S3 bucket #{json_bucket}: #{e.message}"
       s3_put_error.push({"error_type" => ErrorType::S3_PUT, "report" => s3_put_error_message})
       LOGGER.error(s3_put_error_message)
     end

From c664c3f3d27a1e28c9803133e620daa317b3decc Mon Sep 17 00:00:00 2001
From: Gen Schmitt <gschmitt@illinois.edu>
Date: Fri, 1 Dec 2023 13:32:27 -0700
Subject: [PATCH 19/20] updated dockerfiles

---
 Dockerfile                       | 7 ++++---
 Gemfile                          | 2 --
 docker/extractor/Dockerfile-test | 6 +++---
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 4f10663..774004e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,9 +1,10 @@
 # N.B.: this must match the Ruby version in the Gemfile, and /.ruby-version.
 FROM ruby:3.1.2
 
-ENV RAILS_ENV=production
+ENV RUBY_ENV=demo
 ENV RAILS_LOG_TO_STDOUT=true
 ENV RAILS_SERVE_STATIC_FILES=true
+ENV RUBY_HOME=/extractor
 
 RUN apt-get update && apt-get install -y \
   build-essential \
@@ -11,9 +12,9 @@ RUN apt-get update && apt-get install -y \
   libpq-dev \
   libarchive-dev
 
-RUN mkdir app
-WORKDIR app
 
+RUN mkdir extractor
+WORKDIR extractor
 
 # Copy the Gemfile as well as the Gemfile.lock and install gems.
 # This is a separate step so the dependencies will be cached.
diff --git a/Gemfile b/Gemfile
index 2013c04..0b08bd5 100644
--- a/Gemfile
+++ b/Gemfile
@@ -10,8 +10,6 @@ gem 'rubyzip'
 gem 'config'
 
 # Use archive for non-zip archive files
-# gem 'libarchive'
-# gem 'libarchive-ruby'
 gem 'ffi-libarchive'
 
 # Use os to interact with operating system
diff --git a/docker/extractor/Dockerfile-test b/docker/extractor/Dockerfile-test
index f5ea02c..0ca4179 100644
--- a/docker/extractor/Dockerfile-test
+++ b/docker/extractor/Dockerfile-test
@@ -1,7 +1,7 @@
 FROM --platform=linux/arm64 ruby:3.1.2
 #FROM ruby:3.1.2
 
-ENV RAILS_ENV=test
+ENV RUBY_ENV=test
 ENV RAILS_LOG_TO_STDOUT=true
 ENV RUBY_HOME=/extractor
 ENV IS_DOCKER=true
@@ -24,5 +24,5 @@ RUN gem install bundler && bundle install
 # Copy the main application, except whatever is listed in .dockerignore.
 COPY . ./
 
-RUN rake test
-#CMD ["rake", "test"]
\ No newline at end of file
+#RUN rake test
+CMD ["rake", "test"]
\ No newline at end of file

From 8ff3e480df1595ae37300481ece191c0344639c2 Mon Sep 17 00:00:00 2001
From: Gen Schmitt <gschmitt@illinois.edu>
Date: Mon, 4 Dec 2023 10:01:27 -0700
Subject: [PATCH 20/20] updated to remove demo configuration

---
 Dockerfile  | 2 +-
 ecr-push.sh | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 774004e..6fd469e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,7 +1,7 @@
 # N.B.: this must match the Ruby version in the Gemfile, and /.ruby-version.
 FROM ruby:3.1.2
 
-ENV RUBY_ENV=demo
+ENV RUBY_ENV=prod
 ENV RAILS_LOG_TO_STDOUT=true
 ENV RAILS_SERVE_STATIC_FILES=true
 ENV RUBY_HOME=/extractor
diff --git a/ecr-push.sh b/ecr-push.sh
index 9ac4826..1488dd0 100755
--- a/ecr-push.sh
+++ b/ecr-push.sh
@@ -4,6 +4,6 @@
 #
 
 aws ecr get-login-password --region us-east-2 | docker login --username AWS --password-stdin 721945215539.dkr.ecr.us-east-2.amazonaws.com
-docker build -t databank-archive-extractor-demo .
-docker tag databank-archive-extractor-demo:latest 721945215539.dkr.ecr.us-east-2.amazonaws.com/databank-archive-extractor-demo:latest
-docker push 721945215539.dkr.ecr.us-east-2.amazonaws.com/databank-archive-extractor-demo:latest
+docker build -t databank-archive-extractor-prod .
+docker tag databank-archive-extractor-prod:latest 721945215539.dkr.ecr.us-east-2.amazonaws.com/databank-archive-extractor-prod:latest
+docker push 721945215539.dkr.ecr.us-east-2.amazonaws.com/databank-archive-extractor-prod:latest