diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 0000000..525771d
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,22 @@
+on:
+ push:
+ branches:
+ - test
+ - demo
+ - prod
+
+jobs:
+ build:
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout the repository
+ uses: actions/checkout@v4
+ - name: Set up Docker Buildx
+ uses: docker/setup-buildx-action@v3
+ - name: Build and push
+ uses: docker/build-push-action@v5
+ with:
+ context: .
+ file: ./docker/extractor/Dockerfile-test
+ tags: extractor-test
+ platforms: linux/arm64
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 9106b2a..fe3b9cd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,4 @@
/pkg/
/spec/reports/
/tmp/
+/.idea/
\ No newline at end of file
diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..73f69e0
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
+# Editor-based HTTP Client requests
+/httpRequests/
diff --git a/.idea/databank-archive-extractor.iml b/.idea/databank-archive-extractor.iml
new file mode 100644
index 0000000..f279c8f
--- /dev/null
+++ b/.idea/databank-archive-extractor.iml
@@ -0,0 +1,49 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
new file mode 100644
index 0000000..b0db9b0
--- /dev/null
+++ b/.idea/inspectionProfiles/Project_Default.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..1f18249
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..1127073
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..35eb1dd
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.ruby-version b/.ruby-version
new file mode 100644
index 0000000..ef538c2
--- /dev/null
+++ b/.ruby-version
@@ -0,0 +1 @@
+3.1.2
diff --git a/Dockerfile b/Dockerfile
index 2de759a..6fd469e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,9 +1,10 @@
# N.B.: this must match the Ruby version in the Gemfile, and /.ruby-version.
-FROM ruby:2.7.2
+FROM ruby:3.1.2
-ENV RAILS_ENV=production
+ENV RUBY_ENV=prod
ENV RAILS_LOG_TO_STDOUT=true
ENV RAILS_SERVE_STATIC_FILES=true
+ENV RUBY_HOME=/extractor
RUN apt-get update && apt-get install -y \
build-essential \
@@ -11,22 +12,18 @@ RUN apt-get update && apt-get install -y \
libpq-dev \
libarchive-dev
-RUN mkdir app
-WORKDIR app
+RUN mkdir extractor
+WORKDIR extractor
# Copy the Gemfile as well as the Gemfile.lock and install gems.
# This is a separate step so the dependencies will be cached.
COPY Gemfile Gemfile.lock ./
-RUN gem install bundler && bundle install --without development test --jobs 20 --retry 5
+RUN gem install bundler && bundle install
# Copy the main application, except whatever is listed in .dockerignore.
COPY . ./
-#RUN bin/rails assets:precompile
-
-EXPOSE 3000
-
# This is the web server entry point. It will need to be overridden when
# running the workers.
CMD ["echo", "Error running task, please check the container override command!"]
diff --git a/Gemfile b/Gemfile
index 73c0c87..0b08bd5 100644
--- a/Gemfile
+++ b/Gemfile
@@ -1,16 +1,16 @@
source "https://rubygems.org"
git_source(:github) { |repo| "https://github.com/#{repo}.git" }
-ruby '2.7.2'
-
# Use mime-types to determine mimetypes based on extension
gem 'mime-types', require: 'mime/types/full'
# Use rubyzip to read zip files
gem 'rubyzip'
+gem 'config'
+
# Use archive for non-zip archive files
-gem 'libarchive'
+gem 'ffi-libarchive'
# Use os to interact with operating system
gem 'os'
@@ -24,3 +24,7 @@ gem "aws-sdk-s3"
gem "aws-sdk-sqs"
+gem 'minitest'
+
+gem 'simplecov'
+
diff --git a/Gemfile.lock b/Gemfile.lock
index c50f5a1..a291f6f 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -1,53 +1,103 @@
GEM
remote: https://rubygems.org/
specs:
- aws-eventstream (1.1.0)
- aws-partitions (1.416.0)
- aws-sdk-core (3.111.0)
- aws-eventstream (~> 1, >= 1.0.2)
- aws-partitions (~> 1, >= 1.239.0)
- aws-sigv4 (~> 1.1)
- jmespath (~> 1.0)
- aws-sdk-kms (1.41.0)
- aws-sdk-core (~> 3, >= 3.109.0)
+ aws-eventstream (1.3.0)
+ aws-partitions (1.862.0)
+ aws-sdk-core (3.190.0)
+ aws-eventstream (~> 1, >= 1.3.0)
+ aws-partitions (~> 1, >= 1.651.0)
+ aws-sigv4 (~> 1.8)
+ jmespath (~> 1, >= 1.6.1)
+ aws-sdk-kms (1.74.0)
+ aws-sdk-core (~> 3, >= 3.188.0)
aws-sigv4 (~> 1.1)
- aws-sdk-s3 (1.87.0)
- aws-sdk-core (~> 3, >= 3.109.0)
+ aws-sdk-s3 (1.141.0)
+ aws-sdk-core (~> 3, >= 3.189.0)
aws-sdk-kms (~> 1)
+ aws-sigv4 (~> 1.8)
+ aws-sdk-sqs (1.69.0)
+ aws-sdk-core (~> 3, >= 3.188.0)
aws-sigv4 (~> 1.1)
- aws-sdk-sqs (1.35.0)
- aws-sdk-core (~> 3, >= 3.109.0)
- aws-sigv4 (~> 1.1)
- aws-sigv4 (1.2.2)
+ aws-sigv4 (1.8.0)
aws-eventstream (~> 1, >= 1.0.2)
- jmespath (1.4.0)
- mime-types (3.3.1)
+ concurrent-ruby (1.2.2)
+ config (5.0.0)
+ deep_merge (~> 1.2, >= 1.2.1)
+ dry-validation (~> 1.0, >= 1.0.0)
+ deep_merge (1.2.2)
+ docile (1.4.0)
+ dry-configurable (1.1.0)
+ dry-core (~> 1.0, < 2)
+ zeitwerk (~> 2.6)
+ dry-core (1.0.1)
+ concurrent-ruby (~> 1.0)
+ zeitwerk (~> 2.6)
+ dry-inflector (1.0.0)
+ dry-initializer (3.1.1)
+ dry-logic (1.5.0)
+ concurrent-ruby (~> 1.0)
+ dry-core (~> 1.0, < 2)
+ zeitwerk (~> 2.6)
+ dry-schema (1.13.3)
+ concurrent-ruby (~> 1.0)
+ dry-configurable (~> 1.0, >= 1.0.1)
+ dry-core (~> 1.0, < 2)
+ dry-initializer (~> 3.0)
+ dry-logic (>= 1.4, < 2)
+ dry-types (>= 1.7, < 2)
+ zeitwerk (~> 2.6)
+ dry-types (1.7.1)
+ concurrent-ruby (~> 1.0)
+ dry-core (~> 1.0)
+ dry-inflector (~> 1.0)
+ dry-logic (~> 1.4)
+ zeitwerk (~> 2.6)
+ dry-validation (1.10.0)
+ concurrent-ruby (~> 1.0)
+ dry-core (~> 1.0, < 2)
+ dry-initializer (~> 3.0)
+ dry-schema (>= 1.12, < 2)
+ zeitwerk (~> 2.6)
+ ffi (1.16.3)
+ ffi-libarchive (1.1.13)
+ ffi (~> 1.0)
+ jmespath (1.6.2)
+ mime-types (3.5.1)
mime-types-data (~> 3.2015)
- mime-types-data (3.2020.1104)
+ mime-types-data (3.2023.1003)
mimemagic (0.3.10)
nokogiri (~> 1)
rake
- nokogiri (1.11.2-x86_64-darwin)
+ minitest (5.20.0)
+ nokogiri (1.15.5-x86_64-darwin)
racc (~> 1.4)
- os (1.1.1)
- racc (1.5.2)
- rake (13.0.3)
- rubyzip (2.3.0)
+ os (1.1.4)
+ racc (1.7.3)
+ rake (13.1.0)
+ rubyzip (2.3.2)
+ simplecov (0.22.0)
+ docile (~> 1.1)
+ simplecov-html (~> 0.11)
+ simplecov_json_formatter (~> 0.1)
+ simplecov-html (0.12.3)
+ simplecov_json_formatter (0.1.4)
+ zeitwerk (2.6.12)
PLATFORMS
- x86_64-darwin-19
+ x86_64-darwin-21
DEPENDENCIES
aws-sdk-s3
aws-sdk-sqs
+ config
+ ffi-libarchive
mime-types
mimemagic (~> 0.3.6)
+ minitest
os
rake (~> 13.0)
rubyzip
-
-RUBY VERSION
- ruby 2.7.2p137
+ simplecov
BUNDLED WITH
- 2.2.4
+ 2.3.22
diff --git a/Rakefile b/Rakefile
index cd510a0..0abb012 100644
--- a/Rakefile
+++ b/Rakefile
@@ -1,4 +1,11 @@
# frozen_string_literal: true
-require "bundler/gem_tasks"
-task default: %i[]
+require 'rake/testtask'
+require 'simplecov'
+require_relative 'bin/set-test-vars'
+
+Rake::TestTask.new(:test) do |t|
+ t.libs << 'lib' << 'test'
+ # t.libs << 'lib'
+ t.test_files = FileList['test/*_test.rb']
+end
diff --git a/bin/console b/bin/console
deleted file mode 100755
index 8096e50..0000000
--- a/bin/console
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/usr/bin/env ruby
-# frozen_string_literal: true
-
-require "bundler/setup"
-require "databank/archive/extractor"
-
-# You can add fixtures and/or initialization code here to make experimenting
-# with your gem easier. You can also use a different console, if you like.
-
-# (If you use this, don't forget to add pry to your Gemfile!)
-# require "pry"
-# Pry.start
-
-require "irb"
-IRB.start(__FILE__)
diff --git a/bin/set-test-vars.rb b/bin/set-test-vars.rb
new file mode 100644
index 0000000..7959c7c
--- /dev/null
+++ b/bin/set-test-vars.rb
@@ -0,0 +1,5 @@
+#!/usr/bin/env ruby
+
+ENV['RUBY_ENV'] = 'test'
+ENV['RUBY_HOME'] = ENV['IS_DOCKER'] == 'true' ? '/extractor' : '/Users/gschmitt/workspace/databank-archive-extractor'
+ENV['RUBY_TEST_HOME'] = "#{ENV['RUBY_HOME']}/test"
diff --git a/bin/setup b/bin/setup
deleted file mode 100755
index dce67d8..0000000
--- a/bin/setup
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-IFS=$'\n\t'
-set -vx
-
-bundle install
-
-# Do any other automated setup that you need to do here
diff --git a/config/settings.yml b/config/settings.yml
new file mode 100644
index 0000000..ed17934
--- /dev/null
+++ b/config/settings.yml
@@ -0,0 +1,3 @@
+aws:
+ region: "us-east-2"
+
diff --git a/config/settings/demo.yml b/config/settings/demo.yml
new file mode 100644
index 0000000..18367ed
--- /dev/null
+++ b/config/settings/demo.yml
@@ -0,0 +1,8 @@
+aws:
+ efs:
+ mount_point: "/mnt/efs/"
+ sqs:
+ queue_name: "extractor-to-databank-demo"
+ queue_url: "https://sqs.us-east-2.amazonaws.com/721945215539/extractor-to-databank-demo"
+ s3:
+ json_bucket: "databank-demo-main"
diff --git a/config/settings/prod.yml b/config/settings/prod.yml
new file mode 100644
index 0000000..40c24e7
--- /dev/null
+++ b/config/settings/prod.yml
@@ -0,0 +1,8 @@
+aws:
+ efs:
+ mount_point: "/mnt/efs/"
+ sqs:
+ queue_name: "extractor-to-databank-prod"
+ queue_url: "https://sqs.us-east-2.amazonaws.com/721945215539/extractor-to-databank-prod"
+ s3:
+ json_bucket: "databank-main"
\ No newline at end of file
diff --git a/config/settings/test.yml b/config/settings/test.yml
new file mode 100644
index 0000000..db955db
--- /dev/null
+++ b/config/settings/test.yml
@@ -0,0 +1,8 @@
+aws:
+ efs:
+ mount_point: "test/efs/"
+ sqs:
+ queue_name: "extractor-to-databank-test"
+ queue_url: "https://sqs.us-east-2.amazonaws.com/721945215539/extractor-to-databank-test"
+ s3:
+ json_bucket: "databank-test-main"
\ No newline at end of file
diff --git a/docker/extractor/Dockerfile-test b/docker/extractor/Dockerfile-test
new file mode 100644
index 0000000..0ca4179
--- /dev/null
+++ b/docker/extractor/Dockerfile-test
@@ -0,0 +1,28 @@
+FROM --platform=linux/arm64 ruby:3.1.2
+#FROM ruby:3.1.2
+
+ENV RUBY_ENV=test
+ENV RAILS_LOG_TO_STDOUT=true
+ENV RUBY_HOME=/extractor
+ENV IS_DOCKER=true
+
+RUN apt-get update && apt-get install -y \
+ build-essential \
+ git \
+ libpq-dev \
+ libarchive-dev
+
+# Copy the Gemfile as well as the Gemfile.lock and install gems.
+# This is a separate step so the dependencies will be cached.
+RUN mkdir extractor
+WORKDIR extractor
+
+#COPY Gemfile Gemfile.lock ./
+COPY Gemfile ./
+RUN gem install bundler && bundle install
+
+# Copy the main application, except whatever is listed in .dockerignore.
+COPY . ./
+
+#RUN rake test
+CMD ["rake", "test"]
\ No newline at end of file
diff --git a/lib/archive_extractor.rb b/lib/archive_extractor.rb
new file mode 100644
index 0000000..094aa02
--- /dev/null
+++ b/lib/archive_extractor.rb
@@ -0,0 +1,133 @@
+# frozen_string_literal: true
+require 'aws-sdk-sqs'
+require 'aws-sdk-s3'
+require 'fileutils'
+require 'json'
+require 'config'
+require 'logger'
+
+
+require_relative 'extractor/extraction'
+require_relative 'extractor/extraction_status'
+require_relative 'extractor/error_type'
+
+class ArchiveExtractor
+ attr_accessor :s3, :sqs, :bucket_name, :object_key, :binary_name, :web_id, :mime_type, :extraction
+ Config.load_and_set_settings(Config.setting_files("#{ENV['RUBY_HOME']}/config", ENV['RUBY_ENV']))
+ LOGGER = Logger.new(STDOUT)
+
+ def initialize(bucket_name, object_key, binary_name, web_id, mime_type, sqs, s3)
+ @bucket_name = bucket_name
+ @object_key = object_key
+ @binary_name = binary_name
+ @web_id = web_id
+ @mime_type = mime_type
+ @sqs = sqs
+ @s3 = s3
+ end
+
+ def extract
+ begin
+ error = []
+
+ del_path = "#{Settings.aws.efs.mount_point}#{@bucket_name}_#{@web_id}"
+ local_path = "#{del_path}/#{@object_key}"
+
+ dirname = File.dirname(local_path)
+ unless File.directory?(dirname)
+ FileUtils.mkdir_p(dirname)
+ end
+
+ get_object(local_path, error)
+
+ extraction = Extraction.new(@binary_name, local_path, @web_id, @mime_type)
+ return_value = perform_extraction(extraction, error)
+ s3_path = "messages/#{@web_id}.json"
+ s3_put_status, s3_put_error = put_json_response(return_value, s3_path)
+
+ s3_put_errors = s3_put_error.map {|o| Hash[o.each_pair.to_a]}
+
+ return_value = {"bucket_name" => @bucket_name, "object_key" => s3_path, "s3_status" => s3_put_status, "error" => s3_put_errors}
+ send_sqs_message(return_value)
+
+ ensure
+ FileUtils.rm_rf(dirname, :secure => true)
+ FileUtils.rm_rf(del_path, :secure => true)
+ end
+ end
+
+ def get_object(local_path, error)
+ begin
+ @s3.get_object({
+ response_target: local_path,
+ bucket: @bucket_name,
+ key: @object_key,
+ })
+ LOGGER.info("Getting object #{@object_key} with ID #{@web_id} from #{@bucket_name}")
+ rescue StandardError => e
+ s3_error = "Error getting object #{@object_key} with ID #{@web_id} from S3 bucket #{@bucket_name}: #{e.message}"
+ LOGGER.error(s3_error)
+ error.push({"error_type" => ErrorType::S3_GET, "report" => s3_error})
+ end
+ return error
+ end
+
+ def perform_extraction(extraction, error)
+ begin
+ extraction.process
+ status = extraction.status
+ LOGGER.info("status: #{status}")
+ LOGGER.error("error: #{extraction.error}") if status == ExtractionStatus::ERROR
+ error.concat(extraction.error)
+ items = extraction.nested_items.map { |o| Hash[o.each_pair.to_a] }
+ errors = error.map {|o| Hash[o.each_pair.to_a]}
+ return_value = {"web_id" => @web_id, "status" => status, "error" => errors, "peek_type" => extraction.peek_type, "peek_text" => extraction.peek_text, "nested_items" => items}
+ rescue StandardError => e
+ error.push({"task_id" => @web_id, "extraction_process_report" => "Error extracting #{@object_key} with ID #{@web_id}: #{e.message}"})
+ errors = error.map {|o| Hash[o.each_pair.to_a]}
+ return_value = {"web_id" => @web_id, "status" => ExtractionStatus::ERROR, "error" => errors, "peek_type" => PeekType::NONE, "peek_text" => nil, "nested_items" => []}
+ end
+ return return_value
+ end
+
+ def send_sqs_message(return_value)
+ # Send a message to a queue.
+ queue_name = Settings.aws.sqs.queue_name
+ queue_url = Settings.aws.sqs.queue_url
+
+ begin
+ # Create and send a message.
+ @sqs.send_message({
+ queue_url: queue_url,
+ message_body: return_value.to_json,
+ message_attributes: {}
+ })
+ LOGGER.info("Sending message in queue #{queue_name} for object #{@object_key} with ID #{@web_id}")
+ rescue StandardError => e
+ LOGGER.error("Error sending message in queue #{queue_name} for object #{@object_key} with ID #{@web_id}: #{e.message}")
+ end
+ end
+
+ def put_json_response(return_value, s3_path)
+ s3_put_error = []
+ json_bucket = Settings.aws.s3.json_bucket
+ begin
+ @s3.put_object({
+ body: return_value.to_json,
+ bucket: json_bucket,
+ key: s3_path,
+ })
+ LOGGER.info(return_value.to_json)
+ LOGGER.info("Putting json response for object #{@object_key} with ID #{@web_id} in S3 bucket #{json_bucket} with key #{s3_path}")
+ s3_put_status = ExtractionStatus::SUCCESS
+ rescue StandardError => e
+ s3_put_status = ExtractionStatus::ERROR
+ s3_put_error_message = "Error putting json response for object #{@object_key} with ID #{@web_id} in S3 bucket #{json_bucket}: #{e.message}"
+ s3_put_error.push({"error_type" => ErrorType::S3_PUT, "report" => s3_put_error_message})
+ LOGGER.error(s3_put_error_message)
+ end
+ return s3_put_status, s3_put_error
+ end
+
+
+end
diff --git a/lib/extractor.rb b/lib/extractor.rb
index c1bbe0f..8392f9e 100644
--- a/lib/extractor.rb
+++ b/lib/extractor.rb
@@ -1,102 +1,17 @@
require 'aws-sdk-sqs'
require 'aws-sdk-s3'
-require 'fileutils'
-require 'json'
+require 'config'
-require_relative 'extractor/extraction.rb'
-require_relative 'extractor/extraction_status.rb'
-require_relative 'extractor/error_type.rb'
+require_relative 'archive_extractor'
class Extractor
- def self.extract(bucket_name, object_key, binary_name, web_id, mime_type)
- begin
- status = ExtractionStatus::ERROR
- error = Array.new
- s3_put_status = ExtractionStatus::SUCCESS
- s3_put_error = Array.new
- local_path = "./#{binary_name}"
- s3_path = "messages/#{web_id}.json"
-
- region = 'us-east-2'
- s3_client = Aws::S3::Client.new(region: region)
- del_path = "/mnt/efs/#{bucket_name}_#{web_id}"
- local_path = "#{del_path}/#{object_key}"
-
- dirname = File.dirname(local_path)
- unless File.directory?(dirname)
- FileUtils.mkdir_p(dirname)
- end
-
- begin
- s3_client.get_object(
- response_target: local_path,
- bucket: bucket_name,
- key: object_key,
- )
- puts "Getting object #{object_key} with ID #{web_id} from #{bucket_name}"
- rescue StandardError => e
- s3_error = "Error getting object #{object_key} with ID #{web_id} from S3 bucket #{bucket_name}: #{e.message}"
- error.push({"error_type" => ErrorType::S3_GET, "report" => s3_error})
- puts s3_error
- end
-
- begin
- extraction = Extraction.new(binary_name, local_path, web_id, mime_type)
- extraction.process
- status = extraction.status
- puts "status: #{status}"
- puts "error: #{extraction.error}" if status == ExtractionStatus::ERROR
- error.concat(extraction.error)
- items = extraction.nested_items.map { |o| Hash[o.each_pair.to_a] }
- errors = error.map {|o| Hash[o.each_pair.to_a]}
- return_value = {"web_id" => web_id, "status" => status, "error" => errors, "peek_type" => extraction.peek_type, "peek_text" => extraction.peek_text, "nested_items" => items}
- rescue
- error.push({"task_id" => web_id, "extraction_process_report" => "Error extracting #{object_key} with ID #{web_id}: #{e.message}"})
- errors = error.map {|o| Hash[o.each_pair.to_a]}
- return_value = {"web_id" => web_id, "status" => ExtractionStatus::ERROR, "error" => errors, "peek_type" => PeekType::NONE, "peek_text" => null, "nested_items" => []}
- end
-
- begin
- s3_client.put_object({
- body: return_value.to_json,
- bucket: "databank-main",
- key: s3_path,
- })
- puts "Putting json response for object #{object_key} with ID #{web_id} in S3 bucket #{bucket_name} with key #{s3_path}"
- rescue StandardError => e
- s3_put_status = ExtractionStatus::ERROR
- s3_put_error_message = "Error putting json response for object #{object_key} with ID #{web_id} in S3 bucket #{bucket_name}: #{e.message}"
- s3_put_error.push({"error_type" => ErrorType::S3_PUT, "report" => s3_put_error_message})
- puts s3_put_error_message
- end
-
-
- s3_put_errors = s3_put_error.map {|o| Hash[o.each_pair.to_a]}
- return_value = {"bucket_name" => bucket_name, "object_key" => s3_path, "s3_status" => s3_put_status, "error" => s3_put_errors}
-
- sqs = Aws::SQS::Client.new(region: region)
-
- # Send a message to a queue.
- queue_name = "extractor-to-databank-prod"
- queue_url = sqs.get_queue_url(queue_name: queue_name).queue_url
-
- begin
- # Create and send a message.
- sqs.send_message({
- queue_url: queue_url,
- message_body: return_value.to_json,
- message_attributes: {}
- })
- puts "Sending message in queue #{queue_name} for object #{object_key} with ID #{web_id}"
- rescue StandardError => e
- puts "Error sending message in queue #{queue_name} for object #{object_key} with ID #{web_id}: #{e.message}"
- end
-
- ensure
- FileUtils.rm_rf(dirname, :secure => true)
- FileUtils.rm_rf(del_path, :secure => true)
-
- end
- end
+ Config.load_and_set_settings(Config.setting_files("#{ENV['RUBY_HOME']}/config", ENV['RUBY_ENV']))
+ def self.extract(bucket_name, object_key, binary_name, web_id, mime_type)
+ region = Settings.aws.region
+ s3_client = Aws::S3::Client.new(region: region)
+ sqs = Aws::SQS::Client.new(region: region)
+ archive_extractor = ArchiveExtractor.new(bucket_name, object_key, binary_name, web_id, mime_type, sqs, s3_client)
+ archive_extractor.extract
+ end
end
\ No newline at end of file
diff --git a/lib/extractor/extraction.rb b/lib/extractor/extraction.rb
index 168782f..07da439 100644
--- a/lib/extractor/extraction.rb
+++ b/lib/extractor/extraction.rb
@@ -5,49 +5,52 @@
require 'mimemagic/overlay'
require 'zip'
require 'zlib'
-require 'libarchive'
+require 'ffi-libarchive'
require 'rubygems/package'
+require 'config'
+require 'logger'
-require_relative 'extraction_status.rb'
-require_relative 'peek_type.rb'
-require_relative 'error_type.rb'
-require_relative 'mime_type.rb'
+require_relative 'extraction_status'
+require_relative 'extraction_type'
+require_relative 'peek_type'
+require_relative 'error_type'
+require_relative 'mime_type'
class Extraction
attr_accessor :binary_name, :storage_path, :status, :peek_type, :peek_text, :id, :nested_items, :error, :mime_type
+ ALLOWED_CHAR_NUM = 1024 * 8
+ ALLOWED_DISPLAY_BYTES = ALLOWED_CHAR_NUM * 8
+ LOGGER = Logger.new(STDOUT)
def initialize(binary_name, storage_path, id, mime_type)
- @nested_items = Array.new
@binary_name = binary_name
@storage_path = storage_path
@id = id
- @error = Array.new
@mime_type = mime_type
+ @nested_items = []
+ @error = []
end
- ALLOWED_CHAR_NUM = 1024 * 8
- ALLOWED_DISPLAY_BYTES = ALLOWED_CHAR_NUM * 8
-
def process
begin
features_extracted = extract_features
if features_extracted
- self.status = ExtractionStatus::SUCCESS
+ @status = ExtractionStatus::SUCCESS
else
- self.status = ExtractionStatus::ERROR
+ @status = ExtractionStatus::ERROR
end
rescue StandardError => error
- self.status = ExtractionStatus::ERROR
- self.peek_type = PeekType::NONE
+ @status = ExtractionStatus::ERROR
+ @peek_type = PeekType::NONE
report_problem(error.message)
ensure
- if self.peek_text && self.peek_text.encoding.name != 'UTF-8'
+ if @peek_text && @peek_text.encoding.name != 'UTF-8'
begin
- self.peek_text.encode('UTF-8')
+ @peek_text.encode('UTF-8')
rescue Encoding::UndefinedConversionError
- self.peek_text = nil
- self.peek_type = PeekType::NONE
+ @peek_text = nil
+ @peek_type = PeekType::NONE
report_problem('invalid encoding for peek text')
rescue Exception => ex
report_problem("invalid encoding and problem character: #{ex.class}, #{ex.message}")
@@ -57,7 +60,7 @@ def process
end
def report_problem(report)
- self.error.push({"error_type" => ErrorType::EXTRACTION, "report" => report})
+ @error.push({"error_type" => ErrorType::EXTRACTION, "report" => report})
end
def extract_features
@@ -73,10 +76,9 @@ def extract_features
else
return extract_default
end
-
end
- def self.mime_from_path(path)
+ def mime_from_path(path)
file = File.open("#{path}")
file_mime_response = MimeMagic.by_path(file).to_s
file.close
@@ -94,7 +96,7 @@ def self.mime_from_path(path)
end
end
- def self.mime_from_filename(filename)
+ def mime_from_filename(filename)
mime_guesses = MIME::Types.type_for(filename).first.content_type
if mime_guesses.length > 0
mime_guesses
@@ -104,79 +106,27 @@ def self.mime_from_filename(filename)
end
def create_item(item_path, item_name, item_size, media_type, is_directory)
- item = {"item_name" => item_name, "item_path" => item_path, "item_size" => item_size, "media_type" => media_type, "is_directory" => is_directory}
+ item = {"item_name" => item_name, "item_path" => item_path, "item_size" => item_size, "media_type" => media_type,
+ "is_directory" => is_directory}
@nested_items.push(item)
-
end
def extract_zip
begin
- puts "Extracting zip file #{binary_name}"
+ LOGGER.info("Extracting zip file #{@binary_name}")
entry_paths = []
- Zip::File.open(self.storage_path) do |zip_file|
+ Zip::File.open(@storage_path) do |zip_file|
zip_file.each do |entry|
-
if entry.name_safe?
-
- entry_path = valid_entry_path(entry.name)
-
-
- if entry_path && !is_ds_store(entry_path) && !is_mac_thing(entry_path) && !is_mac_tar_thing(entry_path)
-
- entry_paths << entry_path
-
- if is_directory(entry.name)
-
- create_item(entry_path,
- name_part(entry_path),
- entry.size,
- 'directory',
- true)
-
- else
-
- storage_dir = File.dirname(storage_path)
- extracted_entry_path = File.join(storage_dir, entry_path)
- extracted_entry_dir = File.dirname(extracted_entry_path)
- FileUtils.mkdir_p extracted_entry_dir
-
- raise Exception.new("extracted entry somehow already there?!!?!") if File.exist?(extracted_entry_path)
-
- entry.extract(extracted_entry_path)
-
- raise Exception.new("extracting entry not working!") unless File.exist?(extracted_entry_path)
-
- mime_guess = Extraction.mime_from_path(extracted_entry_path) ||
- Extraction.mime_from_filename(entry.name) ||
- 'application/octet-stream'
-
- create_item(entry_path,
- name_part(entry_path),
- entry.size,
- mime_guess,
- false)
- File.delete(extracted_entry_path) if File.exist?(extracted_entry_path)
- end
-
-
- end
+ entry_paths = extract_entry(entry, entry.name, entry_paths, ExtractionType::ZIP)
end
end
end
-
-
- if entry_paths.length > 0
- self.peek_type = PeekType::LISTING
- self.peek_text = entry_paths_arr_to_html(entry_paths)
- else
- self.peek_type = PeekType::NONE
- report_problem("no items found for zip listing for task #{self.id}")
- end
-
+ handle_entry_paths(entry_paths)
return true
rescue StandardError => ex
- self.status = ExtractionStatus::ERROR
- self.peek_type = PeekType::NONE
+ @status = ExtractionStatus::ERROR
+ @peek_type = PeekType::NONE
report_problem("problem extracting zip listing for task: #{ex.message}")
raise ex
@@ -185,161 +135,112 @@ def extract_zip
def extract_archive
begin
- puts "Extracting archive file #{binary_name}"
+ LOGGER.info("Extracting archive file #{@binary_name}")
entry_paths = []
-
Archive.read_open_filename(self.storage_path) do |ar|
while entry = ar.next_header
-
- entry_path = valid_entry_path(entry.pathname)
-
- if entry_path
-
- if !is_ds_store(entry_path) && !is_mac_thing(entry_path) && !is_mac_tar_thing(entry_path)
- entry_paths << entry_path
-
- if entry.directory? || is_directory(entry.pathname)
-
- create_item(entry_path,
- name_part(entry_path),
- entry.size,
- 'directory',
- true)
- else
-
- storage_dir = File.dirname(storage_path)
- extracted_entry_path = File.join(storage_dir, entry_path)
- extracted_entry_dir = File.dirname(extracted_entry_path)
- FileUtils.mkdir_p extracted_entry_dir
-
- file = File.open(extracted_entry_path, 'wb')
-
- raise("extracting non-zip entry not working!") unless File.exist?(extracted_entry_path)
-
- mime_guess = Extraction.mime_from_path(extracted_entry_path) ||
- mime_from_filename(entry.name) ||
- 'application/octet-stream'
-
-
- create_item(entry_path,
- name_part(entry_path),
- entry.size,
- mime_guess,
- false)
- file.close
- File.delete(extracted_entry_path) if File.exist?(extracted_entry_path)
- end
-
- end
-
- end
+ entry_paths = extract_entry(entry, entry.pathname, entry_paths, ExtractionType::ARCHIVE)
end
end
-
- if entry_paths.length > 0
- self.peek_type = PeekType::LISTING
- self.peek_text = entry_paths_arr_to_html(entry_paths)
- return true
- else
- self.peek_type = PeekType::NONE
- report_problem("no items found for archive listing for task #{self.id}")
- return false
- end
+ handle_entry_paths(entry_paths)
rescue StandardError => ex
- self.status = ExtractionStatus::ERROR
- self.peek_type = PeekType::NONE
-
- report_problem("problem extracting extract listing for task #{self.id}: #{ex.message}")
+ LOGGER.error(ex)
+ @status = ExtractionStatus::ERROR
+ @peek_type = PeekType::NONE
+ report_problem("problem extracting extract listing for task #{@id}: #{ex.message}")
return false
end
end
def extract_gzip
begin
- puts "Extracting gzip file #{binary_name}"
+ LOGGER.info("Extracting gzip file #{@binary_name}")
entry_paths = []
- tar_extract = Gem::Package::TarReader.new(Zlib::GzipReader.open(self.storage_path))
- tar_extract.rewind # The extract has to be rewinded after every iteration
+ tar_extract = Gem::Package::TarReader.new(Zlib::GzipReader.open(@storage_path))
+ tar_extract.rewind # The extract has to be rewound after every iteration
tar_extract.each do |entry|
+ entry_paths = extract_entry(entry, entry.full_name, entry_paths, ExtractionType::GZIP)
+ end
+ handle_entry_paths(entry_paths)
- entry_path = valid_entry_path(entry.full_name)
- if entry_path
-
- if !is_ds_store(entry_path) && !is_mac_thing(entry_path) && !is_mac_tar_thing(entry_path)
-# puts entry.full_name
-
- entry_paths << entry_path
-
- if entry.directory?
-
- create_item(entry_path,
- name_part(entry_path),
- entry.size,
- 'directory',
- true)
- else
-
- storage_dir = File.dirname(storage_path)
- extracted_entry_path = File.join(storage_dir, entry_path)
- extracted_entry_dir = File.dirname(extracted_entry_path)
- FileUtils.mkdir_p extracted_entry_dir
-
- file = File.open(extracted_entry_path, 'wb')
-
- raise("extracting gzip entry not working!") unless File.exist?(extracted_entry_path)
-
- mime_guess = Extraction.mime_from_path(extracted_entry_path) ||
- mime_from_filename(entry.name) ||
- 'application/octet-stream'
+ rescue StandardError => ex
+ @status = ExtractionStatus::ERROR
+ @peek_type = PeekType::NONE
+ report_problem("problem extracting extract listing for task #{@id}: #{ex.message}")
+ return false
+ end
+ ensure
+ tar_extract.close
+ end
- create_item(entry_path,
- name_part(entry_path),
- entry.size,
- mime_guess,
- false)
- file.close
- File.delete(extracted_entry_path) if File.exist?(extracted_entry_path)
- end
+ def extract_entry(entry, entry_name, entry_paths, type)
+ entry_path = valid_entry_path(entry_name)
+ if entry_path && !is_ds_store(entry_path) && !is_mac_thing(entry_path) && !is_mac_tar_thing(entry_path)
+ entry_paths << entry_path
+ if entry.directory? || is_directory(entry_name)
+ create_item(entry_path,
+ name_part(entry_path),
+ entry.size,
+ 'directory',
+ true)
+ else
+ storage_dir = File.dirname(storage_path)
+ extracted_entry_path = File.join(storage_dir, entry_path)
+ extracted_entry_dir = File.dirname(extracted_entry_path)
+ FileUtils.mkdir_p extracted_entry_dir
- end
+ raise Exception.new("extracted entry somehow already there?!!?!") if File.exist?(extracted_entry_path)
+ file = nil
+ case type
+ when ExtractionType::ZIP
+ entry.extract(extracted_entry_path)
+ else
+ file = File.open(extracted_entry_path, 'wb')
end
+ raise("extracting #{type} entry not working!") unless File.exist?(extracted_entry_path)
+
+ mime_guess = mime_from_path(extracted_entry_path) ||
+ mime_from_filename(entry_name) ||
+ 'application/octet-stream'
+
+ create_item(entry_path,
+ name_part(entry_path),
+ entry.size,
+ mime_guess,
+ false)
+ file.close if file
+ File.delete(extracted_entry_path) if File.exist?(extracted_entry_path)
end
+ end
+ entry_paths
+ end
- if entry_paths.length > 0
- self.peek_type = PeekType::LISTING
- self.peek_text = entry_paths_arr_to_html(entry_paths)
- return true
- else
- self.peek_type = PeekType::NONE
- report_problem("no items found for archive listing for task #{self.id}")
- return false
- end
-
- rescue StandardError => ex
- self.status = ExtractionStatus::ERROR
- self.peek_type = PeekType::NONE
-
- report_problem("problem extracting extract listing for task #{self.id}: #{ex.message}")
+ def handle_entry_paths(entry_paths)
+ if entry_paths.length > 0
+ @peek_type = PeekType::LISTING
+ @peek_text = entry_paths_arr_to_html(entry_paths)
+ puts @peek_text
+ return true
+ else
+ @peek_type = PeekType::NONE
+ report_problem("no items found for archive listing for task #{@id}")
return false
-
- tar_extract.close
end
-
end
def extract_default
- puts "Default extraction for #{binary_name}"
+ LOGGER.info("Default extraction for #{@binary_name}")
begin
- self.peek_type = PeekType::NONE
+ @peek_type = PeekType::NONE
return true
rescue StandardError => ex
- self.status = ExtractionStatus::ERROR
- self.peek_type = PeekType::NONE
- report_problem("problem creating default peek for task #{self.id}")
+ @status = ExtractionStatus::ERROR
+ @peek_type = PeekType::NONE
+ report_problem("problem creating default peek for task #{@id}: #{ex}")
return false
end
end
@@ -389,7 +290,7 @@ def name_part(path)
def entry_paths_arr_to_html(entry_paths)
return_string = ' '
- return_string << self.binary_name
+ return_string << @binary_name
entry_paths.each do |entry_path|
diff --git a/lib/extractor/extraction_type.rb b/lib/extractor/extraction_type.rb
new file mode 100644
index 0000000..cbe1283
--- /dev/null
+++ b/lib/extractor/extraction_type.rb
@@ -0,0 +1,6 @@
+# frozen_string_literal: true
+class ExtractionType
+ ZIP = 'zip'
+ GZIP = 'gzip'
+ ARCHIVE = 'archive'
+end
diff --git a/test/archive_extractor_test.rb b/test/archive_extractor_test.rb
new file mode 100644
index 0000000..1ed4313
--- /dev/null
+++ b/test/archive_extractor_test.rb
@@ -0,0 +1,158 @@
+# frozen_string_literal: true
+require_relative 'test_helper'
+
+class TestArchiveExtractor < Minitest::Test
+ Config.load_and_set_settings(Config.setting_files("#{ENV['RUBY_HOME']}/config", 'test'))
+ def setup
+ bucket_name = 'test-bucket'
+ object_key = 'test-key'
+ binary_name = 'test'
+ web_id = 'test-id'
+ mime_type = 'application/zip'
+ @sqs = Minitest::Mock.new
+ @s3 = Minitest::Mock.new
+ @archive_extractor = ArchiveExtractor.new(bucket_name, object_key, binary_name, web_id, mime_type, @sqs, @s3)
+ end
+
+ def test_extract
+ # setup
+ @archive_extractor.binary_name = 'test.zip'
+ @archive_extractor.web_id = 'test-zip'
+ @archive_extractor.mime_type = 'application/zip'
+ @archive_extractor.object_key = 'test.zip'
+ del_path = "#{Settings.aws.efs.mount_point}#{@archive_extractor.bucket_name}_#{@archive_extractor.web_id}"
+ local_path = "#{del_path}/#{@archive_extractor.object_key}"
+ file_path = "#{ENV['RUBY_HOME']}/test/test.zip"
+ dirname = File.dirname(local_path)
+ unless File.directory?(dirname)
+ FileUtils.mkdir_p(dirname)
+ end
+ FileUtils.cp(file_path, local_path)
+ @s3.expect(:get_object, nil, [{response_target: local_path, bucket: @archive_extractor.bucket_name,
+ key: @archive_extractor.object_key}])
+ peek_text = " test.zip
test.txt
"
+ items = [{'item_name' => 'test.txt', 'item_path' => 'test.txt', 'item_size' => 12, 'media_type' => 'text/plain', 'is_directory' => false}]
+ return_value = {'web_id' => 'test-zip', 'status' => ExtractionStatus::SUCCESS, 'error' => [], 'peek_type' => PeekType::LISTING, 'peek_text' => peek_text, 'nested_items' => items}
+ s3_path = 'messages/test-zip.json'
+ @s3.expect(:put_object, [], [{body: return_value.to_json, bucket: Settings.aws.s3.json_bucket, key: s3_path}])
+ return_value = {'bucket_name' => 'test-bucket', 'object_key' => s3_path, 's3_status' => ExtractionStatus::SUCCESS, 'error' => []}
+ @sqs.expect(:send_message, nil, [{queue_url: Settings.aws.sqs.queue_url,
+ message_body: return_value.to_json,
+ message_attributes:{}}])
+
+ # test
+ @archive_extractor.extract
+
+ # verify
+ assert_mock(@s3)
+ assert_mock(@sqs)
+ end
+
+ def test_get_object
+ # setup
+ local_path = 'test/path'
+ @s3.expect(:get_object, nil, [{response_target: local_path, bucket: @archive_extractor.bucket_name,
+ key: @archive_extractor.object_key}])
+ # test
+ error = @archive_extractor.get_object(local_path, [])
+
+ # verify
+ assert_mock(@s3)
+ assert_empty(error)
+ end
+
+ def test_get_object_error
+ # setup
+ stub_s3 = Aws::S3::Client.new(region: Settings.aws.region)
+ @archive_extractor.s3 = stub_s3
+ local_path = "test/path"
+ raises_exception = -> { raise StandardError.new }
+
+ # test and verify
+ stub_s3.stub :get_object, raises_exception do
+ error = @archive_extractor.get_object(local_path, [])
+ assert(error.first.value?(ErrorType::S3_GET))
+ end
+ end
+
+ def test_perform_extraction
+ # setup
+ binary_name = 'test.zip'
+ web_id = 'test-zip'
+ mime_type = 'application/zip'
+ local_path = "#{ENV['RUBY_HOME']}/test/test.zip"
+ extraction = Extraction.new(binary_name, local_path, web_id, mime_type)
+
+ #test
+ return_value = @archive_extractor.perform_extraction(extraction, [])
+
+ # verify
+ assert(return_value.value?(PeekType::LISTING))
+ exp_peek_text = " test.zip test.txt
"
+ assert(return_value.value?(exp_peek_text))
+
+ end
+
+ def test_perform_extraction_error
+ # setup
+ binary_name = 'test.zip'
+ web_id = 'test-zip'
+ mime_type = 'application/zip'
+ local_path = "#{ENV['RUBY_HOME']}/test/test.zip"
+ stub_extraction = Extraction.new(binary_name, local_path, web_id, mime_type)
+ raises_exception = -> { raise StandardError.new }
+
+ # test and verify
+ stub_extraction.stub :process, raises_exception do
+ return_value = @archive_extractor.perform_extraction(stub_extraction, [])
+ assert(return_value.value?(PeekType::NONE))
+ assert(return_value.value?(ExtractionStatus::ERROR))
+ end
+ end
+
+ def test_send_sqs_message
+ # setup
+ return_value = {'test' => 'retVal'}
+ @sqs.expect(:send_message, nil, [{queue_url: Settings.aws.sqs.queue_url,
+ message_body: return_value.to_json,
+ message_attributes:{}}])
+
+ # test
+ @archive_extractor.send_sqs_message(return_value)
+
+ # verify
+ assert_mock(@sqs)
+ end
+
+ def test_put_json_response
+ # setup
+ return_value = {'test' => 'retVal'}
+ s3_path = 'test/s3/key'
+ @s3.expect(:put_object, nil, [{body: return_value.to_json, bucket: Settings.aws.s3.json_bucket, key: s3_path}])
+
+ # test
+ s3_put_status, s3_put_error = @archive_extractor.put_json_response(return_value, s3_path)
+
+ # verify
+ assert_mock(@s3)
+ assert_equal(ExtractionStatus::SUCCESS, s3_put_status)
+ assert_empty(s3_put_error)
+ end
+
+ def test_put_json_response_error
+ # setup
+ return_value = {'test' => 'error'}
+ s3_path = 'test/s3/error'
+ stub_s3 = Aws::S3::Client.new(region: Settings.aws.region)
+ @archive_extractor.s3 = stub_s3
+ raises_exception = -> { raise StandardError.new }
+
+ # test and verify
+ stub_s3.stub :put_object, raises_exception do
+ s3_put_status, s3_put_error = @archive_extractor.put_json_response(return_value, s3_path)
+ assert_equal(ExtractionStatus::ERROR, s3_put_status)
+ assert(!s3_put_error.empty?)
+ end
+ end
+end
+
diff --git a/test/extraction_test.rb b/test/extraction_test.rb
new file mode 100644
index 0000000..7af66de
--- /dev/null
+++ b/test/extraction_test.rb
@@ -0,0 +1,356 @@
+# frozen_string_literal: true
+require_relative 'test_helper'
+
+class TestExtraction < Minitest::Test
+ Config.load_and_set_settings(Config.setting_files("#{ENV['RUBY_HOME']}/config", 'test'))
+ def setup
+ binary_name = 'test-binary'
+ web_id = 'test-id'
+ storage_path = "#{Settings.aws.efs.mount_point}test-bucket_#{web_id}/test-key"
+ mime_type = 'application/zip'
+ @extraction = Extraction.new(binary_name, storage_path, web_id, mime_type)
+ end
+
+ def test_process
+ # setup
+ @extraction.binary_name = 'test.txt.gz'
+ @extraction.storage_path = "#{ENV['RUBY_HOME']}/test/test.txt.gz"
+ @extraction.id = 'test-gzip'
+ @extraction.mime_type = 'application/gzip'
+
+ # test
+ @extraction.process
+
+ # verify
+ assert_equal(ExtractionStatus::SUCCESS, @extraction.status)
+ assert_equal(PeekType::LISTING, @extraction.peek_type)
+ end
+
+ def test_report_problem
+ # setup
+ report = 'Test report'
+
+ # test
+ @extraction.report_problem(report)
+
+ # verify
+ error = @extraction.error
+ assert_equal(true, error.include?({'error_type' => ErrorType::EXTRACTION, 'report' => report}))
+ end
+
+ def test_extract_features_gzip
+ # setup
+ @extraction.binary_name = 'test.txt.gz'
+ @extraction.storage_path = "#{ENV['RUBY_HOME']}/test/test.txt.gz"
+ @extraction.id = 'test-gzip'
+ @extraction.mime_type = 'application/gzip'
+
+ # test
+ @extraction.extract_features
+
+ # verify
+ assert_equal(PeekType::LISTING, @extraction.peek_type)
+ exp_peek_text = " test.txt.gz testing\n
"
+ assert_equal(exp_peek_text, @extraction.peek_text)
+ end
+
+ def test_extract_features_zip
+ # setup
+ @extraction.binary_name = 'test.zip'
+ @extraction.storage_path = "#{ENV['RUBY_HOME']}/test/test.zip"
+ @extraction.id = 'test-zip'
+ @extraction.mime_type = 'application/zip'
+
+ # test
+ @extraction.extract_features
+
+ # verify
+ assert_equal(PeekType::LISTING, @extraction.peek_type)
+ exp_peek_text = " test.zip test.txt
"
+ assert_equal(exp_peek_text, @extraction.peek_text)
+ end
+
+ def test_extract_features_default
+ # setup
+ @extraction.binary_name = 'test'
+ @extraction.storage_path = "#{ENV['RUBY_HOME']}/test"
+ @extraction.id = 'test-default'
+ @extraction.mime_type = 'application/directory'
+
+ # test
+ @extraction.extract_features
+
+ # verify
+ assert_equal(PeekType::NONE, @extraction.peek_type)
+ end
+
+ def test_mime_from_path
+ # setup
+ ruby_path = "#{ENV['RUBY_HOME']}/bin/set-test-vars.rb"
+
+ # test
+ ruby_mime = @extraction.mime_from_path(ruby_path)
+
+ # verify
+ assert_equal('application/x-ruby', ruby_mime)
+ end
+
+ def test_mime_from_filename
+ # setup
+ zip_filename = 'test.zip'
+
+ # test
+ zip_mime = @extraction.mime_from_filename(zip_filename)
+
+ # verify
+ assert_equal('application/zip', zip_mime)
+ end
+
+ def test_create_item
+ # setup
+ item_path = 'test/item/path/thing'
+ item_name = 'thing'
+ item_size = 123
+ media_type = 'directory'
+ is_directory = true
+
+ # test
+ @extraction.create_item(item_path, item_name, item_size, media_type, is_directory)
+
+ # verify
+ nested_items = @extraction.nested_items
+ assert(nested_items.include?({'item_name' => item_name, 'item_path' => item_path, 'item_size' => item_size,
+ 'media_type' => media_type, 'is_directory' => is_directory}))
+ end
+
+ def test_extract_zip
+ # setup
+ @extraction.binary_name = 'test.zip'
+ @extraction.storage_path = "#{ENV['RUBY_HOME']}/test/test.zip"
+ @extraction.id = 'test-zip'
+ @extraction.mime_type = 'application/zip'
+
+ # test
+ @extraction.extract_zip
+
+ # verify
+ assert_equal(PeekType::LISTING, @extraction.peek_type)
+ exp_peek_text = " test.zip test.txt
"
+ assert_equal(exp_peek_text, @extraction.peek_text)
+ end
+
+ def test_extract_archive
+ # setup
+ @extraction.binary_name = 'test.tar'
+ @extraction.storage_path = "#{ENV['RUBY_HOME']}/test/test.tar"
+ @extraction.id = 'test-tar'
+ @extraction.mime_type = 'application/x-tar'
+ @extraction.peek_type = nil
+
+ # test
+ @extraction.extract_archive
+
+ # verify
+ assert_equal(PeekType::LISTING, @extraction.peek_type)
+ exp_peek_text = " test.tar test.txt
"
+ assert_equal(exp_peek_text, @extraction.peek_text)
+ end
+
+ def test_extract_gzip
+ # setup
+ @extraction.binary_name = 'test.txt.gz'
+ @extraction.storage_path = "#{ENV['RUBY_HOME']}/test/test.txt.gz"
+ @extraction.id = 'test-gzip'
+ @extraction.mime_type = 'application/gzip'
+
+ # test
+ @extraction.extract_gzip
+
+ # verify
+ assert_equal(PeekType::LISTING, @extraction.peek_type)
+ exp_peek_text = " test.txt.gz testing\n
"
+ assert_equal(exp_peek_text, @extraction.peek_text)
+ end
+
+ def test_extract_entry
+ # setup
+ mock_entry = Minitest::Mock.new
+ entry_name = "#{ENV['RUBY_HOME']}/bin/set-test-vars.rb"
+ type = ExtractionType::GZIP
+ mock_entry.expect(:directory?, false)
+ mock_entry.expect(:size, 123)
+
+ # test
+ entry_paths = @extraction.extract_entry(mock_entry, entry_name, [], type)
+
+ # verify
+ assert_mock(mock_entry)
+ assert(entry_paths.include?(entry_name))
+ expect_item = {'item_name' => 'set-test-vars.rb', 'item_path' => entry_name, 'item_size' => 123,
+ 'media_type' => 'application/x-ruby', 'is_directory' => false}
+ assert(@extraction.nested_items.include?(expect_item))
+
+ end
+
+ def test_handle_entry_paths
+ # setup
+ entry_paths = ['test/path']
+
+ # test
+ resp = @extraction.handle_entry_paths(entry_paths)
+
+ # verify
+ assert(resp)
+ exp_peek_text = " test-binary"
+ assert_equal(exp_peek_text, @extraction.peek_text)
+ assert_equal(PeekType::LISTING, @extraction.peek_type)
+ end
+
+ def test_handle_entry_paths_empty
+ # setup
+ entry_paths = []
+
+ # test
+ resp = @extraction.handle_entry_paths(entry_paths)
+
+ # verify
+ assert_equal(false, resp)
+ assert_equal(PeekType::NONE, @extraction.peek_type)
+ assert(@extraction.error.include?({'error_type' => ErrorType::EXTRACTION,
+ 'report' => "no items found for archive listing for task #{@extraction.id}"}))
+ end
+
+ def test_extract_default
+ # test
+ @extraction.extract_default
+ # verify
+ peek_type = @extraction.peek_type
+ assert_equal(PeekType::NONE, peek_type)
+ end
+
+ def test_valid_entry_path
+ # setup
+ valid_path = 'test/path'
+ invalid_path = ""
+
+ # test
+ path = @extraction.valid_entry_path(valid_path)
+ path_slash = @extraction.valid_entry_path("#{valid_path}/")
+ path_nil = @extraction.valid_entry_path(invalid_path)
+
+ # verify
+ assert_equal(valid_path, path)
+ assert_equal(valid_path, path_slash)
+ assert_nil(path_nil)
+ end
+
+ def test_is_directory
+ # setup
+ ruby_home = ENV['RUBY_HOME']
+ object_path = 'test/path'
+ slash_path = 'test/path/'
+ mac_path = 'this/is/a/mac/._path'
+ ds_store_path = 'test/path/.DS_Store'
+
+ # test
+ ruby_home_dir = @extraction.is_directory(ruby_home)
+ object_path_dir = @extraction.is_directory(object_path)
+ slash_path_dir = @extraction.is_directory(slash_path)
+ mac_path_dir = @extraction.is_directory(mac_path)
+ ds_store_path_dir = @extraction.is_directory(ds_store_path)
+
+ # verify
+ assert_equal(true, ruby_home_dir)
+ assert_equal(true, slash_path_dir)
+ assert_equal(false, object_path_dir)
+ assert_equal(false, mac_path_dir)
+ assert_equal(false, ds_store_path_dir)
+ end
+
+ def test_is_mac_thing
+ # setup
+ mac_path = 'this/is/a/mac/path/__MACOSX'
+ path = 'this/is/not/a/mac/path'
+ # test
+ mac = @extraction.is_mac_thing(mac_path)
+ not_mac = @extraction.is_mac_thing(path)
+ # verify
+ assert_equal(true, mac)
+ assert_equal(false, not_mac)
+ end
+
+ def test_is_mac_tar_thing
+ # setup
+ mac_path = 'this/is/a/mac/._path'
+ paxheader_mac_path = 'PaxHeader/this/is/a/mac/path'
+ longlink_mac_path = 'this/is/a/mac/path/@LongLink'
+ path = 'this/is/not/a/mac/path'
+ # test
+ mac_underscore = @extraction.is_mac_tar_thing(mac_path)
+ mac_paxheader = @extraction.is_mac_tar_thing(paxheader_mac_path)
+ mac_longlink = @extraction.is_mac_tar_thing(longlink_mac_path)
+ not_mac = @extraction.is_mac_tar_thing(path)
+ # verify
+ assert_equal(true, mac_underscore)
+ assert_equal(true, mac_paxheader)
+ assert_equal(true, mac_longlink)
+ assert_equal(false, not_mac)
+ end
+
+ def test_ends_in_slash
+ # setup
+ path_ends_in_slash = 'test/path/'
+ path_does_not_end_in_slash = 'test/path'
+
+ # test
+ ends_in_slash = @extraction.ends_in_slash(path_ends_in_slash)
+ does_not_end_in_slash = @extraction.ends_in_slash(path_does_not_end_in_slash)
+
+ # verify
+ assert_equal(true, ends_in_slash)
+ assert_equal(false, does_not_end_in_slash)
+ end
+
+ def test_is_ds_store
+ # setup
+ ds_store_path = 'test/path/.DS_Store'
+ path = 'test/path'
+
+ # test
+ ds_store = @extraction.is_ds_store(ds_store_path)
+ not_ds_store = @extraction.is_ds_store(path)
+
+ # verify
+ assert_equal(true, ds_store)
+ assert_equal(false, not_ds_store)
+ end
+
+ def test_name_part
+ # setup
+ path = 'test/path'
+ name = 'test'
+ invalid_path = ""
+
+ # test
+ path_name = @extraction.name_part(path)
+ test_name = @extraction.name_part(name)
+ invalid_name = @extraction.name_part(invalid_path)
+
+ # verify
+ assert_equal('path', path_name)
+ assert_equal('test', test_name)
+ assert_nil(invalid_name)
+ end
+
+ def test_entry_paths_arr_to_html
+ # setup
+ entry_paths = ['test/path']
+
+ # test
+ return_string = @extraction.entry_paths_arr_to_html(entry_paths)
+
+ # verify
+ exp_peek_text = " test-binary"
+ assert_equal(exp_peek_text, return_string)
+ end
+end
diff --git a/test/test.tar b/test/test.tar
new file mode 100644
index 0000000..95dc7f6
Binary files /dev/null and b/test/test.tar differ
diff --git a/test/test.txt.gz b/test/test.txt.gz
new file mode 100644
index 0000000..4f821bc
Binary files /dev/null and b/test/test.txt.gz differ
diff --git a/test/test.zip b/test/test.zip
new file mode 100644
index 0000000..d048dc0
Binary files /dev/null and b/test/test.zip differ
diff --git a/test/test_helper.rb b/test/test_helper.rb
new file mode 100644
index 0000000..ec88121
--- /dev/null
+++ b/test/test_helper.rb
@@ -0,0 +1,17 @@
+# frozen_string_literal: true
+
+require 'simplecov'
+SimpleCov.start
+
+require 'minitest/autorun'
+require 'config'
+require 'csv'
+require 'json'
+require_relative '../lib/archive_extractor'
+require_relative '../lib/extractor'
+require_relative '../lib/extractor/error_type'
+require_relative '../lib/extractor/extraction'
+require_relative '../lib/extractor/extraction_status'
+require_relative '../lib/extractor/extraction_type'
+require_relative '../lib/extractor/mime_type'
+require_relative '../lib/extractor/peek_type'
\ No newline at end of file