diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..525771d --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,22 @@ +on: + push: + branches: + - test + - demo + - prod + +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: Checkout the repository + uses: actions/checkout@v4 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - name: Build and push + uses: docker/build-push-action@v5 + with: + context: . + file: ./docker/extractor/Dockerfile-test + tags: extractor-test + platforms: linux/arm64 \ No newline at end of file diff --git a/.gitignore b/.gitignore index 9106b2a..fe3b9cd 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ /pkg/ /spec/reports/ /tmp/ +/.idea/ \ No newline at end of file diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..73f69e0 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml +# Editor-based HTTP Client requests +/httpRequests/ diff --git a/.idea/databank-archive-extractor.iml b/.idea/databank-archive-extractor.iml new file mode 100644 index 0000000..f279c8f --- /dev/null +++ b/.idea/databank-archive-extractor.iml @@ -0,0 +1,49 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..b0db9b0 --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..1f18249 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..1127073 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..35eb1dd --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.ruby-version b/.ruby-version new file mode 100644 index 0000000..ef538c2 --- /dev/null +++ b/.ruby-version @@ -0,0 +1 @@ +3.1.2 diff --git a/Dockerfile b/Dockerfile index 2de759a..6fd469e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,9 +1,10 @@ # N.B.: this must match the Ruby version in the Gemfile, and /.ruby-version. -FROM ruby:2.7.2 +FROM ruby:3.1.2 -ENV RAILS_ENV=production +ENV RUBY_ENV=prod ENV RAILS_LOG_TO_STDOUT=true ENV RAILS_SERVE_STATIC_FILES=true +ENV RUBY_HOME=/extractor RUN apt-get update && apt-get install -y \ build-essential \ @@ -11,22 +12,18 @@ RUN apt-get update && apt-get install -y \ libpq-dev \ libarchive-dev -RUN mkdir app -WORKDIR app +RUN mkdir extractor +WORKDIR extractor # Copy the Gemfile as well as the Gemfile.lock and install gems. # This is a separate step so the dependencies will be cached. COPY Gemfile Gemfile.lock ./ -RUN gem install bundler && bundle install --without development test --jobs 20 --retry 5 +RUN gem install bundler && bundle install # Copy the main application, except whatever is listed in .dockerignore. COPY . ./ -#RUN bin/rails assets:precompile - -EXPOSE 3000 - # This is the web server entry point. It will need to be overridden when # running the workers. CMD ["echo", "Error running task, please check the container override command!"] diff --git a/Gemfile b/Gemfile index 73c0c87..0b08bd5 100644 --- a/Gemfile +++ b/Gemfile @@ -1,16 +1,16 @@ source "https://rubygems.org" git_source(:github) { |repo| "https://github.com/#{repo}.git" } -ruby '2.7.2' - # Use mime-types to determine mimetypes based on extension gem 'mime-types', require: 'mime/types/full' # Use rubyzip to read zip files gem 'rubyzip' +gem 'config' + # Use archive for non-zip archive files -gem 'libarchive' +gem 'ffi-libarchive' # Use os to interact with operating system gem 'os' @@ -24,3 +24,7 @@ gem "aws-sdk-s3" gem "aws-sdk-sqs" +gem 'minitest' + +gem 'simplecov' + diff --git a/Gemfile.lock b/Gemfile.lock index c50f5a1..a291f6f 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,53 +1,103 @@ GEM remote: https://rubygems.org/ specs: - aws-eventstream (1.1.0) - aws-partitions (1.416.0) - aws-sdk-core (3.111.0) - aws-eventstream (~> 1, >= 1.0.2) - aws-partitions (~> 1, >= 1.239.0) - aws-sigv4 (~> 1.1) - jmespath (~> 1.0) - aws-sdk-kms (1.41.0) - aws-sdk-core (~> 3, >= 3.109.0) + aws-eventstream (1.3.0) + aws-partitions (1.862.0) + aws-sdk-core (3.190.0) + aws-eventstream (~> 1, >= 1.3.0) + aws-partitions (~> 1, >= 1.651.0) + aws-sigv4 (~> 1.8) + jmespath (~> 1, >= 1.6.1) + aws-sdk-kms (1.74.0) + aws-sdk-core (~> 3, >= 3.188.0) aws-sigv4 (~> 1.1) - aws-sdk-s3 (1.87.0) - aws-sdk-core (~> 3, >= 3.109.0) + aws-sdk-s3 (1.141.0) + aws-sdk-core (~> 3, >= 3.189.0) aws-sdk-kms (~> 1) + aws-sigv4 (~> 1.8) + aws-sdk-sqs (1.69.0) + aws-sdk-core (~> 3, >= 3.188.0) aws-sigv4 (~> 1.1) - aws-sdk-sqs (1.35.0) - aws-sdk-core (~> 3, >= 3.109.0) - aws-sigv4 (~> 1.1) - aws-sigv4 (1.2.2) + aws-sigv4 (1.8.0) aws-eventstream (~> 1, >= 1.0.2) - jmespath (1.4.0) - mime-types (3.3.1) + concurrent-ruby (1.2.2) + config (5.0.0) + deep_merge (~> 1.2, >= 1.2.1) + dry-validation (~> 1.0, >= 1.0.0) + deep_merge (1.2.2) + docile (1.4.0) + dry-configurable (1.1.0) + dry-core (~> 1.0, < 2) + zeitwerk (~> 2.6) + dry-core (1.0.1) + concurrent-ruby (~> 1.0) + zeitwerk (~> 2.6) + dry-inflector (1.0.0) + dry-initializer (3.1.1) + dry-logic (1.5.0) + concurrent-ruby (~> 1.0) + dry-core (~> 1.0, < 2) + zeitwerk (~> 2.6) + dry-schema (1.13.3) + concurrent-ruby (~> 1.0) + dry-configurable (~> 1.0, >= 1.0.1) + dry-core (~> 1.0, < 2) + dry-initializer (~> 3.0) + dry-logic (>= 1.4, < 2) + dry-types (>= 1.7, < 2) + zeitwerk (~> 2.6) + dry-types (1.7.1) + concurrent-ruby (~> 1.0) + dry-core (~> 1.0) + dry-inflector (~> 1.0) + dry-logic (~> 1.4) + zeitwerk (~> 2.6) + dry-validation (1.10.0) + concurrent-ruby (~> 1.0) + dry-core (~> 1.0, < 2) + dry-initializer (~> 3.0) + dry-schema (>= 1.12, < 2) + zeitwerk (~> 2.6) + ffi (1.16.3) + ffi-libarchive (1.1.13) + ffi (~> 1.0) + jmespath (1.6.2) + mime-types (3.5.1) mime-types-data (~> 3.2015) - mime-types-data (3.2020.1104) + mime-types-data (3.2023.1003) mimemagic (0.3.10) nokogiri (~> 1) rake - nokogiri (1.11.2-x86_64-darwin) + minitest (5.20.0) + nokogiri (1.15.5-x86_64-darwin) racc (~> 1.4) - os (1.1.1) - racc (1.5.2) - rake (13.0.3) - rubyzip (2.3.0) + os (1.1.4) + racc (1.7.3) + rake (13.1.0) + rubyzip (2.3.2) + simplecov (0.22.0) + docile (~> 1.1) + simplecov-html (~> 0.11) + simplecov_json_formatter (~> 0.1) + simplecov-html (0.12.3) + simplecov_json_formatter (0.1.4) + zeitwerk (2.6.12) PLATFORMS - x86_64-darwin-19 + x86_64-darwin-21 DEPENDENCIES aws-sdk-s3 aws-sdk-sqs + config + ffi-libarchive mime-types mimemagic (~> 0.3.6) + minitest os rake (~> 13.0) rubyzip - -RUBY VERSION - ruby 2.7.2p137 + simplecov BUNDLED WITH - 2.2.4 + 2.3.22 diff --git a/Rakefile b/Rakefile index cd510a0..0abb012 100644 --- a/Rakefile +++ b/Rakefile @@ -1,4 +1,11 @@ # frozen_string_literal: true -require "bundler/gem_tasks" -task default: %i[] +require 'rake/testtask' +require 'simplecov' +require_relative 'bin/set-test-vars' + +Rake::TestTask.new(:test) do |t| + t.libs << 'lib' << 'test' + # t.libs << 'lib' + t.test_files = FileList['test/*_test.rb'] +end diff --git a/bin/console b/bin/console deleted file mode 100755 index 8096e50..0000000 --- a/bin/console +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/env ruby -# frozen_string_literal: true - -require "bundler/setup" -require "databank/archive/extractor" - -# You can add fixtures and/or initialization code here to make experimenting -# with your gem easier. You can also use a different console, if you like. - -# (If you use this, don't forget to add pry to your Gemfile!) -# require "pry" -# Pry.start - -require "irb" -IRB.start(__FILE__) diff --git a/bin/set-test-vars.rb b/bin/set-test-vars.rb new file mode 100644 index 0000000..7959c7c --- /dev/null +++ b/bin/set-test-vars.rb @@ -0,0 +1,5 @@ +#!/usr/bin/env ruby + +ENV['RUBY_ENV'] = 'test' +ENV['RUBY_HOME'] = ENV['IS_DOCKER'] == 'true' ? '/extractor' : '/Users/gschmitt/workspace/databank-archive-extractor' +ENV['RUBY_TEST_HOME'] = "#{ENV['RUBY_HOME']}/test" diff --git a/bin/setup b/bin/setup deleted file mode 100755 index dce67d8..0000000 --- a/bin/setup +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail -IFS=$'\n\t' -set -vx - -bundle install - -# Do any other automated setup that you need to do here diff --git a/config/settings.yml b/config/settings.yml new file mode 100644 index 0000000..ed17934 --- /dev/null +++ b/config/settings.yml @@ -0,0 +1,3 @@ +aws: + region: "us-east-2" + diff --git a/config/settings/demo.yml b/config/settings/demo.yml new file mode 100644 index 0000000..18367ed --- /dev/null +++ b/config/settings/demo.yml @@ -0,0 +1,8 @@ +aws: + efs: + mount_point: "/mnt/efs/" + sqs: + queue_name: "extractor-to-databank-demo" + queue_url: "https://sqs.us-east-2.amazonaws.com/721945215539/extractor-to-databank-demo" + s3: + json_bucket: "databank-demo-main" diff --git a/config/settings/prod.yml b/config/settings/prod.yml new file mode 100644 index 0000000..40c24e7 --- /dev/null +++ b/config/settings/prod.yml @@ -0,0 +1,8 @@ +aws: + efs: + mount_point: "/mnt/efs/" + sqs: + queue_name: "extractor-to-databank-prod" + queue_url: "https://sqs.us-east-2.amazonaws.com/721945215539/extractor-to-databank-prod" + s3: + json_bucket: "databank-main" \ No newline at end of file diff --git a/config/settings/test.yml b/config/settings/test.yml new file mode 100644 index 0000000..db955db --- /dev/null +++ b/config/settings/test.yml @@ -0,0 +1,8 @@ +aws: + efs: + mount_point: "test/efs/" + sqs: + queue_name: "extractor-to-databank-test" + queue_url: "https://sqs.us-east-2.amazonaws.com/721945215539/extractor-to-databank-test" + s3: + json_bucket: "databank-test-main" \ No newline at end of file diff --git a/docker/extractor/Dockerfile-test b/docker/extractor/Dockerfile-test new file mode 100644 index 0000000..0ca4179 --- /dev/null +++ b/docker/extractor/Dockerfile-test @@ -0,0 +1,28 @@ +FROM --platform=linux/arm64 ruby:3.1.2 +#FROM ruby:3.1.2 + +ENV RUBY_ENV=test +ENV RAILS_LOG_TO_STDOUT=true +ENV RUBY_HOME=/extractor +ENV IS_DOCKER=true + +RUN apt-get update && apt-get install -y \ + build-essential \ + git \ + libpq-dev \ + libarchive-dev + +# Copy the Gemfile as well as the Gemfile.lock and install gems. +# This is a separate step so the dependencies will be cached. +RUN mkdir extractor +WORKDIR extractor + +#COPY Gemfile Gemfile.lock ./ +COPY Gemfile ./ +RUN gem install bundler && bundle install + +# Copy the main application, except whatever is listed in .dockerignore. +COPY . ./ + +#RUN rake test +CMD ["rake", "test"] \ No newline at end of file diff --git a/lib/archive_extractor.rb b/lib/archive_extractor.rb new file mode 100644 index 0000000..094aa02 --- /dev/null +++ b/lib/archive_extractor.rb @@ -0,0 +1,133 @@ +# frozen_string_literal: true +require 'aws-sdk-sqs' +require 'aws-sdk-s3' +require 'fileutils' +require 'json' +require 'config' +require 'logger' + + +require_relative 'extractor/extraction' +require_relative 'extractor/extraction_status' +require_relative 'extractor/error_type' + +class ArchiveExtractor + attr_accessor :s3, :sqs, :bucket_name, :object_key, :binary_name, :web_id, :mime_type, :extraction + Config.load_and_set_settings(Config.setting_files("#{ENV['RUBY_HOME']}/config", ENV['RUBY_ENV'])) + LOGGER = Logger.new(STDOUT) + + def initialize(bucket_name, object_key, binary_name, web_id, mime_type, sqs, s3) + @bucket_name = bucket_name + @object_key = object_key + @binary_name = binary_name + @web_id = web_id + @mime_type = mime_type + @sqs = sqs + @s3 = s3 + end + + def extract + begin + error = [] + + del_path = "#{Settings.aws.efs.mount_point}#{@bucket_name}_#{@web_id}" + local_path = "#{del_path}/#{@object_key}" + + dirname = File.dirname(local_path) + unless File.directory?(dirname) + FileUtils.mkdir_p(dirname) + end + + get_object(local_path, error) + + extraction = Extraction.new(@binary_name, local_path, @web_id, @mime_type) + return_value = perform_extraction(extraction, error) + s3_path = "messages/#{@web_id}.json" + s3_put_status, s3_put_error = put_json_response(return_value, s3_path) + + s3_put_errors = s3_put_error.map {|o| Hash[o.each_pair.to_a]} + + return_value = {"bucket_name" => @bucket_name, "object_key" => s3_path, "s3_status" => s3_put_status, "error" => s3_put_errors} + send_sqs_message(return_value) + + ensure + FileUtils.rm_rf(dirname, :secure => true) + FileUtils.rm_rf(del_path, :secure => true) + end + end + + def get_object(local_path, error) + begin + @s3.get_object({ + response_target: local_path, + bucket: @bucket_name, + key: @object_key, + }) + LOGGER.info("Getting object #{@object_key} with ID #{@web_id} from #{@bucket_name}") + rescue StandardError => e + s3_error = "Error getting object #{@object_key} with ID #{@web_id} from S3 bucket #{@bucket_name}: #{e.message}" + LOGGER.error(s3_error) + error.push({"error_type" => ErrorType::S3_GET, "report" => s3_error}) + end + return error + end + + def perform_extraction(extraction, error) + begin + extraction.process + status = extraction.status + LOGGER.info("status: #{status}") + LOGGER.error("error: #{extraction.error}") if status == ExtractionStatus::ERROR + error.concat(extraction.error) + items = extraction.nested_items.map { |o| Hash[o.each_pair.to_a] } + errors = error.map {|o| Hash[o.each_pair.to_a]} + return_value = {"web_id" => @web_id, "status" => status, "error" => errors, "peek_type" => extraction.peek_type, "peek_text" => extraction.peek_text, "nested_items" => items} + rescue StandardError => e + error.push({"task_id" => @web_id, "extraction_process_report" => "Error extracting #{@object_key} with ID #{@web_id}: #{e.message}"}) + errors = error.map {|o| Hash[o.each_pair.to_a]} + return_value = {"web_id" => @web_id, "status" => ExtractionStatus::ERROR, "error" => errors, "peek_type" => PeekType::NONE, "peek_text" => nil, "nested_items" => []} + end + return return_value + end + + def send_sqs_message(return_value) + # Send a message to a queue. + queue_name = Settings.aws.sqs.queue_name + queue_url = Settings.aws.sqs.queue_url + + begin + # Create and send a message. + @sqs.send_message({ + queue_url: queue_url, + message_body: return_value.to_json, + message_attributes: {} + }) + LOGGER.info("Sending message in queue #{queue_name} for object #{@object_key} with ID #{@web_id}") + rescue StandardError => e + LOGGER.error("Error sending message in queue #{queue_name} for object #{@object_key} with ID #{@web_id}: #{e.message}") + end + end + + def put_json_response(return_value, s3_path) + s3_put_error = [] + json_bucket = Settings.aws.s3.json_bucket + begin + @s3.put_object({ + body: return_value.to_json, + bucket: json_bucket, + key: s3_path, + }) + LOGGER.info(return_value.to_json) + LOGGER.info("Putting json response for object #{@object_key} with ID #{@web_id} in S3 bucket #{json_bucket} with key #{s3_path}") + s3_put_status = ExtractionStatus::SUCCESS + rescue StandardError => e + s3_put_status = ExtractionStatus::ERROR + s3_put_error_message = "Error putting json response for object #{@object_key} with ID #{@web_id} in S3 bucket #{json_bucket}: #{e.message}" + s3_put_error.push({"error_type" => ErrorType::S3_PUT, "report" => s3_put_error_message}) + LOGGER.error(s3_put_error_message) + end + return s3_put_status, s3_put_error + end + + +end diff --git a/lib/extractor.rb b/lib/extractor.rb index c1bbe0f..8392f9e 100644 --- a/lib/extractor.rb +++ b/lib/extractor.rb @@ -1,102 +1,17 @@ require 'aws-sdk-sqs' require 'aws-sdk-s3' -require 'fileutils' -require 'json' +require 'config' -require_relative 'extractor/extraction.rb' -require_relative 'extractor/extraction_status.rb' -require_relative 'extractor/error_type.rb' +require_relative 'archive_extractor' class Extractor - def self.extract(bucket_name, object_key, binary_name, web_id, mime_type) - begin - status = ExtractionStatus::ERROR - error = Array.new - s3_put_status = ExtractionStatus::SUCCESS - s3_put_error = Array.new - local_path = "./#{binary_name}" - s3_path = "messages/#{web_id}.json" - - region = 'us-east-2' - s3_client = Aws::S3::Client.new(region: region) - del_path = "/mnt/efs/#{bucket_name}_#{web_id}" - local_path = "#{del_path}/#{object_key}" - - dirname = File.dirname(local_path) - unless File.directory?(dirname) - FileUtils.mkdir_p(dirname) - end - - begin - s3_client.get_object( - response_target: local_path, - bucket: bucket_name, - key: object_key, - ) - puts "Getting object #{object_key} with ID #{web_id} from #{bucket_name}" - rescue StandardError => e - s3_error = "Error getting object #{object_key} with ID #{web_id} from S3 bucket #{bucket_name}: #{e.message}" - error.push({"error_type" => ErrorType::S3_GET, "report" => s3_error}) - puts s3_error - end - - begin - extraction = Extraction.new(binary_name, local_path, web_id, mime_type) - extraction.process - status = extraction.status - puts "status: #{status}" - puts "error: #{extraction.error}" if status == ExtractionStatus::ERROR - error.concat(extraction.error) - items = extraction.nested_items.map { |o| Hash[o.each_pair.to_a] } - errors = error.map {|o| Hash[o.each_pair.to_a]} - return_value = {"web_id" => web_id, "status" => status, "error" => errors, "peek_type" => extraction.peek_type, "peek_text" => extraction.peek_text, "nested_items" => items} - rescue - error.push({"task_id" => web_id, "extraction_process_report" => "Error extracting #{object_key} with ID #{web_id}: #{e.message}"}) - errors = error.map {|o| Hash[o.each_pair.to_a]} - return_value = {"web_id" => web_id, "status" => ExtractionStatus::ERROR, "error" => errors, "peek_type" => PeekType::NONE, "peek_text" => null, "nested_items" => []} - end - - begin - s3_client.put_object({ - body: return_value.to_json, - bucket: "databank-main", - key: s3_path, - }) - puts "Putting json response for object #{object_key} with ID #{web_id} in S3 bucket #{bucket_name} with key #{s3_path}" - rescue StandardError => e - s3_put_status = ExtractionStatus::ERROR - s3_put_error_message = "Error putting json response for object #{object_key} with ID #{web_id} in S3 bucket #{bucket_name}: #{e.message}" - s3_put_error.push({"error_type" => ErrorType::S3_PUT, "report" => s3_put_error_message}) - puts s3_put_error_message - end - - - s3_put_errors = s3_put_error.map {|o| Hash[o.each_pair.to_a]} - return_value = {"bucket_name" => bucket_name, "object_key" => s3_path, "s3_status" => s3_put_status, "error" => s3_put_errors} - - sqs = Aws::SQS::Client.new(region: region) - - # Send a message to a queue. - queue_name = "extractor-to-databank-prod" - queue_url = sqs.get_queue_url(queue_name: queue_name).queue_url - - begin - # Create and send a message. - sqs.send_message({ - queue_url: queue_url, - message_body: return_value.to_json, - message_attributes: {} - }) - puts "Sending message in queue #{queue_name} for object #{object_key} with ID #{web_id}" - rescue StandardError => e - puts "Error sending message in queue #{queue_name} for object #{object_key} with ID #{web_id}: #{e.message}" - end - - ensure - FileUtils.rm_rf(dirname, :secure => true) - FileUtils.rm_rf(del_path, :secure => true) - - end - end + Config.load_and_set_settings(Config.setting_files("#{ENV['RUBY_HOME']}/config", ENV['RUBY_ENV'])) + def self.extract(bucket_name, object_key, binary_name, web_id, mime_type) + region = Settings.aws.region + s3_client = Aws::S3::Client.new(region: region) + sqs = Aws::SQS::Client.new(region: region) + archive_extractor = ArchiveExtractor.new(bucket_name, object_key, binary_name, web_id, mime_type, sqs, s3_client) + archive_extractor.extract + end end \ No newline at end of file diff --git a/lib/extractor/extraction.rb b/lib/extractor/extraction.rb index 168782f..07da439 100644 --- a/lib/extractor/extraction.rb +++ b/lib/extractor/extraction.rb @@ -5,49 +5,52 @@ require 'mimemagic/overlay' require 'zip' require 'zlib' -require 'libarchive' +require 'ffi-libarchive' require 'rubygems/package' +require 'config' +require 'logger' -require_relative 'extraction_status.rb' -require_relative 'peek_type.rb' -require_relative 'error_type.rb' -require_relative 'mime_type.rb' +require_relative 'extraction_status' +require_relative 'extraction_type' +require_relative 'peek_type' +require_relative 'error_type' +require_relative 'mime_type' class Extraction attr_accessor :binary_name, :storage_path, :status, :peek_type, :peek_text, :id, :nested_items, :error, :mime_type + ALLOWED_CHAR_NUM = 1024 * 8 + ALLOWED_DISPLAY_BYTES = ALLOWED_CHAR_NUM * 8 + LOGGER = Logger.new(STDOUT) def initialize(binary_name, storage_path, id, mime_type) - @nested_items = Array.new @binary_name = binary_name @storage_path = storage_path @id = id - @error = Array.new @mime_type = mime_type + @nested_items = [] + @error = [] end - ALLOWED_CHAR_NUM = 1024 * 8 - ALLOWED_DISPLAY_BYTES = ALLOWED_CHAR_NUM * 8 - def process begin features_extracted = extract_features if features_extracted - self.status = ExtractionStatus::SUCCESS + @status = ExtractionStatus::SUCCESS else - self.status = ExtractionStatus::ERROR + @status = ExtractionStatus::ERROR end rescue StandardError => error - self.status = ExtractionStatus::ERROR - self.peek_type = PeekType::NONE + @status = ExtractionStatus::ERROR + @peek_type = PeekType::NONE report_problem(error.message) ensure - if self.peek_text && self.peek_text.encoding.name != 'UTF-8' + if @peek_text && @peek_text.encoding.name != 'UTF-8' begin - self.peek_text.encode('UTF-8') + @peek_text.encode('UTF-8') rescue Encoding::UndefinedConversionError - self.peek_text = nil - self.peek_type = PeekType::NONE + @peek_text = nil + @peek_type = PeekType::NONE report_problem('invalid encoding for peek text') rescue Exception => ex report_problem("invalid encoding and problem character: #{ex.class}, #{ex.message}") @@ -57,7 +60,7 @@ def process end def report_problem(report) - self.error.push({"error_type" => ErrorType::EXTRACTION, "report" => report}) + @error.push({"error_type" => ErrorType::EXTRACTION, "report" => report}) end def extract_features @@ -73,10 +76,9 @@ def extract_features else return extract_default end - end - def self.mime_from_path(path) + def mime_from_path(path) file = File.open("#{path}") file_mime_response = MimeMagic.by_path(file).to_s file.close @@ -94,7 +96,7 @@ def self.mime_from_path(path) end end - def self.mime_from_filename(filename) + def mime_from_filename(filename) mime_guesses = MIME::Types.type_for(filename).first.content_type if mime_guesses.length > 0 mime_guesses @@ -104,79 +106,27 @@ def self.mime_from_filename(filename) end def create_item(item_path, item_name, item_size, media_type, is_directory) - item = {"item_name" => item_name, "item_path" => item_path, "item_size" => item_size, "media_type" => media_type, "is_directory" => is_directory} + item = {"item_name" => item_name, "item_path" => item_path, "item_size" => item_size, "media_type" => media_type, + "is_directory" => is_directory} @nested_items.push(item) - end def extract_zip begin - puts "Extracting zip file #{binary_name}" + LOGGER.info("Extracting zip file #{@binary_name}") entry_paths = [] - Zip::File.open(self.storage_path) do |zip_file| + Zip::File.open(@storage_path) do |zip_file| zip_file.each do |entry| - if entry.name_safe? - - entry_path = valid_entry_path(entry.name) - - - if entry_path && !is_ds_store(entry_path) && !is_mac_thing(entry_path) && !is_mac_tar_thing(entry_path) - - entry_paths << entry_path - - if is_directory(entry.name) - - create_item(entry_path, - name_part(entry_path), - entry.size, - 'directory', - true) - - else - - storage_dir = File.dirname(storage_path) - extracted_entry_path = File.join(storage_dir, entry_path) - extracted_entry_dir = File.dirname(extracted_entry_path) - FileUtils.mkdir_p extracted_entry_dir - - raise Exception.new("extracted entry somehow already there?!!?!") if File.exist?(extracted_entry_path) - - entry.extract(extracted_entry_path) - - raise Exception.new("extracting entry not working!") unless File.exist?(extracted_entry_path) - - mime_guess = Extraction.mime_from_path(extracted_entry_path) || - Extraction.mime_from_filename(entry.name) || - 'application/octet-stream' - - create_item(entry_path, - name_part(entry_path), - entry.size, - mime_guess, - false) - File.delete(extracted_entry_path) if File.exist?(extracted_entry_path) - end - - - end + entry_paths = extract_entry(entry, entry.name, entry_paths, ExtractionType::ZIP) end end end - - - if entry_paths.length > 0 - self.peek_type = PeekType::LISTING - self.peek_text = entry_paths_arr_to_html(entry_paths) - else - self.peek_type = PeekType::NONE - report_problem("no items found for zip listing for task #{self.id}") - end - + handle_entry_paths(entry_paths) return true rescue StandardError => ex - self.status = ExtractionStatus::ERROR - self.peek_type = PeekType::NONE + @status = ExtractionStatus::ERROR + @peek_type = PeekType::NONE report_problem("problem extracting zip listing for task: #{ex.message}") raise ex @@ -185,161 +135,112 @@ def extract_zip def extract_archive begin - puts "Extracting archive file #{binary_name}" + LOGGER.info("Extracting archive file #{@binary_name}") entry_paths = [] - Archive.read_open_filename(self.storage_path) do |ar| while entry = ar.next_header - - entry_path = valid_entry_path(entry.pathname) - - if entry_path - - if !is_ds_store(entry_path) && !is_mac_thing(entry_path) && !is_mac_tar_thing(entry_path) - entry_paths << entry_path - - if entry.directory? || is_directory(entry.pathname) - - create_item(entry_path, - name_part(entry_path), - entry.size, - 'directory', - true) - else - - storage_dir = File.dirname(storage_path) - extracted_entry_path = File.join(storage_dir, entry_path) - extracted_entry_dir = File.dirname(extracted_entry_path) - FileUtils.mkdir_p extracted_entry_dir - - file = File.open(extracted_entry_path, 'wb') - - raise("extracting non-zip entry not working!") unless File.exist?(extracted_entry_path) - - mime_guess = Extraction.mime_from_path(extracted_entry_path) || - mime_from_filename(entry.name) || - 'application/octet-stream' - - - create_item(entry_path, - name_part(entry_path), - entry.size, - mime_guess, - false) - file.close - File.delete(extracted_entry_path) if File.exist?(extracted_entry_path) - end - - end - - end + entry_paths = extract_entry(entry, entry.pathname, entry_paths, ExtractionType::ARCHIVE) end end - - if entry_paths.length > 0 - self.peek_type = PeekType::LISTING - self.peek_text = entry_paths_arr_to_html(entry_paths) - return true - else - self.peek_type = PeekType::NONE - report_problem("no items found for archive listing for task #{self.id}") - return false - end + handle_entry_paths(entry_paths) rescue StandardError => ex - self.status = ExtractionStatus::ERROR - self.peek_type = PeekType::NONE - - report_problem("problem extracting extract listing for task #{self.id}: #{ex.message}") + LOGGER.error(ex) + @status = ExtractionStatus::ERROR + @peek_type = PeekType::NONE + report_problem("problem extracting extract listing for task #{@id}: #{ex.message}") return false end end def extract_gzip begin - puts "Extracting gzip file #{binary_name}" + LOGGER.info("Extracting gzip file #{@binary_name}") entry_paths = [] - tar_extract = Gem::Package::TarReader.new(Zlib::GzipReader.open(self.storage_path)) - tar_extract.rewind # The extract has to be rewinded after every iteration + tar_extract = Gem::Package::TarReader.new(Zlib::GzipReader.open(@storage_path)) + tar_extract.rewind # The extract has to be rewound after every iteration tar_extract.each do |entry| + entry_paths = extract_entry(entry, entry.full_name, entry_paths, ExtractionType::GZIP) + end + handle_entry_paths(entry_paths) - entry_path = valid_entry_path(entry.full_name) - if entry_path - - if !is_ds_store(entry_path) && !is_mac_thing(entry_path) && !is_mac_tar_thing(entry_path) -# puts entry.full_name - - entry_paths << entry_path - - if entry.directory? - - create_item(entry_path, - name_part(entry_path), - entry.size, - 'directory', - true) - else - - storage_dir = File.dirname(storage_path) - extracted_entry_path = File.join(storage_dir, entry_path) - extracted_entry_dir = File.dirname(extracted_entry_path) - FileUtils.mkdir_p extracted_entry_dir - - file = File.open(extracted_entry_path, 'wb') - - raise("extracting gzip entry not working!") unless File.exist?(extracted_entry_path) - - mime_guess = Extraction.mime_from_path(extracted_entry_path) || - mime_from_filename(entry.name) || - 'application/octet-stream' + rescue StandardError => ex + @status = ExtractionStatus::ERROR + @peek_type = PeekType::NONE + report_problem("problem extracting extract listing for task #{@id}: #{ex.message}") + return false + end + ensure + tar_extract.close + end - create_item(entry_path, - name_part(entry_path), - entry.size, - mime_guess, - false) - file.close - File.delete(extracted_entry_path) if File.exist?(extracted_entry_path) - end + def extract_entry(entry, entry_name, entry_paths, type) + entry_path = valid_entry_path(entry_name) + if entry_path && !is_ds_store(entry_path) && !is_mac_thing(entry_path) && !is_mac_tar_thing(entry_path) + entry_paths << entry_path + if entry.directory? || is_directory(entry_name) + create_item(entry_path, + name_part(entry_path), + entry.size, + 'directory', + true) + else + storage_dir = File.dirname(storage_path) + extracted_entry_path = File.join(storage_dir, entry_path) + extracted_entry_dir = File.dirname(extracted_entry_path) + FileUtils.mkdir_p extracted_entry_dir - end + raise Exception.new("extracted entry somehow already there?!!?!") if File.exist?(extracted_entry_path) + file = nil + case type + when ExtractionType::ZIP + entry.extract(extracted_entry_path) + else + file = File.open(extracted_entry_path, 'wb') end + raise("extracting #{type} entry not working!") unless File.exist?(extracted_entry_path) + + mime_guess = mime_from_path(extracted_entry_path) || + mime_from_filename(entry_name) || + 'application/octet-stream' + + create_item(entry_path, + name_part(entry_path), + entry.size, + mime_guess, + false) + file.close if file + File.delete(extracted_entry_path) if File.exist?(extracted_entry_path) end + end + entry_paths + end - if entry_paths.length > 0 - self.peek_type = PeekType::LISTING - self.peek_text = entry_paths_arr_to_html(entry_paths) - return true - else - self.peek_type = PeekType::NONE - report_problem("no items found for archive listing for task #{self.id}") - return false - end - - rescue StandardError => ex - self.status = ExtractionStatus::ERROR - self.peek_type = PeekType::NONE - - report_problem("problem extracting extract listing for task #{self.id}: #{ex.message}") + def handle_entry_paths(entry_paths) + if entry_paths.length > 0 + @peek_type = PeekType::LISTING + @peek_text = entry_paths_arr_to_html(entry_paths) + puts @peek_text + return true + else + @peek_type = PeekType::NONE + report_problem("no items found for archive listing for task #{@id}") return false - - tar_extract.close end - end def extract_default - puts "Default extraction for #{binary_name}" + LOGGER.info("Default extraction for #{@binary_name}") begin - self.peek_type = PeekType::NONE + @peek_type = PeekType::NONE return true rescue StandardError => ex - self.status = ExtractionStatus::ERROR - self.peek_type = PeekType::NONE - report_problem("problem creating default peek for task #{self.id}") + @status = ExtractionStatus::ERROR + @peek_type = PeekType::NONE + report_problem("problem creating default peek for task #{@id}: #{ex}") return false end end @@ -389,7 +290,7 @@ def name_part(path) def entry_paths_arr_to_html(entry_paths) return_string = ' ' - return_string << self.binary_name + return_string << @binary_name entry_paths.each do |entry_path| diff --git a/lib/extractor/extraction_type.rb b/lib/extractor/extraction_type.rb new file mode 100644 index 0000000..cbe1283 --- /dev/null +++ b/lib/extractor/extraction_type.rb @@ -0,0 +1,6 @@ +# frozen_string_literal: true +class ExtractionType + ZIP = 'zip' + GZIP = 'gzip' + ARCHIVE = 'archive' +end diff --git a/test/archive_extractor_test.rb b/test/archive_extractor_test.rb new file mode 100644 index 0000000..1ed4313 --- /dev/null +++ b/test/archive_extractor_test.rb @@ -0,0 +1,158 @@ +# frozen_string_literal: true +require_relative 'test_helper' + +class TestArchiveExtractor < Minitest::Test + Config.load_and_set_settings(Config.setting_files("#{ENV['RUBY_HOME']}/config", 'test')) + def setup + bucket_name = 'test-bucket' + object_key = 'test-key' + binary_name = 'test' + web_id = 'test-id' + mime_type = 'application/zip' + @sqs = Minitest::Mock.new + @s3 = Minitest::Mock.new + @archive_extractor = ArchiveExtractor.new(bucket_name, object_key, binary_name, web_id, mime_type, @sqs, @s3) + end + + def test_extract + # setup + @archive_extractor.binary_name = 'test.zip' + @archive_extractor.web_id = 'test-zip' + @archive_extractor.mime_type = 'application/zip' + @archive_extractor.object_key = 'test.zip' + del_path = "#{Settings.aws.efs.mount_point}#{@archive_extractor.bucket_name}_#{@archive_extractor.web_id}" + local_path = "#{del_path}/#{@archive_extractor.object_key}" + file_path = "#{ENV['RUBY_HOME']}/test/test.zip" + dirname = File.dirname(local_path) + unless File.directory?(dirname) + FileUtils.mkdir_p(dirname) + end + FileUtils.cp(file_path, local_path) + @s3.expect(:get_object, nil, [{response_target: local_path, bucket: @archive_extractor.bucket_name, + key: @archive_extractor.object_key}]) + peek_text = " test.zip
test.txt
" + items = [{'item_name' => 'test.txt', 'item_path' => 'test.txt', 'item_size' => 12, 'media_type' => 'text/plain', 'is_directory' => false}] + return_value = {'web_id' => 'test-zip', 'status' => ExtractionStatus::SUCCESS, 'error' => [], 'peek_type' => PeekType::LISTING, 'peek_text' => peek_text, 'nested_items' => items} + s3_path = 'messages/test-zip.json' + @s3.expect(:put_object, [], [{body: return_value.to_json, bucket: Settings.aws.s3.json_bucket, key: s3_path}]) + return_value = {'bucket_name' => 'test-bucket', 'object_key' => s3_path, 's3_status' => ExtractionStatus::SUCCESS, 'error' => []} + @sqs.expect(:send_message, nil, [{queue_url: Settings.aws.sqs.queue_url, + message_body: return_value.to_json, + message_attributes:{}}]) + + # test + @archive_extractor.extract + + # verify + assert_mock(@s3) + assert_mock(@sqs) + end + + def test_get_object + # setup + local_path = 'test/path' + @s3.expect(:get_object, nil, [{response_target: local_path, bucket: @archive_extractor.bucket_name, + key: @archive_extractor.object_key}]) + # test + error = @archive_extractor.get_object(local_path, []) + + # verify + assert_mock(@s3) + assert_empty(error) + end + + def test_get_object_error + # setup + stub_s3 = Aws::S3::Client.new(region: Settings.aws.region) + @archive_extractor.s3 = stub_s3 + local_path = "test/path" + raises_exception = -> { raise StandardError.new } + + # test and verify + stub_s3.stub :get_object, raises_exception do + error = @archive_extractor.get_object(local_path, []) + assert(error.first.value?(ErrorType::S3_GET)) + end + end + + def test_perform_extraction + # setup + binary_name = 'test.zip' + web_id = 'test-zip' + mime_type = 'application/zip' + local_path = "#{ENV['RUBY_HOME']}/test/test.zip" + extraction = Extraction.new(binary_name, local_path, web_id, mime_type) + + #test + return_value = @archive_extractor.perform_extraction(extraction, []) + + # verify + assert(return_value.value?(PeekType::LISTING)) + exp_peek_text = " test.zip
test.txt
" + assert(return_value.value?(exp_peek_text)) + + end + + def test_perform_extraction_error + # setup + binary_name = 'test.zip' + web_id = 'test-zip' + mime_type = 'application/zip' + local_path = "#{ENV['RUBY_HOME']}/test/test.zip" + stub_extraction = Extraction.new(binary_name, local_path, web_id, mime_type) + raises_exception = -> { raise StandardError.new } + + # test and verify + stub_extraction.stub :process, raises_exception do + return_value = @archive_extractor.perform_extraction(stub_extraction, []) + assert(return_value.value?(PeekType::NONE)) + assert(return_value.value?(ExtractionStatus::ERROR)) + end + end + + def test_send_sqs_message + # setup + return_value = {'test' => 'retVal'} + @sqs.expect(:send_message, nil, [{queue_url: Settings.aws.sqs.queue_url, + message_body: return_value.to_json, + message_attributes:{}}]) + + # test + @archive_extractor.send_sqs_message(return_value) + + # verify + assert_mock(@sqs) + end + + def test_put_json_response + # setup + return_value = {'test' => 'retVal'} + s3_path = 'test/s3/key' + @s3.expect(:put_object, nil, [{body: return_value.to_json, bucket: Settings.aws.s3.json_bucket, key: s3_path}]) + + # test + s3_put_status, s3_put_error = @archive_extractor.put_json_response(return_value, s3_path) + + # verify + assert_mock(@s3) + assert_equal(ExtractionStatus::SUCCESS, s3_put_status) + assert_empty(s3_put_error) + end + + def test_put_json_response_error + # setup + return_value = {'test' => 'error'} + s3_path = 'test/s3/error' + stub_s3 = Aws::S3::Client.new(region: Settings.aws.region) + @archive_extractor.s3 = stub_s3 + raises_exception = -> { raise StandardError.new } + + # test and verify + stub_s3.stub :put_object, raises_exception do + s3_put_status, s3_put_error = @archive_extractor.put_json_response(return_value, s3_path) + assert_equal(ExtractionStatus::ERROR, s3_put_status) + assert(!s3_put_error.empty?) + end + end +end + diff --git a/test/extraction_test.rb b/test/extraction_test.rb new file mode 100644 index 0000000..7af66de --- /dev/null +++ b/test/extraction_test.rb @@ -0,0 +1,356 @@ +# frozen_string_literal: true +require_relative 'test_helper' + +class TestExtraction < Minitest::Test + Config.load_and_set_settings(Config.setting_files("#{ENV['RUBY_HOME']}/config", 'test')) + def setup + binary_name = 'test-binary' + web_id = 'test-id' + storage_path = "#{Settings.aws.efs.mount_point}test-bucket_#{web_id}/test-key" + mime_type = 'application/zip' + @extraction = Extraction.new(binary_name, storage_path, web_id, mime_type) + end + + def test_process + # setup + @extraction.binary_name = 'test.txt.gz' + @extraction.storage_path = "#{ENV['RUBY_HOME']}/test/test.txt.gz" + @extraction.id = 'test-gzip' + @extraction.mime_type = 'application/gzip' + + # test + @extraction.process + + # verify + assert_equal(ExtractionStatus::SUCCESS, @extraction.status) + assert_equal(PeekType::LISTING, @extraction.peek_type) + end + + def test_report_problem + # setup + report = 'Test report' + + # test + @extraction.report_problem(report) + + # verify + error = @extraction.error + assert_equal(true, error.include?({'error_type' => ErrorType::EXTRACTION, 'report' => report})) + end + + def test_extract_features_gzip + # setup + @extraction.binary_name = 'test.txt.gz' + @extraction.storage_path = "#{ENV['RUBY_HOME']}/test/test.txt.gz" + @extraction.id = 'test-gzip' + @extraction.mime_type = 'application/gzip' + + # test + @extraction.extract_features + + # verify + assert_equal(PeekType::LISTING, @extraction.peek_type) + exp_peek_text = " test.txt.gz
testing\n
" + assert_equal(exp_peek_text, @extraction.peek_text) + end + + def test_extract_features_zip + # setup + @extraction.binary_name = 'test.zip' + @extraction.storage_path = "#{ENV['RUBY_HOME']}/test/test.zip" + @extraction.id = 'test-zip' + @extraction.mime_type = 'application/zip' + + # test + @extraction.extract_features + + # verify + assert_equal(PeekType::LISTING, @extraction.peek_type) + exp_peek_text = " test.zip
test.txt
" + assert_equal(exp_peek_text, @extraction.peek_text) + end + + def test_extract_features_default + # setup + @extraction.binary_name = 'test' + @extraction.storage_path = "#{ENV['RUBY_HOME']}/test" + @extraction.id = 'test-default' + @extraction.mime_type = 'application/directory' + + # test + @extraction.extract_features + + # verify + assert_equal(PeekType::NONE, @extraction.peek_type) + end + + def test_mime_from_path + # setup + ruby_path = "#{ENV['RUBY_HOME']}/bin/set-test-vars.rb" + + # test + ruby_mime = @extraction.mime_from_path(ruby_path) + + # verify + assert_equal('application/x-ruby', ruby_mime) + end + + def test_mime_from_filename + # setup + zip_filename = 'test.zip' + + # test + zip_mime = @extraction.mime_from_filename(zip_filename) + + # verify + assert_equal('application/zip', zip_mime) + end + + def test_create_item + # setup + item_path = 'test/item/path/thing' + item_name = 'thing' + item_size = 123 + media_type = 'directory' + is_directory = true + + # test + @extraction.create_item(item_path, item_name, item_size, media_type, is_directory) + + # verify + nested_items = @extraction.nested_items + assert(nested_items.include?({'item_name' => item_name, 'item_path' => item_path, 'item_size' => item_size, + 'media_type' => media_type, 'is_directory' => is_directory})) + end + + def test_extract_zip + # setup + @extraction.binary_name = 'test.zip' + @extraction.storage_path = "#{ENV['RUBY_HOME']}/test/test.zip" + @extraction.id = 'test-zip' + @extraction.mime_type = 'application/zip' + + # test + @extraction.extract_zip + + # verify + assert_equal(PeekType::LISTING, @extraction.peek_type) + exp_peek_text = " test.zip
test.txt
" + assert_equal(exp_peek_text, @extraction.peek_text) + end + + def test_extract_archive + # setup + @extraction.binary_name = 'test.tar' + @extraction.storage_path = "#{ENV['RUBY_HOME']}/test/test.tar" + @extraction.id = 'test-tar' + @extraction.mime_type = 'application/x-tar' + @extraction.peek_type = nil + + # test + @extraction.extract_archive + + # verify + assert_equal(PeekType::LISTING, @extraction.peek_type) + exp_peek_text = " test.tar
test.txt
" + assert_equal(exp_peek_text, @extraction.peek_text) + end + + def test_extract_gzip + # setup + @extraction.binary_name = 'test.txt.gz' + @extraction.storage_path = "#{ENV['RUBY_HOME']}/test/test.txt.gz" + @extraction.id = 'test-gzip' + @extraction.mime_type = 'application/gzip' + + # test + @extraction.extract_gzip + + # verify + assert_equal(PeekType::LISTING, @extraction.peek_type) + exp_peek_text = " test.txt.gz
testing\n
" + assert_equal(exp_peek_text, @extraction.peek_text) + end + + def test_extract_entry + # setup + mock_entry = Minitest::Mock.new + entry_name = "#{ENV['RUBY_HOME']}/bin/set-test-vars.rb" + type = ExtractionType::GZIP + mock_entry.expect(:directory?, false) + mock_entry.expect(:size, 123) + + # test + entry_paths = @extraction.extract_entry(mock_entry, entry_name, [], type) + + # verify + assert_mock(mock_entry) + assert(entry_paths.include?(entry_name)) + expect_item = {'item_name' => 'set-test-vars.rb', 'item_path' => entry_name, 'item_size' => 123, + 'media_type' => 'application/x-ruby', 'is_directory' => false} + assert(@extraction.nested_items.include?(expect_item)) + + end + + def test_handle_entry_paths + # setup + entry_paths = ['test/path'] + + # test + resp = @extraction.handle_entry_paths(entry_paths) + + # verify + assert(resp) + exp_peek_text = " test-binary
path
" + assert_equal(exp_peek_text, @extraction.peek_text) + assert_equal(PeekType::LISTING, @extraction.peek_type) + end + + def test_handle_entry_paths_empty + # setup + entry_paths = [] + + # test + resp = @extraction.handle_entry_paths(entry_paths) + + # verify + assert_equal(false, resp) + assert_equal(PeekType::NONE, @extraction.peek_type) + assert(@extraction.error.include?({'error_type' => ErrorType::EXTRACTION, + 'report' => "no items found for archive listing for task #{@extraction.id}"})) + end + + def test_extract_default + # test + @extraction.extract_default + # verify + peek_type = @extraction.peek_type + assert_equal(PeekType::NONE, peek_type) + end + + def test_valid_entry_path + # setup + valid_path = 'test/path' + invalid_path = "" + + # test + path = @extraction.valid_entry_path(valid_path) + path_slash = @extraction.valid_entry_path("#{valid_path}/") + path_nil = @extraction.valid_entry_path(invalid_path) + + # verify + assert_equal(valid_path, path) + assert_equal(valid_path, path_slash) + assert_nil(path_nil) + end + + def test_is_directory + # setup + ruby_home = ENV['RUBY_HOME'] + object_path = 'test/path' + slash_path = 'test/path/' + mac_path = 'this/is/a/mac/._path' + ds_store_path = 'test/path/.DS_Store' + + # test + ruby_home_dir = @extraction.is_directory(ruby_home) + object_path_dir = @extraction.is_directory(object_path) + slash_path_dir = @extraction.is_directory(slash_path) + mac_path_dir = @extraction.is_directory(mac_path) + ds_store_path_dir = @extraction.is_directory(ds_store_path) + + # verify + assert_equal(true, ruby_home_dir) + assert_equal(true, slash_path_dir) + assert_equal(false, object_path_dir) + assert_equal(false, mac_path_dir) + assert_equal(false, ds_store_path_dir) + end + + def test_is_mac_thing + # setup + mac_path = 'this/is/a/mac/path/__MACOSX' + path = 'this/is/not/a/mac/path' + # test + mac = @extraction.is_mac_thing(mac_path) + not_mac = @extraction.is_mac_thing(path) + # verify + assert_equal(true, mac) + assert_equal(false, not_mac) + end + + def test_is_mac_tar_thing + # setup + mac_path = 'this/is/a/mac/._path' + paxheader_mac_path = 'PaxHeader/this/is/a/mac/path' + longlink_mac_path = 'this/is/a/mac/path/@LongLink' + path = 'this/is/not/a/mac/path' + # test + mac_underscore = @extraction.is_mac_tar_thing(mac_path) + mac_paxheader = @extraction.is_mac_tar_thing(paxheader_mac_path) + mac_longlink = @extraction.is_mac_tar_thing(longlink_mac_path) + not_mac = @extraction.is_mac_tar_thing(path) + # verify + assert_equal(true, mac_underscore) + assert_equal(true, mac_paxheader) + assert_equal(true, mac_longlink) + assert_equal(false, not_mac) + end + + def test_ends_in_slash + # setup + path_ends_in_slash = 'test/path/' + path_does_not_end_in_slash = 'test/path' + + # test + ends_in_slash = @extraction.ends_in_slash(path_ends_in_slash) + does_not_end_in_slash = @extraction.ends_in_slash(path_does_not_end_in_slash) + + # verify + assert_equal(true, ends_in_slash) + assert_equal(false, does_not_end_in_slash) + end + + def test_is_ds_store + # setup + ds_store_path = 'test/path/.DS_Store' + path = 'test/path' + + # test + ds_store = @extraction.is_ds_store(ds_store_path) + not_ds_store = @extraction.is_ds_store(path) + + # verify + assert_equal(true, ds_store) + assert_equal(false, not_ds_store) + end + + def test_name_part + # setup + path = 'test/path' + name = 'test' + invalid_path = "" + + # test + path_name = @extraction.name_part(path) + test_name = @extraction.name_part(name) + invalid_name = @extraction.name_part(invalid_path) + + # verify + assert_equal('path', path_name) + assert_equal('test', test_name) + assert_nil(invalid_name) + end + + def test_entry_paths_arr_to_html + # setup + entry_paths = ['test/path'] + + # test + return_string = @extraction.entry_paths_arr_to_html(entry_paths) + + # verify + exp_peek_text = " test-binary
path
" + assert_equal(exp_peek_text, return_string) + end +end diff --git a/test/test.tar b/test/test.tar new file mode 100644 index 0000000..95dc7f6 Binary files /dev/null and b/test/test.tar differ diff --git a/test/test.txt.gz b/test/test.txt.gz new file mode 100644 index 0000000..4f821bc Binary files /dev/null and b/test/test.txt.gz differ diff --git a/test/test.zip b/test/test.zip new file mode 100644 index 0000000..d048dc0 Binary files /dev/null and b/test/test.zip differ diff --git a/test/test_helper.rb b/test/test_helper.rb new file mode 100644 index 0000000..ec88121 --- /dev/null +++ b/test/test_helper.rb @@ -0,0 +1,17 @@ +# frozen_string_literal: true + +require 'simplecov' +SimpleCov.start + +require 'minitest/autorun' +require 'config' +require 'csv' +require 'json' +require_relative '../lib/archive_extractor' +require_relative '../lib/extractor' +require_relative '../lib/extractor/error_type' +require_relative '../lib/extractor/extraction' +require_relative '../lib/extractor/extraction_status' +require_relative '../lib/extractor/extraction_type' +require_relative '../lib/extractor/mime_type' +require_relative '../lib/extractor/peek_type' \ No newline at end of file