Skip to content

Commit

Permalink
WIP: storage abstraction
Browse files Browse the repository at this point in the history
  • Loading branch information
elohanlon committed Sep 11, 2024
1 parent 1dc9502 commit 77781bf
Show file tree
Hide file tree
Showing 28 changed files with 528 additions and 197 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ term_additional_fields.yml
resque.yml
redis.yml
ezid.yml
aws.yml
gcp.yml

*_user_accounts.yml

Expand Down
9 changes: 8 additions & 1 deletion Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ gem 'best_type', git: 'https://github.com/cul/best_type.git', branch: 'LDPD-415-
gem 'active_fedora_relsint', git: 'https://github.com/cul/active_fedora_relsint', ref: '91114c78c9af344673f1e899624031da79b72693'

# URI Escaping
gem 'addressable', '~> 2.7.0'
gem 'addressable', '~> 2.8.0'

# Use resque for background jobs
# We're pinning resque to 1.26.x because 1.27 does an eager load operation
Expand Down Expand Up @@ -149,6 +149,13 @@ gem 'rack-protection', '>= 1.5.5'
gem 'loofah', '~> 2.20.0'
gem 'rails-html-sanitizer', '>= 1.2'

# Amazon S3 SDK
gem 'aws-sdk-s3', '~> 1'
# Additional gem enabling the AWS SDK to calculate CRC32C checksums
gem 'aws-crt', '~> 0.2.0'
# Google Cloud Storage SDK
gem 'google-cloud-storage', '~> 1.49'

# Development and testing!
group :development, :test do
gem 'byebug'
Expand Down
81 changes: 77 additions & 4 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -114,15 +114,33 @@ GEM
minitest (>= 5.1)
tzinfo (~> 2.0)
zeitwerk (~> 2.3)
addressable (2.7.0)
public_suffix (>= 2.0.2, < 5.0)
addressable (2.8.7)
public_suffix (>= 2.0.2, < 7.0)
airbrussh (1.5.0)
sshkit (>= 1.6.1, != 1.7.0)
ast (2.4.2)
attr_encrypted (4.0.0)
encryptor (~> 3.0.0)
autoprefixer-rails (10.4.16.0)
execjs (~> 2)
aws-crt (0.2.1)
ffi
aws-eventstream (1.3.0)
aws-partitions (1.961.0)
aws-sdk-core (3.201.3)
aws-eventstream (~> 1, >= 1.3.0)
aws-partitions (~> 1, >= 1.651.0)
aws-sigv4 (~> 1.8)
jmespath (~> 1, >= 1.6.1)
aws-sdk-kms (1.88.0)
aws-sdk-core (~> 3, >= 3.201.0)
aws-sigv4 (~> 1.5)
aws-sdk-s3 (1.157.0)
aws-sdk-core (~> 3, >= 3.201.0)
aws-sdk-kms (~> 1)
aws-sigv4 (~> 1.5)
aws-sigv4 (1.9.1)
aws-eventstream (~> 1, >= 1.0.2)
base64 (0.2.0)
bcrypt (3.1.20)
bcrypt_pbkdf (1.1.0)
Expand Down Expand Up @@ -181,6 +199,7 @@ GEM
crass (1.0.6)
daemons (1.4.1)
date (3.3.4)
declarative (0.0.20)
deprecation (0.99.0)
activesupport
devise (4.9.3)
Expand All @@ -191,6 +210,8 @@ GEM
warden (~> 1.2.3)
diff-lcs (1.5.0)
diffy (3.4.2)
digest-crc (0.6.5)
rake (>= 12.0.0, < 14.0.0)
domain_name (0.6.20231109)
dry-cli (1.0.0)
ebnf (1.0.0)
Expand Down Expand Up @@ -222,6 +243,40 @@ GEM
railties (>= 3.2, < 8.0)
globalid (1.2.1)
activesupport (>= 6.1)
google-apis-core (0.15.1)
addressable (~> 2.5, >= 2.5.1)
googleauth (~> 1.9)
httpclient (>= 2.8.3, < 3.a)
mini_mime (~> 1.0)
mutex_m
representable (~> 3.0)
retriable (>= 2.0, < 4.a)
google-apis-iamcredentials_v1 (0.21.0)
google-apis-core (>= 0.15.0, < 2.a)
google-apis-storage_v1 (0.41.0)
google-apis-core (>= 0.15.0, < 2.a)
google-cloud-core (1.7.0)
google-cloud-env (>= 1.0, < 3.a)
google-cloud-errors (~> 1.0)
google-cloud-env (2.1.1)
faraday (>= 1.0, < 3.a)
google-cloud-errors (1.4.0)
google-cloud-storage (1.52.0)
addressable (~> 2.8)
digest-crc (~> 0.4)
google-apis-core (~> 0.13)
google-apis-iamcredentials_v1 (~> 0.18)
google-apis-storage_v1 (~> 0.38)
google-cloud-core (~> 1.6)
googleauth (~> 1.9)
mini_mime (~> 1.0)
googleauth (1.11.0)
faraday (>= 1.0, < 3.a)
google-cloud-env (~> 2.1)
jwt (>= 1.4, < 3.0)
multi_json (~> 1.11)
os (>= 0.9, < 2.0)
signet (>= 0.16, < 2.a)
haml (5.2.2)
temple (>= 0.8.0)
tilt
Expand All @@ -239,6 +294,7 @@ GEM
jbuilder (2.11.5)
actionview (>= 5.0.0)
activesupport (>= 5.0.0)
jmespath (1.6.2)
jquery-rails (4.6.0)
rails-dom-testing (>= 1, < 3)
railties (>= 4.2.0)
Expand All @@ -249,6 +305,8 @@ GEM
jquery-ui-rails (~> 5.0)
railties (>= 3.1)
json (2.7.1)
jwt (2.8.2)
base64
kaminari (1.2.2)
activesupport (>= 4.1.0)
kaminari-actionview (= 1.2.2)
Expand Down Expand Up @@ -292,6 +350,7 @@ GEM
multi_json (1.15.0)
mustermann (3.0.0)
ruby2_keywords (~> 0.0.1)
mutex_m (0.2.0)
mysql2 (0.5.5)
net-http (0.4.0)
uri
Expand Down Expand Up @@ -323,12 +382,13 @@ GEM
nokogiri (>= 1.4.2)
solrizer (~> 3.3)
orm_adapter (0.5.0)
os (1.1.4)
parallel (1.24.0)
parser (3.2.2.4)
ast (~> 2.4.1)
racc
psych (3.3.4)
public_suffix (4.0.7)
public_suffix (5.1.1)
puma (5.6.7)
nio4r (~> 2.0)
racc (1.7.3)
Expand Down Expand Up @@ -399,6 +459,10 @@ GEM
redis-namespace (1.8.2)
redis (>= 3.0.4)
regexp_parser (2.8.3)
representable (3.2.0)
declarative (< 0.1.0)
trailblazer-option (>= 0.1.1, < 0.2.0)
uber (< 0.2.0)
responders (3.1.1)
actionpack (>= 5.2)
railties (>= 5.2)
Expand Down Expand Up @@ -478,6 +542,11 @@ GEM
websocket (~> 1.0)
sequel (5.75.0)
bigdecimal
signet (0.19.0)
addressable (~> 2.8)
faraday (>= 0.17.5, < 3.a)
jwt (>= 1.5, < 3.0)
multi_json (~> 1.10)
sinatra (3.1.0)
mustermann (~> 3.0)
rack (~> 2.2, >= 2.2.4)
Expand Down Expand Up @@ -523,6 +592,7 @@ GEM
thread (0.2.2)
tilt (2.3.0)
timeout (0.4.1)
trailblazer-option (0.1.2)
tzinfo (2.0.6)
concurrent-ruby (~> 1.0)
uber (0.0.15)
Expand Down Expand Up @@ -567,9 +637,11 @@ PLATFORMS
DEPENDENCIES
active-fedora (= 8.6.0)
active_fedora_relsint!
addressable (~> 2.7.0)
addressable (~> 2.8.0)
attr_encrypted (>= 1.3.3)
autoprefixer-rails
aws-crt (~> 0.2.0)
aws-sdk-s3 (~> 1)
best_type!
bootsnap
bootstrap-sass (~> 3.4.1)
Expand All @@ -590,6 +662,7 @@ DEPENDENCIES
factory_bot_rails (~> 4.9)
faker
font-awesome-rails (~> 4.7.0)
google-cloud-storage (~> 1.49)
jbuilder
jquery-rails (>= 4.0.4)
jquery-ui-rails
Expand Down
35 changes: 30 additions & 5 deletions app/controllers/concerns/hyacinth/digital_objects/downloads.rb
Original file line number Diff line number Diff line change
@@ -1,16 +1,41 @@
module Hyacinth::DigitalObjects::Downloads
include ActionController::Live

def download
if @digital_object.is_a?(DigitalObject::Asset)
if @digital_object.fedora_object.datastreams['content'].controlGroup == 'M'
send_data @digital_object.fedora_object.datastreams['content'].content,
filename: @digital_object.fedora_object.datastreams['content'].dsLabel
else
send_file @digital_object.filesystem_location, filename: @digital_object.original_filename
# This endpoint should not support range requests.
if request.headers['Range'].present?
render plain: 'This endpoint does not allow range requests (using the http Range header).'
return
end

storage_object = Hyacinth::Storage.storage_object_for(
Hyacinth::Utils::PathUtils.ds_location_to_decoded_location_uri(
@digital_object.fedora_object.datastreams['content'].dsLocation
)
)

response.headers['Content-Length'] = storage_object.size
response.status = 200
response.headers['Content-Type'] = storage_object.content_type
response.headers["Content-Disposition"] = label_to_content_disposition(storage_object.filename, true)
# Setting the Last-Modified header to fix streaming bug that affects Rails < 7.1 and rack gem 2.2.x?
# https://github.com/rack/rack/issues/1619#issuecomment-1510031078
response.headers["Last-Modified"] = Time.now.httpdate
storage_object.read do |chunk|
response.stream.write(chunk)
# Prevent server instance from sleeping forever if client disconnects during download.
# See: https://gist.github.com/njakobsen/6257887
# A value of 0.1 seems to be more reliable than smaller values.
sleep 0.1
end
puts "Done writing"
else
render plain: @digital_object.digital_object_type.display_label.pluralize + ' do not have download URLs. Try downloading an Asset instead.'
end
ensure
# Always close the stream, even if the client disconnects early.
response.stream.close
end

def download_service_copy
Expand Down
2 changes: 1 addition & 1 deletion app/controllers/concerns/hyacinth/projects_behavior.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ def set_project
# Never trust parameters from the scary internet, only allow the white list through.
def project_params
params.require(:project).permit(
:id, :uri, :display_label, :short_label, :string_key, :pid_generator_id, :full_path_to_custom_asset_directory, :primary_publish_target_pid,
:id, :uri, :display_label, :short_label, :string_key, :pid_generator_id, :full_path_to_custom_asset_directory, :primary_publish_target_pid, :default_storage_type,
enabled_publish_target_pids: [],
enabled_dynamic_fields_attributes: [:id, :digital_object_type_id, :dynamic_field_id, :default_value, :required, :hidden, :locked, :_destroy, fieldset_ids: []],
project_permissions_attributes: [:id, :_destroy, :user_id, :can_create, :can_read, :can_update, :can_delete, :can_publish, :is_project_admin],
Expand Down
81 changes: 23 additions & 58 deletions app/models/concerns/digital_object/assets/file_import.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ module DigitalObject::Assets::FileImport

# Returns true if file import was successful, false otherwise
def do_file_import
path_to_final_save_location = nil
final_save_location_uri = nil
import_file_sha256_hexdigest = nil
import_file_size = nil

Expand All @@ -15,14 +15,14 @@ def do_file_import
import_file_size = import_file.size
raise Hyacinth::Exceptions::ZeroByteFileError, 'Original file file size is 0 bytes. File must contain data.' if import_file_size == 0

copy_results = copy_and_verify_file(import_file)
copy_results = verify_and_optionally_copy_file(import_file)

path_to_final_save_location = copy_results[0]
final_save_location_uri = copy_results[0]
import_file_sha256_hexdigest = copy_results[1]
end
# At this point, there is a file at path_to_final_save_location and
# import_file_sha256_hexdigest has been calculated, and
# import_file_size has been set, regardless of import type.
# At this point, there is a file at final_save_location_uri,
# import_file_sha256_hexdigest has been calculated,
# and import_file_size has been set, regardless of import type.

original_filename = File.basename(@import_file_original_file_path || @import_file_import_path)

Expand All @@ -38,9 +38,9 @@ def do_file_import
# Line below will create paths like "file:/this%23_and_%26_also_something%20great/here.txt"
# We DO NOT want a double slash at the beginnings of these paths.
# We need to manually escape ampersands (%26) and pound signs (%23) because these are not always handled by Addressable::URI.encode()
ds_location = Hyacinth::Utils::PathUtils.filesystem_path_to_ds_location(path_to_final_save_location)
puts "final_save_location_uri is: #{final_save_location_uri}"
content_ds = @fedora_object.create_datastream(ActiveFedora::Datastream, 'content', controlGroup: 'E', mimeType: BestType.mime_type.for_file_name(original_filename), dsLabel: original_filename, versionable: true)
content_ds.dsLocation = ds_location
content_ds.dsLocation = Hyacinth::Utils::PathUtils.location_uri_to_encoded_ds_location(final_save_location_uri)
@fedora_object.datastreams["DC"].dc_source = path_to_final_save_location
@fedora_object.add_datastream(content_ds)

Expand Down Expand Up @@ -168,67 +168,32 @@ def do_poster_import
@fedora_object.rels_int.add_relationship(poster_ds, :extent, File.size(@poster_import_path).to_s, true) # last param *true* means that this is a literal value rather than a relationship
end

def copy_and_verify_file(import_file)
def verify_and_optionally_copy_file(import_file)
if [DigitalObject::Asset::IMPORT_TYPE_INTERNAL, DigitalObject::Asset::IMPORT_TYPE_POST_DATA].include? @import_file_import_type
return copy_and_verify_internal_file(import_file)
return copy_and_verify_internal_file(import_file.path)
elsif @import_file_import_type == DigitalObject::Asset::IMPORT_TYPE_EXTERNAL
return copy_and_verify_external_file(import_file)
return verify_external_file(import_file.path)
end
raise "Did not expect @import_file_import_type: #{@import_file_import_type.inspect}"
end

def copy_and_verify_internal_file(import_file)
path_to_final_save_location = Hyacinth::Utils::PathUtils.path_to_asset_file(pid, project, File.basename(@import_file_import_path))
def copy_and_verify_internal_file(import_file_path)
final_save_location_uri = Hyacinth::Storage.generate_location_uri_for(pid, project, File.basename(import_file_path))
storage_object = Hyacinth::Storage.storage_object_for(final_save_location_uri)

if File.exist?(path_to_final_save_location)
raise 'Could not upload new internally-stored file because existing file was already found at target location: ' + path_to_final_save_location
if storage_object.exist?
raise 'Could not upload new internally-stored file because existing file was already found at target location: ' + final_save_location_uri
end

# Recursively make necessary directories
FileUtils.mkdir_p(File.dirname(path_to_final_save_location))
sha256_hexdigest = storage_object.write(import_file_path)

# Test write abilities by touching the target file
FileUtils.touch(path_to_final_save_location)
unless File.exist?(path_to_final_save_location)
raise 'Unable to write to file path: ' + path_to_final_save_location
end

import_file_sha256 = Digest::SHA256.new
# Copy file to target path_to_final_save_location while generating checksum of original
File.open(path_to_final_save_location, 'wb') do |new_file| # 'w' == write, 'b' == binary mode
buff = ''
while import_file.read(4096, buff)
import_file_sha256.update(buff)
new_file.write(buff)
end
end
import_file_sha256_hexdigest = import_file_sha256.hexdigest

# Confirm that checksum of newly written file matches original checksum. Delete new file and raise error if it doesn't.
copied_file_sha256 = Digest::SHA256.new
File.open(path_to_final_save_location, 'rb') do |copied_file| # 'r' == write, 'b' == binary mode
buff = ''
copied_file_sha256.update(buff) while copied_file.read(4096, buff)
end
copied_file_sha256_hexdigest = copied_file_sha256.hexdigest

if copied_file_sha256_hexdigest != import_file_sha256_hexdigest
FileUtils.rm(path_to_final_save_location) # Important to delete new file
raise "Error during file copy. Copied file checksum (#{copied_file_sha256_hexdigest}) didn't match import file (#{import_file_sha256_hexdigest}). Try file import again."
end

[path_to_final_save_location, import_file_sha256_hexdigest]
[storage_object.location_uri, sha256_hexdigest]
end

def copy_and_verify_external_file(import_file)
import_file_sha256 = Digest::SHA256.new
# Generate checksum for file
buff = ''
import_file_sha256.update(buff) while import_file.read(4096, buff)

# Set path_to_final_save_location as original file path
path_to_final_save_location = @import_file_import_path
import_file_sha256_hexdigest = import_file_sha256.hexdigest
[path_to_final_save_location, import_file_sha256_hexdigest]
def verify_external_file(external_file_path)
storage_object = Hyacinth::Storage.for("file://#{external_file_path}")
raise "External file not found at: #{storage_object.path}" unless storage_object.exist?
sha256_hexdigest = Digest::SHA256.file(storage_object.path).hexdigest
[storage_object.location_uri, sha256_hexdigest]
end
end
Loading

0 comments on commit 77781bf

Please sign in to comment.