Skip to content

Commit

Permalink
convert scripts into reusable rake tasks
Browse files Browse the repository at this point in the history
  • Loading branch information
mnyrop committed Oct 9, 2024
1 parent 53ec486 commit 64b0734
Show file tree
Hide file tree
Showing 6 changed files with 140 additions and 100 deletions.
17 changes: 17 additions & 0 deletions Rakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
require 'fileutils'
require 'yaml'


CONFIG = YAML.load_file 'config.yml'
RW_DIR = File.dirname CONFIG['source_dir']
PDF_DIR = File.join RW_DIR, 'pdfs'
JPG_DIR = File.join RW_DIR, 'jpgs'
AP_SRC_DIR = './src'
ANUM_TXT_FILE = File.join RW_DIR, 'anumbers.txt'
AFILES_CSV_FILE = File.join AP_SRC_DIR, 'afiles.csv'
PAGES_CSV_FILE = File.join AP_SRC_DIR, 'pages.csv'

Dir.glob("lib/tasks/*.rake").each { |r| load r }



58 changes: 0 additions & 58 deletions lib/split-pdfs-populate-csv.rb

This file was deleted.

21 changes: 0 additions & 21 deletions lib/sync-aws-local-json.rb

This file was deleted.

21 changes: 0 additions & 21 deletions lib/sync-aws-local-tifs.rb

This file was deleted.

61 changes: 61 additions & 0 deletions lib/tasks/pdfs.rake
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
require 'pdf-reader'
require 'vips'

def pdf_paths
@pdfs ||= Dir.glob("#{PDF_DIR}/*.pdf")
end

def infer_anum(pdf_path)
base = File.basename(pdf_path, '.pdf')
anum = base.sub('_redacted', '').sub('_withdrawal', '')
anum
end

namespace :pdfs do
desc 'spit out txt list of anums inferred from pdfs'
task :anum_txt do
File.open(ANUM_TXT_FILE, "w") do |file|
pdf_paths.map { |path| file.puts infer_anum(path) }
end
puts "Done ✓"
end

desc 'split pdfs to jpgs, capture results in csvs'
task :jpg_csv do
File.open(AFILES_CSV_FILE, 'w') { |file| file.puts("id,label,og_pdf_id,page_count") }
File.open(PAGES_CSV_FILE, 'w') { |file| file.puts("id,label,a_number,page_number,extracted_text") }
FileUtils.mkdir_p JPG_DIR

pdf_paths.each_with_index do |path, i|
GC.start
reader = PDF::Reader.new path
page_count = reader.page_count
anum = infer_anum path
dir = File.join JPG_DIR, anum
pdf_data = [anum,anum,File.basename(path, '.pdf'),page_count]

File.open(AFILES_CSV_FILE, 'a') { |f| f.puts pdf_data.join(',') }
FileUtils.mkdir_p dir

(0..page_count - 1).each do |index|
page_num = index.to_s.rjust(4, "0")
page_id = "#{anum}_#{page_num}"
target = File.join dir, "#{page_num}.jpg"
text = reader.pages[index].text.to_s.gsub(/\R+/, "|").gsub('"', "'")
page_data = [page_id,page_id,anum,page_num,"\"#{text}\""]

File.open(PAGES_CSV_FILE, "a") { |f| f.puts page_data.join(',') }

img = Vips::Image.pdfload path, page: index, n: 1, dpi: 300
img = img.thumbnail_image(2500, height: 10000000) if (img.width > 2500)
img.jpegsave target

print "writing #{anum} page #{index} / #{page_count}\r"
$stdout.flush
end

puts "finished pdf #{i+1}/#{pdf_paths.length} — process is #{(i.to_f / pdf_paths.length.to_f * 100.0).round(1)}% complete \n"
end
puts "Done ✓"
end
end
62 changes: 62 additions & 0 deletions lib/tasks/s3.rake
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
require 'aws-sdk-s3'
require 'dotenv'

TIF_DIR = './build/image/'
JSON_DIR = './build/presentation/'

Dotenv.load

def credentials
@credentials ||= Aws::Credentials.new ENV['ACCESS_KEY_ID'], ENV['SECRET_ACCESS_KEY']
end

def s3
@s3 ||= Aws::S3::Client.new(region: ENV['REGION'], credentials: credentials)
end

namespace :s3 do
namespace :push do
desc 'sync local tifs to s3'
task :tifs do
Dir.glob("#{TIF_DIR}/*.tif").each do |path|
key = File.basename path
s3.put_object({
bucket: ENV['IMAGE_BUCKET_NAME'],
key: key,
content_type: 'image/tiff',
content_disposition: 'inline',
acl: 'public-read',
body: File.read(path)
})
puts "uploaded #{key}"
end
end

desc 'sync local json to s3'
task :json do
Dir.glob("#{JSON_DIR}/**/*.json").each do |path|
key = path.sub JSON_DIR, ''
s3.put_object({
bucket: ENV['PRESENTATION_BUCKET_NAME'],
key: key,
content_type: 'application/json',
content_disposition: 'inline',
acl: 'public-read',
body: File.read(path)
})
puts "uploaded #{key}"
end
end
end
namespace :clobber do
desc 'clears out og tifs in s3 bucket'
task :tifs do
puts 'TO DO'
end

desc 'clears out og json in s3 bucket'
task :json do
puts 'TO DO'
end
end
end

0 comments on commit 64b0734

Please sign in to comment.