-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
convert scripts into reusable rake tasks
- Loading branch information
Showing
6 changed files
with
140 additions
and
100 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
require 'fileutils' | ||
require 'yaml' | ||
|
||
|
||
CONFIG = YAML.load_file 'config.yml' | ||
RW_DIR = File.dirname CONFIG['source_dir'] | ||
PDF_DIR = File.join RW_DIR, 'pdfs' | ||
JPG_DIR = File.join RW_DIR, 'jpgs' | ||
AP_SRC_DIR = './src' | ||
ANUM_TXT_FILE = File.join RW_DIR, 'anumbers.txt' | ||
AFILES_CSV_FILE = File.join AP_SRC_DIR, 'afiles.csv' | ||
PAGES_CSV_FILE = File.join AP_SRC_DIR, 'pages.csv' | ||
|
||
Dir.glob("lib/tasks/*.rake").each { |r| load r } | ||
|
||
|
||
|
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
require 'pdf-reader' | ||
require 'vips' | ||
|
||
def pdf_paths | ||
@pdfs ||= Dir.glob("#{PDF_DIR}/*.pdf") | ||
end | ||
|
||
def infer_anum(pdf_path) | ||
base = File.basename(pdf_path, '.pdf') | ||
anum = base.sub('_redacted', '').sub('_withdrawal', '') | ||
anum | ||
end | ||
|
||
namespace :pdfs do | ||
desc 'spit out txt list of anums inferred from pdfs' | ||
task :anum_txt do | ||
File.open(ANUM_TXT_FILE, "w") do |file| | ||
pdf_paths.map { |path| file.puts infer_anum(path) } | ||
end | ||
puts "Done ✓" | ||
end | ||
|
||
desc 'split pdfs to jpgs, capture results in csvs' | ||
task :jpg_csv do | ||
File.open(AFILES_CSV_FILE, 'w') { |file| file.puts("id,label,og_pdf_id,page_count") } | ||
File.open(PAGES_CSV_FILE, 'w') { |file| file.puts("id,label,a_number,page_number,extracted_text") } | ||
FileUtils.mkdir_p JPG_DIR | ||
|
||
pdf_paths.each_with_index do |path, i| | ||
GC.start | ||
reader = PDF::Reader.new path | ||
page_count = reader.page_count | ||
anum = infer_anum path | ||
dir = File.join JPG_DIR, anum | ||
pdf_data = [anum,anum,File.basename(path, '.pdf'),page_count] | ||
|
||
File.open(AFILES_CSV_FILE, 'a') { |f| f.puts pdf_data.join(',') } | ||
FileUtils.mkdir_p dir | ||
|
||
(0..page_count - 1).each do |index| | ||
page_num = index.to_s.rjust(4, "0") | ||
page_id = "#{anum}_#{page_num}" | ||
target = File.join dir, "#{page_num}.jpg" | ||
text = reader.pages[index].text.to_s.gsub(/\R+/, "|").gsub('"', "'") | ||
page_data = [page_id,page_id,anum,page_num,"\"#{text}\""] | ||
|
||
File.open(PAGES_CSV_FILE, "a") { |f| f.puts page_data.join(',') } | ||
|
||
img = Vips::Image.pdfload path, page: index, n: 1, dpi: 300 | ||
img = img.thumbnail_image(2500, height: 10000000) if (img.width > 2500) | ||
img.jpegsave target | ||
|
||
print "writing #{anum} page #{index} / #{page_count}\r" | ||
$stdout.flush | ||
end | ||
|
||
puts "finished pdf #{i+1}/#{pdf_paths.length} — process is #{(i.to_f / pdf_paths.length.to_f * 100.0).round(1)}% complete \n" | ||
end | ||
puts "Done ✓" | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
require 'aws-sdk-s3' | ||
require 'dotenv' | ||
|
||
TIF_DIR = './build/image/' | ||
JSON_DIR = './build/presentation/' | ||
|
||
Dotenv.load | ||
|
||
def credentials | ||
@credentials ||= Aws::Credentials.new ENV['ACCESS_KEY_ID'], ENV['SECRET_ACCESS_KEY'] | ||
end | ||
|
||
def s3 | ||
@s3 ||= Aws::S3::Client.new(region: ENV['REGION'], credentials: credentials) | ||
end | ||
|
||
namespace :s3 do | ||
namespace :push do | ||
desc 'sync local tifs to s3' | ||
task :tifs do | ||
Dir.glob("#{TIF_DIR}/*.tif").each do |path| | ||
key = File.basename path | ||
s3.put_object({ | ||
bucket: ENV['IMAGE_BUCKET_NAME'], | ||
key: key, | ||
content_type: 'image/tiff', | ||
content_disposition: 'inline', | ||
acl: 'public-read', | ||
body: File.read(path) | ||
}) | ||
puts "uploaded #{key}" | ||
end | ||
end | ||
|
||
desc 'sync local json to s3' | ||
task :json do | ||
Dir.glob("#{JSON_DIR}/**/*.json").each do |path| | ||
key = path.sub JSON_DIR, '' | ||
s3.put_object({ | ||
bucket: ENV['PRESENTATION_BUCKET_NAME'], | ||
key: key, | ||
content_type: 'application/json', | ||
content_disposition: 'inline', | ||
acl: 'public-read', | ||
body: File.read(path) | ||
}) | ||
puts "uploaded #{key}" | ||
end | ||
end | ||
end | ||
namespace :clobber do | ||
desc 'clears out og tifs in s3 bucket' | ||
task :tifs do | ||
puts 'TO DO' | ||
end | ||
|
||
desc 'clears out og json in s3 bucket' | ||
task :json do | ||
puts 'TO DO' | ||
end | ||
end | ||
end |