Skip to content

Commit

Permalink
use & add to ingestion metadata
Browse files Browse the repository at this point in the history
  • Loading branch information
mnyrop committed Oct 9, 2024
1 parent dbc35fd commit 1af9bab
Show file tree
Hide file tree
Showing 6 changed files with 106 additions and 1,269 deletions.
4 changes: 1 addition & 3 deletions Rakefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,8 @@ CONFIG = YAML.load_file 'config.yml'
RW_DIR = File.dirname CONFIG['source_dir']
PDF_DIR = File.join RW_DIR, 'pdfs'
JPG_DIR = File.join RW_DIR, 'jpgs'
AP_SRC_DIR = './src'
ANUM_TXT_FILE = File.join RW_DIR, 'anumbers.txt'
AFILES_CSV_FILE = File.join AP_SRC_DIR, 'afiles.csv'
PAGES_CSV_FILE = File.join AP_SRC_DIR, 'pages.csv'
AFILES_CSV_FILE = CONFIG.dig 'records', 'file'

Dir.glob("lib/tasks/*.rake").each { |r| load r }

Expand Down
2 changes: 1 addition & 1 deletion config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,6 @@ presentation_api_url: 'https://mats-aperitiiif-presenation-api-store-v1.s3.us-ea
image_api_url: 'https://dctn4zjpwgdwdiiy5odjv7o2se0bqgjb.lambda-url.us-east-1.on.aws/iiif/3'

records:
file: 'src/afiles.csv'
file: 'src/records.csv'


78 changes: 63 additions & 15 deletions lib/tasks/pdfs.rake
Original file line number Diff line number Diff line change
@@ -1,6 +1,23 @@
require 'csv'
require 'pdf-reader'
require 'vips'

def records
@records ||= CSV.open(AFILES_CSV_FILE, headers: :first_row).map(&:to_h)
end

def records=(records)
@records = records
end

def records_hash
@records_hash ||= pickle(records)
end

def records_hash=(records_hash)
@records_hash = records_hash
end

def pdf_paths
@pdfs ||= Dir.glob("#{PDF_DIR}/*.pdf")
end
Expand All @@ -11,6 +28,29 @@ def infer_anum(pdf_path)
anum
end

def pickle(array)
array.map { |r| { r['id'].strip => r } }.inject(:merge)
end

def unpickle(hash)
hash.map { |_k, value| value }.sort_by! { |r| r['id']}
end

def write_to_csv(data, file)
CSV.open(file, "wb") do |csv|
csv << data.first.keys
data.each do |hash|
csv << hash.values
end
end
end

def deduce_page_count(pdf_path)
GC.start
PDF::Reader.new(pdf_path).page_count
end


namespace :pdfs do
desc 'spit out txt list of anums inferred from pdfs'
task :anum_txt do
Expand All @@ -19,33 +59,41 @@ namespace :pdfs do
end
puts "Done ✓"
end

desc 'add page count to csv'
task :page_count_csv do
pdf_paths.each_with_index do |path, i|
anum = infer_anum path

next puts "skipping #{anum}" unless records_hash.dig(anum, 'page_count').nil?

page_count = deduce_page_count path
raise "no anum #{anum} found in hash!!!" unless records_hash.key? anum
puts "#{anum}: #{page_count} pages"

records_hash[anum]['page_count'] = page_count
write_to_csv(unpickle(records_hash), AFILES_CSV_FILE)
end
end

desc 'split pdfs to jpgs, capture results in csvs'
task :jpg_csv do
File.open(AFILES_CSV_FILE, 'w') { |file| file.puts("id,label,og_pdf_id,page_count") }
File.open(PAGES_CSV_FILE, 'w') { |file| file.puts("id,label,a_number,page_number,extracted_text") }
desc 'split pdfs to jpgs'
task :split_jpgs do
FileUtils.mkdir_p JPG_DIR

pdf_paths.each_with_index do |path, i|
GC.start
reader = PDF::Reader.new path
page_count = reader.page_count
anum = infer_anum path
page_count = Integer(records_hash.dig(anum, 'page_count') || deduce_page_count(path))
dir = File.join JPG_DIR, anum
pdf_data = [anum,anum,File.basename(path, '.pdf'),page_count]

File.open(AFILES_CSV_FILE, 'a') { |f| f.puts pdf_data.join(',') }

FileUtils.mkdir_p dir

(0..page_count - 1).each do |index|
page_num = index.to_s.rjust(4, "0")
page_id = "#{anum}_#{page_num}"
target = File.join dir, "#{page_num}.jpg"
text = reader.pages[index].text.to_s.gsub(/\R+/, "|").gsub('"', "'")
page_data = [page_id,page_id,anum,page_num,"\"#{text}\""]

File.open(PAGES_CSV_FILE, "a") { |f| f.puts page_data.join(',') }


next if File.file? target

img = Vips::Image.pdfload path, page: index, n: 1, dpi: 300
img = img.thumbnail_image(2500, height: 10000000) if (img.width > 2500)
img.jpegsave target
Expand Down
41 changes: 0 additions & 41 deletions src/afiles.csv

This file was deleted.

Loading

0 comments on commit 1af9bab

Please sign in to comment.