From e9690b2738d0067a2af17c7268360d3108082b24 Mon Sep 17 00:00:00 2001 From: cragwolfe Date: Tue, 10 Sep 2024 11:40:35 -0700 Subject: [PATCH] feat: utility script to process large PDFs through the API by script (#3591) Adds the bash script `process-pdf-parallel-through-api.sh` that allows splitting up a PDF into smaller parts (splits) to be processed through the API concurrently, and is re-entrant. If any of the parts splits fail to process, one can attempt reprocessing those split(s) by rerunning the script. Note: requires the `qpdf` command line utility. The below command line output shows the scenario where just one split had to be reprocessed through the API to create the final `layout-parser-paper_combined.json` output. ``` $ BATCH_SIZE=20 PDF_SPLIT_PAGE_SIZE=6 STRATEGY=hi_res \ ./scripts/user/process-pdf-parallel-through-api.sh example-docs/pdf/layout-parser-paper.pdf > % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0 Skipping processing for /Users/cragwolfe/tmp/pdf-splits/layout-parser-paper-output-8a76cb6228e109450992bc097dbd1a51_split-6_strat-hi_res/layout-pars\ er-paper_pages_1_to_6.json as it already exists. Skipping processing for /Users/cragwolfe/tmp/pdf-splits/layout-parser-paper-output-8a76cb6228e109450992bc097dbd1a51_split-6_strat-hi_res/layout-parser-paper_pages_7_to_12.json as it already exists. Valid JSON output created: /Users/cragwolfe/tmp/pdf-splits/layout-parser-paper-output-8a76cb6228e109450992bc097dbd1a51_split-6_strat-hi_res/layout-parser-paper_pages_13_to_16.json Processing complete. Combined JSON saved to /Users/cragwolfe/tmp/pdf-splits/layout-parser-paper-output-8a76cb6228e109450992bc097dbd1a51_split-6_strat-hi_res/layout-parser-paper_combined.json ``` Bonus change to `unstructured-get-json.sh` to point to the standard hosted Serverless API, but allow using the Free API with --freemium. --- .../user/process-pdf-parallel-through-api.sh | 125 ++++++++++++++++++ scripts/user/split-pdf.sh | 42 ++++++ scripts/user/unstructured-get-json.sh | 17 ++- 3 files changed, 183 insertions(+), 1 deletion(-) create mode 100755 scripts/user/process-pdf-parallel-through-api.sh create mode 100755 scripts/user/split-pdf.sh diff --git a/scripts/user/process-pdf-parallel-through-api.sh b/scripts/user/process-pdf-parallel-through-api.sh new file mode 100755 index 0000000000..09a633a850 --- /dev/null +++ b/scripts/user/process-pdf-parallel-through-api.sh @@ -0,0 +1,125 @@ +#!/usr/bin/env bash + +# Usage: ./process-pdf-parallel-through-api.sh filename.pdf + +set -eu -o pipefail + +if [ $# -ne 1 ]; then + echo "Processes a single PDF through the Unstructured API by breaking it into smaller splits that are processed concurrently." + echo + echo "Usage: $0 " + echo "Please provide a PDF filename as the first argument." + echo + echo "Optionally, set the following env vars: " + echo + echo "* STRATEGY (default hi_res)" + echo "* BATCH_SIZE (default 30) as the number of parts (AKA splits) to process in parallel" + echo "* PDF_SPLIT_PAGE_SIZE (default 10) as the number of pages per split" + echo + echo "BATCH_SIZE=20 PDF_SPLIT_PAGE_SIZE=6 STRATEGY=hi_res ./process-pdf-parallel-through-api.sh example-docs/pdf/layout-parser-paper.pdf" + exit 1 +fi + +ALLOWED_STRATEGIES=("hi_res" "fast" "auto") + +# Validate STRATEGY environment variable if it's set +if [ -n "${STRATEGY:-}" ] && [[ ! " ${ALLOWED_STRATEGIES[*]} " =~ ${STRATEGY} ]]; then + echo "Error: STRATEGY must be one of ${ALLOWED_STRATEGIES[*]}" >&2 + exit 1 +fi + +# Check if UNST_API_KEY is set +if [ -z "${UNST_API_KEY}" ]; then + echo "Error: UNST_API_KEY is not set or is empty" >&2 + exit 1 +fi + +PDF_FILE="$1" +DEFAULT_SPLIT_SIZE=10 +SPLIT_SIZE=${PDF_SPLIT_PAGE_SIZE:-$DEFAULT_SPLIT_SIZE} +PDF_NAME=$(basename "$PDF_FILE" .pdf) +DEFAULT_DIR="$HOME/tmp/pdf-splits" +PDF_SPLITS_DIR="${PDF_SPLITS_DIR:-$DEFAULT_DIR}" +MD5_SUM=$(md5sum "$PDF_FILE" | awk '{ print $1 }') +PDF_DIR="$PDF_SPLITS_DIR/$PDF_NAME-${MD5_SUM}_split-${SPLIT_SIZE}" +PDF_OUTPUT_DIR="$PDF_SPLITS_DIR/${PDF_NAME}-output-${MD5_SUM}_split-${SPLIT_SIZE}_strat-${STRATEGY}" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +# Check if PDF parts directory exists +if [ ! -d "$PDF_DIR" ]; then + "$SCRIPT_DIR/split-pdf.sh" "$PDF_FILE" +fi + +# Create output directory if it does not exist +mkdir -p "$PDF_OUTPUT_DIR" + +incomplete=0 # Flag to track incomplete processing + +# Function to process a single PDF part file +process_file_part() { + local file="$1" + local STARTING_PAGE_NUMBER="$2" + local OUTPUT_JSON="$3" + + if [ -f "$OUTPUT_JSON" ]; then + echo "Skipping processing for $OUTPUT_JSON as it already exists." + return + fi + + curl -q -X POST https://api.unstructuredapp.io/general/v0/general \ + -H "unstructured-api-key: $UNST_API_KEY" \ + -H 'accept: application/json' \ + -H 'Content-Type: multipart/form-data' \ + -F strategy="${STRATEGY:-hi_res}" \ + -F 'skip_infer_table_types="[]"' \ + -F starting_page_number="$STARTING_PAGE_NUMBER" \ + -F files=@"$file;filename=$PDF_FILE" \ + -o "$OUTPUT_JSON" + + # Verify JSON content + if ! jq -e 'if type=="array" then all(.[]; type=="object" or length==0) else empty end' "$OUTPUT_JSON" >/dev/null; then + echo "Invalid JSON structure in $OUTPUT_JSON (contents below), deleting file." + cat "$OUTPUT_JSON" + rm "$OUTPUT_JSON" + incomplete=1 + else + echo "Valid JSON output created: $OUTPUT_JSON" + fi +} + +# Function to process a batch of files +process_batch() { + for file in "$@"; do + local START_PAGE + START_PAGE=$(echo "$file" | sed -n 's/.*_pages_\([0-9]*\)_to_[0-9]*.pdf/\1/p') + local END_PAGE= + END_PAGE=$(echo "$file" | sed -n 's/.*_pages_[0-9]*_to_\([0-9]*\).pdf/\1/p') + local OUTPUT_JSON="$PDF_OUTPUT_DIR/${PDF_NAME}_pages_${START_PAGE}_to_${END_PAGE}.json" + process_file_part "$file" "$START_PAGE" "$OUTPUT_JSON" & + done + wait +} + +# Read PDF parts into an array +mapfile -t pdf_parts < <(find "$PDF_DIR" -name '*.pdf' -print) + +# Process PDF parts in batches of 30, by default +batch_size=${BATCH_SIZE:-30} +for ((i = 0; i < ${#pdf_parts[@]}; i += batch_size)); do + process_batch "${pdf_parts[@]:i:batch_size}" +done + +# Determine the output filename based on whether processing was incomplete +if [ "$incomplete" -eq 1 ]; then + combined_output_filename="${PDF_NAME}_incomplete_combined.json" + echo "WARNING! not all json parts were successfully processed. you may rerun this script" + echo "to attempt reprocessing those (failed to process) parts." +else + combined_output_filename="${PDF_NAME}_combined.json" +fi + +# Combine JSON outputs in numerical order +find "$PDF_OUTPUT_DIR" -name '*.json' -print0 | sort -zV | xargs -0 jq -s 'add' >"$PDF_OUTPUT_DIR/$combined_output_filename" + +echo "Processing complete. Combined JSON saved to $PDF_OUTPUT_DIR/$combined_output_filename" diff --git a/scripts/user/split-pdf.sh b/scripts/user/split-pdf.sh new file mode 100755 index 0000000000..f116a92e24 --- /dev/null +++ b/scripts/user/split-pdf.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash + +# Usage: ./split_pdf.sh filename.pdf + +set -e + +PDF_FILE="$1" +DEFAULT_SPLIT_SIZE=5 +SPLIT_SIZE=${PDF_SPLIT_PAGE_SIZE:-$DEFAULT_SPLIT_SIZE} + +# Validate that SPLIT_SIZE is an integer +if ! [[ "$SPLIT_SIZE" =~ ^[0-9]+$ ]]; then + echo "Error: PDF_SPLIT_PAGE_SIZE must be an integer." + exit 1 +fi + +DEFAULT_DIR="$HOME/tmp/pdf-splits" +PDF_SPLITS_DIR="${PDF_SPLITS_DIR:-$DEFAULT_DIR}" +PDF_NAME=$(basename "$PDF_FILE" .pdf) +MD5_SUM=$(md5sum "$PDF_FILE" | awk '{ print $1 }') +PDF_DIR="$PDF_SPLITS_DIR/$PDF_NAME-${MD5_SUM}_split-${SPLIT_SIZE}" + +# Create directory if it does not exist +mkdir -p "$PDF_DIR" + +# Total number of pages +TOTAL_PAGES=$(qpdf --show-npages "$PDF_FILE") + +# Split PDF into $SPLIT_SIZE-page chunks +START_PAGE=1 +while [ "$START_PAGE" -le "$TOTAL_PAGES" ]; do + END_PAGE=$((START_PAGE + SPLIT_SIZE - 1)) + if [ "$END_PAGE" -gt "$TOTAL_PAGES" ]; then + END_PAGE=$TOTAL_PAGES + fi + OUTPUT_FILE="$PDF_DIR/${PDF_NAME}_pages_${START_PAGE}_to_${END_PAGE}.pdf" + qpdf "$PDF_FILE" --pages . "$START_PAGE"-"$END_PAGE" -- "$OUTPUT_FILE" + echo "Created $OUTPUT_FILE" + START_PAGE=$((END_PAGE + 1)) +done + +echo "All parts have been saved to $PDF_DIR" diff --git a/scripts/user/unstructured-get-json.sh b/scripts/user/unstructured-get-json.sh index aa8d00f189..bd2221d517 100755 --- a/scripts/user/unstructured-get-json.sh +++ b/scripts/user/unstructured-get-json.sh @@ -12,6 +12,7 @@ USAGE_MESSAGE="Usage: $0 [options] "' Options: --api-key KEY Specify the API key for authentication. Set the env var $UNST_API_KEY to skip providing this option. + --freemium Use the free API rather paid API --hi-res hi_res strategy: Enable high-resolution processing, with layout segmentation and OCR --fast fast strategy: No OCR, just extract embedded text --ocr-only ocr_only strategy: Perform OCR (Optical Character Recognition) only. No layout segmentation. @@ -22,9 +23,13 @@ Options: --s3 Write the resulting output to s3 (like a pastebin) --help Display this help and exit. + Arguments: File to send to the API. +If running against an API instance other than hosted Unstructured paid API (or --freemium), +set the enviornment variable UNST_API_ENDPOINT. + The script requires a , the document to post to the Unstructured API. The .json result is written to ~/tmp/unst-outputs/ -- this path is echoed and copied to your clipboard. ' @@ -35,7 +40,6 @@ if [ "$#" -eq 0 ]; then fi API_KEY=${UNST_API_KEY:-""} -API_ENDPOINT=${UNST_API_ENDPOINT:-"https://api.unstructured.io/general/v0/general"} TMP_DOWNLOADS_DIR="$HOME/tmp/unst-downloads" TMP_OUTPUTS_DIR="$HOME/tmp/unst-outputs" # only applicable if writing .json output files to S3 when using --s3, e.g. s3://bucket-name/path/ @@ -62,6 +66,7 @@ STRATEGY="" VERBOSE=false TRACE=false COORDINATES=false +FREEMIUM=false TABLES=true S3="" @@ -99,6 +104,10 @@ while [[ "$#" -gt 0 ]]; do COORDINATES=true shift ;; + --freemium) + FREEMIUM=true + shift + ;; --api-key) if [ -n "$2" ] && [ "${2:0:1}" != "-" ]; then API_KEY=$2 @@ -139,6 +148,12 @@ else INPUT_FILEPATH=${INPUT} fi +if $FREEMIUM; then + API_ENDPOINT="https://api.unstructured.io/general/v0/general" +else + API_ENDPOINT=${UNST_API_ENDPOINT:-"https://api.unstructuredapp.io/general/v0/general"} +fi + if $HI_RES; then if $VERBOSE; then echo "Sending API request with hi_res strategy"; fi STRATEGY="-hi-res"