diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 43fe255..9428224 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,5 +4,5 @@ repos: rev: v0.3.4 hooks: - id: ruff - args: [ --fix, --exit-non-zero-on-fix ] + args: [ --select, I, --fix, --exit-non-zero-on-fix ] - id: ruff-format diff --git a/.python-version b/.python-version index c8cfe39..e4fba21 100644 --- a/.python-version +++ b/.python-version @@ -1 +1 @@ -3.10 +3.12 diff --git a/CHANGELOG.md b/CHANGELOG.md index 5b55af1..8db6e98 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,26 @@ # CHANGELOG +## 0.2.0 +- Now requires Python 3.12 +### Corppa Utilities +- Basic readme documentation for filter script +- New script for OCR with google vision +- Updated filter script: + - Uses PPA work ids instead of source ids + - Additional filtering by volume and page + - Additional filtering by include or exclude key-pair values +- New utilities function for working with PPA corpus file paths +- New script for generating PPA page subset to be used in conjunction with the filter script +- New script for adding image relative paths to a PPA text corpus +### Poetry Detection +- New Prodigy recipes and custom CSS for image and text annotation +- Script to add PPA work-level metadata for display in Prodigy +### Misc +- Ruff precommit hook now configured to autofix import order + + ## 0.1.0 - Utility to filter the full text corpus by source ID - Experimental Scripts - OCR evaluation - Character-level statistics - diff --git a/README.md b/README.md index eb18dad..a7827e0 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,59 @@ -# `corppa` PPA full-text corpus utilities +# corppa + +This repository is research software developed as part of the [Ends of Prosody](https://cdh.princeton.edu/projects/the-ends-of-prosody/), which is associated with the [Princeton Prosody Archive](https://prosody.princeton.edu/) (PPA). This software is particularly focused on research and work related to PPA full-text and page image corpora. + +> [!WARNING] +> This code is primarily for internal team use. Some portions of it may eventually be useful for participants of the [Ends of Prosody conference](https://cdh.princeton.edu/events/the-ends-of-prosody/) or be adapted or used elsewhere. + +## Basic Usage + +### Installation + +Use pip to install as a python package directly from GitHub. Use a branch or tag name, e.g. `@develop` or `@0.1` if you need to install a specific version. + +```sh +pip install git+https://github.com/Princeton-CDH/ppa-nlp.git#egg=corppa +``` +or +```sh +pip install git+https://github.com/Princeton-CDH/ppa-nlp.git@v0.1#egg=corppa +``` + +### Scripts + +Installing `corppa` currently provides access to two command line scripts, for filtering a PPA page-level corpus or for generating OCR text for images using Google Vision API. These can be run as `corppa-filter` and `corppa-ocr` respectively. + +#### Filtering PPA page-text corpus + +The PPA page-level text corpus is shared as a json lines (`.jsonl`) file, which may or may not be compressed (e.g., `.jsonl.gz`). It's often useful to filter the full corpus to a subset of pages for a specific task, e.g. to analyze content from specific volumes or select particular pages for annotation. + +To create a subset corpus with _all pages_ for a set of specific volumes, create a text file with a list of **PPA work identifiers**, one id per line, and then run the filter script with the input file, desired output file, and path to id file. + +```sh +corppa-filter ppa_pages.jsonl my_subset.jsonl --idfile my_ids.txt +``` + +> [!NOTE] +> **PPA work identifiers** are based on source identifiers, i.e., the identifier from the original source (HathiTrust, Gale/ECCO, EEBO-TCP). In most cases the work identifier and the source identifier are the same, but _if you are working with any excerpted content the work id is NOT the same as the source identifier_. Excerpt ids are based on the combination of source identifier and the first original page included in the excerpt. In some cases PPA contains multiple excerpts from the same source, so this provides guaranteed unique work ids. + +To create a subset of _specific pages_ from specific volumes, create a CSV file that includes fields `work_id` and `page_num`, and pass that to the filter script with the `--pg-file` option: + +```sh +corppa-filter ppa_pages.jsonl my_subset.jsonl --pg_file my_work_pages.csv +``` + +You can filter a page corpus to exclude or include pages based on exact-matches for attributes included in the jsonl data. For example, to get all pages with the original page number roman numeral 'i': + +```sh +corppa-filter ppa_pages.jsonl i_pages.jsonl --include label=i +``` + +Filters can also be combined; for example, to get the original page 10 for every volume from a list, you could specify a list of ids and the `--include` filter: + +```sh +corppa-filter ppa_pages.jsonl my_subset_page10.jsonl --idfile my_ids.txt --include label=10 +``` -This repository provides code and other resources associated with the [Princeton Prosody Archive](https://prosody.princeton.edu/) (PPA), with a particular focus on working with the PPA full-text corpus. ## Development instructions @@ -8,18 +61,22 @@ This repo uses [git-flow](https://github.com/nvie/gitflow) branching conventions ### Developer setup and installation -- **Recommended:** create a python virtual environment with your tool of choice (virtualenv, conda, etc); use python 3.10 or higher +- **Recommended:** create a python virtual environment with your tool of choice (virtualenv, conda, etc); use python 3.12 or higher + +- Install the local checked out version of this package in editable mode (`-e`), including all python dependencies and optional dependencies for development and testing: -- Install the local checked out version of this package in editable mode (`-e`), including all python dependencies and optional dependencies for development and testing: ```sh pip install -e ".[dev]" ``` - This repository uses [pre-commit](https://pre-commit.com/) for python code linting and consistent formatting. Run this command to initialize and install pre-commit hooks: + ```sh pre-commit install ``` ## Experimental Scripts + Experimental scripts associated with `corppa` are located within the `scripts` directory. See this directory's README for more detail. +` \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 69062b9..92c97d3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,15 +5,13 @@ build-backend = "hatchling.build" [project] name = "corppa" description = "Utilities for working with Princeton Prosody Archive full-text corpus" -requires-python = ">=3.10" +requires-python = ">=3.12" readme = "README.md" # license TBD #license.file = "LICENSE" #license = {text = "Apache-2"} classifiers = [ "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Development Status :: 3 - Alpha", "Intended Audience :: Science/Research", @@ -22,21 +20,27 @@ classifiers = [ "Topic :: Text Processing", "Topic :: Utilities", ] +dynamic = ["version"] dependencies = [ "orjsonl", - "tqdm" + "tqdm", ] -dynamic = ["version"] - -[project.scripts] -corppa-filter-corpus = "corppa.utils.filter:main" - -[tool.hatch.version] -path = "src/corppa/__init__.py" [project.optional-dependencies] test = [ "pytest", "pytest-cov" ] -dev = ["pre-commit", "corppa[test]"] +ocr = ["google-cloud-vision"] +dev = ["pre-commit", "corppa[test]", "corppa[ocr]"] + +[project.scripts] +corppa-filter = "corppa.utils.filter:main" +corppa-ocr = "corppa.ocr.gvision_ocr:main" + +[tool.hatch.version] +path = "src/corppa/__init__.py" + +[tool.ruff] +# configure src path so ruff import fixes can identify local imports +src = ["src"] diff --git a/requirements.lock b/requirements.lock new file mode 100644 index 0000000..fc7be70 --- /dev/null +++ b/requirements.lock @@ -0,0 +1,39 @@ +cachetools==5.5.0 +certifi==2024.8.30 +cfgv==3.4.0 +charset-normalizer==3.3.2 +-e git+ssh://git@github.com/Princeton-CDH/ppa-nlp.git@30734c57bdf3e9ae63d04bc2e2585aede4b6d751#egg=corppa +coverage==7.6.1 +distlib==0.3.8 +filelock==3.15.4 +google-api-core==2.19.2 +google-auth==2.34.0 +google-cloud-vision==3.7.4 +googleapis-common-protos==1.65.0 +grpcio==1.66.1 +grpcio-status==1.66.1 +identify==2.6.0 +idna==3.8 +iniconfig==2.0.0 +nodeenv==1.9.1 +orjson==3.10.7 +orjsonl==1.0.0 +packaging==24.1 +platformdirs==4.2.2 +pluggy==1.5.0 +pre-commit==3.8.0 +proto-plus==1.24.0 +protobuf==5.28.0 +pyasn1==0.6.0 +pyasn1_modules==0.4.0 +pytest==8.3.2 +pytest-cov==5.0.0 +PyYAML==6.0.2 +requests==2.32.3 +rsa==4.9 +setuptools==72.1.0 +tqdm==4.66.5 +urllib3==2.2.2 +virtualenv==20.26.3 +wheel==0.43.0 +xopen==2.0.2 diff --git a/scripts/README.md b/scripts/README.md index 7253ad0..cc1d9fb 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -34,3 +34,6 @@ This module contains general-purpose auxiliary methods. #### `ocr_helper.py` This module contains OCR-related auxiliary methods. + +### `transform-images.sh` +This bash script will copy and transform images from a PPA (sub)corpus (jsonl). diff --git a/scripts/evaluate_ocr.py b/scripts/evaluate_ocr.py index 5773bbe..f76938b 100644 --- a/scripts/evaluate_ocr.py +++ b/scripts/evaluate_ocr.py @@ -1,13 +1,13 @@ +import csv import os import sys -import spacy -import csv -import orjsonl -from xopen import xopen -from tqdm import tqdm +import orjsonl +import spacy from lingua import LanguageDetectorBuilder from ocr_helper import clean_chars +from tqdm import tqdm +from xopen import xopen class OCREvaluator: diff --git a/scripts/get_character_stats.py b/scripts/get_character_stats.py index 7ff5b0f..1f7e1b6 100644 --- a/scripts/get_character_stats.py +++ b/scripts/get_character_stats.py @@ -4,17 +4,16 @@ env: ppa-ocr """ -import sys -import os.path import csv +import os.path +import sys import unicodedata +from collections import Counter import orjsonl -from collections import Counter -from xopen import xopen -from tqdm import tqdm from ocr_helper import clean_chars - +from tqdm import tqdm +from xopen import xopen __cc_names = { "\n": "Cc: LINE FEED", diff --git a/scripts/helper.py b/scripts/helper.py index 0757fc3..3f4fe1f 100644 --- a/scripts/helper.py +++ b/scripts/helper.py @@ -8,6 +8,26 @@ _htid_decode_table = str.maketrans(_htid_decode_map) +def get_stub_dir(source, vol_id): + """ + Returns the stub directory for the specified volume (vol_id) and + source type (source) + + For Gale, every third number (excluding the leading 0) of the volume + identifier is used. + Ex. CB0127060085 --> 100 + + For HathiTrust, the library portion of the volume identifier is used. + Ex. mdp.39015003633594 --> mdp + """ + if source == "Gale": + return vol_id[::3][1:] + elif source == "HathiTrust": + return vol_id.split(".", maxsplit=1)[0] + else: + raise ValueError(f"Unknown source '{source}'") + + def encode_htid(htid): """ Returns the "clean" version of a HathiTrust volume identifier with the form: diff --git a/scripts/ocr_helper.py b/scripts/ocr_helper.py index 0ea405e..b50985f 100644 --- a/scripts/ocr_helper.py +++ b/scripts/ocr_helper.py @@ -4,7 +4,6 @@ import ftfy - _char_conversion_map = {"ſ": "s"} _char_translation_table = str.maketrans(_char_conversion_map) diff --git a/scripts/transform-images.sh b/scripts/transform-images.sh new file mode 100755 index 0000000..74893e3 --- /dev/null +++ b/scripts/transform-images.sh @@ -0,0 +1,64 @@ +#! /bin/sh + +# For the images specified in the input jsonl, copy and transform images +# in from the input directory to the output directory, according to the +# mode specified. + +mode=$1 +in_jsonl=$2 +in_dir=$3 +out_dir=$4 + + +# Arg validation +if [ $# -ne 4 ]; then + echo "Usage: [mode] [jsonl] [in dir] [out dir]" + exit 1 +fi +# Check that jsonl file exists +if [ ! -f "$in_jsonl" ]; then + echo "ERROR: File $in_jsonl does not exist!" + exit 1 +fi +# Check input dir exists +if [ ! -d "$in_dir" ]; then + echo "ERROR: Directory $in_dir does not exist!" + exit 1 +fi +# Check output dir exists +if [ ! -d "$out_dir" ]; then + echo "ERROR: Directory $out_dir does not exist!" + exit 1 +fi +# Check that the mode is valid +if [ "$mode" != "copy" ]; then + echo "ERROR: Invalid mode '$mode'" + exit 1 +fi + +for path_str in `jq ".image_path" $in_jsonl`; do + # strip double quotes + img_path=${path_str#'"'} + img_path=${img_path%'"'} + echo "$img_path" + + # Check image exists + in_path="$in_dir/$img_path" + if [ ! -f "$in_path" ]; then + "WARNING: Image $in_path does not exist!" + fi + + out_path="$out_dir/$img_path" + out_subdir=`dirname "$out_path"` + if [ ! -d "$out_subdir" ]; then + mkdir -p "$out_subdir" + fi + + if [ $mode == "copy" ]; then + # For now just make copies + cp "$in_path" "$out_path" + else + echo "ERROR: Unkown mode '$mode'" + exit 1 + fi +done diff --git a/src/corppa/__init__.py b/src/corppa/__init__.py index 3dc1f76..d3ec452 100644 --- a/src/corppa/__init__.py +++ b/src/corppa/__init__.py @@ -1 +1 @@ -__version__ = "0.1.0" +__version__ = "0.2.0" diff --git a/src/corppa/ocr/gvision_ocr.py b/src/corppa/ocr/gvision_ocr.py new file mode 100755 index 0000000..98bcec4 --- /dev/null +++ b/src/corppa/ocr/gvision_ocr.py @@ -0,0 +1,330 @@ +#!/usr/bin/env python + +""" +This script OCRs images using the Google Vision API. +""" + +import argparse +import io +import os +import pathlib +import sys + +from tqdm import tqdm + +from corppa.utils.path_utils import get_ppa_source, get_vol_dir + +# Attempt to import Google Cloud Vision Python Client +try: + from google.cloud import vision as google_vision +except ImportError: + google_vision = None + +# Workaround (hopefully temporary) to surpress some logging printed to stderr +os.environ["GRPC_VERBOSITY"] = "NONE" + + +def image_relpath_generator(image_dir, exts, follow_symlinks=True): + """ + This generator method finds all images in image_dir with file extensions + in exts (case insensitive). For each of these images, the method yields + the relative path with respect to image_dir. + + For example, if image_dir = "a/b/c/images" and there are image files at the + following paths: "a/b/c/images/alpha.jpg", "a/b/c/images/d/beta.jpg" + The generate will produce these two items: "alpha.jpg" and "d/beta.jpg" + """ + # Create lowercase extension set from passed in exts + ext_set = {ext.lower() for ext in exts} + + # Using pathlib.walk over glob because (1) it allows us to find files with + # multiple extensions in a single walk of the directory and (2) lets us + # leverage additional functionality of pathlib. + for dirpath, dirs, files in image_dir.walk(follow_symlinks=follow_symlinks): + # Check the files in walked directory + for file in files: + ext = os.path.splitext(file)[1] + if ext.lower() in ext_set: + filepath = dirpath.joinpath(file) + yield filepath.relative_to(image_dir) + # For future walking, remove hidden directories + dirs[:] = [d for d in dirs if d[0] != "."] + + +def ocr_image_via_gvision(gvision_client, input_image, out_txt, out_json): + """ + Perform OCR for input image using the Google Cloud Vision API via the provided client. + The plaintext output and json response of the OCR call are written to out_txt and + out_json paths respectively. + """ + # TODO: Clean up code duplication. This check is needed, since this method relies on + # both an existing client as well as API calls directly. + # Check that Google Cloud Vision Python Client library was successfully imported + if google_vision is None: + print( + "Error: Python environment does not contain google-cloud-vision " + "package. Switch environments or install package and try again.", + file=sys.stderr, + ) + sys.exit(1) + + # Load the image into memory + with io.open(input_image, "rb") as image_reader: + content = image_reader.read() + image = google_vision.Image(content=content) + + # Performs OCR and handwriting detection on the image file + response = gvision_client.document_text_detection(image=image) + + # Save plain text output to local file; + # even if text is empty, create text file so we don't request again + with open(out_txt, "w") as textfilehandle: + textfilehandle.write(response.full_text_annotation.text) + + # Save json response + json_response = google_vision.AnnotateImageResponse.to_json(response) + with open(out_json, "w") as jsonfilehandle: + jsonfilehandle.write(json_response) + + if response.error.message: + raise Exception( + f"{response.error.message}\n for more info on error messages, " + "check: https://cloud.google.com/apis/design/errors" + ) + + +def ocr_images(in_dir, out_dir, exts, ocr_limit=0, show_progress=True): + """ + OCR images in in_dir with extension exts to out_dir. If ocr_limit > 0, + stop after OCRing ocr_limit images. + + Returns a map structure reporting the number of images OCR'd and skipped. + """ + # Check that Google Cloud Vision Python Client was successfully imported + if google_vision is None: + print( + "Error: Python environment does not contain google-cloud-vision " + "package. Switch environments or install package and try again.", + file=sys.stderr, + ) + sys.exit(1) + + # Instantiate google vision client + client = google_vision.ImageAnnotatorClient() + + # Setup up progress bar if progress will be shown + if show_progress: + desc = "OCRing images" + maxinterval = 1 + if ocr_limit: + progress_bar = tqdm(desc=desc, total=ocr_limit, maxinterval=maxinterval) + else: + bar_format = "{desc}: {n:,} images OCR'd | elapsed: {elapsed}, {rate_fmt}" + progress_bar = tqdm( + desc=desc, bar_format=bar_format, maxinterval=maxinterval + ) + + ocr_count = 0 + skip_count = 0 + for image_relpath in image_relpath_generator(in_dir, exts): + # Refresh progress bar + if show_progress: + progress_bar.refresh() + # Get image and ocr output paths + image_file = in_dir.joinpath(image_relpath) + text_file = out_dir.joinpath(image_relpath).with_suffix(".txt") + json_file = text_file.with_suffix(".json") + # Ensure that all subdirectories exist + ocr_dir = text_file.parent + ocr_dir.mkdir(parents=True, exist_ok=True) + + # Request OCR if file does not exist + if text_file.is_file(): + skip_count += 1 + else: + try: + ocr_image_via_gvision(client, image_file, text_file, json_file) + + # Update counter + ocr_count += 1 + if show_progress: + # Update progress bar since only OCR'd images are tracked + progress_bar.update() + + # Check if we should stop + if ocr_limit and ocr_count == ocr_limit: + # TODO: Is there a better structuring to avoid this break + break + except (Exception, KeyboardInterrupt): + # Close progress bar before raising error + progress_bar.close() + print( + f"Error: An error encountered while OCRing {imagefile.stem}", + file=sys.stderr, + ) + raise + + if show_progress: + # Close progress bar + progress_bar.close() + if ocr_limit and ocr_count == ocr_limit: + print("Stopping early, OCR limit reached.", file=sys.stderr) + print( + f"{ocr_count:,} images OCR'd & {skip_count:,} images skipped.", + file=sys.stderr, + ) + + return {"ocr_count": ocr_count, "skip_count": skip_count} + + +def ocr_volumes(vol_ids, in_dir, out_dir, exts, ocr_limit=0, show_progress=True): + """ + OCR images for volumes vol_ids with extension exts to out_dir. Assumes in_dir + follows the PPA directory conventions (see corppa.utils.path_utils for more + details). If ocr_limit > 0, stop after OCRing ocr_limit images. + """ + n_vols = len(vol_ids) + current_ocr_limit = ocr_limit + total_ocr_count = 0 + total_skip_count = 0 + for i, vol_id in enumerate(vol_ids): + try: + sub_dir = get_vol_dir(vol_id) + except NotImplementedError: + # Skip unsupported source types (i.e. HathiTrust) + vol_source = get_ppa_source(vol_id) + print( + f"Warning: Skipping {vol_id} since its source ({vol_source}) is " + "not yet unsupported.", + file=sys.stderr, + ) + continue + + # Get vol dir info + in_vol_dir = in_dir.joinpath(sub_dir) + out_vol_dir = out_dir.joinpath(sub_dir) + + # Check that input vol dir exists + if not in_vol_dir.is_dir(): + print(f"Warning: Volume '{vol_id}' is not in {in_dir}", file=sys.stderr) + print(f"Directory {in_vol_dir} does not exist.", file=sys.stderr) + continue + # Ensure that output vol dir exists + out_vol_dir.mkdir(parents=True, exist_ok=True) + if show_progress: + # Add space between volume-level reporting + if i: + print("", file=sys.stderr) + print(f"OCRing {vol_id} ({i+1}/{n_vols})...", file=sys.stderr) + + # OCR images + report = ocr_images( + in_vol_dir, + out_vol_dir, + exts, + ocr_limit=current_ocr_limit, + show_progress=show_progress, + ) + + # Upkeep + total_ocr_count += report["ocr_count"] + total_skip_count += report["skip_count"] + if ocr_limit: + current_ocr_limit -= report["ocr_count"] + # Stop if limit is reached + if current_ocr_limit == 0: + if show_progress: + print("Hit OCR limit.", file=sys.stderr) + break + + print( + f"---\nIn total, {total_ocr_count:,} images OCR'd & {total_skip_count:,} " + "images skipped.", + file=sys.stderr, + ) + + +def main(): + parser = argparse.ArgumentParser( + description="Uses Google Vision API to OCR images." + ) + + # Required arguments + parser.add_argument( + "input", + help="Top-level input directory containing images to be OCR'd", + type=pathlib.Path, + ) + parser.add_argument( + "output", + help="Top-level output directory for OCR output; " + + "maintains input subdirectory structure.", + type=pathlib.Path, + ) + + # Optional arguments + parser.add_argument( + "--progress", + help="Show progress", + action=argparse.BooleanOptionalAction, + default=True, + ) + parser.add_argument( + "--ocr-limit", + help="Set a limit for the number of images to be OCR'd", + type=int, + default=0, + ) + parser.add_argument( + "--ext", + help="Accepted file extension(s), case insensitive. Can be repeated. Defaults: .tif, .jpg", + nargs="*", + type=str, + action="extend", + ) + parser.add_argument( + "--vol", + help="Only OCR images from the specified PPA volume(s) represented as " + "volume ids. Can be repeated.", + nargs="*", + action="extend", + ) + + args = parser.parse_args() + # Workaround: Set default extensions if none are provided. + if args.ext is None: + args.ext = [".tif", ".jpg"] + + # Validate arguments + if not args.input.is_dir(): + print(f"Error: input directory {args.input} does not exist", file=sys.stderr) + sys.exit(1) + # TODO: Is this too restrictive / unnecessary? + if not args.output.is_dir(): + print(f"Error: output directory {args.output} does not exist", file=sys.stderr) + sys.exit(1) + if args.ocr_limit < 0: + print("Error: ocr limit cannot be negative", file=sys.stderr) + sys.exit(1) + + if args.vol is None: + ocr_images( + args.input, + args.output, + args.ext, + ocr_limit=args.ocr_limit, + show_progress=args.progress, + ) + else: + ocr_volumes( + args.vol, + args.input, + args.output, + args.ext, + ocr_limit=args.ocr_limit, + show_progress=args.progress, + ) + + +if __name__ == "__main__": + main() diff --git a/src/corppa/poetry_detection/annotation/add_metadata.py b/src/corppa/poetry_detection/annotation/add_metadata.py new file mode 100644 index 0000000..a38c4ba --- /dev/null +++ b/src/corppa/poetry_detection/annotation/add_metadata.py @@ -0,0 +1,111 @@ +""" +This script is used to prep ppa corpus page jsonl data for use in +Prodigy annotation. It adds work metadata (title, author, year) in the +location that Prodigy requires for display, and allows adjusting image +paths for display from the Prodigy interface. It assumes the input page +corpus has already been annotated with image paths with an `image_path` +attribute, and adds an optional url prefix and converts any .TIF extensions +to .jpg. +""" + +# NOTE: this script is provisional and should likely be refactored +# or combined with other page data handling scripts + +import argparse +import csv +import pathlib +import sys +from typing import Iterator + +import orjsonl +from tqdm import tqdm + + +def combine_data( + jsonl_path: pathlib.Path, csv_path: pathlib.Path, disable_progress: bool = False +) -> Iterator[dict]: + # add work-level metadata to jsonl page data + + # load metadata from csv file + # - preserve title, author, and publication year + with csv_path.open() as csvfile: + csvreader = csv.DictReader(csvfile) + # create a lookup keyed on work id + metadata = { + row["work_id"]: { + "title": row["title"], + "author": row["author"], + "year": row["pub_year"], + } + for row in csvreader + } + + # use orjsonl to open and stream the json lines; + # wrap in tqdm for optional progress bar + progress_pages = tqdm( + orjsonl.stream(jsonl_path), + disable=disable_progress, + ) + + for page in progress_pages: + # add metadata dictionary for Prodigy + page["meta"] = metadata[page["work_id"]] + + yield page + + +def main(): + """Add work metadata to pages for display in Prodigy""" + # NOTE: if we decide to keep this script, we should add a named entry + # for it in the pyproject so it can be run when the package is installed + + parser = argparse.ArgumentParser( + description="Add PPA work-level metadata to pages for context in Prodigy", + ) + parser.add_argument( + "input", + help="Path to a PPA page-level corpus JSONL file (compressed or not)", + type=pathlib.Path, + ) + parser.add_argument( + "metadata", help="Path to PPA work-level metatada CSV file", type=pathlib.Path + ) + parser.add_argument( + "output", + help="Filename where the updated corpus should be saved", + type=pathlib.Path, + ) + parser.add_argument( + "--progress", + help="Show progress", + action=argparse.BooleanOptionalAction, + default=True, + ) + + args = parser.parse_args() + # progress bar is enabled by default; disable if requested + disable_progress = not args.progress + + # input file and metadata files should exist and not be empty + for input_file in [args.input, args.metadata]: + if not input_file.exists(): + print(f"Error: {input_file} does not exist") + sys.exit(-1) + elif args.input.stat().st_size == 0: + print(f"Error: {input_file} is zero size") + sys.exit(-1) + + # output file should not exist + if args.output.exists(): + print(f"Error: output file {args.output} already exists, not overwriting") + sys.exit(-1) + + # use orjsonl to stream updated pages to specified output file + orjsonl.save( + args.output, + combine_data(args.input, args.metadata, disable_progress=disable_progress), + ) + + +if __name__ == "__main__": + main() diff --git a/src/corppa/poetry_detection/annotation/create_pageset.py b/src/corppa/poetry_detection/annotation/create_pageset.py new file mode 100644 index 0000000..cdb1561 --- /dev/null +++ b/src/corppa/poetry_detection/annotation/create_pageset.py @@ -0,0 +1,143 @@ +""" +Create poetry page set. +Note that this is hard-coded for the poetry test-set + +env: ppa-data +""" + +import csv +import json +import os.path +import re +import sys + +import orjsonl +from helper import encode_htid, get_stub_dir +from tqdm import tqdm +from xopen import xopen + + +def extract_page_numbers(page_url_list): + pg_urls = page_url_list.split("\n") + pg_nums = {int(url.rsplit("=", 1)[1]) for url in pg_urls} + return pg_nums + + +def get_page_image_path(page_record): + source = page_record["source"] + vol_id = page_record["source_id"] + stub_dir = get_stub_dir(source, vol_id) + page_num = page_record["order"] + if source == "Gale": + vol_dir = f"Gale/{stub_dir}/{vol_id}" + image_name = f"{vol_id}_{page_num:04d}0.TIF" + return f"{vol_dir}/{image_name}" + elif source == "HathiTrust": + vol_id = encode_htid(vol_id) + ver_date = page_record["ver_date"] + vol_dir = f"HathiTrust/{stub_dir}/{vol_id}_{ver_date}" + image_name = f"{page_num:08d}.jpg" + return f"{vol_dir}/{image_name}" + else: + print(f"ERROR: Unknown source '{source}'") + raise ValueError + + +def get_ver_date(possible_timestamp): + pattern = re.compile(r"\d\d\d\d-\d\d-\d\d") + + result = pattern.match(possible_timestamp.strip()) + if result: + # Valid date identified + return result.group() + else: + # No date found + return "N/A" + + +if __name__ == "__main__": + if len(sys.argv) != 4: + print("Usage: [ppa-text corpus dir] [pageset csv] [out jsonl]") + sys.exit(1) + + ppa_dir = sys.argv[1] + pageset_csv = sys.argv[2] + out_jsonl = sys.argv[3] + + ppa_meta_json = f"{ppa_dir}/ppa_metadata.json" + ppa_jsonl = f"{ppa_dir}/ppa_pages.jsonl.gz" + + # Validate inputs + if not os.path.isfile(pageset_csv): + print(f"ERROR: {pageset_csv} does not exist") + sys.exit(1) + if not os.path.isdir(ppa_dir): + print(f"ERROR: {ppa_dir} does not exist") + sys.exit(1) + if not os.path.isfile(ppa_meta_json): + print("ERROR: PPA metadata file (ppa_metadata.json) does not exist") + sys.exit(1) + if not os.path.isfile(ppa_jsonl): + print("ERROR: PPA pages file (ppa_pages.json.gz) does not exist") + sys.exit(1) + if os.path.isfile(out_jsonl): + print(f"ERROR: {out_jsonl} already exists") + sys.exit(1) + + # Load ppa metadata + works_meta = {} + with open(ppa_meta_json) as file_handler: + for work in json.load(file_handler): + works_meta[work["work_id"]] = work + + # Load testset data + working_set = {} + with open(pageset_csv, newline="") as file_handler: + reader = csv.DictReader(file_handler) + pg_rng_id = "digital page span of main text as determined by Mary" + has_poetry_id = "links to pages with poetry (non-comprehensive)" + for row in reader: + work_id = row["ID"] + work_record = works_meta[work_id] + pg_start, pg_end = map(int, row[pg_rng_id].split("-")) + entry = { + "work_id": work_id, + "source": work_record["source"], + "source_id": work_record["source_id"], + "source_url": work_record["source_url"], + "pub_year": work_record["pub_year"], + "pg_start": pg_start, + "pg_end": pg_end, + "poetry_pages": extract_page_numbers(row[has_poetry_id]), + "ver_date": get_ver_date(row["version_date"]), + } + working_set[work_id] = entry + + # Gather pages + n_lines = sum(1 for line in xopen(ppa_jsonl, mode="rb")) + for page in tqdm(orjsonl.stream(ppa_jsonl), total=n_lines): + work_id = page["work_id"] + if work_id in working_set: + work = working_set[work_id] + page_num = page["order"] + assert page_num == int(page["id"].rsplit(".", 1)[1]) + # Filter to working range of volume + if page_num >= work["pg_start"] and page_num <= work["pg_end"]: + # Add some additional metdata + page["source"] = work["source"] + page["source_id"] = work["source_id"] + page["pub_year"] = work["pub_year"] + page["ver_date"] = work["ver_date"] + + # Check if this page is known to contain poetry + contains_poetry = "?" + if page_num in work["poetry_pages"]: + contains_poetry = "Yes" + page["contains_poetry"] = contains_poetry + + # Get image path (if possible) + image_path = get_page_image_path(page) + page["image_path"] = image_path + + # Write page data to file + orjsonl.append(out_jsonl, page) diff --git a/src/corppa/poetry_detection/annotation/prodigy.css b/src/corppa/poetry_detection/annotation/prodigy.css new file mode 100644 index 0000000..fb10c34 --- /dev/null +++ b/src/corppa/poetry_detection/annotation/prodigy.css @@ -0,0 +1,43 @@ +.prodigy-container { + display: grid; + grid-template-areas: + "meta meta" + "header right" + "left right"; + /* 30px for metadata, 120px for title header on left */ + grid-template-rows: 30px 120px auto; + grid-template-columns: 1fr 1fr; /* share space evenly */ + column-gap: 5px; + align-items: start; + max-width: none; +} + +/* labels for first block (image) are in their own div */ +.prodigy-title-wrapper { + grid-area: header; +} +/* display metadata at top */ +.prodigy-meta { + grid-area: meta; + margin: 0 auto; +} + +/* image annotation */ +.prodigy-content { + grid-area: left; + align-self: start; + position: sticky; /* keep sticky when text scrolls */ + top: 0; +} + +/* second block (span annotation) doesn't have an obvious class to key on */ +.prodigy-container > div[class^=_a]:not(.prodigy-meta):not(.prodigy-content) { + grid-area: right; +} + +/* default token spacing is huge; override while still leaving enough space to select */ +/* FIXME: this breaks highlighting; need to figure out how to adjust both or set in Prodigy config +span[class^=_Token] { + height: 30px !important; +} */ + diff --git a/src/corppa/poetry_detection/annotation/recipe.py b/src/corppa/poetry_detection/annotation/recipe.py new file mode 100644 index 0000000..abd35a9 --- /dev/null +++ b/src/corppa/poetry_detection/annotation/recipe.py @@ -0,0 +1,169 @@ +""" +This module provides custom recipes for Prodigy annotation. They were +created with page-level text annotation in mind, and support annotating +text with a reference image displayed beside the text (`annotate_page_text`), +or annotating both text and image side by side (`annotate_text_and_image`). + +Referenced images must be served out independently for display; the image url +prefix for images should be specified when initializing the recipe. + +Example use: +``` +prodigy annotate_page_text poetry_spans poetry_pages.jsonl --label POETRY,PROSODY -F ../corppa/poetry_detection/annotation/recipe.py --image-prefix http://localhost:8000/ +prodigy annotate_text_and_image poetry_text_image poetry_pages.jsonl --label POETRY -F ../corppa/poetry_detection/annotation/recipe.py --image-prefix http://localhost:8000/ +""" + +from pathlib import Path + +import spacy +from prodigy.components.loaders import JSONL +from prodigy.core import Arg, recipe + +#: reference to current directory, for use as Prodigy CSS directory +CURRENT_DIR = Path(__file__).parent.absolute() + +#: common prodigy configurations for both recipes; copy and add blocks and labels +PRODIGY_COMMON_CONFIG = { + "buttons": ["accept", "reject", "undo"], # remove ignore button + "show_flag": True, # show flag button to mark weird/difficult examples + "hide_newlines": False, # ensure newlines are shown \n + "allow_newline_highlight": True, # allow highlighting of newlines \n + "honor_token_whitespace": True, # reflect whitespace accurately (e.g. in case of leading/trailing spaces) + "custom_theme": { + "labels": { + # trying to use options from PPA webapp color scheme, + # but may not be so great in Prodigy UI. + # azure #0788fc seafoam blue #57c4c4 wisteria #9c93c0 pig pink #ed949c + "POETRY": "#57c4c4", # label color for POETRY + }, + "hide_true_newline_tokens": False, + }, + "global_css_dir": CURRENT_DIR, +} + + +def tokenize_stream(stream, image_prefix=None): + """Takes a stream of Prodigy tasks and tokenizes text for span annotation, + and optionally adds an image prefix URL to any image paths present. + Stream is expected to contain `text` and may contain image_path` and a `meta` + dictionary. Returns a generator of the stream. + """ + + nlp = spacy.blank("en") # use blank spaCy model for tokenization + + # ensure image prefix URL does not have a trailing slash + if image_prefix is None: + image_prefix = "" + image_prefix = image_prefix.rstrip("/") + + for task in stream: + if task.get("text"): + doc = nlp(task["text"]) + task["tokens"] = [ + { + "text": token.text, + "start": token.idx, + "end": token.idx + len(token.text), + "id": i, + } + for i, token in enumerate(doc) + ] + # add image prefix URL for serving out images + if "image_path" in task: + task["image"] = f"{image_prefix}/{task['image_path']}" + yield task + + +@recipe( + "annotate_text_and_image", + dataset=Arg(help="path to input dataset"), + labels=Arg("--label", "-l", help="Comma-separated label(s)"), + image_prefix=Arg("--image-prefix", "-i", help="Base URL for images"), +) +def annotate_text_and_image( + dataset: str, source: str, labels: str, image_prefix: str = None +): + """Annotate text and image side by side: allows adding manual spans + to both image and text. Intended for page-level annotation. + """ + + stream = JSONL(source) # load jsonlines into stream + # tokenize for span annotation and add image prefix + tokenized_stream = tokenize_stream(stream, image_prefix) + + # split labels by commas and strip any whitespace + label_list = [label.strip() for label in labels.split(",")] + + blocks = [ + { + "view_id": "image_manual", + "labels": label_list, + }, + {"view_id": "spans_manual", "labels": label_list}, + ] + + # copy the common config options and add blocks and labels + config = PRODIGY_COMMON_CONFIG.copy() + config.update( + { + "blocks": blocks, + "labels": label_list, + "ner_manual_highlight_chars": True, + "image_manual_spans_key": "image_spans", + # limit image selection to rectangle only, no polygon or freehand + "image_manual_modes": ["rect"], + } + ) + + return { + "dataset": dataset, + "stream": tokenized_stream, + "view_id": "blocks", + "config": config, + } + + +@recipe( + "annotate_page_text", + dataset=Arg(help="path to input dataset"), + labels=Arg("--label", "-l", help="Comma-separated label(s)"), + image_prefix=Arg("--image-prefix", "-i", help="Base URL for images"), +) +def annotate_page_text( + dataset: str, source: str, labels: str, image_prefix: str = None +): + """Annotate text with manual spans; displays an image side by side + with text for reference only (image cannot be annotated). + Intended for page-level annotation. + """ + + stream = JSONL(source) # load jsonlines into stream + # tokenize for span annotation and add image prefix + tokenized_stream = tokenize_stream(stream, image_prefix) + + # split labels by commas and strip any whitespace + label_list = [label.strip() for label in labels.split(",")] + + blocks = [ + { + "view_id": "html", + "html_template": "", + }, + {"view_id": "spans_manual", "labels": label_list}, + ] + # copy the common config options and add blocks and labels + config = PRODIGY_COMMON_CONFIG.copy() + config.update( + { + "blocks": blocks, + "labels": label_list, + "ner_manual_highlight_chars": True, + } + ) + + return { + "dataset": dataset, + "stream": tokenized_stream, + "view_id": "blocks", + "config": config, + } diff --git a/src/corppa/utils/add_image_relpaths.py b/src/corppa/utils/add_image_relpaths.py new file mode 100644 index 0000000..5329717 --- /dev/null +++ b/src/corppa/utils/add_image_relpaths.py @@ -0,0 +1,88 @@ +import argparse +import sys +from pathlib import Path + +import orjsonl +from path_utils import get_image_relpath +from tqdm import tqdm + + +def add_image_paths(in_jsonl, ext=None, show_progress=True): + progress_bar = tqdm( + orjsonl.stream(in_jsonl), + desc="Adding page paths", + bar_format="{desc}: processed {n:,} pages{postfix} | elapsed: {elapsed}", + disable=not show_progress, + ) + for page in progress_bar: + work_id = page["work_id"] + page_num = page["order"] + image_relpath = get_image_relpath(work_id, page_num) + if ext is not None: + image_relpath = image_relpath.with_suffix(ext) + # Add relative path to record + page["image_path"] = str(image_relpath) + yield page + + +def save_corpus_with_image_relpaths(in_jsonl, out_jsonl, ext=None, show_progress=True): + orjsonl.save(out_jsonl, add_image_paths(in_jsonl, ext=ext, show_progress=True)) + + +def main(): + parser = argparse.ArgumentParser( + description="Add image (relative) paths to PPA full-text corpus", + ) + # Required arguments + parser.add_argument( + "input", + help="PPA full-text corpus to add page-level image paths to; " + "must be a JSONL file (compresed or not)", + type=Path, + ) + parser.add_argument( + "output", + help="Filename where output corpus should be saved", + type=Path, + ) + # Optional argument + parser.add_argument( + "--ext", + help="Extension to use for all image paths instead of the source-level defaults", + ) + parser.add_argument( + "--progress", + help="Show progress", + action=argparse.BooleanOptionalAction, + default=True, + ) + + args = parser.parse_args() + + # If output filename does not have an extension, add jsonl + out_jsonl = args.output + if out_jsonl.suffix == "": + out_jsonl = out_jsonl.with_suffix(".jsonl") + + # Validate arguments + if not args.input.is_file(): + print(f"Input {args.input} does not exist", file=sys.stderr) + sys.exit(1) + if out_jsonl.is_file(): + print(f"Output {args.output} already exist", file=sys.stderr) + sys.exit(1) + if args.ext: + if args.ext[0] != ".": + print(f"Extension must start with '.'", file=sys.stderr) + sys.exit(1) + + save_corpus_with_image_relpaths( + args.input, + out_jsonl, + ext=args.ext, + show_progress=args.progress, + ) + + +if __name__ == "__main__": + main() diff --git a/src/corppa/utils/filter.py b/src/corppa/utils/filter.py index 70c0d03..d204690 100644 --- a/src/corppa/utils/filter.py +++ b/src/corppa/utils/filter.py @@ -1,31 +1,38 @@ """ Utility for filtering PPA full-text corpus to work with a subset of -pages. Currently supports filtering by a list of PPA source ids. +pages. -.. Note:: - Currently, there is no way to filter to a specific excerpt when - there are multiple excerpts from a single source. +Currently supports the following types of filtering: + * List of PPA work ids (as a text file, id-per-line) + * CSV file specifying work pages (by digital page number) (csv, page-per-line) + * Filtering by key-value pair for either inclusion or exclusion -Filter methods can be run via command-line or python code. Takes jsonl file -(compressed or not) as input, a filename for output, and a file with a list of -selected source ids. +These filtering options can be combined, generally as a logical AND. Pages filtered +by work ids or page numbers will be further filtered by the key-value logic. In cases +where both work- and page-level filtering occurs, works not specified in the page +filtering are included in full. Works that are specified in both will be limited to the +pages specified in page-level filtering. -To use as a command-line script, pass corpus as input, desired output filename, -and filename with the list of source ids: +Filter methods can be run via command-line or python code. Filtering takes a jsonl file +(compressed or not) as input, and will produce a jsonl file (compressed or not) as output. +The input and output filenames can use any extension supported by any extension supported +by :mod:`orjsonl`, with or without compression; e.g. `.jsonl`, `.jsonl.gz`, `.jsonl.bz2`, etc. +Example command line usages: ``` -corppa-filter-corpus path/to/ppa_pages.jsonl my_ids.txt output/ppa_subset_pages.jsonl +corppa-filter path/to/ppa_pages.jsonl output/ppa_subset_pages.jsonl --idfile my_ids.txt ``` -Input format and output filename can use any extension supported by :mod:`orjsonl`, -with or without compression; e.g. `.jsonl`, `.jsonl.gz`, `.jsonl.bz2`, etc. - +``` +corppa-filter path/to/ppa_pages.jsonl output/ppa_subset_pages.jsonl --pg-file pages.csv --include key=value +``` """ import argparse -import os.path -from typing import Iterator +import csv +import pathlib import sys +from typing import Iterator import orjsonl from orjson import JSONDecodeError @@ -33,21 +40,47 @@ def filter_pages( - input_filename: str, source_ids: list[str], disable_progress: bool = False + input_filename: pathlib.Path, + work_ids: list[str] | None = None, + work_pages: dict | None = None, + include_filter: dict | None = None, + exclude_filter: dict | None = None, + disable_progress: bool = False, ) -> Iterator[dict]: """Takes a filename for a PPA full-text corpus in a format orjsonl supports - and a list of source ids. Returns a generator of filtered pages from the - full corpus corresponding to the list of ids. Displays progress - with :mod:`tqdm` progress bar unless disabled. + and one or more options for filtering that corpus. Returns a generator of + filtered pages from the full corpus corresponding to the list of ids. + At least one filtering option must be specified. + Displays progress with :mod:`tqdm` progress bar unless disabled. - :param input_filename: str, filename for corpus input - :param source_ids: list of str, source ids to include in filtered pages + :param input_filename: pathlib.Path, filename for corpus input + :param work_ids: list of str, work ids to include in filtered pages (optional) + :param work_pages: dict of str-set[int] pairs, specifies the set of digital page + numbers of a work (by work id) to be filtered to be filtered to (optional) + :param include_filter: dict of key-value pairs for pages to include in + the filtered page set; equality check against page data attributes (optional) + :param exclude_filter: dict of key-value pairs for pages to exclude from + the filtered page set; equality check against page data attributes (optional) :param disable_progress: boolean, disable progress bar (optional, default: False) :returns: generator of dict with page data :raises: FileNotFoundError, orjson.JSONDecodeError """ - # convert list of source ids to set for fast hashmap lookup - source_ids = set(source_ids) + # at least one filter is required + if not any([work_ids, work_pages, include_filter, exclude_filter]): + raise ValueError( + "At least one filter must be specified (work_ids, work_pages, include_filter, exclude_filter)" + ) + + if work_ids is not None: + # convert list of work ids to set for fast hashmap lookup + work_ids = set(work_ids) + # if work pages is provided, update work ids set + if work_pages is not None: + if work_ids is None: + work_ids = set(work_pages) + else: + work_ids |= set(work_pages) + selected_pages = 0 progress_pages = tqdm( orjsonl.stream(input_filename), @@ -56,62 +89,152 @@ def filter_pages( disable=disable_progress, ) for page in progress_pages: - # page data does not include source id, but does include work id - # which is either source id (for full works) or - # source id plus first page number (for articles/excerpts) - if page["work_id"].split("-p")[0] in source_ids: - # keep track of how many have been selected for reporting in - # progress bar - selected_pages += 1 - progress_pages.set_postfix_str(f"selected {selected_pages:,}") - yield page + # if work ids is specified and id does not match, skip + if work_ids: + if page["work_id"] not in work_ids: + continue + + # if work pages is specified, filter + if work_pages: + # if work id is in indexed, skip pages not include in its set + # NOTE: works specified in the work ids filter but not the work_pages + # filter will be included. + if page["work_id"] in work_pages: + if page["order"] not in work_pages[page["work_id"]]: + continue - # NOTE: other filters could be implemented here later, e.g. - # based on HathiTrust page tags like UNTYPICAL_PAGE or text content + # if key-value pairs for inclusion are specified, filter + if include_filter: + # multiple include filters use OR logic: + # if include filter does not apply, skip this page + if not any(page[key] == val for key, val in include_filter.items()): + continue + + # if key-value pairs for exclusion are specified, filter + if exclude_filter: + # if exclude filter matches, skip this page + if any(page[key] == val for key, val in exclude_filter.items()): + continue + + # keep track of how many have been selected for reporting in + # progress bar + selected_pages += 1 + progress_pages.set_postfix_str(f"selected {selected_pages:,}") + yield page def save_filtered_corpus( - input_filename: str, - idfile: str, - output_filename: str, + input_filename: pathlib.Path, + output_filename: pathlib.Path, + idfile: pathlib.Path | None = None, + pgfile: pathlib.Path | None = None, + include_filter: dict | None = None, + exclude_filter: dict | None = None, disable_progress: bool = False, ) -> None: """Takes a filename for input PPA full-text corpus in a format orjsonl supports, filename where filtered corpus should be saved, - and a filename with a list of source ids, one id per line. + and a filename with a list of work ids, one id per line. + At least one filter must be specified. Calls :meth:`filter_pages`. - :param input_filename: str, filename for corpus input - :param idfile: str, filename for list of source ids - :param output_filename: str, filename for filtered corpus output + :param input_filename: pathlib.Path, filepath for corpus input + :param output_filename: pathlib.Path, filepath for filtered corpus output + :param idfile: pathlib.Path, filepath for list of work ids (optional) + :param pgfile: pathlib.Path, filepath for list of pages (optional) + :param include_filter: dict of key-value pairs for pages to include in + the filtered page set; equality check against page data attributes (optional) + :param exclude_filter: dict of key-value pairs for pages to exclude from + the filtered page set; equality check against page data attributes (optional) :param disable_progress: boolean, disable progress bar (optional, default: False) """ - # read the id file and generate a list of ids - with open(idfile) as idfile_content: - source_ids = [line.strip() for line in idfile_content] + + work_ids = None + work_pages = None + + # at least one filter is required + if not any([idfile, pgfile, include_filter, exclude_filter]): + raise ValueError( + "At least one filter must be specified (idfile, pgfile, include_filter, exclude_filter)" + ) + + # if an id file is specified, read and generate a list of ids to include + if idfile: + with open(idfile) as idfile_content: + work_ids = [line.strip() for line in idfile_content] + + # if a page file is specified, build page index (work id -> page set) from file + if pgfile: + work_pages = {} + with open(pgfile, newline="") as csv_file: + reader = csv.DictReader(csv_file) + # Check header + if ( + "work_id" not in reader.fieldnames + or "page_num" not in reader.fieldnames + ): + raise ValueError( + f'pgfile {pgfile} must include fields "work_id" and "page_num"' + ) + + for row in reader: + if row["work_id"] not in work_pages: + work_pages[row["work_id"]] = set() + work_pages[row["work_id"]].add(int(row["page_num"])) # use orjsonl to stream filtered pages to specified output file orjsonl.save( output_filename, - filter_pages(input_filename, source_ids, disable_progress=disable_progress), + filter_pages( + input_filename, + work_ids=work_ids, + work_pages=work_pages, + include_filter=include_filter, + exclude_filter=exclude_filter, + disable_progress=disable_progress, + ), ) +class MergeKeyValuePairs(argparse.Action): + """ + custom argparse action to split a KEY=VALUE argument and append the pairs to a dictionary. + """ + + # adapted from https://stackoverflow.com/a/77148515/9706217 + + # NOTE: in future, we may want an option to store multiple values for + # the same key, perhaps using multidict or dict of key -> set(possible values) + + def __call__(self, parser, args, values, option_string=None): + previous = getattr(args, self.dest, None) or dict() + try: + added = dict(map(lambda x: x.split("="), values)) + except ValueError: + raise argparse.ArgumentError( + self, f'Could not parse argument "{values}" as k1=v1 k2=v2 ... format' + ) + merged = {**previous, **added} + setattr(args, self.dest, merged) + + def main(): """Command-line access to filtering the corpus. Available as - `corppa-filter-corpus` when this package is installed with pip.""" + `corppa-filter` when this package is installed with pip.""" parser = argparse.ArgumentParser( - description="Filters PPA full-text corpus by list of source ids", + description="Filters PPA full-text corpus", ) parser.add_argument( "input", help="PPA full-text corpus to be " + "filtered; must be a JSONL file (compressed or not)", + type=pathlib.Path, ) - parser.add_argument("idfile", help="filename with list of source ids, one per line") parser.add_argument( - "output", help="filename where the filtered corpus should be saved" + "output", + help="filename where the filtered corpus should be saved", + type=pathlib.Path, ) parser.add_argument( "--progress", @@ -119,38 +242,113 @@ def main(): action=argparse.BooleanOptionalAction, default=True, ) + parser.add_argument( + "--cleanup", + help="Remove empty output file if no pages are relected", + action=argparse.BooleanOptionalAction, + default=True, + ) + filter_args = parser.add_argument_group( + "filters", + "Options for filtering pages. MUST include at least one. " + + "When multiple filters are specified, they are all combined (AND). " + + "If multiple include/exclude filters are specified, a page is " + + "included/excluded if ANY key=value pairs match.", + ) + filter_args.add_argument( + "-i", + "--idfile", + help="File containing a list of work ids (one per line) to filter to", + type=pathlib.Path, + required=False, + ) + filter_args.add_argument( + "--pgfile", + help="CSV file containing the list of pages to filter to. File must have a header " + + 'with fields named "work_id" and "page_num".', + type=pathlib.Path, + ) + filter_args.add_argument( + "--include", + nargs="*", + action=MergeKeyValuePairs, + metavar="KEY=VALUE", + help='Include pages by attribute: add key-value pairs as key=value or key="another value". ' + + "(no spaces around =, use quotes for values with spaces)", + ) + filter_args.add_argument( + "--exclude", + nargs="*", + action=MergeKeyValuePairs, + metavar="KEY=VALUE", + help='Exclude pages by attribute: add key-value pairs as key=value or key="another value". ' + + "(no spaces around =, use quotes for values with spaces)", + ) args = parser.parse_args() # progress bar is enabled by default; disable if requested disable_progress = not args.progress - if not os.path.exists(args.idfile): - print(f"Error: idfile {args.idfile} does not exist") - sys.exit(-1) - elif os.path.getsize(args.idfile) == 0: - print(f"Error: idfile {args.idfile} is zero size") - sys.exit(-1) + # at least one filter must be specified + # check that one of idfile, include, or exclude is specified + if not any([args.idfile, args.pgfile, args.include, args.exclude]): + parser.error("At least one filter option must be specified") + + if args.idfile: + if not args.idfile.is_file(): + print(f"Error: idfile {args.idfile} does not exist", file=sys.stderr) + sys.exit(1) + elif args.idfile.stat().st_size == 0: + print(f"Error: idfile {args.idfile} is zero size", file=sys.stderr) + sys.exit(1) + + if args.pgfile: + if not args.pgfile.is_file(): + print(f"Error: pgfile {args.pgfile} does not exist", file=sys.stderr) + sys.exit(1) + elif args.pgfile.stat().st_size == 0: + print(f"Error: pgfile {args.pgfile} is zero size", file=sys.stderr) + sys.exit(1) # if requested output filename has no extension, add jsonl - output_filename = args.output - if os.path.splitext(output_filename)[1] == "": - output_filename = f"{output_filename}.jsonl" + output_filepath = args.output + if output_filepath.suffix == "": + output_filepath = output_filepath.with_suffix(".jsonl") - if os.path.exists(output_filename): + if output_filepath.is_file(): print( - f"Error: requested output file {args.output} already exists; not overwriting" + f"Error: requested output file {args.output} already exists; not overwriting", + file=sys.stderr, ) - sys.exit(-1) + sys.exit(1) try: save_filtered_corpus( - args.input, args.idfile, output_filename, disable_progress=disable_progress + args.input, + output_filepath, + idfile=args.idfile, + pgfile=args.pgfile, + include_filter=args.include, + exclude_filter=args.exclude, + disable_progress=disable_progress, ) except (FileNotFoundError, JSONDecodeError) as err: # catch known possible errors and display briefly # with the type of error and the brief message - print(f"{err.__class__.__name__}: {err}") - sys.exit(-1) + print(f"{err.__class__.__name__}: {err}", file=sys.stderr) + sys.exit(1) + + # check if output file exists but is zero size (i.e., no pages selected) + if output_filepath.is_file() and output_filepath.stat().st_size == 0: + # if cleanup is disabled, remove and report + if args.cleanup: + output_filepath.unlink() + print( + f"No pages were selected, removing empty output file {output_filepath}" + ) + # otherwise just report + else: + print(f"No pages were selected, output file {output_filepath} is empty") if __name__ == "__main__": diff --git a/src/corppa/utils/generate_page_set.py b/src/corppa/utils/generate_page_set.py new file mode 100644 index 0000000..fdc1147 --- /dev/null +++ b/src/corppa/utils/generate_page_set.py @@ -0,0 +1,130 @@ +""" +Utility for generating a PPA page set. + +This method takes three inputs: (1) an input csv, (2) an output csv, and +(3) the size of the page set. + +The input CSV file must have the following fields: + * work_id: PPA work id + * page_start: Starting index for page range being considered for this work + * page_end: Ending index for page range being considered for this work + * poery_pages: Comma separated list of page numbers containing poetry + +The pages are selected as follows: + * First, all pages with poetry are selected + * Then, all remaining pages are chosen randomly (proportionately by work) + +The resulting output CSV file has the following fields: + * work_id: PPA work id + * page_num: Digital page number +""" + +import argparse +import csv +import random +import sys +from pathlib import Path + + +def get_pages(in_csv, k): + """ + Using the input CSV, generate a page set with k pages such that all pages + with poetry are included and the remaining pages are selected randomly from + the remaining pages under consideration. + + Notes: + * In some cases the generated page set may not match k. + * This is not compatible with PPA works with non-sequential page ranges. + """ + # Load page data + page_pool = {} + poetry_pages = {} + page_counter = 0 + with open(in_csv, newline="") as csvfile: + reader = csv.DictReader(csvfile) + for row in reader: + work_id = row["work_id"] + if work_id not in page_pool: + page_pool[work_id] = {} + start_idx = int(row["page_start"]) + end_idx = int(row["page_end"]) + 1 + # Gather full page range + for i in range(start_idx, end_idx): + page_pool[work_id][i] = {"work_id": work_id, "page_num": i} + # Yield pages with poetry + for pg_id in row["poetry_pages"].split(","): + yield page_pool[work_id].pop(int(pg_id)) + page_counter += 1 + + # Print warning if more poetry pages than k + if page_counter >= k: + print( + f"Warning: Too many pages with poetry (k = {k} <= {page_counter})", + file=sys.stderr, + ) + + # Select remaining pages randomly + # TODO: Revisit to simply page selcection logic + while page_counter < k: + # Select work + work_id = random.choice(list(page_pool.keys())) + # Select page + try: + pg_id = random.choice(list(page_pool[work_id].keys())) + except IndexError: + # Encountered empty list, remove work entry and continue + del page_pool[work_id] + continue + yield page_pool[work_id].pop(pg_id) + page_counter += 1 + + # Print warning if less than k pages found + if page_counter < k: + print(f"Warning: Less than k pages found", file=sys.stderr) + + +def save_page_set(in_csv, out_csv, k): + """ + Save a page set of size k constructed based on the input csv + """ + with open(out_csv, mode="w", newline="") as csvfile: + writer = csv.DictWriter(csvfile, fieldnames=["work_id", "page_num"]) + writer.writeheader() + for page in get_pages(in_csv, k): + writer.writerow(page) + + +def main(): + parser = argparse.ArgumentParser( + description="Generate a page set suitable for filtering PPA" + ) + # Required Arguments + parser.add_argument( + "input", + help="Input CSV", + type=Path, + ) + parser.add_argument( + "output", + help="Output CSV", + type=Path, + ) + parser.add_argument("k", help="Number of pages in set", type=int) + + args = parser.parse_args() + + if not args.input.is_file(): + print(f"Error: input {args.input} does not exist", file=sys.stderr) + sys.exit(1) + if args.output.is_file(): + print(f"Error: output {args.output} already exists", file=sys.stderr) + sys.exit(1) + if args.k <= 0: + print(f"Error: k must be positive", file=sys.stderr) + sys.exit(1) + + save_page_set(args.input, args.output, args.k) + + +if __name__ == "__main__": + main() diff --git a/src/corppa/utils/path_utils.py b/src/corppa/utils/path_utils.py new file mode 100644 index 0000000..98a4b4e --- /dev/null +++ b/src/corppa/utils/path_utils.py @@ -0,0 +1,116 @@ +""" +Library of general-purpose auxiliary methods for stand-alone scripts +""" + +import pathlib + +_htid_encode_map = {":": "+", "/": "=", ".": ","} +_htid_encode_table = str.maketrans(_htid_encode_map) +_htid_decode_map = {v: k for k, v in _htid_encode_map.items()} +_htid_decode_table = str.maketrans(_htid_decode_map) + + +def encode_htid(htid): + """ + Returns the "clean" version of a HathiTrust volume identifier with the form: + [library id].[volume id] + Specifically, the volume-portion of the id undergoes the following + character replacement: ":" --> "+", "/" --> "=", "." --> "," + """ + if "." not in htid: + raise ValueError(f"Invalid htid '{htid}'") + lib_id, vol_id = htid.split(".", 1) + vol_id = vol_id.translate(_htid_encode_table) + return f"{lib_id}.{vol_id}" + + +def decode_htid(encoded_htid): + """ + Return original HathiTrust volume identifier from encoded version: + [library id].[encoded volume id] + Specifically, the volume-portion of the id undergoes the following + character replacement: "+" --> ":", "=" --> "/", "," --> "." + """ + if "." not in encoded_htid: + raise ValueError(f"Invalid encoded htid '{encoded_htid}'") + lib_id, vol_id = encoded_htid.split(".", 1) + vol_id = vol_id.translate(_htid_decode_table) + return f"{lib_id}.{vol_id}" + + +def get_ppa_source(vol_id): + """ + For a given volume id, return the corresponding source. + Assume: + * Gale volume ids begin with "CW0" or "CBO" + * Hathitrust volume ids contain a "." + """ + # Note that this is fairly brittle. + if vol_id.startswith("CW0") or vol_id.startswith("CB0"): + return "Gale" + elif "." in vol_id: + return "HathiTrust" + else: + raise ValueError(f"Can't identify source for volume '{vol_id}'") + + +def get_stub_dir(source, vol_id): + """ + Returns the stub directory name for the specified volume (vol_id) and + source type (source) + + For Gale, every third number (excluding the leading 0) of the volume + identifier is used. + Ex. CB0127060085 --> 100 + + For HathiTrust, the library portion of the volume identifier is used. + Ex. mdp.39015003633594 --> mdp + """ + if source == "Gale": + return vol_id[::3][1:] + elif source == "HathiTrust": + return vol_id.split(".", maxsplit=1)[0] + else: + raise ValueError(f"Unknown source '{source}'") + + +def get_vol_dir(vol_id): + """ + Returns the volume directory (pathlib.Path) for the specified volume (vol_id) + """ + source = get_ppa_source(vol_id) + if source == "Gale": + return pathlib.Path(source, get_stub_dir(source, vol_id), vol_id) + elif source == "HathiTrust": + # TODO: This does not match tigerdata + # return pathlib.Path(source, get_stub_dir(source, vol_id), encode_htid(vol_id)) + raise NotImplementedError(f"{source} volume directory conventions TBD") + else: + raise ValueError(f"Unknown source '{source}'") + + +def get_volume_id(work_id): + """ + Extract volume id from PPA work id + + * For full works, volume ids and work ids are the same. + * For excerpts, the work id is composed of the prefix followed by "-p" and + the starting page of the excerpt. + """ + return work_id.rsplit("-p", 1)[0] + + +def get_image_relpath(work_id, page_num): + """ + Get the (relative) image path for specified PPA work page + """ + vol_id = get_volume_id(work_id) + vol_dir = get_vol_dir(vol_id) + source = get_ppa_source(vol_id) + if source == "Gale": + image_name = f"{vol_id}_{page_num:04d}0.TIF" + return vol_dir.joinpath(image_name) + elif source == "HathiTrust": + raise NotImplementedError + else: + raise ValueError(f"Unsupported source '{source}'") diff --git a/test/test_ocr/test_gvision_ocr.py b/test/test_ocr/test_gvision_ocr.py new file mode 100644 index 0000000..b590d90 --- /dev/null +++ b/test/test_ocr/test_gvision_ocr.py @@ -0,0 +1,168 @@ +from pathlib import Path +from types import GeneratorType +from unittest.mock import call, patch + +import pytest + +from corppa.ocr.gvision_ocr import ( + image_relpath_generator, + ocr_image_via_gvision, + ocr_images, +) + + +def test_image_relpath_generator(tmp_path): + jpg_a = Path("a.jpg") + tmp_path.joinpath(jpg_a).touch() + txt_b = Path("b.txt") + tmp_path.joinpath(txt_b).touch() + + # I. Single ext + paths = image_relpath_generator(tmp_path, [".jpg"]) + assert isinstance(paths, GeneratorType) + assert [jpg_a] == list(paths) + + # II. Multiple exts + tif_c = Path("c.tif") + tmp_path.joinpath(tif_c).touch() + paths = list(image_relpath_generator(tmp_path, [".jpg", ".tif"])) + assert {jpg_a, tif_c} == set(paths) + + # III. Extension handling is case insensitive + jpg_d = Path("d.JPG") + tmp_path.joinpath(jpg_d).touch() + paths_a = list(image_relpath_generator(tmp_path, [".jpg"])) + paths_b = list(image_relpath_generator(tmp_path, [".JPG"])) + assert set(paths_a) == set(paths_b) + assert {jpg_a, jpg_d} == set(paths_a) + + +def test_image_relpath_generator_nested(tmp_path): + img_dir = tmp_path.joinpath(tmp_path, "images") + img_dir.mkdir() + jpg_a = Path("a.jpg") + tmp_path.joinpath(jpg_a).touch() + jpg_b = Path("b.jpg") + img_dir.joinpath(jpg_b).touch() + + paths = image_relpath_generator(img_dir, [".jpg"]) + assert {jpg_b} == set(paths) + + paths = image_relpath_generator(tmp_path, [".jpg"]) + assert {jpg_a, Path("images", "b.jpg")} == set(paths) + + +def test_image_relpath_hidden_dirs(tmp_path): + jpg_a = Path("a.jpg") + tmp_path.joinpath(jpg_a).touch() + hidden_dir = tmp_path.joinpath(".hidden") + hidden_dir.mkdir() + jpg_b = Path("b.jpg") + hidden_dir.joinpath(jpg_b).touch() + + paths = list(image_relpath_generator(tmp_path, [".jpg"])) + assert [jpg_a] == paths + + +def test_image_relpath_generator_symbolic_links(tmp_path): + """ + Test directory sturcture: + dir_a: + a.jpg + b.jpg (symbolic link, file: dir_b/b.jpg) + dir_c (symbolic link, dir: dir_b/dir_c) + dir_b: + b.jpg + dir_c: + c.jpg + """ + # Create directories + dir_a = tmp_path.joinpath("dir_a") + dir_a.mkdir() + dir_b = tmp_path.joinpath("dir_b") + dir_b.mkdir() + dir_c = dir_b.joinpath("dir_c") + dir_c.mkdir() + # Create files + jpg_a = Path("a.jpg") + dir_a.joinpath(jpg_a).touch() + jpg_b = Path("b.jpg") + dir_b.joinpath(jpg_b).touch() + jpg_c = Path("c.jpg") + dir_c.joinpath(jpg_c).touch() + # Create symbolic links + sym_b = dir_a.joinpath("b.jpg") + sym_b.symlink_to(jpg_b) + sym_c = dir_a.joinpath("c") + sym_c.symlink_to(dir_c, target_is_directory=True) + + # Default follows symbolic links + paths = list(image_relpath_generator(dir_a, [".jpg"])) + assert {jpg_a, jpg_b, Path("c", "c.jpg")} == set(paths) + + # Do not follow symbolic links + paths = list(image_relpath_generator(dir_a, [".jpg"], follow_symlinks=False)) + assert {jpg_a, jpg_b} == set(paths) + + +@patch("corppa.ocr.gvision_ocr.google_vision", None) +def test_ocr_image_via_gvision_no_gvision(capsys): + with pytest.raises(SystemExit): + ocr_image_via_gvision(None, Path("in.jpg"), Path("out.txt"), Path("out.json")) + captured = capsys.readouterr() + assert "does not contain google-cloud-vision" in captured.err + + +@patch("corppa.ocr.gvision_ocr.google_vision", None) +def test_ocr_images_no_gvision(capsys): + with pytest.raises(SystemExit): + ocr_images(Path("in"), Path("out"), set()) + captured = capsys.readouterr() + assert "does not contain google-cloud-vision" in captured.err + + +@patch("corppa.ocr.gvision_ocr.image_relpath_generator") +@patch("corppa.ocr.gvision_ocr.ocr_image_via_gvision") +@patch("corppa.ocr.gvision_ocr.google_vision") +def test_ocr_images( + mock_gvision, mock_ocr_image, mock_image_relpath_generator, tmp_path +): + # Setup up mock clientp + mock_client = mock_gvision.ImageAnnotatorClient + img_dir = tmp_path.joinpath("images") + img_dir.mkdir() + ocr_dir = tmp_path.joinpath("ocr") + ocr_dir.mkdir() + # Create output ocr for b, so b.jpg will be skipped + ocr_dir.joinpath("b.txt").touch() + + mock_client.return_value = "client_placeholder" + mock_image_relpath_generator.return_value = [ + Path("a.jpg"), + Path("b.jpg"), + Path("subdir", "c.jpg"), + ] + + reporting = ocr_images(img_dir, ocr_dir, [".jpg"]) + assert mock_client.call_count == 1 + # Check that subdirectory for c was created + assert ocr_dir.joinpath("subdir").is_dir() + # Check ocr calls + assert mock_ocr_image.call_count == 2 + calls = [ + call( + "client_placeholder", + img_dir.joinpath("a.jpg"), + ocr_dir.joinpath("a.txt"), + ocr_dir.joinpath("a.json"), + ), + call( + "client_placeholder", + img_dir.joinpath("subdir", "c.jpg"), + ocr_dir.joinpath("subdir", "c.txt"), + ocr_dir.joinpath("subdir", "c.json"), + ), + ] + mock_ocr_image.assert_has_calls(calls) + # Check output + assert {"ocr_count": 2, "skip_count": 1} == reporting diff --git a/test/test_utils/test_filter.py b/test/test_utils/test_filter.py index 77089e7..df8a099 100644 --- a/test/test_utils/test_filter.py +++ b/test/test_utils/test_filter.py @@ -1,36 +1,117 @@ import json import os +import pathlib from unittest.mock import patch import pytest -from corppa.utils.filter import filter_pages, save_filtered_corpus, main +from corppa.utils.filter import filter_pages, main, save_filtered_corpus # minimal/mock page data fixture for testing fixture_page_data = [ - {"work_id": "foo", "label": "i"}, - {"work_id": "bar-p1", "label": "1"}, - {"work_id": "bar-p1", "label": "2"}, - {"work_id": "bar-p1", "label": "3"}, - {"work_id": "baz", "label": "23"}, + {"work_id": "foo", "label": "i", "order": 2}, + {"work_id": "bar-p1", "label": "1", "order": 1}, + {"work_id": "bar-p1", "label": "2", "order": 2}, + {"work_id": "bar-p1", "label": "3", "order": 3}, + {"work_id": "baz", "label": "23", "order": 27}, ] @pytest.fixture -def corpus_file(tmpdir): - """pytest fixture; creates a jsonl file with fixture_page_data in a tmpdir; +def corpus_file(tmp_path): + """pytest fixture; creates a jsonl file with fixture_page_data in a temp dir; returns the path object for the jsonl file.""" - corpusfile = tmpdir.join("ppa_pages.jsonl") - corpusfile.write("\n".join([json.dumps(p) for p in fixture_page_data])) + corpusfile = tmp_path.joinpath("ppa_pages.jsonl") + corpusfile.write_text("\n".join([json.dumps(p) for p in fixture_page_data])) return corpusfile -def test_filter_pages(corpus_file): - source_ids = ["foo", "bar"] - # use list to consume the generator - results = list(filter_pages(str(corpus_file), source_ids, disable_progress=True)) +def test_filter_work_ids(corpus_file): + # "bar" corresponds to a source_id not a work_id (since bar-p1 is an excerpt) + work_ids = ["foo", "bar"] + results = list(filter_pages(corpus_file, work_ids=work_ids, disable_progress=True)) + assert len(results) == 1 + assert results[0]["work_id"] == "foo" + + work_ids = ["foo", "bar-p1"] + results = list(filter_pages(corpus_file, work_ids=work_ids, disable_progress=True)) + assert len(results) == 4 + assert set(r["work_id"] for r in results) == set(work_ids) + + +def test_filter_work_pages(corpus_file): + work_pages = {"foo": {2}, "bar": {1}, "bar-p1": {3, 5}, "foo": {2}, "baz": {1}} + results = list( + filter_pages( + corpus_file, + work_pages=work_pages, + disable_progress=True, + ) + ) + assert len(results) == 2 + assert set([r["work_id"] for r in results]) == {"foo", "bar-p1"} + assert set([r["order"] for r in results]) == {2, 3} + + +def test_filter_include(corpus_file): + results = list( + filter_pages( + corpus_file, + include_filter={"work_id": "bar-p1", "label": "23"}, + disable_progress=True, + ) + ) assert len(results) == 4 - assert set([r["work_id"].split("-")[0] for r in results]) == set(source_ids) + assert set([r["work_id"] for r in results]) == {"bar-p1", "baz"} + assert set([r["label"] for r in results]) == {"1", "2", "3", "23"} + + +def test_filter_exclude(corpus_file): + results = list( + filter_pages( + corpus_file, + exclude_filter={"work_id": "bar-p1", "label": "23"}, + disable_progress=True, + ) + ) + assert len(results) == 1 + assert set([r["work_id"] for r in results]) == {"foo"} + assert set([r["label"] for r in results]) == {"i"} + + +def test_filter_id_and_include(corpus_file): + # work ids and include filter used in combination + results = list( + filter_pages( + corpus_file, + work_ids=["bar-p1"], + include_filter={"label": "2", "work_id": "baz"}, + disable_progress=True, + ) + ) + assert len(results) == 1 + assert results[0]["work_id"] == "bar-p1" + assert results[0]["label"] == "2" + + +def test_filter_id_and_work_pages(corpus_file): + # provide work ids as well as work pages + results = list( + filter_pages( + corpus_file, + work_ids=["foo"], + work_pages={"bar-p1": {2}}, + disable_progress=True, + ) + ) + assert len(results) == 2 + assert set([r["work_id"] for r in results]) == {"foo", "bar-p1"} + assert set([r["order"] for r in results]) == {2} + + +def test_filter_required_args(corpus_file): + with pytest.raises(ValueError, match="At least one filter must be specified"): + list(filter_pages(corpus_file)) @patch("corppa.utils.filter.tqdm") @@ -40,8 +121,8 @@ def test_filter_pages_progressbar(mock_orjsonl, mock_tqdm, corpus_file): # configure mock tqdm iterator to return fixture page data mock_tqdm.return_value.__iter__.return_value = fixture_page_data # use list to consume the generator - list(filter_pages(str(corpus_file), ["foo"])) - mock_orjsonl.stream.assert_called_with(str(corpus_file)) + list(filter_pages(corpus_file, ["foo"])) + mock_orjsonl.stream.assert_called_with(corpus_file) mock_tqdm.assert_called_with( mock_orjsonl.stream.return_value, desc="Filtering", @@ -58,8 +139,8 @@ def test_filter_pages_noprogressbar(mock_orjsonl, mock_tqdm, corpus_file): # configure mock tqdm iterator to return fixture page data mock_tqdm.return_value.__iter__.return_value = fixture_page_data # use list to consume the generator - list(filter_pages(str(corpus_file), ["foo"], disable_progress=True)) - mock_orjsonl.stream.assert_called_with(str(corpus_file)) + list(filter_pages(corpus_file, ["foo"], disable_progress=True)) + mock_orjsonl.stream.assert_called_with(corpus_file) mock_tqdm.assert_called_with( mock_orjsonl.stream.return_value, desc="Filtering", @@ -70,31 +151,62 @@ def test_filter_pages_noprogressbar(mock_orjsonl, mock_tqdm, corpus_file): @patch("corppa.utils.filter.filter_pages") @patch("corppa.utils.filter.orjsonl") -def test_save_filtered_corpus(mock_orjsonl, mock_filter_pages, tmpdir): - idfile = tmpdir.join("ids.txt") +def test_save_filtered_corpus(mock_orjsonl, mock_filter_pages, tmp_path): + idfile = tmp_path.joinpath("ids.txt") ids = ["one", "two", "three", "four"] - idfile.write("\n".join(ids)) + idfile.write_text("\n".join(ids)) input_filename = "input.jsonl" output_filename = "output.jsonl" - save_filtered_corpus(input_filename, str(idfile), output_filename) + save_filtered_corpus(input_filename, output_filename, idfile) # filter should be called with input file and list of ids from text file - mock_filter_pages.assert_called_with(input_filename, ids, disable_progress=False) + mock_filter_pages.assert_called_with( + input_filename, + work_ids=ids, + work_pages=None, + include_filter=None, + exclude_filter=None, + disable_progress=False, + ) # should save result to specified output filename mock_orjsonl.save.assert_called_with( output_filename, mock_filter_pages.return_value ) +def test_save_filtered_corpus_required_args(): + with pytest.raises(ValueError, match="At least one filter must be specified"): + save_filtered_corpus("pages.jsonl", "filtered.jsonl") + + +def test_save_filtered_corpus_pgfile_fieldnames(tmp_path): + pgfile = tmp_path.joinpath("pages.csv") + pgfile.write_text("work,pg_id\n") + pgfile.write_text("foo,1\n") + pgfile.write_text("bar,2\n") + + with pytest.raises( + ValueError, + match=f'pgfile {pgfile} must include fields "work_id" and "page_num"', + ): + save_filtered_corpus("pages.jsonl", "filtered.jsonl", pgfile=pgfile) + + @pytest.mark.parametrize( "cli_args, call_params", [ # all required params, default progressbar behavior ( - ["filter.py", "pages.json", "id.txt", "subset.jsonl"], + ["filter.py", "pages.json", "subset.jsonl", "--idfile", "id.txt"], ( - ("pages.json", "id.txt", "subset.jsonl"), - {"disable_progress": False}, + (pathlib.Path("pages.json"), pathlib.Path("subset.jsonl")), + { + "idfile": pathlib.Path("id.txt"), + "pgfile": None, + "include_filter": None, + "exclude_filter": None, + "disable_progress": False, + }, ), ), # disable progress bar @@ -102,19 +214,77 @@ def test_save_filtered_corpus(mock_orjsonl, mock_filter_pages, tmpdir): [ "filter.py", "pages.json.bz2", - "id.txt", "subset.jsonl.gz", + "--idfile", + "id.txt", "--no-progress", ], ( - ("pages.json.bz2", "id.txt", "subset.jsonl.gz"), - {"disable_progress": True}, + (pathlib.Path("pages.json.bz2"), pathlib.Path("subset.jsonl.gz")), + { + "idfile": pathlib.Path("id.txt"), + "pgfile": None, + "include_filter": None, + "exclude_filter": None, + "disable_progress": True, + }, ), ), # no extension on output file; should add jsonl ( - ["filter.py", "pages.json", "id.txt", "subset"], - (("pages.json", "id.txt", "subset.jsonl"), {"disable_progress": False}), + ["filter.py", "pages.json", "subset", "--idfile", "id.txt"], + ( + (pathlib.Path("pages.json"), pathlib.Path("subset.jsonl")), + { + "idfile": pathlib.Path("id.txt"), + "pgfile": None, + "include_filter": None, + "exclude_filter": None, + "disable_progress": False, + }, + ), + ), + # include filter + ( + ["filter.py", "pages.json", "subset", "--include", "tag=one", "page=2"], + ( + (pathlib.Path("pages.json"), pathlib.Path("subset.jsonl")), + { + "idfile": None, + "pgfile": None, + "include_filter": {"tag": "one", "page": "2"}, + "exclude_filter": None, + "disable_progress": False, + }, + ), + ), + # exclude filter + ( + ["filter.py", "pages.json", "subset", "--exclude", "contains_poetry=Yes"], + ( + (pathlib.Path("pages.json"), pathlib.Path("subset.jsonl")), + { + "idfile": None, + "pgfile": None, + "include_filter": None, + "exclude_filter": {"contains_poetry": "Yes"}, + "disable_progress": False, + }, + ), + ), + # pgfile filter + ( + ["filter.py", "pages.json", "subset", "--pgfile", "pages.csv"], + ( + (pathlib.Path("pages.json"), pathlib.Path("subset.jsonl")), + { + "idfile": None, + "pgfile": pathlib.Path("pages.csv"), + "include_filter": None, + "exclude_filter": None, + "disable_progress": False, + }, + ), ), ], ) @@ -122,8 +292,14 @@ def test_save_filtered_corpus(mock_orjsonl, mock_filter_pages, tmpdir): def test_main(mock_save_filtered_corpus, cli_args, call_params, tmp_path): # change to temp directory, make sure id file exists and is non-zero os.chdir(tmp_path) - idfile = tmp_path / cli_args[2] - idfile.write_text("id1\nid2") + # create an idfile at expected path; arg comes immediately after --idfile + if "--idfile" in cli_args: + idfile = tmp_path / cli_args[cli_args.index("--idfile") + 1] + idfile.write_text("id1\nid2") + # cerate a csvfile at expected path; args comes immediately after --pgfile + if "--pgfile" in cli_args: + pgfile = tmp_path / cli_args[cli_args.index("--pgfile") + 1] + pgfile.write_text("src_id1,1\nsrc_id2,2") # patch in test args for argparse to parse with patch("sys.argv", cli_args): @@ -132,24 +308,87 @@ def test_main(mock_save_filtered_corpus, cli_args, call_params, tmp_path): mock_save_filtered_corpus.assert_called_with(*args, **kwargs) +def test_main_argparse_error(capsys): + # call with required parameters but no filters + with patch("sys.argv", ["filter.py", "pages.json", "subset"]): + # at least one filter is required + with pytest.raises(SystemExit): + main() + captured = capsys.readouterr() + assert "At least one filter option must be specified" in captured.err + + +@patch("corppa.utils.filter.save_filtered_corpus") +def test_main_cleanup(mock_save_filtered_corpus, tmp_path, capsys): + input_file = tmp_path / "pages.json" + idfile = tmp_path / "id.txt" + output_file = tmp_path / "subset.jsonl" + + cli_args = ["filter.py", str(input_file), str(output_file), "--idfile", str(idfile)] + + # change to temp directory, make sure id file exists and is non-zero + os.chdir(tmp_path) + # create an idfile at expected path + idfile = tmp_path / cli_args[cli_args.index("--idfile") + 1] + idfile.write_text("id1\nid2") + + # as a mock side effect, create a zero size file to be cleaned up + def create_output(*args, **kwargs): + output_file.write_text("") + + mock_save_filtered_corpus.side_effect = create_output + + # patch in arguments for arg.parse to load + with patch("sys.argv", cli_args): + main() + assert not output_file.exists() + captured = capsys.readouterr() + assert "No pages were selected, removing empty output file" in captured.out + + # with cleanup disabled, zero-size file should not be removed + cli_args.append("--no-cleanup") + with patch("sys.argv", cli_args): + main() + assert output_file.exists() + captured = capsys.readouterr() + # should still report on the empty file + assert "No pages were selected" in captured.out + + @patch("corppa.utils.filter.save_filtered_corpus") def test_main_idfile_nonexistent(mock_save_filtered_corpus, capsys): - with patch("sys.argv", ["f.py", "foo.jsonl", "/not/a/real/id.txt", "out.jsonl"]): - with pytest.raises(SystemExit): + with patch( + "sys.argv", ["f.py", "foo.jsonl", "out.jsonl", "--idfile", "/not/a/real/id.txt"] + ): + with pytest.raises(SystemExit) as execinfo: main() + assert execinfo.value.code == 1 captured = capsys.readouterr() - assert "does not exist" in captured.out + assert "does not exist" in captured.err @patch("corppa.utils.filter.save_filtered_corpus") def test_main_idfile_empty(mock_save_filtered_corpus, capsys, tmp_path): idfile = tmp_path / "id.txt" idfile.touch() - with patch("sys.argv", ["f.py", "foo.jsonl", str(idfile), "out.jsonl"]): - with pytest.raises(SystemExit): + with patch("sys.argv", ["f.py", "foo.jsonl", "out.jsonl", "--idfile", str(idfile)]): + with pytest.raises(SystemExit) as execinfo: main() + assert execinfo.value.code == 1 captured = capsys.readouterr() - assert "is zero size" in captured.out + assert "is zero size" in captured.err + + +@patch("corppa.utils.filter.save_filtered_corpus") +def test_main_pgfile_empty(mock_save_filtered_corpus, capsys, tmp_path): + pgfile = tmp_path / "pages.csv" + pgfile.touch() + with patch("sys.argv", ["f.py", "foo.jsonl", "out.jsonl", "--pgfile", str(pgfile)]): + with pytest.raises(SystemExit) as execinfo: + main() + assert execinfo.value.code == 1 + captured = capsys.readouterr() + assert "is zero size" in captured.err @patch("corppa.utils.filter.save_filtered_corpus") @@ -158,8 +397,11 @@ def test_main_outfile_exists(mock_save_filtered_corpus, capsys, tmp_path): idfile.write_text("id1\nid2") outfile = tmp_path / "subset.jsonl" outfile.touch() - with patch("sys.argv", ["f.py", "foo.jsonl", str(idfile), str(outfile)]): - with pytest.raises(SystemExit): + with patch( + "sys.argv", ["f.py", "foo.jsonl", str(outfile), "--idfile", str(idfile)] + ): + with pytest.raises(SystemExit) as execinfo: main() + assert execinfo.value.code == 1 captured = capsys.readouterr() - assert "already exists" in captured.out + assert "already exists" in captured.err diff --git a/test/test_utils/test_path_utils.py b/test/test_utils/test_path_utils.py new file mode 100644 index 0000000..b41d525 --- /dev/null +++ b/test/test_utils/test_path_utils.py @@ -0,0 +1,127 @@ +import pathlib +from unittest.mock import patch + +import pytest + +from corppa.utils.path_utils import ( + decode_htid, + encode_htid, + get_image_relpath, + get_ppa_source, + get_stub_dir, + get_vol_dir, + get_volume_id, +) + + +def test_encode_htid(): + assert encode_htid("mdp.39015003633594") == "mdp.39015003633594" + assert encode_htid("dul1.ark:/13960/t5w67998k") == "dul1.ark+=13960=t5w67998k" + assert encode_htid("miun.aaa3406.0001.001") == "miun.aaa3406,0001,001" + with pytest.raises(ValueError, match="Invalid htid 'xxx0000'"): + encode_htid("xxx0000") + + +def test_decode_htid(): + assert decode_htid("mdp.39015003633594") == "mdp.39015003633594" + assert decode_htid("dul1.ark+=13960=t5w67998k") == "dul1.ark:/13960/t5w67998k" + assert decode_htid("miun.aaa3406,0001,001") == "miun.aaa3406.0001.001" + with pytest.raises(ValueError, match="Invalid encoded htid 'xxx0000'"): + decode_htid("xxx0000") + + +def test_encode_decode_htid(): + assert decode_htid(encode_htid("mdp.39015003633594")) == "mdp.39015003633594" + assert ( + decode_htid(encode_htid("dul1.ark:/13960/t5w67998k")) + == "dul1.ark:/13960/t5w67998k" + ) + + assert decode_htid(encode_htid("miun.aaa3406.0001.001")) == "miun.aaa3406.0001.001" + + +def test_get_ppa_source(): + assert get_ppa_source("CB0127060085") == "Gale" + assert get_ppa_source("CW0116527364") == "Gale" + assert get_ppa_source("mdp.39015010540071") == "HathiTrust" + with pytest.raises(ValueError, match="Can't identify source for volume 'xxx0000'"): + get_ppa_source("xxx0000") + + +def test_get_stub_dir(): + # Gale + assert get_stub_dir("Gale", "CB0127060085") == "100" + # HathiTrust + assert get_stub_dir("HathiTrust", "mdp.39015003633594") == "mdp" + # Other + with pytest.raises(ValueError, match="Unknown source 'invalid src'"): + get_stub_dir("invalid src", "xxx0000") + + +@patch("corppa.utils.path_utils.get_stub_dir", return_value="stub_name") +@patch("corppa.utils.path_utils.get_ppa_source") +def test_get_vol_dir_gale(mock_get_ppa_source, mock_get_stub_dir): + # Set returned source value to Gale + mock_get_ppa_source.return_value = "Gale" + assert get_vol_dir("gale_id") == pathlib.Path("Gale", "stub_name", "gale_id") + mock_get_ppa_source.assert_called_with("gale_id") + mock_get_stub_dir.assert_called_with("Gale", "gale_id") + + +@patch("corppa.utils.path_utils.get_stub_dir", return_value="stub_name") +@patch("corppa.utils.path_utils.get_ppa_source") +def test_get_vol_dir_hathi(mock_get_ppa_source, mock_get_stub_dir): + # Set returned source value to HathiTrust + mock_get_ppa_source.return_value = "HathiTrust" + # TODO: Update once HathiTrust directory conventions are finalized + with pytest.raises( + NotImplementedError, match="HathiTrust volume directory conventions TBD" + ): + get_vol_dir("htid") + mock_get_ppa_source.assert_called_with("htid") + mock_get_stub_dir.assert_not_called() + + +@patch("corppa.utils.path_utils.get_stub_dir", return_value="stub_name") +@patch("corppa.utils.path_utils.get_ppa_source") +def test_get_vol_dir_unk(mock_get_ppa_source, mock_get_stub_dir): + # Set returned source value + mock_get_ppa_source.return_value = "Unknown" + with pytest.raises(ValueError, match="Unknown source 'Unknown'"): + get_vol_dir("vol_id") + mock_get_ppa_source.assert_called_with("vol_id") + mock_get_stub_dir.assert_not_called() + + +def test_get_volume_id(): + # Full works + for work_id in ["CB0131351206", "dul1.ark:/13960/t5w67998k"]: + assert get_volume_id(work_id) == work_id + + # Excerpts + assert get_volume_id("CW0102294490-pxvi") == "CW0102294490" + assert get_volume_id("coo1.ark:/13960/t4bp0n867-p3") == "coo1.ark:/13960/t4bp0n867" + + +@patch("corppa.utils.path_utils.get_volume_id", return_value="vol_id") +@patch("corppa.utils.path_utils.get_vol_dir", return_value=pathlib.Path("vol_dir")) +@patch("corppa.utils.path_utils.get_ppa_source") +def test_get_image_relpath(mock_get_ppa_source, mock_get_vol_dir, mock_get_volume_id): + # Gale + mock_get_ppa_source.return_value = "Gale" + assert get_image_relpath("test_id", 4) == pathlib.Path( + "vol_dir", "vol_id_00040.TIF" + ) + assert get_image_relpath("test_id", 100) == pathlib.Path( + "vol_dir", "vol_id_01000.TIF" + ) + + # HathiTrust + mock_get_ppa_source.return_value = "HathiTrust" + with pytest.raises(NotImplementedError): + get_image_relpath("test_id", 4) + + # Other sources + mock_get_ppa_source.return_value = "EEBO" + with pytest.raises(ValueError, match="Unsupported source 'EEBO'"): + get_image_relpath("test_id", 4)