From 10868508d03ee78b1f91ce11dcabd464d498e1c1 Mon Sep 17 00:00:00 2001 From: Ben Silverman Date: Thu, 18 Jan 2024 11:45:10 -0500 Subject: [PATCH 01/97] Set develop version to 4.17-dev --- geniza/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/geniza/__init__.py b/geniza/__init__.py index 940dae072..23fdeebb2 100644 --- a/geniza/__init__.py +++ b/geniza/__init__.py @@ -1,4 +1,4 @@ -__version_info__ = (4, 16, 0, None) +__version_info__ = (4, 17, 0, "dev") # Dot-connect all but the last. Last is dash-connected if not None. From 53eaa7ea46f0c1d057f60d6cd9d6cc811acd0ef3 Mon Sep 17 00:00:00 2001 From: Ben Silverman Date: Thu, 18 Jan 2024 12:59:25 -0500 Subject: [PATCH 02/97] Remove one-time-use management command --- .../management/commands/add_cat_numbers.py | 44 ------------------- 1 file changed, 44 deletions(-) delete mode 100644 geniza/corpus/management/commands/add_cat_numbers.py diff --git a/geniza/corpus/management/commands/add_cat_numbers.py b/geniza/corpus/management/commands/add_cat_numbers.py deleted file mode 100644 index 54a6ec243..000000000 --- a/geniza/corpus/management/commands/add_cat_numbers.py +++ /dev/null @@ -1,44 +0,0 @@ -""" -Script to add catalog numbers to historical shelfmarks for some Bodleian -records. This is a one-time script and should be removed after the import is -completed in production. - -Intended to be run manually from the shell as follows: -./manage.py add_cat_numbers historical_shelfmarks.csv -""" - -import csv -import re - -from django.core.management.base import BaseCommand - -from geniza.corpus.models import Fragment - - -class Command(BaseCommand): - """Import catalog numbers into Fragment records in the local database.""" - - bodl_regex = r"^Bodl\. MS Heb\. (?P[A-Za-z]) (?P\d+)," - - def add_arguments(self, parser): - parser.add_argument("path", help="Path to a CSV file") - - def handle(self, *args, **kwargs): - with open(kwargs.get("path"), newline="") as csvfile: - reader = csv.DictReader(csvfile) - for row in reader: - cat_number = row["catalog no. (Bodl. historical shelfmarks)"] - if cat_number: - try: - frag = Fragment.objects.get(pk=int(row["id"])) - except Fragment.DoesNotExist: - print(f"Error: cannot find fragment with id {row['id']}") - continue - - # Bodl. MS heb. b 12/6 - # --> data migration --> Bodl. MS Heb. b 12, f. 6 - # --> this script --> Bodl. MS Heb. b 12 (Cat. 2875), f. 6 - hist_repl = f"Bodl. MS Heb. \g \g (Cat. {cat_number})," - hist = re.sub(self.bodl_regex, hist_repl, frag.old_shelfmarks) - frag.old_shelfmarks = hist - frag.save() From 3cf41871f0e79a0f3775d6fc404303fb056ee500 Mon Sep 17 00:00:00 2001 From: Ben Silverman Date: Fri, 19 Jan 2024 11:26:17 -0500 Subject: [PATCH 03/97] Add name field to package.json --- package-lock.json | 1 + package.json | 1 + 2 files changed, 2 insertions(+) diff --git a/package-lock.json b/package-lock.json index 75d3ba0f8..8ac812865 100644 --- a/package-lock.json +++ b/package-lock.json @@ -4,6 +4,7 @@ "requires": true, "packages": { "": { + "name": "geniza", "dependencies": { "@babel/core": "^7.15.0", "@babel/preset-env": "^7.15.0", diff --git a/package.json b/package.json index 713970f01..7906822db 100644 --- a/package.json +++ b/package.json @@ -1,4 +1,5 @@ { + "name": "geniza", "scripts": { "start": "webpack serve --mode=development", "build": "webpack --mode=production", From ffa70b9a777a03bd920ad2c2493ecf00a176c647 Mon Sep 17 00:00:00 2001 From: rlskoeser Date: Thu, 25 Jan 2024 13:52:38 -0500 Subject: [PATCH 04/97] Remove defunct requires.io badge from readme --- README.rst | 4 ---- 1 file changed, 4 deletions(-) diff --git a/README.rst b/README.rst index 855d9608b..09375ea68 100644 --- a/README.rst +++ b/README.rst @@ -19,10 +19,6 @@ Python 3.9 / Django 3.2 / Node 16 / Postgresql / Solr 9.2 :target: https://codecov.io/gh/Princeton-CDH/geniza :alt: Code coverage -.. image:: https://requires.io/github/Princeton-CDH/geniza/requirements.svg?branch=main - :target: https://requires.io/github/Princeton-CDH/geniza/requirements/?branch=main - :alt: Requirements Status - .. image:: https://github.com/Princeton-CDH/geniza/workflows/dbdocs/badge.svg :target: https://dbdocs.io/princetoncdh/geniza :alt: dbdocs build From 48fd694fedcd6d967b40b3cac740729539182dd1 Mon Sep 17 00:00:00 2001 From: Ben Silverman Date: Fri, 2 Feb 2024 14:09:39 -0500 Subject: [PATCH 05/97] Remove TEI migration and interim sync transcription code ref #1179 - post v4.9 cleanup --- .../commands/sync_transcriptions.py | 393 ------------- .../management/commands/tei_to_annotation.py | 529 ------------------ geniza/corpus/tei_transcriptions.py | 266 --------- geniza/corpus/tests/test_tei_transcription.py | 150 ----- 4 files changed, 1338 deletions(-) delete mode 100644 geniza/corpus/management/commands/sync_transcriptions.py delete mode 100644 geniza/corpus/management/commands/tei_to_annotation.py delete mode 100644 geniza/corpus/tei_transcriptions.py delete mode 100644 geniza/corpus/tests/test_tei_transcription.py diff --git a/geniza/corpus/management/commands/sync_transcriptions.py b/geniza/corpus/management/commands/sync_transcriptions.py deleted file mode 100644 index 7a307a166..000000000 --- a/geniza/corpus/management/commands/sync_transcriptions.py +++ /dev/null @@ -1,393 +0,0 @@ -""" -Script to synchronize transcription content from PGP v3 TEI files -to an _interim_ html format in the database. - -The script checks out and updates the transcription files from the -git repository, and then loops through all xml files and -identifies the document and footnote to update, if possible. - -""" - -import glob -import os.path -from collections import defaultdict - -from django.conf import settings -from django.contrib.admin.models import ADDITION, CHANGE, LogEntry -from django.contrib.auth.models import User -from django.contrib.contenttypes.models import ContentType -from django.core.management.base import BaseCommand, CommandError -from django.db import models -from django.urls import reverse -from eulxml import xmlmap -from git import Repo - -from geniza.common.utils import absolutize_url -from geniza.corpus.models import Document -from geniza.corpus.tei_transcriptions import GenizaTei -from geniza.footnotes.models import Footnote, Source - - -class Command(BaseCommand): - """Synchronize TEI transcriptions to edition footnote content""" - - v_normal = 1 # default verbosity - - def add_arguments(self, parser): - parser.add_argument( - "-n", - "--noact", - action="store_true", - help="Do not save changes to the database", - ) - parser.add_argument("files", nargs="*", help="Only sync the specified files.") - - # dict of footnotes that have been updated with list of TEI files, to track/prevent - # TEI files resolving incorrectly to the same edition - footnotes_updated = defaultdict(list) - - # keep track of document ids with multiple digitized editions (likely merged records/joins) - multiedition_docs = set() - - def handle(self, *args, **options): - # get settings for remote git repository url and local path - gitrepo_url = settings.TEI_TRANSCRIPTIONS_GITREPO - gitrepo_path = settings.TEI_TRANSCRIPTIONS_LOCAL_PATH - - self.verbosity = options["verbosity"] - self.noact_mode = options["noact"] - - # make sure we have latest tei content from git repository - self.sync_git(gitrepo_url, gitrepo_path) - - if not self.noact_mode: - # get content type and user for log entries, unless in no-act mode - self.footnote_contenttype = ContentType.objects.get_for_model(Footnote) - self.script_user = User.objects.get(username=settings.SCRIPT_USERNAME) - - self.stats = defaultdict(int) - # after creating missing goitein unpublished edition notes, these will not be created again - self.stats["footnote_created"] = 0 - # duplicates might not always happen - self.stats["duplicate_footnote"] = 0 - # updates should not happen after initial sync when there are no TEI changes - self.stats["footnote_updated"] = 0 - # empty tei may not happen when running on a subset - self.stats["empty_tei"] = 0 - self.stats["document_not_found"] = 0 - self.stats["joins"] = 0 - self.stats["no_edition"] = 0 - self.stats["one_edition"] = 0 - self.stats["multiple_editions_with_content"] = 0 - # keep track of document ids with multiple digitized editions (likely merged records/joins) - self.multiedition_docs = set() - - # iterate through all tei files in the repository OR specified files - xmlfiles = options["files"] or glob.iglob(os.path.join(gitrepo_path, "*.xml")) - for xmlfile in xmlfiles: - self.stats["xml"] += 1 - xmlfile_basename = os.path.basename(xmlfile) - - tei = xmlmap.load_xmlobject_from_file(xmlfile, GenizaTei) - # some files are stubs with no content - # check if the tei is ok to proceed; (e.g., empty or only translation content) - # if empty, report and skip - if not self.check_tei(tei, xmlfile): - continue - - # get the document for the file based on id / old id - doc = self.get_pgp_document(xmlfile_basename) - # if document was not found, skip - if not doc: - continue - - if doc.fragments.count() > 1: - self.stats["joins"] += 1 - - footnote = self.get_edition_footnote(doc, tei, xmlfile) - # if we identified an appropriate footnote, update it - if footnote: - # if this footnote has already been chosen in the current script run, don't update again - if self.footnotes_updated[footnote.pk]: - self.stderr.write( - "Footnote %s (PGPID %s) already updated with %s; not overwriting with %s" - % ( - footnote.pk, - doc.pk, - ";".join(self.footnotes_updated[footnote.pk]), - xmlfile, - ) - ) - self.stats["duplicate_footnote"] += 1 - else: - self.footnotes_updated[footnote.pk].append(xmlfile) - - # convert into html, return in a list of blocks per inferred page/image - html_pages = tei.text_to_html() - text = tei.text_to_plaintext() - - # if no html was generated, stop processing - if not html_pages: - if self.verbosity >= self.v_normal: - self.stderr.write("No html generated for %s" % doc.id) - continue - - html = {} - # assign each page of html to a canvas based on sequence, - # skipping any non-document images - for i, image in enumerate(doc.iiif_images(filter_side=True)): - # stop iterating through images when we run out of pages - if not html_pages: - break - # pop the first page of html off the list - # and assign to the image canvas uri - html[image["canvas"]] = html_pages.pop(0) - - # if there are any html pages left - # (either document does not have any iiif images, or not all images) - # generate local canvas uris and attach transcription content - if html_pages: - # document manifest url is /documents/pgpid/iiif/manifest/ - # create canvas uris parallel to that - canvas_base_uri = "%siiif/canvas/" % doc.permalink - # iterate through any remaining pages and assign to local canvas uris - for i, html_chunk in enumerate(html_pages): - canvas_uri = "%s%d/" % (canvas_base_uri, i) - html[canvas_uri] = html_chunk - - footnote.content = {"html": html, "text": text} - if footnote.has_changed("content"): - # don't actually save in --noact mode - if not self.noact_mode: - footnote.save() - # create a log entry to document the change - self.log_footnote_update( - footnote, os.path.basename(xmlfile) - ) - - # count as a change whether in no-act mode or not - self.stats["footnote_updated"] += 1 - - # NOTE: in *one* case there is a TEI file with translation content and - # no transcription; will get reported as empty, but that's ok — it's out of scope - # for this script and should be handled elsewhere. - - # report on what was done - # include total number of transcription files, - # documents with transcriptions, number of fragments, and how how many joins - self.stats["multi_edition_docs"] = len(self.multiedition_docs) - self.stdout.write( - """Processed {xml:,} TEI/XML files; skipped {empty_tei:,} TEI files with no transcription content. -{document_not_found:,} documents not found in database. -{joins:,} documents with multiple fragments. -{multiple_editions:,} documents with multiple editions; {multiple_editions_with_content} multiple editions with content ({multi_edition_docs} unique documents). -{no_edition:,} documents with no edition. -{one_edition:,} documents with one edition. -Updated {footnote_updated:,} footnotes (created {footnote_created:,}; skipped overwriting {duplicate_footnote}). -""".format( - **self.stats - ) - ) - - for footnote_id, xmlfiles in self.footnotes_updated.items(): - if len(xmlfiles) > 1: - self.stderr.write( - "Footnote pk %s updated more than once: %s" - % (footnote_id, ";".join(xmlfiles)) - ) - - def check_tei(self, tei, xmlfile): - """Check TEI and report if it is empty, labels only, or has no content. - - :param tei: xmlmap tei instance to check; :class:`~geniza.corpus.tei_transcriptions.GenizaTei` - :param xmlfile: xml filename, for reporting - :returns: True if check passes; False if the TEI should be skipped. - :rtype: bool - """ - # some files are stubs with no content - # check if there is no text content; report and return true or false - if tei.no_content(): - if self.verbosity >= self.v_normal: - self.stdout.write("%s has no text content, skipping" % xmlfile) - self.stats["empty_tei"] += 1 - return False - elif tei.labels_only(): - if self.verbosity >= self.v_normal: - self.stdout.write( - "%s has labels only, no other text content; skipping" % xmlfile - ) - self.stats["empty_tei"] += 1 - return False - elif not tei.text.lines: - self.stdout.write("%s has no lines (translation?), skipping" % xmlfile) - self.stats["empty_tei"] += 1 - return False - - return True - - def get_pgp_document(self, xmlfile_basename): - """Find the PGP document for the specified TEI file, based on filename, - if possible. - - :returns: instance of :class:`~geniza.corpus.models.Document` or None if not found - """ - - # get the document id from the filename (####.xml) - pgpid = os.path.splitext(xmlfile_basename)[0] - # in ONE case there is a duplicate id with b suffix on the second - try: - pgpid = int(pgpid.strip("b")) - except ValueError: - if self.verbosity >= self.v_normal: - self.stderr.write("Failed to generate integer PGPID from %s" % pgpid) - return - # can we rely on pgpid from xml? - # but in some cases, it looks like a join 12047 + 12351 - - # find the document in the database - try: - Document.objects.get_by_any_pgpid(pgpid) - except Document.DoesNotExist: - self.stats["document_not_found"] += 1 - if self.verbosity >= self.v_normal: - self.stdout.write("Document %s not found in database" % pgpid) - return - - def get_footnote_editions(self, doc): - """Get all edition footnotes of a document; used by :meth:`get_edition_footnote`, - extend to include digital editions in tei to annotation script.""" - return doc.footnotes.editions() - - def get_edition_footnote(self, doc, tei, filename): - """identify the edition footnote to be updated""" - # get editions for this document - editions = self.get_footnote_editions(doc) - - if editions.count() > 1: - self.stats["multiple_editions"] += 1 - - # when there are multiple, try to identify correct edition by author names - footnote = self.choose_edition_by_authors(tei, editions, doc) - # if we got a match, use it - if footnote: - return footnote - - # if not, limit to editions with content and try again - editions_with_content = editions.filter(content__isnull=False) - footnote = self.choose_edition_by_authors(tei, editions_with_content, doc) - if footnote: - return footnote - - # if not, fallback to first edition - if editions_with_content.count() == 1: - self.stats["multiple_editions_with_content"] += 1 - self.multiedition_docs.add(doc.id) - - # if there was only one, assume it's the one to update - # NOTE: this is potentially wrong! - return editions_with_content.first() - - if not editions.exists(): - # no editions found; check if we can create a goitein unpublished edition footnote - footnote = self.is_it_goitein(tei, doc) - if footnote: - return footnote - - self.stats["no_edition"] += 1 - if self.verbosity > self.v_normal: - self.stdout.write("No edition found for %s" % filename) - for line in tei.source: - self.stdout.write("\t%s" % line) - else: - self.stats["one_edition"] += 1 - # if only one edition, update the transciption content there - return editions.first() - - def choose_edition_by_authors(self, tei, editions, doc): - """Try to choose correct edition from a list based on author names; - based on structured author names in the TEI""" - if tei.source_authors: - tei_authors = set(tei.source_authors) - author_matches = [] - for ed in editions: - ed_authors = set([auth.last_name for auth in ed.source.authors.all()]) - if ed_authors == tei_authors: - author_matches.append(ed) - - # if we got exactly one match, use that edition - if len(author_matches) == 1: - return author_matches[0] - - # if there were *no* author matches, see if we can create a goitein unpublished edition note - if not author_matches: - return self.is_it_goitein(tei, doc) - - def is_it_goitein(self, tei, doc): - """Check if a TEI document is a Goitein edition. If no edition exists - and we can identify based on the TEI as a Goitein unpublished edition, - then create a new footnote.""" - source_info = str(tei.source[0]).lower() - if "goitein" in source_info and ( - "unpublished editions" in source_info or "typed texts" in source_info - ): - if not self.noact_mode: - footnote = self.create_goitein_footnote(doc) - if footnote: - self.stats["footnote_created"] += 1 - return footnote - - def create_goitein_footnote(self, doc): - """Create a new footnote for a Goitein unpublished edition""" - source = Source.objects.filter( - authors__last_name="Goitein", - title_en="unpublished editions", - source_type__type="Unpublished", - volume__startswith=Source.get_volume_from_shelfmark(doc.shelfmark), - ).first() - if not source: - self.stderr.write( - "Error finding Goitein unpublished editions source for %s" - % doc.shelfmark - ) - return - - footnote = Footnote.objects.create( - source=source, - content_object=doc, - doc_relation=Footnote.EDITION, - ) - LogEntry.objects.log_action( - user_id=self.script_user.id, - content_type_id=self.footnote_contenttype.pk, - object_id=footnote.pk, - object_repr=str(footnote), - change_message="Created Goitein unpublished editions footnote to sync transcription", - action_flag=ADDITION, - ) - - return footnote - - def sync_git(self, gitrepo_url, local_path): - """ensure git repository has been cloned and content is up to date""" - - # if directory does not yet exist, clone repository - if not os.path.isdir(local_path): - if self.verbosity >= self.v_normal: - self.stdout.write( - "Cloning TEI transcriptions repository to %s" % local_path - ) - Repo.clone_from(url=gitrepo_url, to_path=local_path) - else: - # pull any changes since the last run - Repo(local_path).remotes.origin.pull() - - def log_footnote_update(self, footnote, xmlfile): - """create a log entry for a footnote that has been updated""" - LogEntry.objects.log_action( - user_id=self.script_user.id, - content_type_id=self.footnote_contenttype.pk, - object_id=footnote.pk, - object_repr=str(footnote), - change_message="Updated transcription from TEI file %s" % xmlfile, - action_flag=CHANGE, - ) diff --git a/geniza/corpus/management/commands/tei_to_annotation.py b/geniza/corpus/management/commands/tei_to_annotation.py deleted file mode 100644 index 8827da3e4..000000000 --- a/geniza/corpus/management/commands/tei_to_annotation.py +++ /dev/null @@ -1,529 +0,0 @@ -""" -Script to convert transcription content from PGP v3 TEI files -to IIIF annotations in the configured annotation server. - -""" - -import glob -import os.path -import unicodedata -from collections import defaultdict -from datetime import datetime -from functools import cached_property - -import requests -from addict import Dict -from django.conf import settings -from django.contrib.admin.models import ADDITION, CHANGE, DELETION, LogEntry -from django.contrib.auth.models import User -from django.contrib.contenttypes.models import ContentType -from django.core.management.base import BaseCommand, CommandError -from django.db import models -from django.template.defaultfilters import pluralize -from django.urls import reverse -from django.utils import timezone -from eulxml import xmlmap -from git import Repo -from parasolr.django.signals import IndexableSignalHandler -from rich.progress import MofNCompleteColumn, Progress - -from geniza.annotations.models import Annotation -from geniza.common.utils import absolutize_url -from geniza.corpus.annotation_export import AnnotationExporter -from geniza.corpus.management.commands import sync_transcriptions -from geniza.corpus.models import Document -from geniza.corpus.tei_transcriptions import GenizaTei -from geniza.footnotes.models import Footnote - - -class Command(sync_transcriptions.Command): - """Synchronize TEI transcriptions to edition footnote content""" - - v_normal = 1 # default verbosity - - missing_footnotes = [] - - normalized_unicode = set() - - document_not_found = [] - - def add_arguments(self, parser): - parser.add_argument( - "files", nargs="*", help="Only convert the specified files." - ) - - def handle(self, *args, **options): - # get settings for remote git repository url and local path - gitrepo_url = settings.TEI_TRANSCRIPTIONS_GITREPO - gitrepo_path = settings.TEI_TRANSCRIPTIONS_LOCAL_PATH - - self.verbosity = options["verbosity"] - # get content type and script user for log entries - self.script_user = User.objects.get(username=settings.SCRIPT_USERNAME) - - # disconnect solr indexing signals - IndexableSignalHandler.disconnect() - - # make sure we have latest tei content from git repository - # (inherited from sync transcriptions command) - self.sync_git(gitrepo_url, gitrepo_path) - # initialize local git repo client - self.tei_gitrepo = Repo(gitrepo_path) - - self.stats = defaultdict(int) - - xmlfiles = options["files"] or glob.glob(os.path.join(gitrepo_path, "*.xml")) - script_run_start = timezone.now() - - self.stdout.write("Migrating %d TEI files" % len(xmlfiles)) - - # when running on all files (i.e., specific files not specified), - # clear all annotations from the database before running the migration - # NOTE: could make this optional behavior, but it probably only - # impacts development and not the real migration? - if not options["files"]: - # cheating a little here, but much faster to clear all at once - # instead of searching and deleting one at a time - all_annos = Annotation.objects.all() - self.stdout.write("Clearing %d annotations" % all_annos.count()) - all_annos.delete() - - # initialize annotation exporter; don't push changes until the end; - # commit message will be overridden per export to docment TEI file - self.anno_exporter = AnnotationExporter( - stdout=self.stdout, - verbosity=options["verbosity"], - push_changes=False, - commit_msg="PGP transcription export from TEI migration", - ) - self.anno_exporter.setup_repo() - - # use rich progressbar without context manager - progress = Progress( - MofNCompleteColumn(), *Progress.get_default_columns(), expand=True - ) - progress.start() - task = progress.add_task("Migrating...", total=len(xmlfiles)) - - # iterate through tei files to be migrated - for xmlfile in xmlfiles: - self.stats["xml"] += 1 - # update progress at the beginning instead of end, - # since some records are skipped - progress.update(task, advance=1, update=True) - - if self.verbosity >= self.v_normal: - self.stdout.write(xmlfile) - - xmlfile_basename = os.path.basename(xmlfile) - - tei = xmlmap.load_xmlobject_from_file(xmlfile, GenizaTei) - # some files are stubs with no content - # check if the tei is ok to proceed; (e.g., empty or only translation content) - # if empty, report and skip - if not self.check_tei(tei, xmlfile): - self.stdout.write( - self.style.WARNING( - "No transcription content in %s; skipping" % xmlfile - ) - ) - continue - # get the document for the file based on id / old id - doc = self.get_pgp_document(xmlfile_basename) - # if document was not found, skip - if not doc: - self.stdout.write( - self.style.WARNING("Document not found for %s; skipping" % xmlfile) - ) - self.document_not_found.append(xmlfile) - continue - # found the document - if self.verbosity >= self.v_normal: - self.stdout.write(str(doc)) - - # get the footnote for this file & doc - footnote = self.get_edition_footnote(doc, tei, xmlfile) - # if no footnote, skip for now - # (some are missing, but will handle with data work) - if not footnote: - self.stdout.write( - self.style.ERROR( - "footnote not found for %s / %s; skipping" % (xmlfile, doc.pk) - ) - ) - self.missing_footnotes.append(xmlfile) - continue - footnote = self.migrate_footnote(footnote, doc) - - # if there is a single primary language, use the iso code if it is set - lang_code = None - if doc.languages.count() == 1: - lang_code = doc.languages.first().iso_code - - # get html blocks from the tei - html_blocks = tei.text_to_html(block_format=True) - - # get canvas objects for the images in order; skip any non-document images - iiif_canvases = list(doc.iiif_images(filter_side=True).keys()) - # determine the number of canvases needed based on labels - # that indicate new pages - # check and count any after the first; always need at least 1 canvas - num_canvases = 1 + len( - [ - b["label"] - for b in html_blocks[1:] - if tei.label_indicates_new_page(b["label"]) - ] - ) - # in verbose mode report on available/needed canvases - if self.verbosity > self.v_normal: - self.stdout.write( - "%d iiif canvases; need %d canvases for %d blocks" - % (len(iiif_canvases), num_canvases, len(html_blocks)) - ) - # if we need more canvases than we have available, - # generate local canvas ids - if num_canvases > len(iiif_canvases): - # document manifest url is /documents/pgpid/iiif/manifest/ - # create canvas uris parallel to that - canvas_base_uri = doc.manifest_uri.replace("manifest", "canvas") - for i in range(num_canvases - len(iiif_canvases)): - canvas_uri = "%s%d/" % (canvas_base_uri, i + 1) - iiif_canvases.append(canvas_uri) - - # NOTE: pgpid 1390 folio example; each chunk should be half of the canvas - # (probably should be handled manually) - # if len(html_chunks) > len(iiif_canvases): - # self.stdout.write( - # "%s has more html chunks than canvases; skipping" % xmlfile - # ) - # continue - - # start attaching to first canvas; increment based on chunk label - canvas_index = 0 - - # if specific files were specified, remove annotations - # just for those documents & sources - if options["files"]: - # remove all existing annotations associated with this - # document and source so we can reimport as needed - existing_annos = Annotation.objects.filter( - footnote__source=footnote.source, - footnote__content_object=doc, - created__lt=script_run_start, - ) - # NOTE: this is problematic for transcriptions currently - # split across two TEI files... take care when running - # on individual or groups of files - if existing_annos: - print( - "Removing %s pre-existing annotation%s for %s on %s " - % ( - len(existing_annos), - pluralize(existing_annos), - footnote.source, - doc.manifest_uri, - ) - ) - # not creating log entries for deletion, but - # this should probably only come up in dev runs - existing_annos.delete() - - for i, block in enumerate(html_blocks): - # if this is not the first block and the label suggests new image, - # increment canvas index - if i != 0 and tei.label_indicates_new_page(block["label"]): - canvas_index += 1 - - anno = new_transcription_annotation() - # get the canvas uri for this section of text - annotation_target = iiif_canvases[canvas_index] - anno.target.source.id = annotation_target - - # apply to the full canvas using % notation - # (using nearly full canvas to make it easier to edit zones) - anno.target.selector.value = "xywh=percent:1,1,98,98" - # anno.selector.value = "%s#xywh=pixel:0,0,%s,%s" % (annotation_target, canvas.width, canvas.height) - - # add html and optional label to annotation text body - # NOTE: not specifying language in html here because we - # handle it in wrapping template code based on db language - - html = tei.lines_to_html(block["lines"]) - if not unicodedata.is_normalized("NFC", html): - self.normalized_unicode.add(xmlfile) - html = unicodedata.normalize("NFC", html) - anno.body[0].value = html - - if block["label"]: - # check if label text requires normalization - if not unicodedata.is_normalized("NFC", block["label"]): - self.normalized_unicode.add(xmlfile) - block["label"] = unicodedata.normalize("NFC", block["label"]) - anno.body[0].label = block["label"] - - anno["schema:position"] = i + 1 - # print(anno) # can print for debugging - - # create database annotation - db_anno = Annotation() - db_anno.set_content(dict(anno)) - # link to digital edition footnote - db_anno.footnote = footnote - db_anno.save() - # log entry to document annotation creation - self.log_addition(db_anno, "Migrated from TEI transcription") - self.stats["created"] += 1 - - # export migrated transcription to backup - self.export_transcription(doc, xmlfile_basename) - - progress.refresh() - progress.stop() - - print( - "Processed %(xml)d TEI file(s). \nCreated %(created)d annotation(s)." - % self.stats - ) - - # push all changes from migration to github - self.anno_exporter.sync_github() - - # report on missing footnotes - if self.missing_footnotes: - print( - "Could not find footnotes for %s document%s:" - % (len(self.missing_footnotes), pluralize(self.missing_footnotes)) - ) - for xmlfile in self.missing_footnotes: - print("\t%s" % xmlfile) - - # report on unicode normalization - if self.normalized_unicode: - print( - "Normalized unicode for %s document%s:" - % (len(self.normalized_unicode), pluralize(self.normalized_unicode)) - ) - for xmlfile in self.normalized_unicode: - print("\t%s" % xmlfile) - - if self.document_not_found: - print( - "Document not found for %s TEI file%s:" - % (len(self.document_not_found), pluralize(self.document_not_found)) - ) - for xmlfile in self.normalized_unicode: - print("\t%s" % xmlfile) - - # report on edition footnotes that still have content - # (skip when unning on a specified files) - if not options["files"]: - self.check_unmigrated_footnotes() - - def get_footnote_editions(self, doc): - # extend to return digital edition or edition - # (digital edition if from previous run of this script) - return doc.footnotes.filter( - models.Q(doc_relation__contains=Footnote.EDITION) - | models.Q(doc_relation__contains=Footnote.DIGITAL_EDITION) - ) - - # we shouldn't be creating new footnotes at this point... - # override method from sync transcriptions to ensure we don't - def is_it_goitein(self, tei, doc): - return None - - def migrate_footnote(self, footnote, document): - # convert existing edition footnote to digital edition - # OR make a new one if the existing footnote has other information - - # convert existing edition footnote to digital edition - # OR make a new one if the existing footnote has other information - - # if footnote is already a digital edition, nothing to be done - # (already migrated in a previous run) - if footnote.doc_relation == Footnote.DIGITAL_EDITION: - return footnote - - # check if a digital edition footnote for this document+source exists, - # so we can avoid creating a duplicate - diged_footnote = document.footnotes.filter( - doc_relation=Footnote.DIGITAL_EDITION, source=footnote.source - ).first() - - # if footnote has other types or a url, we should preserve it - if ( - set(footnote.doc_relation).intersection( - {Footnote.TRANSLATION, Footnote.DISCUSSION} - ) - or footnote.url - or footnote.location - ): - # remove interim transcription content - if footnote.content: - footnote.content = None - footnote.save() - - # if a digital edition footnote for this document+source exists, - # use that instead of creating a duplicate - if diged_footnote: - return diged_footnote - - # otherwise, make a new one - new_footnote = Footnote( - doc_relation=Footnote.DIGITAL_EDITION, source=footnote.source - ) - # trying to set from related object footnote.document errors - new_footnote.content_object = document - new_footnote.save() - # log footnote creation and return - self.log_addition( - new_footnote, - "Created new footnote for migrated digital edition", - ) - return new_footnote - - # when there is no additional information on the footnote - else: - # if a digital edition already exists, remove this one - if diged_footnote: - footnote.delete() - # log deletion and and return existing diged - self.log_deletion(footnote, "Removing redundant edition footnote") - return diged_footnote - - # otherwise, convert edition to digital edition - footnote.doc_relation = Footnote.DIGITAL_EDITION - footnote.content = None - footnote.save() - # log footnote change and return - self.log_change(footnote, "Migrated footnote to digital edition") - return footnote - - # lookup to map tei git repo usernames to pgp db username for co-author string - teicontributor_to_username = { - "Alan Elbaum": "ae5677", - # multiple Bens should all map to same user - "Ben": "benj", - "Ben Johnston": "benj", - "benj@princeton.edu": "benj", - "benjohnsto": "benj", - # no github account that I can find; just use the name - "Brendan Goldman": "Brendan Goldman", - "Jessica Parker": "jp0630", - "Ksenia Ryzhova": "kryzhova", - "Rachel Richman": "rrichman", - "mrustow": "mrustow", - # multiple RSKs also... - "Rebecca Sutton Koeser": "rkoeser", - "rlskoeser": "rkoeser", - } - - @cached_property - def tei_contrib_users(self): - # retrieve users from database based on known tei contributor usernames, - # and return as a dict for lookup by username - tei_users = User.objects.filter( - username__in=set(self.teicontributor_to_username.values()) - ) - return {u.username: u for u in tei_users} - - def export_transcription(self, document, xmlfile): - # get contributors and export to git backup - - # get the unique list of all contributors to this file - commits = list(self.tei_gitrepo.iter_commits("master", paths=xmlfile)) - contributors = set([c.author.name for c in commits]) - # convert bitbucket users to unique set of pgp users - contrib_usernames = set( - self.teicontributor_to_username[c] for c in contributors - ) - # now get actual users for those usernames... - contrib_users = [self.tei_contrib_users.get(u, u) for u in contrib_usernames] - - # export transcription for the specified document, - # documenting the users who modified the TEI file - self.anno_exporter.export( - pgpids=[document.pk], - modifying_users=contrib_users, - commit_msg="Transcription migrated from TEI %s" % xmlfile, - ) - - def log_addition(self, obj, log_message): - "create a log entry documenting object creation" - return self.log_entry(obj, log_message, ADDITION) - - def log_change(self, obj, log_message): - "create a log entry documenting object change" - return self.log_entry(obj, log_message, CHANGE) - - def log_deletion(self, obj, log_message): - "create a log entry documenting object change" - return self.log_entry(obj, log_message, DELETION) - - def check_unmigrated_footnotes(self): - unmigrated_footnotes = Footnote.objects.filter( - doc_relation__contains=Footnote.EDITION, content__isnull=False - ) - if unmigrated_footnotes.exists(): - self.stdout.write( - "\n%d unmigrated footnote%s" - % (unmigrated_footnotes.count(), pluralize(unmigrated_footnotes)) - ) - for fn in unmigrated_footnotes: - # provide admin link to make it easier to investigate - admin_url = absolutize_url( - reverse("admin:footnotes_footnote_change", args=(fn.id,)) - ) - print("\t%s\t%s" % (fn, admin_url)) - - _content_types = {} - - def get_content_type(self, obj): - # lookup and cache content type for model - model_class = obj.__class__ - if model_class not in self._content_types: - self._content_types[model_class] = ContentType.objects.get_for_model( - model_class - ) - return self._content_types[model_class] - - def log_entry(self, obj, log_message, log_action): - "create a log entry documenting object creation/change/deletion" - # for this migration, we can assume user is always script user - content_type = self.get_content_type(obj) - return LogEntry.objects.log_action( - user_id=self.script_user.id, - content_type_id=content_type.pk, - object_id=obj.pk, - object_repr=str(obj), - change_message=log_message, - action_flag=log_action, - ) - - -def new_transcription_annotation(): - # initialize a new annotation dict object with all the defaults set - - anno = Dict() - setattr(anno, "@context", "http://www.w3.org/ns/anno.jsonld") - anno.type = "Annotation" - anno.body = [Dict()] - anno.body[0].type = "TextualBody" - # purpose on body is only needed if more than one body - # (e.g., transcription + tags in the same annotation) - # anno.body[0].purpose = "transcribing" - anno.body[0].format = "text/html" - # explicitly indicate text direction; all transcriptions are RTL - anno.body[0].TextInput = "rtl" - # supplement rather than painting over the image - # multiple motivations are allowed; add transcribing as secondary motivation - # (could use edm:transcribing from Europeana Data Model, but not sure - # how to declare edm namespace) - anno.motivation = ["sc:supplementing", "transcribing"] - - anno.target.source.type = "Canvas" - anno.target.selector.type = "FragmentSelector" - anno.target.selector.conformsTo = "http://www.w3.org/TR/media-frags/" - - return anno diff --git a/geniza/corpus/tei_transcriptions.py b/geniza/corpus/tei_transcriptions.py deleted file mode 100644 index ed3bb5d11..000000000 --- a/geniza/corpus/tei_transcriptions.py +++ /dev/null @@ -1,266 +0,0 @@ -from eulxml import xmlmap -from eulxml.xmlmap import teimap - -from geniza.common.utils import simplify_quotes - - -class GenizaTeiLine(teimap.TeiLine): - name = xmlmap.StringField("local-name(.)") - lang = xmlmap.StringField("@xml:lang|tei:span/@xml:lang") - number = xmlmap.StringField("@n") - - -class MainText(teimap.TeiDiv): - lines = xmlmap.NodeListField("tei:l|tei:label", GenizaTeiLine) - - -class GenizaTei(teimap.Tei): - # extend eulxml TEI to add mappings for the fields we care about - # NOTE: at least one pgpid is in format ### + ### - pgpid = xmlmap.IntegerField('tei:teiHeader//tei:idno[@type="PGP"]') - # normally main text content is under text/body/div; but at least one document has no div - text = xmlmap.NodeField( - "tei:text/tei:body/tei:div|tei:text/tei:body[not(tei:div)]", MainText - ) - lines = xmlmap.NodeListField("tei:text/tei:body/tei:div/tei:l", GenizaTeiLine) - labels = xmlmap.NodeListField( - "tei:text/tei:body/tei:div/tei:label", GenizaTeiLine - ) # not really a line... - # source description sometimes contains reference to scholarship record - source = xmlmap.NodeListField( - "tei:teiHeader//tei:sourceDesc/tei:msDesc/tei:msContents/tei:p", GenizaTeiLine - ) - # for documents with more than one transcription, authors have been - # tagged with last name in n attribute to allow identifying/differentiating - source_authors = xmlmap.StringListField( - "tei:teiHeader//tei:sourceDesc//tei:author/@n" - ) - - def no_content(self): - return str(self.text).strip() == "" - - # text that generally indicates a new page/image, anywhere in the label - new_page_indicators = [ - "recto", - "verso", - "side ii", - "page b", - "page 2", - "page two", - 'ע"ב', # Hebrew label for page 2 - ] - # text that indicates a new page/image at the start of the label - new_page_start_indicators = ["t-s ", "ts ", "ena ", "moss. "] - - def label_indicates_new_page(self, label): - label = simplify_quotes(label.lower()) - return any( - [side_label in label for side_label in self.new_page_indicators] - ) or any( - label.startswith(start_label) - for start_label in self.new_page_start_indicators - ) - - def labels_only(self): - text_content = str(self.text).strip() - label_content = " ".join([str(label).strip() for label in self.labels]) - return text_content == label_content - - def text_to_html(self, block_format=False): - # convert the TEI text content to basic HTML - blocks = [] - lines = [] - label = [] - # because blocks are indicated by labels without containing elements, - # iterate over all lines and create blocks based on the labels - for line in self.text.lines: - if line.name == "label": - # append current text block if set, and initialize a new one - if lines: - blocks.append( - { - "label": "\n".join(label), - "lines": lines, - # "languages": list(languages), - } - ) - label = [] - lines = [] - - # store the label; sometimes there are two in a row - label.append(str(line)) - - elif line.name == "l": - # use language codes? unreliable in the xml - # append tuple of line number, text - # return empty string for line number if no line attribute - lines.append((line.number or "", str(line))) - - # append the last block - if lines: - blocks.append( - { - "label": "\n".join(label), - "lines": lines, - } - ) - - # if block format requested, return blocks without further processing - if block_format: - return blocks - - # otherwise, return chunked HTML - return self.chunk_html(blocks) - - def chunk_html(self, blocks): - # combine blocks of text into html, chunked into pages to match sides of images - html = [] - page = [] - for block in blocks: - - # if there is a label and it looks like a new side, - # start a new section - if block["label"]: - if self.label_indicates_new_page(block["label"]): - # if we have any content, close the previous section - if page: - # combine all sections in the page and add to the html - html.append("\n".join(page)) - # then start a new page - page = [] - - # start output for the new block - output = ["
"] - # add label if we have one - if block["label"]: - output.append(f"

{block['label']}

") - - output.append(self.lines_to_html(block["lines"])) - output.append("
") - page.append("\n".join(output)) - - # save the last page - html.append("\n".join(page)) - - return html - - def lines_to_html(self, lines): - """Convert lines and line numbers from TEI to HTML, accounting - for unnumbered lines and lines starting with numbers other than 1. - Converts to ordered lists and paragraphs; ordered lists have - start attribute when needed. - - :params lines: list of tuples of line number, line text - :returns: string of html content - """ - - html_lines = [] - list_num = 1 - in_list = False - for line_number, line in lines: - # convertline number to integer for comparison - if line_number: - try: - line_number = int(line_number) - except ValueError: - # in at least one instance, line number is a range "16-17" - # ignore the problem (??) - if "-" in line_number: - line_number = int(line_number.split("-")[0]) - - # if line is empty, skip it - if not line.strip(): - continue - - # if line is unnumberred, output as a paragraph - if not line_number: - # if we were in a list, close it - if in_list: - html_lines.append("") - in_list = False - list_num = 1 - html_lines.append("

%s

" % line) - - # if line number is 1, start a new list - elif line_number == 1: - # close any preceeding list - if in_list: - html_lines.append("") - - in_list = True - list_num = 1 - html_lines.append("
    ") - html_lines.append("
  1. %s
  2. " % line) - # if the line number matches expected next value, output as line - elif line_number == list_num: - html_lines.append("
  3. %s
  4. " % line) - - # if line number does not match expected list number, - # start a new list with start attribute specified - else: - # close existing list if any - if in_list: - html_lines.append("
") - - # start a new list with the specified number IF numeric - if isinstance(line_number, int): - list_num = line_number - in_list = True - html_lines.append('
    ' % line_number) - html_lines.append("
  1. %s
  2. " % line) - else: - # if not numeric, we can't use as line number or start - html_lines.append("
      ") - # add the n to text to preserve the content - html_lines.append("
    1. %s %s
    2. " % (line_number, line)) - - # increment expected list number if we're inside a list - if in_list: - list_num += 1 - - # close the last list, if active - if in_list: - html_lines.append("
    ") - - return "\n".join(html_lines) - - rtl_mark = "\u200F" - ltr_mark = "\u200E" - - def text_to_plaintext(self): - lines = [] - # because blocks are indicated by labels without containing elements, - # iterate over all lines and create blocks based on the labels - - # errors if there are no lines; sync transcription now checks - # and won't call in that case - if not self.text.lines: - return - - # determine longest line so we can pad the text - longest_line = max(len(str(line)) for line in self.text.lines) - # some files have descriptions that are making lines much too long, - # so set a limit on line length - if longest_line > 100: - longest_line = 100 - for line in self.text.lines: - if line.name == "label": - # blank line to indicate breaks between blocks - lines.append("") - lines.append("%s%s" % (self.ltr_mark, line)) - elif line.name == "l": - line_num = line.number or "" - # combine line text with line number and right justify; - # right justify line number - lines.append( - " ".join( - [ - self.rtl_mark, - str(line).rjust(longest_line), - self.ltr_mark, - line_num.rjust(3), - ] - ) - ) - - return "\n".join(lines) diff --git a/geniza/corpus/tests/test_tei_transcription.py b/geniza/corpus/tests/test_tei_transcription.py deleted file mode 100644 index 8a1b71258..000000000 --- a/geniza/corpus/tests/test_tei_transcription.py +++ /dev/null @@ -1,150 +0,0 @@ -import os.path - -from eulxml import xmlmap - -from geniza.corpus.tei_transcriptions import GenizaTei - -fixture_dir = os.path.join(os.path.dirname(__file__), "fixtures") - -xmlfile = os.path.join(fixture_dir, "968.xml") - - -def test_fields(): - tei = xmlmap.load_xmlobject_from_file(xmlfile, GenizaTei) - assert tei.pgpid == 968 - # should have text, lines, and labels - assert tei.text - assert tei.lines - assert tei.labels - assert len(tei.labels) == 4 - assert tei.source_authors == ["Gil"] - - -def test_no_content(): - tei = xmlmap.load_xmlobject_from_file(xmlfile, GenizaTei) - # this file has text content - assert not tei.no_content() - - # if we delete the lines and labels, it does not - tei.lines = [] - tei.labels = [] - assert tei.no_content() - - -def test_labels_only(): - tei = xmlmap.load_xmlobject_from_file(xmlfile, GenizaTei) - # fixture has both labels and lines - assert not tei.labels_only() - - # delete all the line elemens so only labels are left - while len(tei.lines): - del tei.lines[0] - - # now labels only is true - assert tei.labels_only() - - -def test_block_format(): - tei = xmlmap.load_xmlobject_from_file(xmlfile, GenizaTei) - blocks = tei.text_to_html(block_format=True) - - # should be a list of three items (three sets of lines separated by