diff --git a/.github/workflows/lighthouse.yml b/.github/workflows/lighthouse.yml index 9c0e14e62..7b08d9dc2 100644 --- a/.github/workflows/lighthouse.yml +++ b/.github/workflows/lighthouse.yml @@ -23,7 +23,7 @@ jobs: ports: - 8983:8983 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Decrypt encrypted fonts zip run: | gpg --quiet --batch --yes --decrypt --passphrase="${{ secrets.GPG_PASSPHRASE }}" --output sitemedia/fonts.zip sitemedia/fonts.zip.gpg @@ -37,11 +37,9 @@ jobs: docker exec -d ${{ job.services.solr.id }} cp -r /opt/solr/server/solr/configsets /var/solr/data docker exec ${{ job.services.solr.id }} solr create -c geniza -n geniza - run: sudo apt install gettext - - run: echo "PYTHON_VERSION=$(cat .python-version)" >> $GITHUB_ENV - - uses: actions/setup-python@v2 - with: - python-version: ${{ env.PYTHON_VERSION }} - - uses: actions/cache@v2 + - uses: actions/setup-python@v5 + # uses version set in .python-version + - uses: actions/cache@v4 with: path: ~/.cache/pip key: pip-${{ hashFiles('requirements/*.txt') }} @@ -53,7 +51,7 @@ jobs: - uses: actions/setup-node@v2 with: node-version: 16 - - uses: actions/cache@v2 + - uses: actions/cache@v4 with: path: ~/.npm key: npm-${{ hashFiles('package-lock.json') }} diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 347b4ed9d..9e7283683 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,37 @@ Change Log ========== +4.17 +---- + +- public site + - As a public site user, I would like to see date ranges separated with an en-dash (–) instead of an em-dash (—). + - As a front end user, I only want to see one document number for a source displayed in the scholarship records on the public site. + - As a frontend user, I want to see dating information displayed on document details when available, so that I can find out the time frame of a document when it is known. + - bugfix: Double quotes search returning unexpected results + - bugfix: Issues with shelfmark scoped search + - bugfix: Highlighting context shows entire transcription or translation in search result + - bugfix: Transcription search results not always formatted correctly + - bugfix: Bracket and other character search is functioning unpredictably + - bugfix: Incorrect words are highlighted in complete word quotation search (Hebrew script) + - bugfix: Some partial search results in description not boosted by relevancy + - chore: accessibility issues flagged by DubBot + +- image, transcription, translation viewer/editor + - As a transcription editor, I should see an error if I try to update an annotation with out of date content so that I don't overwrite someone else's changes. + - bugfix: Autofill for source search (when inputting a transcription source) not functioning properly + +- admin + - As a content editor, I want to record places-to-places relationship on the place page and on the document detail page, so that I can track ambiguity. + - As a content admin, I want to drop down a pin on a map and then be able to move the pin around so that I can manually adjust the coordinates of a place before saving the location. + - As a content editor, I want there to be a notes field in the places pages so that I can add more detail about places that are hard-to-find. + - As a content admin, I want a provenance field on the document detail page so that I can note the origin and aquisition history of fragments when available. + - As a content editor, I want clearer help text for the name field of the person page so I know how best to present people's names on their pages + - As a content editor, I would like to see Historic Shelfmark on the Document edit page, to ensure that my work is correct when working with old scholarship. + - bugfix: Full shelfmark search for multiple shelfmarks not working in admin + - bugfix: Invalid lat/long coordinates are allowed for Places, but don't persist + - bugfix: People names are not diacritic neutral when adding them from Document Detail page + 4.16.1 ------ diff --git a/DEPLOYNOTES.md b/DEPLOYNOTES.md index d8920acfc..b007fc9f0 100644 --- a/DEPLOYNOTES.md +++ b/DEPLOYNOTES.md @@ -1,5 +1,12 @@ # Deploy Notes +## 4.17 + +- Solr configuration has changed. Ensure Solr configset has been updated + and then reindex all content: `python manage.py index` +- Configure **MAPTILER_API_TOKEN** in local settings for maps to appear. +- Anywhere that Node versions are being managed manually, NPM should be upgraded to 8.x, at least 8.1.0. + ## 4.16 - Import Bodleian catalog numbers from a spreadsheet using the script diff --git a/README.rst b/README.rst index 855d9608b..09375ea68 100644 --- a/README.rst +++ b/README.rst @@ -19,10 +19,6 @@ Python 3.9 / Django 3.2 / Node 16 / Postgresql / Solr 9.2 :target: https://codecov.io/gh/Princeton-CDH/geniza :alt: Code coverage -.. image:: https://requires.io/github/Princeton-CDH/geniza/requirements.svg?branch=main - :target: https://requires.io/github/Princeton-CDH/geniza/requirements/?branch=main - :alt: Requirements Status - .. image:: https://github.com/Princeton-CDH/geniza/workflows/dbdocs/badge.svg :target: https://dbdocs.io/princetoncdh/geniza :alt: dbdocs build diff --git a/geniza/__init__.py b/geniza/__init__.py index 509d2d231..cf15af50b 100644 --- a/geniza/__init__.py +++ b/geniza/__init__.py @@ -1,4 +1,4 @@ -__version_info__ = (4, 16, 1, None) +__version_info__ = (4, 17, 0, None) # Dot-connect all but the last. Last is dash-connected if not None. diff --git a/geniza/admin.py b/geniza/admin.py index ac943f53a..a2bdc7540 100644 --- a/geniza/admin.py +++ b/geniza/admin.py @@ -1,3 +1,4 @@ +from django.conf import settings from django.contrib import admin from geniza.corpus.models import Document, Fragment @@ -29,4 +30,7 @@ def each_context(self, request): : self.REVIEW_PREVIEW_MAX ] + # add maptiler token if we have one + context["maptiler_token"] = getattr(settings, "MAPTILER_API_TOKEN", "") + return context diff --git a/geniza/annotations/conftest.py b/geniza/annotations/conftest.py index 0945ce078..c18effbdc 100644 --- a/geniza/annotations/conftest.py +++ b/geniza/annotations/conftest.py @@ -62,6 +62,7 @@ def annotation_json(document, source): } } }, + "motivation": ["sc:supplementing", "transcribing"], "dc:source": source.uri, } diff --git a/geniza/annotations/migrations/0006_annotation_block.py b/geniza/annotations/migrations/0006_annotation_block.py new file mode 100644 index 000000000..56d066661 --- /dev/null +++ b/geniza/annotations/migrations/0006_annotation_block.py @@ -0,0 +1,23 @@ +# Generated by Django 3.2.16 on 2024-02-07 18:00 + +import django.db.models.deletion +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("annotations", "0005_annotation_cleanup_nbsp"), + ] + + operations = [ + migrations.AddField( + model_name="annotation", + name="block", + field=models.ForeignKey( + null=True, + on_delete=django.db.models.deletion.CASCADE, + related_name="lines", + to="annotations.annotation", + ), + ), + ] diff --git a/geniza/annotations/models.py b/geniza/annotations/models.py index c6c0ba8b7..e893509fc 100644 --- a/geniza/annotations/models.py +++ b/geniza/annotations/models.py @@ -1,3 +1,5 @@ +import hashlib +import json import re import uuid from collections import defaultdict @@ -59,7 +61,18 @@ def group_by_manifest(self): class Annotation(TrackChangesModel): - """Annotation model for storing annotations in the database.""" + """Annotation model for storing annotations in the database. + + Annotations may be either block-level or line-level. Block-level annotation is the default; + in most cases, a block-level annotation's content is stored as a TextualBody in its `content` + JSON. + + However, block-level annotations may also be used to group line-level annotations, in which case + they have no textual content themselves, except for an optional label. Instead, their content + is serialized by joining TextualBody values from all associated line-level annotations. + + Line-level annotations are associated with blocks via the `block` property, and that relationship + is serialized as `partOf` at the root of the line-level annotation.""" #: annotation id (uuid, autogenerated when created) id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) @@ -67,7 +80,10 @@ class Annotation(TrackChangesModel): created = models.DateTimeField(auto_now_add=True) #: date last modified modified = models.DateTimeField(auto_now=True) - #: json content of the annotation + #: json content of the annotation. in addition to W3C Web Annotation Data Model properties, + #: may also include: "schema:position", which tracks the order of the annotation with respect + #: to others in the same block or canvas; and "textGranularity", which indicates whether this + #: is a block- or line-level annotation content = models.JSONField() #: optional canonical identifier, for annotations imported from another source canonical = models.CharField(max_length=255, blank=True) @@ -79,6 +95,14 @@ class Annotation(TrackChangesModel): on_delete=models.CASCADE, null=False, ) + #: block-level annotation associated with this (line-level) annotation. if null, this is a + #: block-level annotation. if a block is deleted, all associated lines will be deleted. + block = models.ForeignKey( + to="Annotation", + on_delete=models.CASCADE, + related_name="lines", + null=True, + ) # use custom manager & queryset objects = AnnotationQuerySet.as_manager() @@ -140,6 +164,29 @@ def body_content(self): except IndexError: pass + @cached_property + def block_content_html(self): + """convenience method to get HTML content, including label and any associated lines, + of a block-level annotation, as a list of HTML strings""" + content = [] + if self.label: + content.append(f"

{self.label}

") + if self.has_lines: + # if this block annotation has separate line annotations, serialize as ordered list + content.append("
    ") + for l in self.lines.all().order_by("content__schema:position"): + content.append(f"
  1. {l.body_content}
  2. ") + content.append("
") + elif self.body_content: + content.append(self.body_content) + return content + + @cached_property + def has_lines(self): + """cached property to indicate whether or not this is a block-level + annotation with line-level children""" + return self.lines.exists() + def set_content(self, data): """Set or update annotation content and model fields. @@ -148,7 +195,7 @@ def set_content(self, data): and the data will not be saved. """ # remove any values tracked on the model; redundant in json field - for val in ["id", "created", "modified", "@context", "type"]: + for val in ["id", "created", "modified", "@context", "type", "etag"]: if val in data: del data[val] @@ -202,6 +249,18 @@ def sanitize_html(cls, html): else: return cleaned_html + @property + def etag(self): + """Compute and return an md5 hash of content to use as an ETag. + + NOTE: Only :attr:`content` can be modified in the editor, so it is the only hashed + attribute. If other attributes become mutable, modify this function to include them in + the ETag computation.""" + # must be a string encoded as utf-8 to compute md5 hash + content_str = json.dumps(self.content, sort_keys=True).encode("utf-8") + # ETag should be wrapped in double quotes, per Django @condition docs + return f'"{hashlib.md5(content_str).hexdigest()}"' + def compile(self, include_context=True): """Combine annotation data and return as a dictionary that can be serialized as JSON. Includes context by default, @@ -213,6 +272,11 @@ def compile(self, include_context=True): anno = {} if include_context: anno = {"@context": "http://www.w3.org/ns/anno.jsonld"} + else: + # NOTE: ETag required here for inclusion in annotation list, which is how + # annotations are fetched during editing; need to associate each ETag with + # an individual annotation, for comparison with response ETag on POST/DELETE + anno = {"etag": self.etag} # define fields in desired order anno.update( @@ -246,4 +310,9 @@ def compile(self, include_context=True): # overwrite with the base annotation data in case of any collisions # between content and model fields anno.update(base_anno) + + # if this is a line-level annotation with block, include in content + if self.block: + anno.update({"partOf": self.block.uri()}) + return anno diff --git a/geniza/annotations/tests/test_annotations_models.py b/geniza/annotations/tests/test_annotations_models.py index 4853be50d..9d1c7eabb 100644 --- a/geniza/annotations/tests/test_annotations_models.py +++ b/geniza/annotations/tests/test_annotations_models.py @@ -18,6 +18,27 @@ def test_get_absolute_url(self): anno = Annotation() assert anno.get_absolute_url() == "/annotations/%s/" % anno.pk + def test_etag(self, annotation): + old_etag = annotation.etag + # should be surrounded by doublequotes + assert old_etag[0] == old_etag[-1] == '"' + # should be length of an md5 hash + two characters + assert len(old_etag) == 34 + # changing content should change etag + annotation.content.update( + { + "foo": "bar", + "id": "bogus", + "created": "yesterday", + "modified": "today", + } + ) + assert annotation.etag != old_etag + new_etag = annotation.etag + # changing other properties on the annotation should not change etag + annotation.footnote = Footnote() + assert annotation.etag == new_etag + @pytest.mark.django_db def test_uri(self): anno = Annotation() @@ -31,12 +52,13 @@ def test_set_content(self, source, document): "id": absolutize_url("/annotations/1"), "type": "Annotation", "foo": "bar", + "etag": "abcd1234", } anno = Annotation(footnote=footnote) anno.set_content(content) # check that appropriate fields were removed - for field in ["@context", "id", "type"]: + for field in ["@context", "id", "type", "etag"]: assert field not in anno.content # remaining content was set assert anno.content["foo"] == "bar" @@ -116,6 +138,18 @@ def test_compile(self, annotation): assert compiled["canonical"] == annotation.canonical assert compiled["via"] == annotation.via + line = Annotation.objects.create( + footnote=annotation.footnote, block=annotation, content={} + ) + compiled = line.compile() + assert compiled["partOf"] == annotation.uri() + + # when include_context=False (i.e. part of a list), should include etag, since + # we need a way to associate individual ETag to each item returned in list response + compiled = line.compile(include_context=False) + assert compiled["etag"] == line.etag + assert "@context" not in compiled + def test_sanitize_html(self): html = '

test

  1. line
' # should strip out all unwanted elements and attributes (table, div, style) @@ -134,6 +168,40 @@ def test_sanitize_html(self): html = "

text\xa0and more \xa0 text

" assert Annotation.sanitize_html(html) == "

text and more text

" + def test_block_content_html(self, annotation): + annotation.content["body"][0]["label"] = "Test label" + # should include label and content + block_html = annotation.block_content_html + assert len(block_html) == 2 + assert block_html[0] == "

Test label

" + assert block_html[1] == annotation.body_content + + # with associated lines, should produce ordered list + del annotation.content["body"][0]["value"] + line_1 = Annotation.objects.create( + block=annotation, + content={"body": [{"value": "Line 1"}], "schema:position": 1}, + footnote=annotation.footnote, + ) + line_2 = Annotation.objects.create( + block=annotation, + content={"body": [{"value": "Line 2"}], "schema:position": 2}, + footnote=annotation.footnote, + ) + + # invalidate cached properties + del annotation.has_lines + del annotation.block_content_html + + # should now show that it has lines and produce the ordered list + assert annotation.has_lines == True + block_html = annotation.block_content_html + assert len(block_html) == 5 + assert block_html[0] == "

Test label

" + assert block_html[1] == "
    " + assert block_html[2] == f"
  1. {line_1.body_content}
  2. " + assert block_html[3] == f"
  3. {line_2.body_content}
  4. " + @pytest.mark.django_db class TestAnnotationQuerySet: diff --git a/geniza/annotations/tests/test_annotations_views.py b/geniza/annotations/tests/test_annotations_views.py index 4b51fca10..88df624d3 100644 --- a/geniza/annotations/tests/test_annotations_views.py +++ b/geniza/annotations/tests/test_annotations_views.py @@ -8,7 +8,7 @@ from pytest_django.asserts import assertContains, assertNotContains from geniza.annotations.models import Annotation -from geniza.annotations.views import AnnotationResponse +from geniza.annotations.views import AnnotationDetail, AnnotationResponse from geniza.corpus.models import Document from geniza.footnotes.models import Footnote, Source, SourceType @@ -362,6 +362,55 @@ def test_corresponding_footnote_location(self, admin_client, document): # should have its location copied from the existing Edition footnote assert created_digital_edition.location == "doc. 123" + def test_get_etag(self, annotation): + # nonexistent annotation should return None + view = AnnotationDetail() + view.kwargs = {"pk": 1234} + assert view.get_etag(request=None) == None + # otherwise should return ETag + view.kwargs = {"pk": annotation.pk} + assert view.get_etag(request=None) == annotation.etag + + def test_get_last_modified(self, annotation): + view = AnnotationDetail() + view.kwargs = {"pk": 1234} + # nonexistent annotation should return None + assert view.get_last_modified(request=None) == None + # otherwise should return last modified + view.kwargs = {"pk": annotation.pk} + assert view.get_last_modified(request=None) == annotation.modified + + def test_dispatch(self, admin_client, annotation): + # integration test for conditional response + # get current etag + old_etag = annotation.etag + + # change content + annotation.content = { + **annotation.content, + "body": [{"value": "changed"}], + } + annotation.save() + + # attempt to update annotation with POST request as admin, including the old + # ETag in If-Match header + response = admin_client.post( + annotation.get_absolute_url(), + json.dumps(annotation.content), + content_type="application/json", + HTTP_IF_MATCH=old_etag, + ) + # should result in 412 Precondition Failed status + assert response.status_code == 412 + + # attempt to delete annotation with DELETE request as admin, including the old + # ETag in If-Match header + response = admin_client.delete( + annotation.get_absolute_url(), HTTP_IF_MATCH=old_etag + ) + # should result in 412 Precondition Failed status + assert response.status_code == 412 + @pytest.mark.django_db class TestAnnotationSearch: diff --git a/geniza/annotations/views.py b/geniza/annotations/views.py index 212eda569..b13ed7dfe 100644 --- a/geniza/annotations/views.py +++ b/geniza/annotations/views.py @@ -6,7 +6,8 @@ from django.contrib.auth.mixins import AccessMixin, PermissionRequiredMixin from django.contrib.contenttypes.models import ContentType from django.core.exceptions import BadRequest -from django.http import HttpResponse, JsonResponse +from django.http import Http404, HttpResponse, JsonResponse +from django.views.decorators.http import condition from django.views.generic.base import View from django.views.generic.detail import SingleObjectMixin from django.views.generic.list import MultipleObjectMixin @@ -269,7 +270,10 @@ def get(self, request, *args, **kwargs): class AnnotationDetail( - PermissionRequiredMixin, ApiAccessMixin, View, SingleObjectMixin + PermissionRequiredMixin, + ApiAccessMixin, + View, + SingleObjectMixin, ): """View to read, update, or delete a single annotation.""" @@ -293,7 +297,6 @@ def get_permission_required(self): def post(self, request, *args, **kwargs): """update the annotation on POST""" - # NOTE: should use etag / if-match anno = self.get_object() try: anno_data = parse_annotation_data(request=request) @@ -316,7 +319,6 @@ def post(self, request, *args, **kwargs): def delete(self, request, *args, **kwargs): """delete the annotation on DELETE""" - # should use etag / if-match # deleted uuid should not be reused (relying on low likelihood of uuid collision) anno = self.get_object() # create log entry to document deletion *BEFORE* deleting @@ -352,3 +354,33 @@ def delete(self, request, *args, **kwargs): footnote.refresh_from_db() return HttpResponse(status=204) + + def get_etag(self, request, *args, **kwargs): + """Get etag from annotation""" + try: + if not hasattr(self, "object"): + self.object = self.get_object() + anno = self.object + return anno.etag + except Http404: + return None + + def get_last_modified(self, request, *args, **kwargs): + """Return last modified :class:`datetime.datetime`""" + try: + if not hasattr(self, "object"): + self.object = self.get_object() + anno = self.object + return anno.modified + except Http404: + return None + + def dispatch(self, request, *args, **kwargs): + """Wrap the dispatch method to add ETag/last modified headers when + appropriate, then return a conditional response.""" + + @condition(etag_func=self.get_etag, last_modified_func=self.get_last_modified) + def _dispatch(request, *args, **kwargs): + return super(AnnotationDetail, self).dispatch(request, *args, **kwargs) + + return _dispatch(request, *args, **kwargs) diff --git a/geniza/common/migrations/0009_install_unaccent.py b/geniza/common/migrations/0009_install_unaccent.py new file mode 100644 index 000000000..2687596ae --- /dev/null +++ b/geniza/common/migrations/0009_install_unaccent.py @@ -0,0 +1,14 @@ +# Generated by Django 3.2.23 on 2024-02-29 18:02 + +from django.contrib.postgres.operations import UnaccentExtension +from django.db import migrations + + +class Migration(migrations.Migration): + dependencies = [ + ("common", "0008_preload_github_coauthors"), + ] + + operations = [ + UnaccentExtension(), + ] diff --git a/geniza/corpus/admin.py b/geniza/corpus/admin.py index 8d079e35b..17308e437 100644 --- a/geniza/corpus/admin.py +++ b/geniza/corpus/admin.py @@ -18,12 +18,14 @@ from geniza.annotations.models import Annotation from geniza.common.admin import custom_empty_field_list_filter -from geniza.corpus.dates import DocumentDateMixin +from geniza.corpus.dates import DocumentDateMixin, standard_date_display +from geniza.corpus.forms import DocumentPersonForm, DocumentPlaceForm from geniza.corpus.metadata_export import AdminDocumentExporter, AdminFragmentExporter from geniza.corpus.models import ( Collection, Dating, Document, + DocumentEventRelation, DocumentType, Fragment, LanguageScript, @@ -32,7 +34,7 @@ from geniza.corpus.solr_queryset import DocumentSolrQuerySet from geniza.corpus.views import DocumentMerge from geniza.entities.admin import PersonInline, PlaceInline -from geniza.entities.models import DocumentPlaceRelation, PersonDocumentRelation +from geniza.entities.models import DocumentPlaceRelation, Event, PersonDocumentRelation from geniza.footnotes.admin import DocumentFootnoteInline from geniza.footnotes.models import Footnote @@ -131,6 +133,7 @@ class DocumentTextBlockInline(SortableInlineAdminMixin, admin.TabularInline): readonly_fields = ( "thumbnail", "side", + "fragment_provenance", ) fields = ( "fragment", @@ -139,6 +142,7 @@ class DocumentTextBlockInline(SortableInlineAdminMixin, admin.TabularInline): "region", "order", "certain", + "fragment_provenance", "thumbnail", "selected_images", ) @@ -149,6 +153,10 @@ class DocumentTextBlockInline(SortableInlineAdminMixin, admin.TabularInline): ArrayField: {"widget": HiddenInput()}, # hidden input for selected_images } + @admin.display(description="Provenance") + def fragment_provenance(self, obj): + return obj.fragment.provenance + class DocumentForm(forms.ModelForm): class Meta: @@ -358,12 +366,30 @@ class DocumentPersonInline(PersonInline): """Inline for people related to a document""" model = PersonDocumentRelation + form = DocumentPersonForm class DocumentPlaceInline(PlaceInline): """Inline for places related to a document""" model = DocumentPlaceRelation + form = DocumentPlaceForm + + +class DocumentEventInline(admin.TabularInline): + """Inline for events related to a document""" + + autocomplete_fields = ("event",) + fields = ("event", "notes") + model = DocumentEventRelation + min_num = 0 + extra = 1 + show_change_link = True + verbose_name = "Related Event" + verbose_name_plural = "Related Events" + formfield_overrides = { + TextField: {"widget": Textarea(attrs={"rows": "4"})}, + } @admin.register(Document) @@ -392,6 +418,7 @@ class DocumentAdmin(TabbedTranslationAdmin, SortableAdminBase, admin.ModelAdmin) "view_old_pgpids", "standard_date", "admin_thumbnails", + "fragment_historical_shelfmarks", ) search_fields = ( "fragments__shelfmark", @@ -410,6 +437,12 @@ class DocumentAdmin(TabbedTranslationAdmin, SortableAdminBase, admin.ModelAdmin) def view_old_pgpids(self, obj): return ",".join([str(pid) for pid in obj.old_pgpids]) if obj.old_pgpids else "-" + @admin.display( + description="Standard date", + ) + def standard_date(self, obj): + return standard_date_display(obj.doc_date_standard) + list_filter = ( "doctype", HasTranscriptionListFilter, @@ -437,7 +470,12 @@ def view_old_pgpids(self, obj): None, { "fields": ( - ("shelfmark", "id", "view_old_pgpids"), + ( + "shelfmark", + "id", + "view_old_pgpids", + "fragment_historical_shelfmarks", + ), "shelfmark_override", "doctype", ("languages", "secondary_languages"), @@ -481,6 +519,7 @@ def view_old_pgpids(self, obj): DocumentFootnoteInline, DocumentPersonInline, DocumentPlaceInline, + DocumentEventInline, ] # mixed fieldsets and inlines: /templates/admin/snippets/mixed_inlines_fieldsets.html fieldsets_and_inlines_order = ( @@ -493,6 +532,7 @@ def view_old_pgpids(self, obj): "i", # DocumentFootnoteInline "i", # DocumentPersonInline "i", # DocumentPlaceInline + "i", # DocumentEventInline ) class Media: @@ -714,7 +754,7 @@ class Media: class FragmentAdmin(admin.ModelAdmin): list_display = ("shelfmark", "collection_display", "url", "is_multifragment") search_fields = ("shelfmark", "old_shelfmarks", "notes", "needs_review") - readonly_fields = ("created", "last_modified") + readonly_fields = ("created", "last_modified", "iiif_provenance") list_filter = ( ("url", custom_empty_field_list_filter("IIIF image", "Has image", "No image")), ( @@ -731,6 +771,8 @@ class FragmentAdmin(admin.ModelAdmin): "collection", ("url", "iiif_url"), "is_multifragment", + "provenance", + "iiif_provenance", "notes", "needs_review", ("created", "last_modified"), diff --git a/geniza/corpus/annotation_export.py b/geniza/corpus/annotation_export.py index a21bac17a..464e179c0 100644 --- a/geniza/corpus/annotation_export.py +++ b/geniza/corpus/annotation_export.py @@ -379,7 +379,9 @@ def filename(document, source, fn_type): # filename based on pgpid and source authors; # explicitly label as transcription/translation for context authors = [a.creator.last_name for a in source.authorship_set.all()] or [ - "unknown author" + "machine-generated" + if "model" in source.source_type.type + else "unknown author" ] return "PGPID%(pgpid)s_s%(source_id)d_%(authors)s_%(text_type)s" % { diff --git a/geniza/corpus/dates.py b/geniza/corpus/dates.py index 28a42e1f9..fafade4b6 100644 --- a/geniza/corpus/dates.py +++ b/geniza/corpus/dates.py @@ -126,6 +126,27 @@ def numeric_format(self, mode="min"): See :meth:`isoformat` for more details.""" return self.isoformat(mode, "numeric") + @staticmethod + def get_date_range(old_range, new_range): + """Compute the union (widest possible date range) between two PartialDate ranges.""" + minmax = old_range + [start, end] = new_range + + # use numeric format to compare to current min, replace if smaller + start_numeric = int(start.numeric_format(mode="min")) + min = minmax[0] + if min is None or start_numeric < int(min.numeric_format(mode="min")): + # store as PartialDate, not numeric format + minmax[0] = start + # use numeric format to compare to current max, replace if larger + end_numeric = int(end.numeric_format(mode="max")) + max = minmax[1] + if max is None or end_numeric > int(max.numeric_format(mode="max")): + # store as PartialDate, not numeric format + minmax[1] = end + + return minmax + class DocumentDateMixin(TrackChangesModel): """Mixin for document date fields (original and standardized), @@ -183,31 +204,11 @@ def original_date(self): [self.doc_date_original, self.get_doc_date_calendar_display()] ).strip() - @property - def standard_date(self): - """Display standard date in human readable format, when set.""" - # bail out if there is nothing to display - if not self.doc_date_standard: - return - - # currently storing in isoformat, with slash if a date range - dates = self.doc_date_standard.split("/") - # we should always have at least one date, if date is set - # convert to local partial date object for precision-aware string formatting - # join dates with n-dash if more than one; - # add CE to the end to make calendar system explicit - try: - return "%s CE" % " — ".join(str(PartialDate(d)) for d in dates) - except ValueError: - # dates entered before validation was applied may not parse - # as fallback, display as is - return "%s CE" % self.doc_date_standard - @property def document_date(self): """Generate formatted display of combined original and standardized dates""" if self.doc_date_standard: - standardized_date = self.standard_date + standardized_date = standard_date_display(self.doc_date_standard) # add parentheses to standardized date if original date is also present if self.original_date: # NOTE: we want no-wrap for individual dates when displaying as html @@ -528,3 +529,23 @@ def get_islamic_month(month_name): with or without accents, and supports local month-name overrides.""" month_name = unidecode(month_name) return islamic_months.index(islamic_month_aliases.get(month_name, month_name)) + 1 + + +def standard_date_display(standard_date): + """Display a standardized CE date in human readable format.""" + # bail out if there is nothing to display + if not standard_date: + return + + # currently storing in isoformat, with slash if a date range + dates = standard_date.split("/") + # we should always have at least one date, if date is set + # convert to local partial date object for precision-aware string formatting + # join dates with en-dash if more than one; + # add CE to the end to make calendar system explicit + try: + return "%s CE" % " – ".join(str(PartialDate(d)) for d in dates) + except ValueError: + # dates entered before validation was applied may not parse + # as fallback, display as is + return "%s CE" % standard_date diff --git a/geniza/corpus/forms.py b/geniza/corpus/forms.py index ffe421393..400a5c1e9 100644 --- a/geniza/corpus/forms.py +++ b/geniza/corpus/forms.py @@ -1,3 +1,4 @@ +from dal import autocomplete from django import forms from django.db.models import Count from django.template.loader import get_template @@ -8,6 +9,7 @@ from geniza.common.fields import RangeField, RangeForm, RangeWidget from geniza.common.utils import simplify_quotes from geniza.corpus.models import Document, DocumentType +from geniza.entities.models import DocumentPlaceRelation, PersonDocumentRelation class SelectDisabledMixin: @@ -153,7 +155,6 @@ class YearRangeWidget(RangeWidget): class DocumentSearchForm(RangeForm): - q = forms.CharField( label="Keyword or Phrase", required=False, @@ -413,3 +414,33 @@ def __init__(self, *args, **kwargs): ).annotate( item_count=Count("taggit_taggeditem_items", distinct=True), ) + + +class DocumentPersonForm(forms.ModelForm): + class Meta: + model = PersonDocumentRelation + fields = ( + "person", + "type", + "notes", + ) + widgets = { + "notes": forms.Textarea(attrs={"rows": 4}), + "person": autocomplete.ModelSelect2(url="entities:person-autocomplete"), + "type": autocomplete.ModelSelect2(), + } + + +class DocumentPlaceForm(forms.ModelForm): + class Meta: + model = DocumentPlaceRelation + fields = ( + "place", + "type", + "notes", + ) + widgets = { + "notes": forms.Textarea(attrs={"rows": 4}), + "place": autocomplete.ModelSelect2(url="entities:place-autocomplete"), + "type": autocomplete.ModelSelect2(), + } diff --git a/geniza/corpus/management/commands/add_cat_numbers.py b/geniza/corpus/management/commands/add_cat_numbers.py deleted file mode 100644 index 54a6ec243..000000000 --- a/geniza/corpus/management/commands/add_cat_numbers.py +++ /dev/null @@ -1,44 +0,0 @@ -""" -Script to add catalog numbers to historical shelfmarks for some Bodleian -records. This is a one-time script and should be removed after the import is -completed in production. - -Intended to be run manually from the shell as follows: -./manage.py add_cat_numbers historical_shelfmarks.csv -""" - -import csv -import re - -from django.core.management.base import BaseCommand - -from geniza.corpus.models import Fragment - - -class Command(BaseCommand): - """Import catalog numbers into Fragment records in the local database.""" - - bodl_regex = r"^Bodl\. MS Heb\. (?P[A-Za-z]) (?P\d+)," - - def add_arguments(self, parser): - parser.add_argument("path", help="Path to a CSV file") - - def handle(self, *args, **kwargs): - with open(kwargs.get("path"), newline="") as csvfile: - reader = csv.DictReader(csvfile) - for row in reader: - cat_number = row["catalog no. (Bodl. historical shelfmarks)"] - if cat_number: - try: - frag = Fragment.objects.get(pk=int(row["id"])) - except Fragment.DoesNotExist: - print(f"Error: cannot find fragment with id {row['id']}") - continue - - # Bodl. MS heb. b 12/6 - # --> data migration --> Bodl. MS Heb. b 12, f. 6 - # --> this script --> Bodl. MS Heb. b 12 (Cat. 2875), f. 6 - hist_repl = f"Bodl. MS Heb. \g \g (Cat. {cat_number})," - hist = re.sub(self.bodl_regex, hist_repl, frag.old_shelfmarks) - frag.old_shelfmarks = hist - frag.save() diff --git a/geniza/corpus/management/commands/escr_alto_to_annotation.py b/geniza/corpus/management/commands/escr_alto_to_annotation.py index a901224df..7e9e9ecd5 100644 --- a/geniza/corpus/management/commands/escr_alto_to_annotation.py +++ b/geniza/corpus/management/commands/escr_alto_to_annotation.py @@ -8,6 +8,7 @@ from django.db.models import Q from djiffy.models import Canvas, Manifest from eulxml import xmlmap +from parasolr.django.signals import IndexableSignalHandler from geniza.annotations.models import Annotation from geniza.corpus.models import Document @@ -27,7 +28,9 @@ class AltoPolygonalObject(AltoObject): class Line(AltoPolygonalObject): + id = xmlmap.StringField("./@ID") content = xmlmap.StringField("alto:String/@CONTENT") + line_type_id = xmlmap.StringField("./@TAGREFS") class TextBlock(AltoPolygonalObject): @@ -59,6 +62,17 @@ class Command(BaseCommand): # regex pattern for image filenames filename_pattern = r"PGPID_(?P\d+)_(?P[\w\-]+)_(?P\d)\..+" + # tags used for rotated blocks and lines + rotation_tags = [ + "Oblique_45", # 45° + "Vertical_Bottom_Up_90", # 90° + "Oblique_135", # 135° + "Upside_Down", # 180° + "Oblique_225", # 225° + "Vertical_Top_Down_270", # 270° + "Oblique_315", # 315° + ] + def add_arguments(self, parser): # needs xml filenames as input parser.add_argument( @@ -68,10 +82,16 @@ def add_arguments(self, parser): def handle(self, *args, **options): self.script_user = User.objects.get(username=settings.SCRIPT_USERNAME) + # store content type pk for logentry + self.anno_contenttype = ContentType.objects.get_for_model(Annotation).pk + # lists for reporting self.document_errors = set() self.canvas_errors = set() + # disconnect solr indexing signals; this script will index annotations manually + IndexableSignalHandler.disconnect() + # process all files for xmlfile in options["alto"]: self.stdout.write("Processing %s" % xmlfile) @@ -128,7 +148,7 @@ def ingest_xml(self, xmlfile): ) # create annotations - for tb in alto.printspace.textblocks: + for tb_idx, tb in enumerate(alto.printspace.textblocks, start=1): block_type = None if tb.block_type_id: # find first tag in tag list whose id matches block type id @@ -136,26 +156,57 @@ def ingest_xml(self, xmlfile): tag = next(tag_matches, None) if tag: block_type = tag.label - # TODO: When implementing line-by-line, use block_type to determine rotation # skip arabic; these are Hebrew script transcriptions if not (block_type and "Arabic" in block_type) and len(tb.lines): # get or create footnote footnote = self.get_footnote(doc) # create annotation and log entry - anno = Annotation.objects.create( - content=self.create_block_annotation(tb, canvas_uri, scale_factor), + block = Annotation.objects.create( + content=self.create_block_annotation( + tb, canvas_uri, scale_factor, block_type, tb_idx + ), footnote=footnote, ) LogEntry.objects.log_action( user_id=self.script_user.pk, - content_type_id=ContentType.objects.get_for_model(Annotation).pk, - object_id=anno.pk, - object_repr=str(anno), - change_message="Imported from eScriptorium HTR ALTO", + content_type_id=self.anno_contenttype, + object_id=block.pk, + object_repr=str(block), + change_message="Imported block from eScriptorium HTR ALTO", action_flag=ADDITION, ) + # create line annotations from lines and link to block + for i, line in enumerate(tb.lines, start=1): + line_type = None + if line.line_type_id: + # find first tag in tag list whose id matches line type id + tag_matches = filter( + lambda t: t.id == line.line_type_id, alto.tags + ) + tag = next(tag_matches, None) + if tag: + line_type = tag + line_anno = Annotation.objects.create( + content=self.create_line_annotation( + line, block, scale_factor, line_type, order=i + ), + block=block, + footnote=footnote, + ) + LogEntry.objects.log_action( + user_id=self.script_user.pk, + content_type_id=self.anno_contenttype, + object_id=line_anno.pk, + object_repr=str(line_anno), + change_message="Imported line from eScriptorium HTR ALTO", + action_flag=ADDITION, + ) + + # index after all blocks added + doc.index() + def get_manifest(self, document, short_id, filename): """Attempt to get the manifest using the supplied short id; fallback to first manifest, or return None if there are none on the document""" @@ -232,26 +283,16 @@ def scale_polygon(self, polygon, scale): # return as string for use in svg polygon element return " ".join([str(point) for point in scaled_points]) - def create_block_annotation(self, textblock, canvas_uri, scale_factor): + def create_block_annotation( + self, textblock, canvas_uri, scale_factor, block_type, order + ): """Produce a valid IIIF annotation with the block-level content and geometry, linked to the IIIF canvas by URI""" - # lines to HTML list - block_text = "
      \n" - for line in textblock.lines: - block_text += f"
    1. {line.content}
    2. \n" - block_text += "
    " - # create IIIF annotation anno_content = {} - anno_content["body"] = [ - { - "TextInput": "rtl", - "format": "text/html", - "type": "TextualBody", - "value": block_text, - } - ] + anno_content["schema:position"] = order + anno_content["textGranularity"] = "block" anno_content["motivation"] = ["sc:supplementing", "transcribing"] anno_content["target"] = { "source": { @@ -259,6 +300,15 @@ def create_block_annotation(self, textblock, canvas_uri, scale_factor): "type": "Canvas", }, } + if block_type: + anno_content["body"] = [ + { + "label": block_type, + } + ] + if block_type in self.rotation_tags: + # add rotation tag as a CSS class to this block + anno_content["target"]["styleClass"] = block_type # add selector if textblock.polygon: @@ -278,3 +328,40 @@ def create_block_annotation(self, textblock, canvas_uri, scale_factor): } return anno_content + + def create_line_annotation(self, line, block_anno, scale_factor, line_type, order): + # create IIIF annotation + anno_content = {} + anno_content["schema:position"] = order + anno_content["body"] = [ + { + "TextInput": "rtl", + "format": "text/html", + "type": "TextualBody", + "value": line.content, + } + ] + anno_content["textGranularity"] = "line" + anno_content["motivation"] = block_anno.content["motivation"] + anno_content["target"] = {"source": block_anno.content["target"]["source"]} + if line_type and line_type in self.rotation_tags: + # add rotation tag as a CSS class to this line (sometimes differs from block) + anno_content["target"]["styleClass"] = line_type + elif "styleClass" in block_anno.content["target"]: + # if block has rotation but line doesn't, use block's rotation + anno_content["target"]["styleClass"] = block_anno.content["target"][ + "styleClass" + ] + + # add selector + if line.polygon: + # scale polygon points and use SvgSelector + points = self.scale_polygon(line.polygon, scale_factor) + anno_content["target"]["selector"] = { + "type": "SvgSelector", + "value": f'', + } + else: + self.stdout.write(f"No line-level geometry available for {line.id}") + + return anno_content diff --git a/geniza/corpus/management/commands/sync_transcriptions.py b/geniza/corpus/management/commands/sync_transcriptions.py deleted file mode 100644 index 7a307a166..000000000 --- a/geniza/corpus/management/commands/sync_transcriptions.py +++ /dev/null @@ -1,393 +0,0 @@ -""" -Script to synchronize transcription content from PGP v3 TEI files -to an _interim_ html format in the database. - -The script checks out and updates the transcription files from the -git repository, and then loops through all xml files and -identifies the document and footnote to update, if possible. - -""" - -import glob -import os.path -from collections import defaultdict - -from django.conf import settings -from django.contrib.admin.models import ADDITION, CHANGE, LogEntry -from django.contrib.auth.models import User -from django.contrib.contenttypes.models import ContentType -from django.core.management.base import BaseCommand, CommandError -from django.db import models -from django.urls import reverse -from eulxml import xmlmap -from git import Repo - -from geniza.common.utils import absolutize_url -from geniza.corpus.models import Document -from geniza.corpus.tei_transcriptions import GenizaTei -from geniza.footnotes.models import Footnote, Source - - -class Command(BaseCommand): - """Synchronize TEI transcriptions to edition footnote content""" - - v_normal = 1 # default verbosity - - def add_arguments(self, parser): - parser.add_argument( - "-n", - "--noact", - action="store_true", - help="Do not save changes to the database", - ) - parser.add_argument("files", nargs="*", help="Only sync the specified files.") - - # dict of footnotes that have been updated with list of TEI files, to track/prevent - # TEI files resolving incorrectly to the same edition - footnotes_updated = defaultdict(list) - - # keep track of document ids with multiple digitized editions (likely merged records/joins) - multiedition_docs = set() - - def handle(self, *args, **options): - # get settings for remote git repository url and local path - gitrepo_url = settings.TEI_TRANSCRIPTIONS_GITREPO - gitrepo_path = settings.TEI_TRANSCRIPTIONS_LOCAL_PATH - - self.verbosity = options["verbosity"] - self.noact_mode = options["noact"] - - # make sure we have latest tei content from git repository - self.sync_git(gitrepo_url, gitrepo_path) - - if not self.noact_mode: - # get content type and user for log entries, unless in no-act mode - self.footnote_contenttype = ContentType.objects.get_for_model(Footnote) - self.script_user = User.objects.get(username=settings.SCRIPT_USERNAME) - - self.stats = defaultdict(int) - # after creating missing goitein unpublished edition notes, these will not be created again - self.stats["footnote_created"] = 0 - # duplicates might not always happen - self.stats["duplicate_footnote"] = 0 - # updates should not happen after initial sync when there are no TEI changes - self.stats["footnote_updated"] = 0 - # empty tei may not happen when running on a subset - self.stats["empty_tei"] = 0 - self.stats["document_not_found"] = 0 - self.stats["joins"] = 0 - self.stats["no_edition"] = 0 - self.stats["one_edition"] = 0 - self.stats["multiple_editions_with_content"] = 0 - # keep track of document ids with multiple digitized editions (likely merged records/joins) - self.multiedition_docs = set() - - # iterate through all tei files in the repository OR specified files - xmlfiles = options["files"] or glob.iglob(os.path.join(gitrepo_path, "*.xml")) - for xmlfile in xmlfiles: - self.stats["xml"] += 1 - xmlfile_basename = os.path.basename(xmlfile) - - tei = xmlmap.load_xmlobject_from_file(xmlfile, GenizaTei) - # some files are stubs with no content - # check if the tei is ok to proceed; (e.g., empty or only translation content) - # if empty, report and skip - if not self.check_tei(tei, xmlfile): - continue - - # get the document for the file based on id / old id - doc = self.get_pgp_document(xmlfile_basename) - # if document was not found, skip - if not doc: - continue - - if doc.fragments.count() > 1: - self.stats["joins"] += 1 - - footnote = self.get_edition_footnote(doc, tei, xmlfile) - # if we identified an appropriate footnote, update it - if footnote: - # if this footnote has already been chosen in the current script run, don't update again - if self.footnotes_updated[footnote.pk]: - self.stderr.write( - "Footnote %s (PGPID %s) already updated with %s; not overwriting with %s" - % ( - footnote.pk, - doc.pk, - ";".join(self.footnotes_updated[footnote.pk]), - xmlfile, - ) - ) - self.stats["duplicate_footnote"] += 1 - else: - self.footnotes_updated[footnote.pk].append(xmlfile) - - # convert into html, return in a list of blocks per inferred page/image - html_pages = tei.text_to_html() - text = tei.text_to_plaintext() - - # if no html was generated, stop processing - if not html_pages: - if self.verbosity >= self.v_normal: - self.stderr.write("No html generated for %s" % doc.id) - continue - - html = {} - # assign each page of html to a canvas based on sequence, - # skipping any non-document images - for i, image in enumerate(doc.iiif_images(filter_side=True)): - # stop iterating through images when we run out of pages - if not html_pages: - break - # pop the first page of html off the list - # and assign to the image canvas uri - html[image["canvas"]] = html_pages.pop(0) - - # if there are any html pages left - # (either document does not have any iiif images, or not all images) - # generate local canvas uris and attach transcription content - if html_pages: - # document manifest url is /documents/pgpid/iiif/manifest/ - # create canvas uris parallel to that - canvas_base_uri = "%siiif/canvas/" % doc.permalink - # iterate through any remaining pages and assign to local canvas uris - for i, html_chunk in enumerate(html_pages): - canvas_uri = "%s%d/" % (canvas_base_uri, i) - html[canvas_uri] = html_chunk - - footnote.content = {"html": html, "text": text} - if footnote.has_changed("content"): - # don't actually save in --noact mode - if not self.noact_mode: - footnote.save() - # create a log entry to document the change - self.log_footnote_update( - footnote, os.path.basename(xmlfile) - ) - - # count as a change whether in no-act mode or not - self.stats["footnote_updated"] += 1 - - # NOTE: in *one* case there is a TEI file with translation content and - # no transcription; will get reported as empty, but that's ok — it's out of scope - # for this script and should be handled elsewhere. - - # report on what was done - # include total number of transcription files, - # documents with transcriptions, number of fragments, and how how many joins - self.stats["multi_edition_docs"] = len(self.multiedition_docs) - self.stdout.write( - """Processed {xml:,} TEI/XML files; skipped {empty_tei:,} TEI files with no transcription content. -{document_not_found:,} documents not found in database. -{joins:,} documents with multiple fragments. -{multiple_editions:,} documents with multiple editions; {multiple_editions_with_content} multiple editions with content ({multi_edition_docs} unique documents). -{no_edition:,} documents with no edition. -{one_edition:,} documents with one edition. -Updated {footnote_updated:,} footnotes (created {footnote_created:,}; skipped overwriting {duplicate_footnote}). -""".format( - **self.stats - ) - ) - - for footnote_id, xmlfiles in self.footnotes_updated.items(): - if len(xmlfiles) > 1: - self.stderr.write( - "Footnote pk %s updated more than once: %s" - % (footnote_id, ";".join(xmlfiles)) - ) - - def check_tei(self, tei, xmlfile): - """Check TEI and report if it is empty, labels only, or has no content. - - :param tei: xmlmap tei instance to check; :class:`~geniza.corpus.tei_transcriptions.GenizaTei` - :param xmlfile: xml filename, for reporting - :returns: True if check passes; False if the TEI should be skipped. - :rtype: bool - """ - # some files are stubs with no content - # check if there is no text content; report and return true or false - if tei.no_content(): - if self.verbosity >= self.v_normal: - self.stdout.write("%s has no text content, skipping" % xmlfile) - self.stats["empty_tei"] += 1 - return False - elif tei.labels_only(): - if self.verbosity >= self.v_normal: - self.stdout.write( - "%s has labels only, no other text content; skipping" % xmlfile - ) - self.stats["empty_tei"] += 1 - return False - elif not tei.text.lines: - self.stdout.write("%s has no lines (translation?), skipping" % xmlfile) - self.stats["empty_tei"] += 1 - return False - - return True - - def get_pgp_document(self, xmlfile_basename): - """Find the PGP document for the specified TEI file, based on filename, - if possible. - - :returns: instance of :class:`~geniza.corpus.models.Document` or None if not found - """ - - # get the document id from the filename (####.xml) - pgpid = os.path.splitext(xmlfile_basename)[0] - # in ONE case there is a duplicate id with b suffix on the second - try: - pgpid = int(pgpid.strip("b")) - except ValueError: - if self.verbosity >= self.v_normal: - self.stderr.write("Failed to generate integer PGPID from %s" % pgpid) - return - # can we rely on pgpid from xml? - # but in some cases, it looks like a join 12047 + 12351 - - # find the document in the database - try: - Document.objects.get_by_any_pgpid(pgpid) - except Document.DoesNotExist: - self.stats["document_not_found"] += 1 - if self.verbosity >= self.v_normal: - self.stdout.write("Document %s not found in database" % pgpid) - return - - def get_footnote_editions(self, doc): - """Get all edition footnotes of a document; used by :meth:`get_edition_footnote`, - extend to include digital editions in tei to annotation script.""" - return doc.footnotes.editions() - - def get_edition_footnote(self, doc, tei, filename): - """identify the edition footnote to be updated""" - # get editions for this document - editions = self.get_footnote_editions(doc) - - if editions.count() > 1: - self.stats["multiple_editions"] += 1 - - # when there are multiple, try to identify correct edition by author names - footnote = self.choose_edition_by_authors(tei, editions, doc) - # if we got a match, use it - if footnote: - return footnote - - # if not, limit to editions with content and try again - editions_with_content = editions.filter(content__isnull=False) - footnote = self.choose_edition_by_authors(tei, editions_with_content, doc) - if footnote: - return footnote - - # if not, fallback to first edition - if editions_with_content.count() == 1: - self.stats["multiple_editions_with_content"] += 1 - self.multiedition_docs.add(doc.id) - - # if there was only one, assume it's the one to update - # NOTE: this is potentially wrong! - return editions_with_content.first() - - if not editions.exists(): - # no editions found; check if we can create a goitein unpublished edition footnote - footnote = self.is_it_goitein(tei, doc) - if footnote: - return footnote - - self.stats["no_edition"] += 1 - if self.verbosity > self.v_normal: - self.stdout.write("No edition found for %s" % filename) - for line in tei.source: - self.stdout.write("\t%s" % line) - else: - self.stats["one_edition"] += 1 - # if only one edition, update the transciption content there - return editions.first() - - def choose_edition_by_authors(self, tei, editions, doc): - """Try to choose correct edition from a list based on author names; - based on structured author names in the TEI""" - if tei.source_authors: - tei_authors = set(tei.source_authors) - author_matches = [] - for ed in editions: - ed_authors = set([auth.last_name for auth in ed.source.authors.all()]) - if ed_authors == tei_authors: - author_matches.append(ed) - - # if we got exactly one match, use that edition - if len(author_matches) == 1: - return author_matches[0] - - # if there were *no* author matches, see if we can create a goitein unpublished edition note - if not author_matches: - return self.is_it_goitein(tei, doc) - - def is_it_goitein(self, tei, doc): - """Check if a TEI document is a Goitein edition. If no edition exists - and we can identify based on the TEI as a Goitein unpublished edition, - then create a new footnote.""" - source_info = str(tei.source[0]).lower() - if "goitein" in source_info and ( - "unpublished editions" in source_info or "typed texts" in source_info - ): - if not self.noact_mode: - footnote = self.create_goitein_footnote(doc) - if footnote: - self.stats["footnote_created"] += 1 - return footnote - - def create_goitein_footnote(self, doc): - """Create a new footnote for a Goitein unpublished edition""" - source = Source.objects.filter( - authors__last_name="Goitein", - title_en="unpublished editions", - source_type__type="Unpublished", - volume__startswith=Source.get_volume_from_shelfmark(doc.shelfmark), - ).first() - if not source: - self.stderr.write( - "Error finding Goitein unpublished editions source for %s" - % doc.shelfmark - ) - return - - footnote = Footnote.objects.create( - source=source, - content_object=doc, - doc_relation=Footnote.EDITION, - ) - LogEntry.objects.log_action( - user_id=self.script_user.id, - content_type_id=self.footnote_contenttype.pk, - object_id=footnote.pk, - object_repr=str(footnote), - change_message="Created Goitein unpublished editions footnote to sync transcription", - action_flag=ADDITION, - ) - - return footnote - - def sync_git(self, gitrepo_url, local_path): - """ensure git repository has been cloned and content is up to date""" - - # if directory does not yet exist, clone repository - if not os.path.isdir(local_path): - if self.verbosity >= self.v_normal: - self.stdout.write( - "Cloning TEI transcriptions repository to %s" % local_path - ) - Repo.clone_from(url=gitrepo_url, to_path=local_path) - else: - # pull any changes since the last run - Repo(local_path).remotes.origin.pull() - - def log_footnote_update(self, footnote, xmlfile): - """create a log entry for a footnote that has been updated""" - LogEntry.objects.log_action( - user_id=self.script_user.id, - content_type_id=self.footnote_contenttype.pk, - object_id=footnote.pk, - object_repr=str(footnote), - change_message="Updated transcription from TEI file %s" % xmlfile, - action_flag=CHANGE, - ) diff --git a/geniza/corpus/management/commands/tei_to_annotation.py b/geniza/corpus/management/commands/tei_to_annotation.py deleted file mode 100644 index 8827da3e4..000000000 --- a/geniza/corpus/management/commands/tei_to_annotation.py +++ /dev/null @@ -1,529 +0,0 @@ -""" -Script to convert transcription content from PGP v3 TEI files -to IIIF annotations in the configured annotation server. - -""" - -import glob -import os.path -import unicodedata -from collections import defaultdict -from datetime import datetime -from functools import cached_property - -import requests -from addict import Dict -from django.conf import settings -from django.contrib.admin.models import ADDITION, CHANGE, DELETION, LogEntry -from django.contrib.auth.models import User -from django.contrib.contenttypes.models import ContentType -from django.core.management.base import BaseCommand, CommandError -from django.db import models -from django.template.defaultfilters import pluralize -from django.urls import reverse -from django.utils import timezone -from eulxml import xmlmap -from git import Repo -from parasolr.django.signals import IndexableSignalHandler -from rich.progress import MofNCompleteColumn, Progress - -from geniza.annotations.models import Annotation -from geniza.common.utils import absolutize_url -from geniza.corpus.annotation_export import AnnotationExporter -from geniza.corpus.management.commands import sync_transcriptions -from geniza.corpus.models import Document -from geniza.corpus.tei_transcriptions import GenizaTei -from geniza.footnotes.models import Footnote - - -class Command(sync_transcriptions.Command): - """Synchronize TEI transcriptions to edition footnote content""" - - v_normal = 1 # default verbosity - - missing_footnotes = [] - - normalized_unicode = set() - - document_not_found = [] - - def add_arguments(self, parser): - parser.add_argument( - "files", nargs="*", help="Only convert the specified files." - ) - - def handle(self, *args, **options): - # get settings for remote git repository url and local path - gitrepo_url = settings.TEI_TRANSCRIPTIONS_GITREPO - gitrepo_path = settings.TEI_TRANSCRIPTIONS_LOCAL_PATH - - self.verbosity = options["verbosity"] - # get content type and script user for log entries - self.script_user = User.objects.get(username=settings.SCRIPT_USERNAME) - - # disconnect solr indexing signals - IndexableSignalHandler.disconnect() - - # make sure we have latest tei content from git repository - # (inherited from sync transcriptions command) - self.sync_git(gitrepo_url, gitrepo_path) - # initialize local git repo client - self.tei_gitrepo = Repo(gitrepo_path) - - self.stats = defaultdict(int) - - xmlfiles = options["files"] or glob.glob(os.path.join(gitrepo_path, "*.xml")) - script_run_start = timezone.now() - - self.stdout.write("Migrating %d TEI files" % len(xmlfiles)) - - # when running on all files (i.e., specific files not specified), - # clear all annotations from the database before running the migration - # NOTE: could make this optional behavior, but it probably only - # impacts development and not the real migration? - if not options["files"]: - # cheating a little here, but much faster to clear all at once - # instead of searching and deleting one at a time - all_annos = Annotation.objects.all() - self.stdout.write("Clearing %d annotations" % all_annos.count()) - all_annos.delete() - - # initialize annotation exporter; don't push changes until the end; - # commit message will be overridden per export to docment TEI file - self.anno_exporter = AnnotationExporter( - stdout=self.stdout, - verbosity=options["verbosity"], - push_changes=False, - commit_msg="PGP transcription export from TEI migration", - ) - self.anno_exporter.setup_repo() - - # use rich progressbar without context manager - progress = Progress( - MofNCompleteColumn(), *Progress.get_default_columns(), expand=True - ) - progress.start() - task = progress.add_task("Migrating...", total=len(xmlfiles)) - - # iterate through tei files to be migrated - for xmlfile in xmlfiles: - self.stats["xml"] += 1 - # update progress at the beginning instead of end, - # since some records are skipped - progress.update(task, advance=1, update=True) - - if self.verbosity >= self.v_normal: - self.stdout.write(xmlfile) - - xmlfile_basename = os.path.basename(xmlfile) - - tei = xmlmap.load_xmlobject_from_file(xmlfile, GenizaTei) - # some files are stubs with no content - # check if the tei is ok to proceed; (e.g., empty or only translation content) - # if empty, report and skip - if not self.check_tei(tei, xmlfile): - self.stdout.write( - self.style.WARNING( - "No transcription content in %s; skipping" % xmlfile - ) - ) - continue - # get the document for the file based on id / old id - doc = self.get_pgp_document(xmlfile_basename) - # if document was not found, skip - if not doc: - self.stdout.write( - self.style.WARNING("Document not found for %s; skipping" % xmlfile) - ) - self.document_not_found.append(xmlfile) - continue - # found the document - if self.verbosity >= self.v_normal: - self.stdout.write(str(doc)) - - # get the footnote for this file & doc - footnote = self.get_edition_footnote(doc, tei, xmlfile) - # if no footnote, skip for now - # (some are missing, but will handle with data work) - if not footnote: - self.stdout.write( - self.style.ERROR( - "footnote not found for %s / %s; skipping" % (xmlfile, doc.pk) - ) - ) - self.missing_footnotes.append(xmlfile) - continue - footnote = self.migrate_footnote(footnote, doc) - - # if there is a single primary language, use the iso code if it is set - lang_code = None - if doc.languages.count() == 1: - lang_code = doc.languages.first().iso_code - - # get html blocks from the tei - html_blocks = tei.text_to_html(block_format=True) - - # get canvas objects for the images in order; skip any non-document images - iiif_canvases = list(doc.iiif_images(filter_side=True).keys()) - # determine the number of canvases needed based on labels - # that indicate new pages - # check and count any after the first; always need at least 1 canvas - num_canvases = 1 + len( - [ - b["label"] - for b in html_blocks[1:] - if tei.label_indicates_new_page(b["label"]) - ] - ) - # in verbose mode report on available/needed canvases - if self.verbosity > self.v_normal: - self.stdout.write( - "%d iiif canvases; need %d canvases for %d blocks" - % (len(iiif_canvases), num_canvases, len(html_blocks)) - ) - # if we need more canvases than we have available, - # generate local canvas ids - if num_canvases > len(iiif_canvases): - # document manifest url is /documents/pgpid/iiif/manifest/ - # create canvas uris parallel to that - canvas_base_uri = doc.manifest_uri.replace("manifest", "canvas") - for i in range(num_canvases - len(iiif_canvases)): - canvas_uri = "%s%d/" % (canvas_base_uri, i + 1) - iiif_canvases.append(canvas_uri) - - # NOTE: pgpid 1390 folio example; each chunk should be half of the canvas - # (probably should be handled manually) - # if len(html_chunks) > len(iiif_canvases): - # self.stdout.write( - # "%s has more html chunks than canvases; skipping" % xmlfile - # ) - # continue - - # start attaching to first canvas; increment based on chunk label - canvas_index = 0 - - # if specific files were specified, remove annotations - # just for those documents & sources - if options["files"]: - # remove all existing annotations associated with this - # document and source so we can reimport as needed - existing_annos = Annotation.objects.filter( - footnote__source=footnote.source, - footnote__content_object=doc, - created__lt=script_run_start, - ) - # NOTE: this is problematic for transcriptions currently - # split across two TEI files... take care when running - # on individual or groups of files - if existing_annos: - print( - "Removing %s pre-existing annotation%s for %s on %s " - % ( - len(existing_annos), - pluralize(existing_annos), - footnote.source, - doc.manifest_uri, - ) - ) - # not creating log entries for deletion, but - # this should probably only come up in dev runs - existing_annos.delete() - - for i, block in enumerate(html_blocks): - # if this is not the first block and the label suggests new image, - # increment canvas index - if i != 0 and tei.label_indicates_new_page(block["label"]): - canvas_index += 1 - - anno = new_transcription_annotation() - # get the canvas uri for this section of text - annotation_target = iiif_canvases[canvas_index] - anno.target.source.id = annotation_target - - # apply to the full canvas using % notation - # (using nearly full canvas to make it easier to edit zones) - anno.target.selector.value = "xywh=percent:1,1,98,98" - # anno.selector.value = "%s#xywh=pixel:0,0,%s,%s" % (annotation_target, canvas.width, canvas.height) - - # add html and optional label to annotation text body - # NOTE: not specifying language in html here because we - # handle it in wrapping template code based on db language - - html = tei.lines_to_html(block["lines"]) - if not unicodedata.is_normalized("NFC", html): - self.normalized_unicode.add(xmlfile) - html = unicodedata.normalize("NFC", html) - anno.body[0].value = html - - if block["label"]: - # check if label text requires normalization - if not unicodedata.is_normalized("NFC", block["label"]): - self.normalized_unicode.add(xmlfile) - block["label"] = unicodedata.normalize("NFC", block["label"]) - anno.body[0].label = block["label"] - - anno["schema:position"] = i + 1 - # print(anno) # can print for debugging - - # create database annotation - db_anno = Annotation() - db_anno.set_content(dict(anno)) - # link to digital edition footnote - db_anno.footnote = footnote - db_anno.save() - # log entry to document annotation creation - self.log_addition(db_anno, "Migrated from TEI transcription") - self.stats["created"] += 1 - - # export migrated transcription to backup - self.export_transcription(doc, xmlfile_basename) - - progress.refresh() - progress.stop() - - print( - "Processed %(xml)d TEI file(s). \nCreated %(created)d annotation(s)." - % self.stats - ) - - # push all changes from migration to github - self.anno_exporter.sync_github() - - # report on missing footnotes - if self.missing_footnotes: - print( - "Could not find footnotes for %s document%s:" - % (len(self.missing_footnotes), pluralize(self.missing_footnotes)) - ) - for xmlfile in self.missing_footnotes: - print("\t%s" % xmlfile) - - # report on unicode normalization - if self.normalized_unicode: - print( - "Normalized unicode for %s document%s:" - % (len(self.normalized_unicode), pluralize(self.normalized_unicode)) - ) - for xmlfile in self.normalized_unicode: - print("\t%s" % xmlfile) - - if self.document_not_found: - print( - "Document not found for %s TEI file%s:" - % (len(self.document_not_found), pluralize(self.document_not_found)) - ) - for xmlfile in self.normalized_unicode: - print("\t%s" % xmlfile) - - # report on edition footnotes that still have content - # (skip when unning on a specified files) - if not options["files"]: - self.check_unmigrated_footnotes() - - def get_footnote_editions(self, doc): - # extend to return digital edition or edition - # (digital edition if from previous run of this script) - return doc.footnotes.filter( - models.Q(doc_relation__contains=Footnote.EDITION) - | models.Q(doc_relation__contains=Footnote.DIGITAL_EDITION) - ) - - # we shouldn't be creating new footnotes at this point... - # override method from sync transcriptions to ensure we don't - def is_it_goitein(self, tei, doc): - return None - - def migrate_footnote(self, footnote, document): - # convert existing edition footnote to digital edition - # OR make a new one if the existing footnote has other information - - # convert existing edition footnote to digital edition - # OR make a new one if the existing footnote has other information - - # if footnote is already a digital edition, nothing to be done - # (already migrated in a previous run) - if footnote.doc_relation == Footnote.DIGITAL_EDITION: - return footnote - - # check if a digital edition footnote for this document+source exists, - # so we can avoid creating a duplicate - diged_footnote = document.footnotes.filter( - doc_relation=Footnote.DIGITAL_EDITION, source=footnote.source - ).first() - - # if footnote has other types or a url, we should preserve it - if ( - set(footnote.doc_relation).intersection( - {Footnote.TRANSLATION, Footnote.DISCUSSION} - ) - or footnote.url - or footnote.location - ): - # remove interim transcription content - if footnote.content: - footnote.content = None - footnote.save() - - # if a digital edition footnote for this document+source exists, - # use that instead of creating a duplicate - if diged_footnote: - return diged_footnote - - # otherwise, make a new one - new_footnote = Footnote( - doc_relation=Footnote.DIGITAL_EDITION, source=footnote.source - ) - # trying to set from related object footnote.document errors - new_footnote.content_object = document - new_footnote.save() - # log footnote creation and return - self.log_addition( - new_footnote, - "Created new footnote for migrated digital edition", - ) - return new_footnote - - # when there is no additional information on the footnote - else: - # if a digital edition already exists, remove this one - if diged_footnote: - footnote.delete() - # log deletion and and return existing diged - self.log_deletion(footnote, "Removing redundant edition footnote") - return diged_footnote - - # otherwise, convert edition to digital edition - footnote.doc_relation = Footnote.DIGITAL_EDITION - footnote.content = None - footnote.save() - # log footnote change and return - self.log_change(footnote, "Migrated footnote to digital edition") - return footnote - - # lookup to map tei git repo usernames to pgp db username for co-author string - teicontributor_to_username = { - "Alan Elbaum": "ae5677", - # multiple Bens should all map to same user - "Ben": "benj", - "Ben Johnston": "benj", - "benj@princeton.edu": "benj", - "benjohnsto": "benj", - # no github account that I can find; just use the name - "Brendan Goldman": "Brendan Goldman", - "Jessica Parker": "jp0630", - "Ksenia Ryzhova": "kryzhova", - "Rachel Richman": "rrichman", - "mrustow": "mrustow", - # multiple RSKs also... - "Rebecca Sutton Koeser": "rkoeser", - "rlskoeser": "rkoeser", - } - - @cached_property - def tei_contrib_users(self): - # retrieve users from database based on known tei contributor usernames, - # and return as a dict for lookup by username - tei_users = User.objects.filter( - username__in=set(self.teicontributor_to_username.values()) - ) - return {u.username: u for u in tei_users} - - def export_transcription(self, document, xmlfile): - # get contributors and export to git backup - - # get the unique list of all contributors to this file - commits = list(self.tei_gitrepo.iter_commits("master", paths=xmlfile)) - contributors = set([c.author.name for c in commits]) - # convert bitbucket users to unique set of pgp users - contrib_usernames = set( - self.teicontributor_to_username[c] for c in contributors - ) - # now get actual users for those usernames... - contrib_users = [self.tei_contrib_users.get(u, u) for u in contrib_usernames] - - # export transcription for the specified document, - # documenting the users who modified the TEI file - self.anno_exporter.export( - pgpids=[document.pk], - modifying_users=contrib_users, - commit_msg="Transcription migrated from TEI %s" % xmlfile, - ) - - def log_addition(self, obj, log_message): - "create a log entry documenting object creation" - return self.log_entry(obj, log_message, ADDITION) - - def log_change(self, obj, log_message): - "create a log entry documenting object change" - return self.log_entry(obj, log_message, CHANGE) - - def log_deletion(self, obj, log_message): - "create a log entry documenting object change" - return self.log_entry(obj, log_message, DELETION) - - def check_unmigrated_footnotes(self): - unmigrated_footnotes = Footnote.objects.filter( - doc_relation__contains=Footnote.EDITION, content__isnull=False - ) - if unmigrated_footnotes.exists(): - self.stdout.write( - "\n%d unmigrated footnote%s" - % (unmigrated_footnotes.count(), pluralize(unmigrated_footnotes)) - ) - for fn in unmigrated_footnotes: - # provide admin link to make it easier to investigate - admin_url = absolutize_url( - reverse("admin:footnotes_footnote_change", args=(fn.id,)) - ) - print("\t%s\t%s" % (fn, admin_url)) - - _content_types = {} - - def get_content_type(self, obj): - # lookup and cache content type for model - model_class = obj.__class__ - if model_class not in self._content_types: - self._content_types[model_class] = ContentType.objects.get_for_model( - model_class - ) - return self._content_types[model_class] - - def log_entry(self, obj, log_message, log_action): - "create a log entry documenting object creation/change/deletion" - # for this migration, we can assume user is always script user - content_type = self.get_content_type(obj) - return LogEntry.objects.log_action( - user_id=self.script_user.id, - content_type_id=content_type.pk, - object_id=obj.pk, - object_repr=str(obj), - change_message=log_message, - action_flag=log_action, - ) - - -def new_transcription_annotation(): - # initialize a new annotation dict object with all the defaults set - - anno = Dict() - setattr(anno, "@context", "http://www.w3.org/ns/anno.jsonld") - anno.type = "Annotation" - anno.body = [Dict()] - anno.body[0].type = "TextualBody" - # purpose on body is only needed if more than one body - # (e.g., transcription + tags in the same annotation) - # anno.body[0].purpose = "transcribing" - anno.body[0].format = "text/html" - # explicitly indicate text direction; all transcriptions are RTL - anno.body[0].TextInput = "rtl" - # supplement rather than painting over the image - # multiple motivations are allowed; add transcribing as secondary motivation - # (could use edm:transcribing from Europeana Data Model, but not sure - # how to declare edm namespace) - anno.motivation = ["sc:supplementing", "transcribing"] - - anno.target.source.type = "Canvas" - anno.target.selector.type = "FragmentSelector" - anno.target.selector.conformsTo = "http://www.w3.org/TR/media-frags/" - - return anno diff --git a/geniza/corpus/migrations/0045_fragment_provenance.py b/geniza/corpus/migrations/0045_fragment_provenance.py new file mode 100644 index 000000000..aaf8fbbb5 --- /dev/null +++ b/geniza/corpus/migrations/0045_fragment_provenance.py @@ -0,0 +1,20 @@ +# Generated by Django 3.2.23 on 2024-03-05 21:07 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("corpus", "0044_populate_fragment_old_shelfmark"), + ] + + operations = [ + migrations.AddField( + model_name="fragment", + name="provenance", + field=models.TextField( + blank=True, + help_text="The origin and acquisition history of this fragment.", + ), + ), + ] diff --git a/geniza/corpus/migrations/0046_document_events.py b/geniza/corpus/migrations/0046_document_events.py new file mode 100644 index 000000000..a0e3fb69a --- /dev/null +++ b/geniza/corpus/migrations/0046_document_events.py @@ -0,0 +1,52 @@ +# Generated by Django 3.2.23 on 2024-03-21 16:41 + +import django.db.models.deletion +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("entities", "0021_event"), + ("corpus", "0045_fragment_provenance"), + ] + + operations = [ + migrations.CreateModel( + name="DocumentEventRelation", + fields=[ + ( + "id", + models.AutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("notes", models.TextField(blank=True)), + ( + "document", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + to="corpus.document", + ), + ), + ( + "event", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, to="entities.event" + ), + ), + ], + ), + migrations.AddField( + model_name="document", + name="events", + field=models.ManyToManyField( + related_name="documents", + through="corpus.DocumentEventRelation", + to="entities.Event", + verbose_name="Related Events", + ), + ), + ] diff --git a/geniza/corpus/models.py b/geniza/corpus/models.py index c0588d606..d6b406499 100644 --- a/geniza/corpus/models.py +++ b/geniza/corpus/models.py @@ -44,7 +44,7 @@ ) from geniza.common.utils import absolutize_url from geniza.corpus.annotation_utils import document_id_from_manifest_uri -from geniza.corpus.dates import DocumentDateMixin, PartialDate +from geniza.corpus.dates import DocumentDateMixin, PartialDate, standard_date_display from geniza.corpus.iiif_utils import GenizaManifestImporter, get_iiif_string from geniza.corpus.solr_queryset import DocumentSolrQuerySet from geniza.footnotes.models import Creator, Footnote @@ -197,6 +197,9 @@ class Fragment(TrackChangesModel): default=False, help_text="True if there are multiple fragments in one shelfmark", ) + provenance = models.TextField( + blank=True, help_text="The origin and acquisition history of this fragment." + ) notes = models.TextField(blank=True) needs_review = models.TextField( blank=True, @@ -316,8 +319,9 @@ def attribution(self): ) @property - def provenance(self): - """Generate a provenance statement for this fragment""" + @admin.display(description="Provenance from IIIF manifest") + def iiif_provenance(self): + """Generate a provenance statement for this fragment from IIIF""" if self.manifest and self.manifest.metadata: return get_iiif_string(self.manifest.metadata.get("Provenance", "")) @@ -558,7 +562,12 @@ class Document(ModelIndexable, DocumentDateMixin): default=PUBLIC, help_text="Decide whether a document should be publicly visible", ) - + events = models.ManyToManyField( + to="entities.Event", + related_name="documents", + verbose_name="Related Events", + through="DocumentEventRelation", + ) footnotes = GenericRelation(Footnote, related_query_name="document") log_entries = GenericRelation(LogEntry, related_query_name="document") @@ -632,6 +641,16 @@ def shelfmark_display(self): associated fragments; uses :attr:`shelfmark_override` if set""" return self.shelfmark_override or self.shelfmark + @property + @admin.display(description="Historical shelfmarks") + def fragment_historical_shelfmarks(self): + """Property to display set of all historical shelfmarks on the document""" + all_textblocks = self.textblock_set.all() + all_fragments = [tb.fragment for tb in all_textblocks] + return "; ".join( + [frag.old_shelfmarks for frag in all_fragments if frag.old_shelfmarks] + ) + @property def collection(self): """collection (abbreviation) for associated fragments""" @@ -943,6 +962,8 @@ def dating_range(self): """ # it is unlikely, but technically possible, that a document could have both on-document # dates and inferred datings, so find the min and max out of all of them. + + # start_date and end_date are PartialDate instances dating_range = [self.start_date or None, self.end_date or None] # bail out if we don't have any inferred datings @@ -951,24 +972,15 @@ def dating_range(self): # loop through inferred datings to find min and max among all dates (including both # on-document and inferred) - for dating in self.dating_set.all(): + for inferred in self.dating_set.all(): # get start from standardized date range (formatted as "date1/date2" or "date") - split_date = dating.standard_date.split("/") + split_date = inferred.standard_date.split("/") start = PartialDate(split_date[0]) - # use numeric format to compare to current min, replace if smaller - start_numeric = int(start.numeric_format(mode="min")) - min = dating_range[0] - if min is None or start_numeric < int(min.numeric_format(mode="min")): - # store as PartialDate - dating_range[0] = start # get end from standardized date range end = PartialDate(split_date[1]) if len(split_date) > 1 else start - # use numeric format to compare to current max, replace if larger - end_numeric = int(end.numeric_format(mode="max")) - max = dating_range[1] - if max is None or end_numeric > int(max.numeric_format(mode="max")): - # store as PartialDate - dating_range[1] = end + dating_range = PartialDate.get_date_range( + old_range=dating_range, new_range=[start, end] + ) return tuple(dating_range) @@ -1159,12 +1171,14 @@ def index_data(self): # type gets matched back to DocumentType object in get_result_document, for i18n; # should always be indexed in English "type_s": ( - self.doctype.display_label_en - or self.doctype.name_en - or str(self.doctype) - ) - if self.doctype - else "Unknown type", + ( + self.doctype.display_label_en + or self.doctype.name_en + or str(self.doctype) + ) + if self.doctype + else "Unknown type" + ), # use english description for now "description_en_bigram": strip_tags(self.description_en), "notes_t": self.notes or None, @@ -1187,12 +1201,12 @@ def index_data(self): "document_dating_dr": self.solr_dating_range(), # historic date, for searching # start/end of document date or date range - "start_date_i": self.start_date.numeric_format() - if self.start_date - else None, - "end_date_i": self.end_date.numeric_format(mode="max") - if self.end_date - else None, + "start_date_i": ( + self.start_date.numeric_format() if self.start_date else None + ), + "end_date_i": ( + self.end_date.numeric_format(mode="max") if self.end_date else None + ), # library/collection possibly redundant? "collection_ss": [str(f.collection) for f in fragments], "tags_ss_lower": [t.name for t in self.tags.all()], @@ -1712,3 +1726,19 @@ class Meta: notes = models.TextField( help_text="Optional further details about the rationale", ) + + @property + def standard_date_display(self): + """Standard date in human-readable format for document details pages""" + return standard_date_display(self.standard_date) + + +class DocumentEventRelation(models.Model): + """A relationship between a document and an event""" + + document = models.ForeignKey(Document, on_delete=models.CASCADE) + event = models.ForeignKey("entities.Event", on_delete=models.CASCADE) + notes = models.TextField(blank=True) + + def __str__(self): + return f"Document-Event relation: {self.document} and {self.event}" diff --git a/geniza/corpus/solr_queryset.py b/geniza/corpus/solr_queryset.py index 0135860d6..9983ea21c 100644 --- a/geniza/corpus/solr_queryset.py +++ b/geniza/corpus/solr_queryset.py @@ -1,3 +1,4 @@ +import itertools import re from bs4 import BeautifulSoup @@ -12,6 +13,23 @@ def clean_html(html_snippet): """utility method to clean up html, since solr snippets of html content may result in non-valid content""" + + # if this snippet starts with a line that includes a closing but no opening, + # try to append the opening
  5. (and an ellipsis to show incompleteness) + incomplete_line = re.match(r"^(?!$", html_snippet, flags=re.MULTILINE) + incomplete_line_with_p = re.match(r"^

    .*?

    \n
  6. ", html_snippet) + if incomplete_line or incomplete_line_with_p: + ellipsis = "..." if incomplete_line else "" + line_number = re.search(r'
  7. : + # use the line number of the first displayed numbered line, and subtract 1 + html_snippet = ( + f'
  8. {ellipsis}{html_snippet}' + ) + else: + html_snippet = f"
  9. {ellipsis}{html_snippet}" + return BeautifulSoup(html_snippet, "html.parser").prettify(formatter="minimal") @@ -57,6 +75,9 @@ class DocumentSolrQuerySet(AliasedSolrQuerySet): "has_digital_edition": "has_digital_edition_b", "has_digital_translation": "has_digital_translation_b", "has_discussion": "has_discussion_b", + "old_shelfmark": "old_shelfmark_bigram", + "transcription_nostem": "transcription_nostem", + "description_nostem": "description_nostem", } # regex to convert field aliases used in search to actual solr fields @@ -90,7 +111,7 @@ class DocumentSolrQuerySet(AliasedSolrQuerySet): # beginning/end of the string or after/before a space, and not followed by a # tilde for fuzzy/proximity search (non-greedy to prevent matching the entire # string if there are multiple sets of doublequotes) - re_exact_match = re.compile(r'(? "pgpid_i:950 shelfmark_t:ena") @@ -140,15 +177,25 @@ def _search_term_cleanup(self, search_term): def admin_search(self, search_term): # remove " + " from search string to allow searching on shelfmark joins - return self.search(self.admin_doc_qf).raw_query_parameters( - doc_query=self._search_term_cleanup(search_term) - ) + doc_query = self._search_term_cleanup(search_term) + query_params = {"doc_query": doc_query} + # nested edismax query no longer works since solr 7.2 + # https://solr.apache.org/guide/7_2/solr-upgrade-notes.html#solr-7-2 + if "{!type=edismax" in doc_query: + query_params.update({"uf": "* _query_"}) + + return self.search(self.admin_doc_qf).raw_query_parameters(**query_params) keyword_search_qf = "{!type=edismax qf=$keyword_qf pf=$keyword_pf v=$keyword_query}" def keyword_search(self, search_term): + keyword_query = self._search_term_cleanup(search_term) + query_params = {"keyword_query": keyword_query} + # nested edismax query no longer works since solr 7.2 (see above) + if "{!type=edismax" in keyword_query: + query_params.update({"uf": "* _query_"}) search = self.search(self.keyword_search_qf).raw_query_parameters( - keyword_query=self._search_term_cleanup(search_term) + **query_params ) # if search term cleanup identifies any exact phrase searches, # pass the unmodified search to Solr as a highlighting query, @@ -212,8 +259,17 @@ def get_highlighting(self): """highlight snippets within transcription/translation html may result in invalid tags that will render strangely; clean up the html before returning""" highlights = super().get_highlighting() + is_exact_search = "hl_query" in self.raw_params for doc in highlights.keys(): - if "transcription" in highlights[doc]: + # _nostem fields should take precedence over stemmed fields in the case of an + # exact search; in that case, replace highlights for stemmed fields with nostem + if is_exact_search and "description_nostem" in highlights[doc]: + highlights[doc]["description"] = highlights[doc]["description_nostem"] + if is_exact_search and "transcription_nostem" in highlights[doc]: + highlights[doc]["transcription"] = [ + clean_html(s) for s in highlights[doc]["transcription_nostem"] + ] + elif "transcription" in highlights[doc]: highlights[doc]["transcription"] = [ clean_html(s) for s in highlights[doc]["transcription"] ] diff --git a/geniza/corpus/tei_transcriptions.py b/geniza/corpus/tei_transcriptions.py deleted file mode 100644 index ed3bb5d11..000000000 --- a/geniza/corpus/tei_transcriptions.py +++ /dev/null @@ -1,266 +0,0 @@ -from eulxml import xmlmap -from eulxml.xmlmap import teimap - -from geniza.common.utils import simplify_quotes - - -class GenizaTeiLine(teimap.TeiLine): - name = xmlmap.StringField("local-name(.)") - lang = xmlmap.StringField("@xml:lang|tei:span/@xml:lang") - number = xmlmap.StringField("@n") - - -class MainText(teimap.TeiDiv): - lines = xmlmap.NodeListField("tei:l|tei:label", GenizaTeiLine) - - -class GenizaTei(teimap.Tei): - # extend eulxml TEI to add mappings for the fields we care about - # NOTE: at least one pgpid is in format ### + ### - pgpid = xmlmap.IntegerField('tei:teiHeader//tei:idno[@type="PGP"]') - # normally main text content is under text/body/div; but at least one document has no div - text = xmlmap.NodeField( - "tei:text/tei:body/tei:div|tei:text/tei:body[not(tei:div)]", MainText - ) - lines = xmlmap.NodeListField("tei:text/tei:body/tei:div/tei:l", GenizaTeiLine) - labels = xmlmap.NodeListField( - "tei:text/tei:body/tei:div/tei:label", GenizaTeiLine - ) # not really a line... - # source description sometimes contains reference to scholarship record - source = xmlmap.NodeListField( - "tei:teiHeader//tei:sourceDesc/tei:msDesc/tei:msContents/tei:p", GenizaTeiLine - ) - # for documents with more than one transcription, authors have been - # tagged with last name in n attribute to allow identifying/differentiating - source_authors = xmlmap.StringListField( - "tei:teiHeader//tei:sourceDesc//tei:author/@n" - ) - - def no_content(self): - return str(self.text).strip() == "" - - # text that generally indicates a new page/image, anywhere in the label - new_page_indicators = [ - "recto", - "verso", - "side ii", - "page b", - "page 2", - "page two", - 'ע"ב', # Hebrew label for page 2 - ] - # text that indicates a new page/image at the start of the label - new_page_start_indicators = ["t-s ", "ts ", "ena ", "moss. "] - - def label_indicates_new_page(self, label): - label = simplify_quotes(label.lower()) - return any( - [side_label in label for side_label in self.new_page_indicators] - ) or any( - label.startswith(start_label) - for start_label in self.new_page_start_indicators - ) - - def labels_only(self): - text_content = str(self.text).strip() - label_content = " ".join([str(label).strip() for label in self.labels]) - return text_content == label_content - - def text_to_html(self, block_format=False): - # convert the TEI text content to basic HTML - blocks = [] - lines = [] - label = [] - # because blocks are indicated by labels without containing elements, - # iterate over all lines and create blocks based on the labels - for line in self.text.lines: - if line.name == "label": - # append current text block if set, and initialize a new one - if lines: - blocks.append( - { - "label": "\n".join(label), - "lines": lines, - # "languages": list(languages), - } - ) - label = [] - lines = [] - - # store the label; sometimes there are two in a row - label.append(str(line)) - - elif line.name == "l": - # use language codes? unreliable in the xml - # append tuple of line number, text - # return empty string for line number if no line attribute - lines.append((line.number or "", str(line))) - - # append the last block - if lines: - blocks.append( - { - "label": "\n".join(label), - "lines": lines, - } - ) - - # if block format requested, return blocks without further processing - if block_format: - return blocks - - # otherwise, return chunked HTML - return self.chunk_html(blocks) - - def chunk_html(self, blocks): - # combine blocks of text into html, chunked into pages to match sides of images - html = [] - page = [] - for block in blocks: - - # if there is a label and it looks like a new side, - # start a new section - if block["label"]: - if self.label_indicates_new_page(block["label"]): - # if we have any content, close the previous section - if page: - # combine all sections in the page and add to the html - html.append("\n".join(page)) - # then start a new page - page = [] - - # start output for the new block - output = ["
    "] - # add label if we have one - if block["label"]: - output.append(f"

    {block['label']}

    ") - - output.append(self.lines_to_html(block["lines"])) - output.append("
    ") - page.append("\n".join(output)) - - # save the last page - html.append("\n".join(page)) - - return html - - def lines_to_html(self, lines): - """Convert lines and line numbers from TEI to HTML, accounting - for unnumbered lines and lines starting with numbers other than 1. - Converts to ordered lists and paragraphs; ordered lists have - start attribute when needed. - - :params lines: list of tuples of line number, line text - :returns: string of html content - """ - - html_lines = [] - list_num = 1 - in_list = False - for line_number, line in lines: - # convertline number to integer for comparison - if line_number: - try: - line_number = int(line_number) - except ValueError: - # in at least one instance, line number is a range "16-17" - # ignore the problem (??) - if "-" in line_number: - line_number = int(line_number.split("-")[0]) - - # if line is empty, skip it - if not line.strip(): - continue - - # if line is unnumberred, output as a paragraph - if not line_number: - # if we were in a list, close it - if in_list: - html_lines.append("
") - in_list = False - list_num = 1 - html_lines.append("

%s

" % line) - - # if line number is 1, start a new list - elif line_number == 1: - # close any preceeding list - if in_list: - html_lines.append("") - - in_list = True - list_num = 1 - html_lines.append("
    ") - html_lines.append("
  1. %s
  2. " % line) - # if the line number matches expected next value, output as line - elif line_number == list_num: - html_lines.append("
  3. %s
  4. " % line) - - # if line number does not match expected list number, - # start a new list with start attribute specified - else: - # close existing list if any - if in_list: - html_lines.append("
") - - # start a new list with the specified number IF numeric - if isinstance(line_number, int): - list_num = line_number - in_list = True - html_lines.append('
    ' % line_number) - html_lines.append("
  1. %s
  2. " % line) - else: - # if not numeric, we can't use as line number or start - html_lines.append("
      ") - # add the n to text to preserve the content - html_lines.append("
    1. %s %s
    2. " % (line_number, line)) - - # increment expected list number if we're inside a list - if in_list: - list_num += 1 - - # close the last list, if active - if in_list: - html_lines.append("
    ") - - return "\n".join(html_lines) - - rtl_mark = "\u200F" - ltr_mark = "\u200E" - - def text_to_plaintext(self): - lines = [] - # because blocks are indicated by labels without containing elements, - # iterate over all lines and create blocks based on the labels - - # errors if there are no lines; sync transcription now checks - # and won't call in that case - if not self.text.lines: - return - - # determine longest line so we can pad the text - longest_line = max(len(str(line)) for line in self.text.lines) - # some files have descriptions that are making lines much too long, - # so set a limit on line length - if longest_line > 100: - longest_line = 100 - for line in self.text.lines: - if line.name == "label": - # blank line to indicate breaks between blocks - lines.append("") - lines.append("%s%s" % (self.ltr_mark, line)) - elif line.name == "l": - line_num = line.number or "" - # combine line text with line number and right justify; - # right justify line number - lines.append( - " ".join( - [ - self.rtl_mark, - str(line).rjust(longest_line), - self.ltr_mark, - line_num.rjust(3), - ] - ) - ) - - return "\n".join(lines) diff --git a/geniza/corpus/templates/corpus/document_detail.html b/geniza/corpus/templates/corpus/document_detail.html index f4976e588..18d629c0c 100644 --- a/geniza/corpus/templates/corpus/document_detail.html +++ b/geniza/corpus/templates/corpus/document_detail.html @@ -74,6 +74,23 @@

    {% endif %} + {% if document.dating_set.exists %} +
    + {# Translators: Inferred dating label #} + {% blocktranslate count counter=document.dating_set.count trimmed %} + Inferred Date + {% plural %} + Inferred Dates + {% endblocktranslate %} +
    + {% for date in document.dating_set.all %} +
    + + {{ date.display_date|default:date.standard_date_display }} + +
    + {% endfor %} + {% endif %} {% if document.languages.exists %}
    {# Translators: Primary language label #} @@ -117,13 +134,26 @@

    {% translate 'Tags' %}

    {% endif %} -
    - {# Translators: Label for date document was first added to the PGP #} -

    {% translate 'Input date' %}

    - {# Translators: Date document was first added to the PGP #} - {% blocktranslate with date=document.log_entries.last.action_time.year %} - In PGP since {{ date }} - {% endblocktranslate %} +
    @@ -140,13 +170,14 @@

    {# tertiary metadata #}

{% if is_paginated %} - {% include "corpus/snippets/pagination.html" %} + {# Translators: screen reader label for pagination navigation displayed after search results #} + {% translate "secondary pagination" as pagination_label %} + {# don't include footer pagination on random sort, since it's disabled #} + {% if form.sort.value != 'random' %} + {% include "corpus/snippets/pagination.html" with aria_label=pagination_label %} + {% endif %} {% endif %} {% endblock main %} diff --git a/geniza/corpus/templates/corpus/document_scholarship.html b/geniza/corpus/templates/corpus/document_scholarship.html index da0363f08..397bf6022 100644 --- a/geniza/corpus/templates/corpus/document_scholarship.html +++ b/geniza/corpus/templates/corpus/document_scholarship.html @@ -42,7 +42,9 @@

{{ page_title }}

diff --git a/geniza/corpus/templates/corpus/snippets/document_image_rights.html b/geniza/corpus/templates/corpus/snippets/document_image_rights.html index 6e99ccf2b..c7a99f049 100644 --- a/geniza/corpus/templates/corpus/snippets/document_image_rights.html +++ b/geniza/corpus/templates/corpus/snippets/document_image_rights.html @@ -12,12 +12,12 @@ {% endwith %} {% if fragment.attribution %} {{ fragment.attribution }} - {% elif fragment.provenance %} - {{ fragment.provenance }} + {% elif fragment.iiif_provenance %} + {{ fragment.iiif_provenance }} {% endif %} {% if fragment.manifest.license %} {% include "corpus/snippets/fragment_license_statement.html" %} - {% elif not fragment.attribution and not fragment.provenance %} + {% elif not fragment.attribution and not fragment.iiif_provenance %} {% translate "No attribution or license noted." %} {% endif %} @@ -25,15 +25,15 @@ {% endfor %} {% regroup document.fragments.all by manifest.logo as logos_list %} - {% regroup document.fragments.all by provenance as provenance_list %} + {% regroup document.fragments.all by iiif_provenance as provenance_list %}