diff --git a/.github/workflows/lighthouse.yml b/.github/workflows/lighthouse.yml
index 9c0e14e62..7b08d9dc2 100644
--- a/.github/workflows/lighthouse.yml
+++ b/.github/workflows/lighthouse.yml
@@ -23,7 +23,7 @@ jobs:
ports:
- 8983:8983
steps:
- - uses: actions/checkout@v3
+ - uses: actions/checkout@v4
- name: Decrypt encrypted fonts zip
run: |
gpg --quiet --batch --yes --decrypt --passphrase="${{ secrets.GPG_PASSPHRASE }}" --output sitemedia/fonts.zip sitemedia/fonts.zip.gpg
@@ -37,11 +37,9 @@ jobs:
docker exec -d ${{ job.services.solr.id }} cp -r /opt/solr/server/solr/configsets /var/solr/data
docker exec ${{ job.services.solr.id }} solr create -c geniza -n geniza
- run: sudo apt install gettext
- - run: echo "PYTHON_VERSION=$(cat .python-version)" >> $GITHUB_ENV
- - uses: actions/setup-python@v2
- with:
- python-version: ${{ env.PYTHON_VERSION }}
- - uses: actions/cache@v2
+ - uses: actions/setup-python@v5
+ # uses version set in .python-version
+ - uses: actions/cache@v4
with:
path: ~/.cache/pip
key: pip-${{ hashFiles('requirements/*.txt') }}
@@ -53,7 +51,7 @@ jobs:
- uses: actions/setup-node@v2
with:
node-version: 16
- - uses: actions/cache@v2
+ - uses: actions/cache@v4
with:
path: ~/.npm
key: npm-${{ hashFiles('package-lock.json') }}
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 347b4ed9d..9e7283683 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -1,6 +1,37 @@
Change Log
==========
+4.17
+----
+
+- public site
+ - As a public site user, I would like to see date ranges separated with an en-dash (–) instead of an em-dash (—).
+ - As a front end user, I only want to see one document number for a source displayed in the scholarship records on the public site.
+ - As a frontend user, I want to see dating information displayed on document details when available, so that I can find out the time frame of a document when it is known.
+ - bugfix: Double quotes search returning unexpected results
+ - bugfix: Issues with shelfmark scoped search
+ - bugfix: Highlighting context shows entire transcription or translation in search result
+ - bugfix: Transcription search results not always formatted correctly
+ - bugfix: Bracket and other character search is functioning unpredictably
+ - bugfix: Incorrect words are highlighted in complete word quotation search (Hebrew script)
+ - bugfix: Some partial search results in description not boosted by relevancy
+ - chore: accessibility issues flagged by DubBot
+
+- image, transcription, translation viewer/editor
+ - As a transcription editor, I should see an error if I try to update an annotation with out of date content so that I don't overwrite someone else's changes.
+ - bugfix: Autofill for source search (when inputting a transcription source) not functioning properly
+
+- admin
+ - As a content editor, I want to record places-to-places relationship on the place page and on the document detail page, so that I can track ambiguity.
+ - As a content admin, I want to drop down a pin on a map and then be able to move the pin around so that I can manually adjust the coordinates of a place before saving the location.
+ - As a content editor, I want there to be a notes field in the places pages so that I can add more detail about places that are hard-to-find.
+ - As a content admin, I want a provenance field on the document detail page so that I can note the origin and aquisition history of fragments when available.
+ - As a content editor, I want clearer help text for the name field of the person page so I know how best to present people's names on their pages
+ - As a content editor, I would like to see Historic Shelfmark on the Document edit page, to ensure that my work is correct when working with old scholarship.
+ - bugfix: Full shelfmark search for multiple shelfmarks not working in admin
+ - bugfix: Invalid lat/long coordinates are allowed for Places, but don't persist
+ - bugfix: People names are not diacritic neutral when adding them from Document Detail page
+
4.16.1
------
diff --git a/DEPLOYNOTES.md b/DEPLOYNOTES.md
index d8920acfc..b007fc9f0 100644
--- a/DEPLOYNOTES.md
+++ b/DEPLOYNOTES.md
@@ -1,5 +1,12 @@
# Deploy Notes
+## 4.17
+
+- Solr configuration has changed. Ensure Solr configset has been updated
+ and then reindex all content: `python manage.py index`
+- Configure **MAPTILER_API_TOKEN** in local settings for maps to appear.
+- Anywhere that Node versions are being managed manually, NPM should be upgraded to 8.x, at least 8.1.0.
+
## 4.16
- Import Bodleian catalog numbers from a spreadsheet using the script
diff --git a/README.rst b/README.rst
index 855d9608b..09375ea68 100644
--- a/README.rst
+++ b/README.rst
@@ -19,10 +19,6 @@ Python 3.9 / Django 3.2 / Node 16 / Postgresql / Solr 9.2
:target: https://codecov.io/gh/Princeton-CDH/geniza
:alt: Code coverage
-.. image:: https://requires.io/github/Princeton-CDH/geniza/requirements.svg?branch=main
- :target: https://requires.io/github/Princeton-CDH/geniza/requirements/?branch=main
- :alt: Requirements Status
-
.. image:: https://github.com/Princeton-CDH/geniza/workflows/dbdocs/badge.svg
:target: https://dbdocs.io/princetoncdh/geniza
:alt: dbdocs build
diff --git a/geniza/__init__.py b/geniza/__init__.py
index 509d2d231..cf15af50b 100644
--- a/geniza/__init__.py
+++ b/geniza/__init__.py
@@ -1,4 +1,4 @@
-__version_info__ = (4, 16, 1, None)
+__version_info__ = (4, 17, 0, None)
# Dot-connect all but the last. Last is dash-connected if not None.
diff --git a/geniza/admin.py b/geniza/admin.py
index ac943f53a..a2bdc7540 100644
--- a/geniza/admin.py
+++ b/geniza/admin.py
@@ -1,3 +1,4 @@
+from django.conf import settings
from django.contrib import admin
from geniza.corpus.models import Document, Fragment
@@ -29,4 +30,7 @@ def each_context(self, request):
: self.REVIEW_PREVIEW_MAX
]
+ # add maptiler token if we have one
+ context["maptiler_token"] = getattr(settings, "MAPTILER_API_TOKEN", "")
+
return context
diff --git a/geniza/annotations/conftest.py b/geniza/annotations/conftest.py
index 0945ce078..c18effbdc 100644
--- a/geniza/annotations/conftest.py
+++ b/geniza/annotations/conftest.py
@@ -62,6 +62,7 @@ def annotation_json(document, source):
}
}
},
+ "motivation": ["sc:supplementing", "transcribing"],
"dc:source": source.uri,
}
diff --git a/geniza/annotations/migrations/0006_annotation_block.py b/geniza/annotations/migrations/0006_annotation_block.py
new file mode 100644
index 000000000..56d066661
--- /dev/null
+++ b/geniza/annotations/migrations/0006_annotation_block.py
@@ -0,0 +1,23 @@
+# Generated by Django 3.2.16 on 2024-02-07 18:00
+
+import django.db.models.deletion
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+ dependencies = [
+ ("annotations", "0005_annotation_cleanup_nbsp"),
+ ]
+
+ operations = [
+ migrations.AddField(
+ model_name="annotation",
+ name="block",
+ field=models.ForeignKey(
+ null=True,
+ on_delete=django.db.models.deletion.CASCADE,
+ related_name="lines",
+ to="annotations.annotation",
+ ),
+ ),
+ ]
diff --git a/geniza/annotations/models.py b/geniza/annotations/models.py
index c6c0ba8b7..e893509fc 100644
--- a/geniza/annotations/models.py
+++ b/geniza/annotations/models.py
@@ -1,3 +1,5 @@
+import hashlib
+import json
import re
import uuid
from collections import defaultdict
@@ -59,7 +61,18 @@ def group_by_manifest(self):
class Annotation(TrackChangesModel):
- """Annotation model for storing annotations in the database."""
+ """Annotation model for storing annotations in the database.
+
+ Annotations may be either block-level or line-level. Block-level annotation is the default;
+ in most cases, a block-level annotation's content is stored as a TextualBody in its `content`
+ JSON.
+
+ However, block-level annotations may also be used to group line-level annotations, in which case
+ they have no textual content themselves, except for an optional label. Instead, their content
+ is serialized by joining TextualBody values from all associated line-level annotations.
+
+ Line-level annotations are associated with blocks via the `block` property, and that relationship
+ is serialized as `partOf` at the root of the line-level annotation."""
#: annotation id (uuid, autogenerated when created)
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
@@ -67,7 +80,10 @@ class Annotation(TrackChangesModel):
created = models.DateTimeField(auto_now_add=True)
#: date last modified
modified = models.DateTimeField(auto_now=True)
- #: json content of the annotation
+ #: json content of the annotation. in addition to W3C Web Annotation Data Model properties,
+ #: may also include: "schema:position", which tracks the order of the annotation with respect
+ #: to others in the same block or canvas; and "textGranularity", which indicates whether this
+ #: is a block- or line-level annotation
content = models.JSONField()
#: optional canonical identifier, for annotations imported from another source
canonical = models.CharField(max_length=255, blank=True)
@@ -79,6 +95,14 @@ class Annotation(TrackChangesModel):
on_delete=models.CASCADE,
null=False,
)
+ #: block-level annotation associated with this (line-level) annotation. if null, this is a
+ #: block-level annotation. if a block is deleted, all associated lines will be deleted.
+ block = models.ForeignKey(
+ to="Annotation",
+ on_delete=models.CASCADE,
+ related_name="lines",
+ null=True,
+ )
# use custom manager & queryset
objects = AnnotationQuerySet.as_manager()
@@ -140,6 +164,29 @@ def body_content(self):
except IndexError:
pass
+ @cached_property
+ def block_content_html(self):
+ """convenience method to get HTML content, including label and any associated lines,
+ of a block-level annotation, as a list of HTML strings"""
+ content = []
+ if self.label:
+ content.append(f"
{self.label}
")
+ if self.has_lines:
+ # if this block annotation has separate line annotations, serialize as ordered list
+ content.append("")
+ for l in self.lines.all().order_by("content__schema:position"):
+ content.append(f"
{l.body_content}
")
+ content.append("")
+ elif self.body_content:
+ content.append(self.body_content)
+ return content
+
+ @cached_property
+ def has_lines(self):
+ """cached property to indicate whether or not this is a block-level
+ annotation with line-level children"""
+ return self.lines.exists()
+
def set_content(self, data):
"""Set or update annotation content and model fields.
@@ -148,7 +195,7 @@ def set_content(self, data):
and the data will not be saved.
"""
# remove any values tracked on the model; redundant in json field
- for val in ["id", "created", "modified", "@context", "type"]:
+ for val in ["id", "created", "modified", "@context", "type", "etag"]:
if val in data:
del data[val]
@@ -202,6 +249,18 @@ def sanitize_html(cls, html):
else:
return cleaned_html
+ @property
+ def etag(self):
+ """Compute and return an md5 hash of content to use as an ETag.
+
+ NOTE: Only :attr:`content` can be modified in the editor, so it is the only hashed
+ attribute. If other attributes become mutable, modify this function to include them in
+ the ETag computation."""
+ # must be a string encoded as utf-8 to compute md5 hash
+ content_str = json.dumps(self.content, sort_keys=True).encode("utf-8")
+ # ETag should be wrapped in double quotes, per Django @condition docs
+ return f'"{hashlib.md5(content_str).hexdigest()}"'
+
def compile(self, include_context=True):
"""Combine annotation data and return as a dictionary that
can be serialized as JSON. Includes context by default,
@@ -213,6 +272,11 @@ def compile(self, include_context=True):
anno = {}
if include_context:
anno = {"@context": "http://www.w3.org/ns/anno.jsonld"}
+ else:
+ # NOTE: ETag required here for inclusion in annotation list, which is how
+ # annotations are fetched during editing; need to associate each ETag with
+ # an individual annotation, for comparison with response ETag on POST/DELETE
+ anno = {"etag": self.etag}
# define fields in desired order
anno.update(
@@ -246,4 +310,9 @@ def compile(self, include_context=True):
# overwrite with the base annotation data in case of any collisions
# between content and model fields
anno.update(base_anno)
+
+ # if this is a line-level annotation with block, include in content
+ if self.block:
+ anno.update({"partOf": self.block.uri()})
+
return anno
diff --git a/geniza/annotations/tests/test_annotations_models.py b/geniza/annotations/tests/test_annotations_models.py
index 4853be50d..9d1c7eabb 100644
--- a/geniza/annotations/tests/test_annotations_models.py
+++ b/geniza/annotations/tests/test_annotations_models.py
@@ -18,6 +18,27 @@ def test_get_absolute_url(self):
anno = Annotation()
assert anno.get_absolute_url() == "/annotations/%s/" % anno.pk
+ def test_etag(self, annotation):
+ old_etag = annotation.etag
+ # should be surrounded by doublequotes
+ assert old_etag[0] == old_etag[-1] == '"'
+ # should be length of an md5 hash + two characters
+ assert len(old_etag) == 34
+ # changing content should change etag
+ annotation.content.update(
+ {
+ "foo": "bar",
+ "id": "bogus",
+ "created": "yesterday",
+ "modified": "today",
+ }
+ )
+ assert annotation.etag != old_etag
+ new_etag = annotation.etag
+ # changing other properties on the annotation should not change etag
+ annotation.footnote = Footnote()
+ assert annotation.etag == new_etag
+
@pytest.mark.django_db
def test_uri(self):
anno = Annotation()
@@ -31,12 +52,13 @@ def test_set_content(self, source, document):
"id": absolutize_url("/annotations/1"),
"type": "Annotation",
"foo": "bar",
+ "etag": "abcd1234",
}
anno = Annotation(footnote=footnote)
anno.set_content(content)
# check that appropriate fields were removed
- for field in ["@context", "id", "type"]:
+ for field in ["@context", "id", "type", "etag"]:
assert field not in anno.content
# remaining content was set
assert anno.content["foo"] == "bar"
@@ -116,6 +138,18 @@ def test_compile(self, annotation):
assert compiled["canonical"] == annotation.canonical
assert compiled["via"] == annotation.via
+ line = Annotation.objects.create(
+ footnote=annotation.footnote, block=annotation, content={}
+ )
+ compiled = line.compile()
+ assert compiled["partOf"] == annotation.uri()
+
+ # when include_context=False (i.e. part of a list), should include etag, since
+ # we need a way to associate individual ETag to each item returned in list response
+ compiled = line.compile(include_context=False)
+ assert compiled["etag"] == line.etag
+ assert "@context" not in compiled
+
def test_sanitize_html(self):
html = '
test
line
'
# should strip out all unwanted elements and attributes (table, div, style)
@@ -134,6 +168,40 @@ def test_sanitize_html(self):
html = "
"
+
@pytest.mark.django_db
class TestAnnotationQuerySet:
diff --git a/geniza/annotations/tests/test_annotations_views.py b/geniza/annotations/tests/test_annotations_views.py
index 4b51fca10..88df624d3 100644
--- a/geniza/annotations/tests/test_annotations_views.py
+++ b/geniza/annotations/tests/test_annotations_views.py
@@ -8,7 +8,7 @@
from pytest_django.asserts import assertContains, assertNotContains
from geniza.annotations.models import Annotation
-from geniza.annotations.views import AnnotationResponse
+from geniza.annotations.views import AnnotationDetail, AnnotationResponse
from geniza.corpus.models import Document
from geniza.footnotes.models import Footnote, Source, SourceType
@@ -362,6 +362,55 @@ def test_corresponding_footnote_location(self, admin_client, document):
# should have its location copied from the existing Edition footnote
assert created_digital_edition.location == "doc. 123"
+ def test_get_etag(self, annotation):
+ # nonexistent annotation should return None
+ view = AnnotationDetail()
+ view.kwargs = {"pk": 1234}
+ assert view.get_etag(request=None) == None
+ # otherwise should return ETag
+ view.kwargs = {"pk": annotation.pk}
+ assert view.get_etag(request=None) == annotation.etag
+
+ def test_get_last_modified(self, annotation):
+ view = AnnotationDetail()
+ view.kwargs = {"pk": 1234}
+ # nonexistent annotation should return None
+ assert view.get_last_modified(request=None) == None
+ # otherwise should return last modified
+ view.kwargs = {"pk": annotation.pk}
+ assert view.get_last_modified(request=None) == annotation.modified
+
+ def test_dispatch(self, admin_client, annotation):
+ # integration test for conditional response
+ # get current etag
+ old_etag = annotation.etag
+
+ # change content
+ annotation.content = {
+ **annotation.content,
+ "body": [{"value": "changed"}],
+ }
+ annotation.save()
+
+ # attempt to update annotation with POST request as admin, including the old
+ # ETag in If-Match header
+ response = admin_client.post(
+ annotation.get_absolute_url(),
+ json.dumps(annotation.content),
+ content_type="application/json",
+ HTTP_IF_MATCH=old_etag,
+ )
+ # should result in 412 Precondition Failed status
+ assert response.status_code == 412
+
+ # attempt to delete annotation with DELETE request as admin, including the old
+ # ETag in If-Match header
+ response = admin_client.delete(
+ annotation.get_absolute_url(), HTTP_IF_MATCH=old_etag
+ )
+ # should result in 412 Precondition Failed status
+ assert response.status_code == 412
+
@pytest.mark.django_db
class TestAnnotationSearch:
diff --git a/geniza/annotations/views.py b/geniza/annotations/views.py
index 212eda569..b13ed7dfe 100644
--- a/geniza/annotations/views.py
+++ b/geniza/annotations/views.py
@@ -6,7 +6,8 @@
from django.contrib.auth.mixins import AccessMixin, PermissionRequiredMixin
from django.contrib.contenttypes.models import ContentType
from django.core.exceptions import BadRequest
-from django.http import HttpResponse, JsonResponse
+from django.http import Http404, HttpResponse, JsonResponse
+from django.views.decorators.http import condition
from django.views.generic.base import View
from django.views.generic.detail import SingleObjectMixin
from django.views.generic.list import MultipleObjectMixin
@@ -269,7 +270,10 @@ def get(self, request, *args, **kwargs):
class AnnotationDetail(
- PermissionRequiredMixin, ApiAccessMixin, View, SingleObjectMixin
+ PermissionRequiredMixin,
+ ApiAccessMixin,
+ View,
+ SingleObjectMixin,
):
"""View to read, update, or delete a single annotation."""
@@ -293,7 +297,6 @@ def get_permission_required(self):
def post(self, request, *args, **kwargs):
"""update the annotation on POST"""
- # NOTE: should use etag / if-match
anno = self.get_object()
try:
anno_data = parse_annotation_data(request=request)
@@ -316,7 +319,6 @@ def post(self, request, *args, **kwargs):
def delete(self, request, *args, **kwargs):
"""delete the annotation on DELETE"""
- # should use etag / if-match
# deleted uuid should not be reused (relying on low likelihood of uuid collision)
anno = self.get_object()
# create log entry to document deletion *BEFORE* deleting
@@ -352,3 +354,33 @@ def delete(self, request, *args, **kwargs):
footnote.refresh_from_db()
return HttpResponse(status=204)
+
+ def get_etag(self, request, *args, **kwargs):
+ """Get etag from annotation"""
+ try:
+ if not hasattr(self, "object"):
+ self.object = self.get_object()
+ anno = self.object
+ return anno.etag
+ except Http404:
+ return None
+
+ def get_last_modified(self, request, *args, **kwargs):
+ """Return last modified :class:`datetime.datetime`"""
+ try:
+ if not hasattr(self, "object"):
+ self.object = self.get_object()
+ anno = self.object
+ return anno.modified
+ except Http404:
+ return None
+
+ def dispatch(self, request, *args, **kwargs):
+ """Wrap the dispatch method to add ETag/last modified headers when
+ appropriate, then return a conditional response."""
+
+ @condition(etag_func=self.get_etag, last_modified_func=self.get_last_modified)
+ def _dispatch(request, *args, **kwargs):
+ return super(AnnotationDetail, self).dispatch(request, *args, **kwargs)
+
+ return _dispatch(request, *args, **kwargs)
diff --git a/geniza/common/migrations/0009_install_unaccent.py b/geniza/common/migrations/0009_install_unaccent.py
new file mode 100644
index 000000000..2687596ae
--- /dev/null
+++ b/geniza/common/migrations/0009_install_unaccent.py
@@ -0,0 +1,14 @@
+# Generated by Django 3.2.23 on 2024-02-29 18:02
+
+from django.contrib.postgres.operations import UnaccentExtension
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+ dependencies = [
+ ("common", "0008_preload_github_coauthors"),
+ ]
+
+ operations = [
+ UnaccentExtension(),
+ ]
diff --git a/geniza/corpus/admin.py b/geniza/corpus/admin.py
index 8d079e35b..17308e437 100644
--- a/geniza/corpus/admin.py
+++ b/geniza/corpus/admin.py
@@ -18,12 +18,14 @@
from geniza.annotations.models import Annotation
from geniza.common.admin import custom_empty_field_list_filter
-from geniza.corpus.dates import DocumentDateMixin
+from geniza.corpus.dates import DocumentDateMixin, standard_date_display
+from geniza.corpus.forms import DocumentPersonForm, DocumentPlaceForm
from geniza.corpus.metadata_export import AdminDocumentExporter, AdminFragmentExporter
from geniza.corpus.models import (
Collection,
Dating,
Document,
+ DocumentEventRelation,
DocumentType,
Fragment,
LanguageScript,
@@ -32,7 +34,7 @@
from geniza.corpus.solr_queryset import DocumentSolrQuerySet
from geniza.corpus.views import DocumentMerge
from geniza.entities.admin import PersonInline, PlaceInline
-from geniza.entities.models import DocumentPlaceRelation, PersonDocumentRelation
+from geniza.entities.models import DocumentPlaceRelation, Event, PersonDocumentRelation
from geniza.footnotes.admin import DocumentFootnoteInline
from geniza.footnotes.models import Footnote
@@ -131,6 +133,7 @@ class DocumentTextBlockInline(SortableInlineAdminMixin, admin.TabularInline):
readonly_fields = (
"thumbnail",
"side",
+ "fragment_provenance",
)
fields = (
"fragment",
@@ -139,6 +142,7 @@ class DocumentTextBlockInline(SortableInlineAdminMixin, admin.TabularInline):
"region",
"order",
"certain",
+ "fragment_provenance",
"thumbnail",
"selected_images",
)
@@ -149,6 +153,10 @@ class DocumentTextBlockInline(SortableInlineAdminMixin, admin.TabularInline):
ArrayField: {"widget": HiddenInput()}, # hidden input for selected_images
}
+ @admin.display(description="Provenance")
+ def fragment_provenance(self, obj):
+ return obj.fragment.provenance
+
class DocumentForm(forms.ModelForm):
class Meta:
@@ -358,12 +366,30 @@ class DocumentPersonInline(PersonInline):
"""Inline for people related to a document"""
model = PersonDocumentRelation
+ form = DocumentPersonForm
class DocumentPlaceInline(PlaceInline):
"""Inline for places related to a document"""
model = DocumentPlaceRelation
+ form = DocumentPlaceForm
+
+
+class DocumentEventInline(admin.TabularInline):
+ """Inline for events related to a document"""
+
+ autocomplete_fields = ("event",)
+ fields = ("event", "notes")
+ model = DocumentEventRelation
+ min_num = 0
+ extra = 1
+ show_change_link = True
+ verbose_name = "Related Event"
+ verbose_name_plural = "Related Events"
+ formfield_overrides = {
+ TextField: {"widget": Textarea(attrs={"rows": "4"})},
+ }
@admin.register(Document)
@@ -392,6 +418,7 @@ class DocumentAdmin(TabbedTranslationAdmin, SortableAdminBase, admin.ModelAdmin)
"view_old_pgpids",
"standard_date",
"admin_thumbnails",
+ "fragment_historical_shelfmarks",
)
search_fields = (
"fragments__shelfmark",
@@ -410,6 +437,12 @@ class DocumentAdmin(TabbedTranslationAdmin, SortableAdminBase, admin.ModelAdmin)
def view_old_pgpids(self, obj):
return ",".join([str(pid) for pid in obj.old_pgpids]) if obj.old_pgpids else "-"
+ @admin.display(
+ description="Standard date",
+ )
+ def standard_date(self, obj):
+ return standard_date_display(obj.doc_date_standard)
+
list_filter = (
"doctype",
HasTranscriptionListFilter,
@@ -437,7 +470,12 @@ def view_old_pgpids(self, obj):
None,
{
"fields": (
- ("shelfmark", "id", "view_old_pgpids"),
+ (
+ "shelfmark",
+ "id",
+ "view_old_pgpids",
+ "fragment_historical_shelfmarks",
+ ),
"shelfmark_override",
"doctype",
("languages", "secondary_languages"),
@@ -481,6 +519,7 @@ def view_old_pgpids(self, obj):
DocumentFootnoteInline,
DocumentPersonInline,
DocumentPlaceInline,
+ DocumentEventInline,
]
# mixed fieldsets and inlines: /templates/admin/snippets/mixed_inlines_fieldsets.html
fieldsets_and_inlines_order = (
@@ -493,6 +532,7 @@ def view_old_pgpids(self, obj):
"i", # DocumentFootnoteInline
"i", # DocumentPersonInline
"i", # DocumentPlaceInline
+ "i", # DocumentEventInline
)
class Media:
@@ -714,7 +754,7 @@ class Media:
class FragmentAdmin(admin.ModelAdmin):
list_display = ("shelfmark", "collection_display", "url", "is_multifragment")
search_fields = ("shelfmark", "old_shelfmarks", "notes", "needs_review")
- readonly_fields = ("created", "last_modified")
+ readonly_fields = ("created", "last_modified", "iiif_provenance")
list_filter = (
("url", custom_empty_field_list_filter("IIIF image", "Has image", "No image")),
(
@@ -731,6 +771,8 @@ class FragmentAdmin(admin.ModelAdmin):
"collection",
("url", "iiif_url"),
"is_multifragment",
+ "provenance",
+ "iiif_provenance",
"notes",
"needs_review",
("created", "last_modified"),
diff --git a/geniza/corpus/annotation_export.py b/geniza/corpus/annotation_export.py
index a21bac17a..464e179c0 100644
--- a/geniza/corpus/annotation_export.py
+++ b/geniza/corpus/annotation_export.py
@@ -379,7 +379,9 @@ def filename(document, source, fn_type):
# filename based on pgpid and source authors;
# explicitly label as transcription/translation for context
authors = [a.creator.last_name for a in source.authorship_set.all()] or [
- "unknown author"
+ "machine-generated"
+ if "model" in source.source_type.type
+ else "unknown author"
]
return "PGPID%(pgpid)s_s%(source_id)d_%(authors)s_%(text_type)s" % {
diff --git a/geniza/corpus/dates.py b/geniza/corpus/dates.py
index 28a42e1f9..fafade4b6 100644
--- a/geniza/corpus/dates.py
+++ b/geniza/corpus/dates.py
@@ -126,6 +126,27 @@ def numeric_format(self, mode="min"):
See :meth:`isoformat` for more details."""
return self.isoformat(mode, "numeric")
+ @staticmethod
+ def get_date_range(old_range, new_range):
+ """Compute the union (widest possible date range) between two PartialDate ranges."""
+ minmax = old_range
+ [start, end] = new_range
+
+ # use numeric format to compare to current min, replace if smaller
+ start_numeric = int(start.numeric_format(mode="min"))
+ min = minmax[0]
+ if min is None or start_numeric < int(min.numeric_format(mode="min")):
+ # store as PartialDate, not numeric format
+ minmax[0] = start
+ # use numeric format to compare to current max, replace if larger
+ end_numeric = int(end.numeric_format(mode="max"))
+ max = minmax[1]
+ if max is None or end_numeric > int(max.numeric_format(mode="max")):
+ # store as PartialDate, not numeric format
+ minmax[1] = end
+
+ return minmax
+
class DocumentDateMixin(TrackChangesModel):
"""Mixin for document date fields (original and standardized),
@@ -183,31 +204,11 @@ def original_date(self):
[self.doc_date_original, self.get_doc_date_calendar_display()]
).strip()
- @property
- def standard_date(self):
- """Display standard date in human readable format, when set."""
- # bail out if there is nothing to display
- if not self.doc_date_standard:
- return
-
- # currently storing in isoformat, with slash if a date range
- dates = self.doc_date_standard.split("/")
- # we should always have at least one date, if date is set
- # convert to local partial date object for precision-aware string formatting
- # join dates with n-dash if more than one;
- # add CE to the end to make calendar system explicit
- try:
- return "%s CE" % " — ".join(str(PartialDate(d)) for d in dates)
- except ValueError:
- # dates entered before validation was applied may not parse
- # as fallback, display as is
- return "%s CE" % self.doc_date_standard
-
@property
def document_date(self):
"""Generate formatted display of combined original and standardized dates"""
if self.doc_date_standard:
- standardized_date = self.standard_date
+ standardized_date = standard_date_display(self.doc_date_standard)
# add parentheses to standardized date if original date is also present
if self.original_date:
# NOTE: we want no-wrap for individual dates when displaying as html
@@ -528,3 +529,23 @@ def get_islamic_month(month_name):
with or without accents, and supports local month-name overrides."""
month_name = unidecode(month_name)
return islamic_months.index(islamic_month_aliases.get(month_name, month_name)) + 1
+
+
+def standard_date_display(standard_date):
+ """Display a standardized CE date in human readable format."""
+ # bail out if there is nothing to display
+ if not standard_date:
+ return
+
+ # currently storing in isoformat, with slash if a date range
+ dates = standard_date.split("/")
+ # we should always have at least one date, if date is set
+ # convert to local partial date object for precision-aware string formatting
+ # join dates with en-dash if more than one;
+ # add CE to the end to make calendar system explicit
+ try:
+ return "%s CE" % " – ".join(str(PartialDate(d)) for d in dates)
+ except ValueError:
+ # dates entered before validation was applied may not parse
+ # as fallback, display as is
+ return "%s CE" % standard_date
diff --git a/geniza/corpus/forms.py b/geniza/corpus/forms.py
index ffe421393..400a5c1e9 100644
--- a/geniza/corpus/forms.py
+++ b/geniza/corpus/forms.py
@@ -1,3 +1,4 @@
+from dal import autocomplete
from django import forms
from django.db.models import Count
from django.template.loader import get_template
@@ -8,6 +9,7 @@
from geniza.common.fields import RangeField, RangeForm, RangeWidget
from geniza.common.utils import simplify_quotes
from geniza.corpus.models import Document, DocumentType
+from geniza.entities.models import DocumentPlaceRelation, PersonDocumentRelation
class SelectDisabledMixin:
@@ -153,7 +155,6 @@ class YearRangeWidget(RangeWidget):
class DocumentSearchForm(RangeForm):
-
q = forms.CharField(
label="Keyword or Phrase",
required=False,
@@ -413,3 +414,33 @@ def __init__(self, *args, **kwargs):
).annotate(
item_count=Count("taggit_taggeditem_items", distinct=True),
)
+
+
+class DocumentPersonForm(forms.ModelForm):
+ class Meta:
+ model = PersonDocumentRelation
+ fields = (
+ "person",
+ "type",
+ "notes",
+ )
+ widgets = {
+ "notes": forms.Textarea(attrs={"rows": 4}),
+ "person": autocomplete.ModelSelect2(url="entities:person-autocomplete"),
+ "type": autocomplete.ModelSelect2(),
+ }
+
+
+class DocumentPlaceForm(forms.ModelForm):
+ class Meta:
+ model = DocumentPlaceRelation
+ fields = (
+ "place",
+ "type",
+ "notes",
+ )
+ widgets = {
+ "notes": forms.Textarea(attrs={"rows": 4}),
+ "place": autocomplete.ModelSelect2(url="entities:place-autocomplete"),
+ "type": autocomplete.ModelSelect2(),
+ }
diff --git a/geniza/corpus/management/commands/add_cat_numbers.py b/geniza/corpus/management/commands/add_cat_numbers.py
deleted file mode 100644
index 54a6ec243..000000000
--- a/geniza/corpus/management/commands/add_cat_numbers.py
+++ /dev/null
@@ -1,44 +0,0 @@
-"""
-Script to add catalog numbers to historical shelfmarks for some Bodleian
-records. This is a one-time script and should be removed after the import is
-completed in production.
-
-Intended to be run manually from the shell as follows:
-./manage.py add_cat_numbers historical_shelfmarks.csv
-"""
-
-import csv
-import re
-
-from django.core.management.base import BaseCommand
-
-from geniza.corpus.models import Fragment
-
-
-class Command(BaseCommand):
- """Import catalog numbers into Fragment records in the local database."""
-
- bodl_regex = r"^Bodl\. MS Heb\. (?P[A-Za-z]) (?P\d+),"
-
- def add_arguments(self, parser):
- parser.add_argument("path", help="Path to a CSV file")
-
- def handle(self, *args, **kwargs):
- with open(kwargs.get("path"), newline="") as csvfile:
- reader = csv.DictReader(csvfile)
- for row in reader:
- cat_number = row["catalog no. (Bodl. historical shelfmarks)"]
- if cat_number:
- try:
- frag = Fragment.objects.get(pk=int(row["id"]))
- except Fragment.DoesNotExist:
- print(f"Error: cannot find fragment with id {row['id']}")
- continue
-
- # Bodl. MS heb. b 12/6
- # --> data migration --> Bodl. MS Heb. b 12, f. 6
- # --> this script --> Bodl. MS Heb. b 12 (Cat. 2875), f. 6
- hist_repl = f"Bodl. MS Heb. \g \g (Cat. {cat_number}),"
- hist = re.sub(self.bodl_regex, hist_repl, frag.old_shelfmarks)
- frag.old_shelfmarks = hist
- frag.save()
diff --git a/geniza/corpus/management/commands/escr_alto_to_annotation.py b/geniza/corpus/management/commands/escr_alto_to_annotation.py
index a901224df..7e9e9ecd5 100644
--- a/geniza/corpus/management/commands/escr_alto_to_annotation.py
+++ b/geniza/corpus/management/commands/escr_alto_to_annotation.py
@@ -8,6 +8,7 @@
from django.db.models import Q
from djiffy.models import Canvas, Manifest
from eulxml import xmlmap
+from parasolr.django.signals import IndexableSignalHandler
from geniza.annotations.models import Annotation
from geniza.corpus.models import Document
@@ -27,7 +28,9 @@ class AltoPolygonalObject(AltoObject):
class Line(AltoPolygonalObject):
+ id = xmlmap.StringField("./@ID")
content = xmlmap.StringField("alto:String/@CONTENT")
+ line_type_id = xmlmap.StringField("./@TAGREFS")
class TextBlock(AltoPolygonalObject):
@@ -59,6 +62,17 @@ class Command(BaseCommand):
# regex pattern for image filenames
filename_pattern = r"PGPID_(?P\d+)_(?P[\w\-]+)_(?P\d)\..+"
+ # tags used for rotated blocks and lines
+ rotation_tags = [
+ "Oblique_45", # 45°
+ "Vertical_Bottom_Up_90", # 90°
+ "Oblique_135", # 135°
+ "Upside_Down", # 180°
+ "Oblique_225", # 225°
+ "Vertical_Top_Down_270", # 270°
+ "Oblique_315", # 315°
+ ]
+
def add_arguments(self, parser):
# needs xml filenames as input
parser.add_argument(
@@ -68,10 +82,16 @@ def add_arguments(self, parser):
def handle(self, *args, **options):
self.script_user = User.objects.get(username=settings.SCRIPT_USERNAME)
+ # store content type pk for logentry
+ self.anno_contenttype = ContentType.objects.get_for_model(Annotation).pk
+
# lists for reporting
self.document_errors = set()
self.canvas_errors = set()
+ # disconnect solr indexing signals; this script will index annotations manually
+ IndexableSignalHandler.disconnect()
+
# process all files
for xmlfile in options["alto"]:
self.stdout.write("Processing %s" % xmlfile)
@@ -128,7 +148,7 @@ def ingest_xml(self, xmlfile):
)
# create annotations
- for tb in alto.printspace.textblocks:
+ for tb_idx, tb in enumerate(alto.printspace.textblocks, start=1):
block_type = None
if tb.block_type_id:
# find first tag in tag list whose id matches block type id
@@ -136,26 +156,57 @@ def ingest_xml(self, xmlfile):
tag = next(tag_matches, None)
if tag:
block_type = tag.label
- # TODO: When implementing line-by-line, use block_type to determine rotation
# skip arabic; these are Hebrew script transcriptions
if not (block_type and "Arabic" in block_type) and len(tb.lines):
# get or create footnote
footnote = self.get_footnote(doc)
# create annotation and log entry
- anno = Annotation.objects.create(
- content=self.create_block_annotation(tb, canvas_uri, scale_factor),
+ block = Annotation.objects.create(
+ content=self.create_block_annotation(
+ tb, canvas_uri, scale_factor, block_type, tb_idx
+ ),
footnote=footnote,
)
LogEntry.objects.log_action(
user_id=self.script_user.pk,
- content_type_id=ContentType.objects.get_for_model(Annotation).pk,
- object_id=anno.pk,
- object_repr=str(anno),
- change_message="Imported from eScriptorium HTR ALTO",
+ content_type_id=self.anno_contenttype,
+ object_id=block.pk,
+ object_repr=str(block),
+ change_message="Imported block from eScriptorium HTR ALTO",
action_flag=ADDITION,
)
+ # create line annotations from lines and link to block
+ for i, line in enumerate(tb.lines, start=1):
+ line_type = None
+ if line.line_type_id:
+ # find first tag in tag list whose id matches line type id
+ tag_matches = filter(
+ lambda t: t.id == line.line_type_id, alto.tags
+ )
+ tag = next(tag_matches, None)
+ if tag:
+ line_type = tag
+ line_anno = Annotation.objects.create(
+ content=self.create_line_annotation(
+ line, block, scale_factor, line_type, order=i
+ ),
+ block=block,
+ footnote=footnote,
+ )
+ LogEntry.objects.log_action(
+ user_id=self.script_user.pk,
+ content_type_id=self.anno_contenttype,
+ object_id=line_anno.pk,
+ object_repr=str(line_anno),
+ change_message="Imported line from eScriptorium HTR ALTO",
+ action_flag=ADDITION,
+ )
+
+ # index after all blocks added
+ doc.index()
+
def get_manifest(self, document, short_id, filename):
"""Attempt to get the manifest using the supplied short id; fallback to first manifest,
or return None if there are none on the document"""
@@ -232,26 +283,16 @@ def scale_polygon(self, polygon, scale):
# return as string for use in svg polygon element
return " ".join([str(point) for point in scaled_points])
- def create_block_annotation(self, textblock, canvas_uri, scale_factor):
+ def create_block_annotation(
+ self, textblock, canvas_uri, scale_factor, block_type, order
+ ):
"""Produce a valid IIIF annotation with the block-level content and geometry,
linked to the IIIF canvas by URI"""
- # lines to HTML list
- block_text = "\n"
- for line in textblock.lines:
- block_text += f"
{line.content}
\n"
- block_text += ""
-
# create IIIF annotation
anno_content = {}
- anno_content["body"] = [
- {
- "TextInput": "rtl",
- "format": "text/html",
- "type": "TextualBody",
- "value": block_text,
- }
- ]
+ anno_content["schema:position"] = order
+ anno_content["textGranularity"] = "block"
anno_content["motivation"] = ["sc:supplementing", "transcribing"]
anno_content["target"] = {
"source": {
@@ -259,6 +300,15 @@ def create_block_annotation(self, textblock, canvas_uri, scale_factor):
"type": "Canvas",
},
}
+ if block_type:
+ anno_content["body"] = [
+ {
+ "label": block_type,
+ }
+ ]
+ if block_type in self.rotation_tags:
+ # add rotation tag as a CSS class to this block
+ anno_content["target"]["styleClass"] = block_type
# add selector
if textblock.polygon:
@@ -278,3 +328,40 @@ def create_block_annotation(self, textblock, canvas_uri, scale_factor):
}
return anno_content
+
+ def create_line_annotation(self, line, block_anno, scale_factor, line_type, order):
+ # create IIIF annotation
+ anno_content = {}
+ anno_content["schema:position"] = order
+ anno_content["body"] = [
+ {
+ "TextInput": "rtl",
+ "format": "text/html",
+ "type": "TextualBody",
+ "value": line.content,
+ }
+ ]
+ anno_content["textGranularity"] = "line"
+ anno_content["motivation"] = block_anno.content["motivation"]
+ anno_content["target"] = {"source": block_anno.content["target"]["source"]}
+ if line_type and line_type in self.rotation_tags:
+ # add rotation tag as a CSS class to this line (sometimes differs from block)
+ anno_content["target"]["styleClass"] = line_type
+ elif "styleClass" in block_anno.content["target"]:
+ # if block has rotation but line doesn't, use block's rotation
+ anno_content["target"]["styleClass"] = block_anno.content["target"][
+ "styleClass"
+ ]
+
+ # add selector
+ if line.polygon:
+ # scale polygon points and use SvgSelector
+ points = self.scale_polygon(line.polygon, scale_factor)
+ anno_content["target"]["selector"] = {
+ "type": "SvgSelector",
+ "value": f'',
+ }
+ else:
+ self.stdout.write(f"No line-level geometry available for {line.id}")
+
+ return anno_content
diff --git a/geniza/corpus/management/commands/sync_transcriptions.py b/geniza/corpus/management/commands/sync_transcriptions.py
deleted file mode 100644
index 7a307a166..000000000
--- a/geniza/corpus/management/commands/sync_transcriptions.py
+++ /dev/null
@@ -1,393 +0,0 @@
-"""
-Script to synchronize transcription content from PGP v3 TEI files
-to an _interim_ html format in the database.
-
-The script checks out and updates the transcription files from the
-git repository, and then loops through all xml files and
-identifies the document and footnote to update, if possible.
-
-"""
-
-import glob
-import os.path
-from collections import defaultdict
-
-from django.conf import settings
-from django.contrib.admin.models import ADDITION, CHANGE, LogEntry
-from django.contrib.auth.models import User
-from django.contrib.contenttypes.models import ContentType
-from django.core.management.base import BaseCommand, CommandError
-from django.db import models
-from django.urls import reverse
-from eulxml import xmlmap
-from git import Repo
-
-from geniza.common.utils import absolutize_url
-from geniza.corpus.models import Document
-from geniza.corpus.tei_transcriptions import GenizaTei
-from geniza.footnotes.models import Footnote, Source
-
-
-class Command(BaseCommand):
- """Synchronize TEI transcriptions to edition footnote content"""
-
- v_normal = 1 # default verbosity
-
- def add_arguments(self, parser):
- parser.add_argument(
- "-n",
- "--noact",
- action="store_true",
- help="Do not save changes to the database",
- )
- parser.add_argument("files", nargs="*", help="Only sync the specified files.")
-
- # dict of footnotes that have been updated with list of TEI files, to track/prevent
- # TEI files resolving incorrectly to the same edition
- footnotes_updated = defaultdict(list)
-
- # keep track of document ids with multiple digitized editions (likely merged records/joins)
- multiedition_docs = set()
-
- def handle(self, *args, **options):
- # get settings for remote git repository url and local path
- gitrepo_url = settings.TEI_TRANSCRIPTIONS_GITREPO
- gitrepo_path = settings.TEI_TRANSCRIPTIONS_LOCAL_PATH
-
- self.verbosity = options["verbosity"]
- self.noact_mode = options["noact"]
-
- # make sure we have latest tei content from git repository
- self.sync_git(gitrepo_url, gitrepo_path)
-
- if not self.noact_mode:
- # get content type and user for log entries, unless in no-act mode
- self.footnote_contenttype = ContentType.objects.get_for_model(Footnote)
- self.script_user = User.objects.get(username=settings.SCRIPT_USERNAME)
-
- self.stats = defaultdict(int)
- # after creating missing goitein unpublished edition notes, these will not be created again
- self.stats["footnote_created"] = 0
- # duplicates might not always happen
- self.stats["duplicate_footnote"] = 0
- # updates should not happen after initial sync when there are no TEI changes
- self.stats["footnote_updated"] = 0
- # empty tei may not happen when running on a subset
- self.stats["empty_tei"] = 0
- self.stats["document_not_found"] = 0
- self.stats["joins"] = 0
- self.stats["no_edition"] = 0
- self.stats["one_edition"] = 0
- self.stats["multiple_editions_with_content"] = 0
- # keep track of document ids with multiple digitized editions (likely merged records/joins)
- self.multiedition_docs = set()
-
- # iterate through all tei files in the repository OR specified files
- xmlfiles = options["files"] or glob.iglob(os.path.join(gitrepo_path, "*.xml"))
- for xmlfile in xmlfiles:
- self.stats["xml"] += 1
- xmlfile_basename = os.path.basename(xmlfile)
-
- tei = xmlmap.load_xmlobject_from_file(xmlfile, GenizaTei)
- # some files are stubs with no content
- # check if the tei is ok to proceed; (e.g., empty or only translation content)
- # if empty, report and skip
- if not self.check_tei(tei, xmlfile):
- continue
-
- # get the document for the file based on id / old id
- doc = self.get_pgp_document(xmlfile_basename)
- # if document was not found, skip
- if not doc:
- continue
-
- if doc.fragments.count() > 1:
- self.stats["joins"] += 1
-
- footnote = self.get_edition_footnote(doc, tei, xmlfile)
- # if we identified an appropriate footnote, update it
- if footnote:
- # if this footnote has already been chosen in the current script run, don't update again
- if self.footnotes_updated[footnote.pk]:
- self.stderr.write(
- "Footnote %s (PGPID %s) already updated with %s; not overwriting with %s"
- % (
- footnote.pk,
- doc.pk,
- ";".join(self.footnotes_updated[footnote.pk]),
- xmlfile,
- )
- )
- self.stats["duplicate_footnote"] += 1
- else:
- self.footnotes_updated[footnote.pk].append(xmlfile)
-
- # convert into html, return in a list of blocks per inferred page/image
- html_pages = tei.text_to_html()
- text = tei.text_to_plaintext()
-
- # if no html was generated, stop processing
- if not html_pages:
- if self.verbosity >= self.v_normal:
- self.stderr.write("No html generated for %s" % doc.id)
- continue
-
- html = {}
- # assign each page of html to a canvas based on sequence,
- # skipping any non-document images
- for i, image in enumerate(doc.iiif_images(filter_side=True)):
- # stop iterating through images when we run out of pages
- if not html_pages:
- break
- # pop the first page of html off the list
- # and assign to the image canvas uri
- html[image["canvas"]] = html_pages.pop(0)
-
- # if there are any html pages left
- # (either document does not have any iiif images, or not all images)
- # generate local canvas uris and attach transcription content
- if html_pages:
- # document manifest url is /documents/pgpid/iiif/manifest/
- # create canvas uris parallel to that
- canvas_base_uri = "%siiif/canvas/" % doc.permalink
- # iterate through any remaining pages and assign to local canvas uris
- for i, html_chunk in enumerate(html_pages):
- canvas_uri = "%s%d/" % (canvas_base_uri, i)
- html[canvas_uri] = html_chunk
-
- footnote.content = {"html": html, "text": text}
- if footnote.has_changed("content"):
- # don't actually save in --noact mode
- if not self.noact_mode:
- footnote.save()
- # create a log entry to document the change
- self.log_footnote_update(
- footnote, os.path.basename(xmlfile)
- )
-
- # count as a change whether in no-act mode or not
- self.stats["footnote_updated"] += 1
-
- # NOTE: in *one* case there is a TEI file with translation content and
- # no transcription; will get reported as empty, but that's ok — it's out of scope
- # for this script and should be handled elsewhere.
-
- # report on what was done
- # include total number of transcription files,
- # documents with transcriptions, number of fragments, and how how many joins
- self.stats["multi_edition_docs"] = len(self.multiedition_docs)
- self.stdout.write(
- """Processed {xml:,} TEI/XML files; skipped {empty_tei:,} TEI files with no transcription content.
-{document_not_found:,} documents not found in database.
-{joins:,} documents with multiple fragments.
-{multiple_editions:,} documents with multiple editions; {multiple_editions_with_content} multiple editions with content ({multi_edition_docs} unique documents).
-{no_edition:,} documents with no edition.
-{one_edition:,} documents with one edition.
-Updated {footnote_updated:,} footnotes (created {footnote_created:,}; skipped overwriting {duplicate_footnote}).
-""".format(
- **self.stats
- )
- )
-
- for footnote_id, xmlfiles in self.footnotes_updated.items():
- if len(xmlfiles) > 1:
- self.stderr.write(
- "Footnote pk %s updated more than once: %s"
- % (footnote_id, ";".join(xmlfiles))
- )
-
- def check_tei(self, tei, xmlfile):
- """Check TEI and report if it is empty, labels only, or has no content.
-
- :param tei: xmlmap tei instance to check; :class:`~geniza.corpus.tei_transcriptions.GenizaTei`
- :param xmlfile: xml filename, for reporting
- :returns: True if check passes; False if the TEI should be skipped.
- :rtype: bool
- """
- # some files are stubs with no content
- # check if there is no text content; report and return true or false
- if tei.no_content():
- if self.verbosity >= self.v_normal:
- self.stdout.write("%s has no text content, skipping" % xmlfile)
- self.stats["empty_tei"] += 1
- return False
- elif tei.labels_only():
- if self.verbosity >= self.v_normal:
- self.stdout.write(
- "%s has labels only, no other text content; skipping" % xmlfile
- )
- self.stats["empty_tei"] += 1
- return False
- elif not tei.text.lines:
- self.stdout.write("%s has no lines (translation?), skipping" % xmlfile)
- self.stats["empty_tei"] += 1
- return False
-
- return True
-
- def get_pgp_document(self, xmlfile_basename):
- """Find the PGP document for the specified TEI file, based on filename,
- if possible.
-
- :returns: instance of :class:`~geniza.corpus.models.Document` or None if not found
- """
-
- # get the document id from the filename (####.xml)
- pgpid = os.path.splitext(xmlfile_basename)[0]
- # in ONE case there is a duplicate id with b suffix on the second
- try:
- pgpid = int(pgpid.strip("b"))
- except ValueError:
- if self.verbosity >= self.v_normal:
- self.stderr.write("Failed to generate integer PGPID from %s" % pgpid)
- return
- # can we rely on pgpid from xml?
- # but in some cases, it looks like a join 12047 + 12351
-
- # find the document in the database
- try:
- Document.objects.get_by_any_pgpid(pgpid)
- except Document.DoesNotExist:
- self.stats["document_not_found"] += 1
- if self.verbosity >= self.v_normal:
- self.stdout.write("Document %s not found in database" % pgpid)
- return
-
- def get_footnote_editions(self, doc):
- """Get all edition footnotes of a document; used by :meth:`get_edition_footnote`,
- extend to include digital editions in tei to annotation script."""
- return doc.footnotes.editions()
-
- def get_edition_footnote(self, doc, tei, filename):
- """identify the edition footnote to be updated"""
- # get editions for this document
- editions = self.get_footnote_editions(doc)
-
- if editions.count() > 1:
- self.stats["multiple_editions"] += 1
-
- # when there are multiple, try to identify correct edition by author names
- footnote = self.choose_edition_by_authors(tei, editions, doc)
- # if we got a match, use it
- if footnote:
- return footnote
-
- # if not, limit to editions with content and try again
- editions_with_content = editions.filter(content__isnull=False)
- footnote = self.choose_edition_by_authors(tei, editions_with_content, doc)
- if footnote:
- return footnote
-
- # if not, fallback to first edition
- if editions_with_content.count() == 1:
- self.stats["multiple_editions_with_content"] += 1
- self.multiedition_docs.add(doc.id)
-
- # if there was only one, assume it's the one to update
- # NOTE: this is potentially wrong!
- return editions_with_content.first()
-
- if not editions.exists():
- # no editions found; check if we can create a goitein unpublished edition footnote
- footnote = self.is_it_goitein(tei, doc)
- if footnote:
- return footnote
-
- self.stats["no_edition"] += 1
- if self.verbosity > self.v_normal:
- self.stdout.write("No edition found for %s" % filename)
- for line in tei.source:
- self.stdout.write("\t%s" % line)
- else:
- self.stats["one_edition"] += 1
- # if only one edition, update the transciption content there
- return editions.first()
-
- def choose_edition_by_authors(self, tei, editions, doc):
- """Try to choose correct edition from a list based on author names;
- based on structured author names in the TEI"""
- if tei.source_authors:
- tei_authors = set(tei.source_authors)
- author_matches = []
- for ed in editions:
- ed_authors = set([auth.last_name for auth in ed.source.authors.all()])
- if ed_authors == tei_authors:
- author_matches.append(ed)
-
- # if we got exactly one match, use that edition
- if len(author_matches) == 1:
- return author_matches[0]
-
- # if there were *no* author matches, see if we can create a goitein unpublished edition note
- if not author_matches:
- return self.is_it_goitein(tei, doc)
-
- def is_it_goitein(self, tei, doc):
- """Check if a TEI document is a Goitein edition. If no edition exists
- and we can identify based on the TEI as a Goitein unpublished edition,
- then create a new footnote."""
- source_info = str(tei.source[0]).lower()
- if "goitein" in source_info and (
- "unpublished editions" in source_info or "typed texts" in source_info
- ):
- if not self.noact_mode:
- footnote = self.create_goitein_footnote(doc)
- if footnote:
- self.stats["footnote_created"] += 1
- return footnote
-
- def create_goitein_footnote(self, doc):
- """Create a new footnote for a Goitein unpublished edition"""
- source = Source.objects.filter(
- authors__last_name="Goitein",
- title_en="unpublished editions",
- source_type__type="Unpublished",
- volume__startswith=Source.get_volume_from_shelfmark(doc.shelfmark),
- ).first()
- if not source:
- self.stderr.write(
- "Error finding Goitein unpublished editions source for %s"
- % doc.shelfmark
- )
- return
-
- footnote = Footnote.objects.create(
- source=source,
- content_object=doc,
- doc_relation=Footnote.EDITION,
- )
- LogEntry.objects.log_action(
- user_id=self.script_user.id,
- content_type_id=self.footnote_contenttype.pk,
- object_id=footnote.pk,
- object_repr=str(footnote),
- change_message="Created Goitein unpublished editions footnote to sync transcription",
- action_flag=ADDITION,
- )
-
- return footnote
-
- def sync_git(self, gitrepo_url, local_path):
- """ensure git repository has been cloned and content is up to date"""
-
- # if directory does not yet exist, clone repository
- if not os.path.isdir(local_path):
- if self.verbosity >= self.v_normal:
- self.stdout.write(
- "Cloning TEI transcriptions repository to %s" % local_path
- )
- Repo.clone_from(url=gitrepo_url, to_path=local_path)
- else:
- # pull any changes since the last run
- Repo(local_path).remotes.origin.pull()
-
- def log_footnote_update(self, footnote, xmlfile):
- """create a log entry for a footnote that has been updated"""
- LogEntry.objects.log_action(
- user_id=self.script_user.id,
- content_type_id=self.footnote_contenttype.pk,
- object_id=footnote.pk,
- object_repr=str(footnote),
- change_message="Updated transcription from TEI file %s" % xmlfile,
- action_flag=CHANGE,
- )
diff --git a/geniza/corpus/management/commands/tei_to_annotation.py b/geniza/corpus/management/commands/tei_to_annotation.py
deleted file mode 100644
index 8827da3e4..000000000
--- a/geniza/corpus/management/commands/tei_to_annotation.py
+++ /dev/null
@@ -1,529 +0,0 @@
-"""
-Script to convert transcription content from PGP v3 TEI files
-to IIIF annotations in the configured annotation server.
-
-"""
-
-import glob
-import os.path
-import unicodedata
-from collections import defaultdict
-from datetime import datetime
-from functools import cached_property
-
-import requests
-from addict import Dict
-from django.conf import settings
-from django.contrib.admin.models import ADDITION, CHANGE, DELETION, LogEntry
-from django.contrib.auth.models import User
-from django.contrib.contenttypes.models import ContentType
-from django.core.management.base import BaseCommand, CommandError
-from django.db import models
-from django.template.defaultfilters import pluralize
-from django.urls import reverse
-from django.utils import timezone
-from eulxml import xmlmap
-from git import Repo
-from parasolr.django.signals import IndexableSignalHandler
-from rich.progress import MofNCompleteColumn, Progress
-
-from geniza.annotations.models import Annotation
-from geniza.common.utils import absolutize_url
-from geniza.corpus.annotation_export import AnnotationExporter
-from geniza.corpus.management.commands import sync_transcriptions
-from geniza.corpus.models import Document
-from geniza.corpus.tei_transcriptions import GenizaTei
-from geniza.footnotes.models import Footnote
-
-
-class Command(sync_transcriptions.Command):
- """Synchronize TEI transcriptions to edition footnote content"""
-
- v_normal = 1 # default verbosity
-
- missing_footnotes = []
-
- normalized_unicode = set()
-
- document_not_found = []
-
- def add_arguments(self, parser):
- parser.add_argument(
- "files", nargs="*", help="Only convert the specified files."
- )
-
- def handle(self, *args, **options):
- # get settings for remote git repository url and local path
- gitrepo_url = settings.TEI_TRANSCRIPTIONS_GITREPO
- gitrepo_path = settings.TEI_TRANSCRIPTIONS_LOCAL_PATH
-
- self.verbosity = options["verbosity"]
- # get content type and script user for log entries
- self.script_user = User.objects.get(username=settings.SCRIPT_USERNAME)
-
- # disconnect solr indexing signals
- IndexableSignalHandler.disconnect()
-
- # make sure we have latest tei content from git repository
- # (inherited from sync transcriptions command)
- self.sync_git(gitrepo_url, gitrepo_path)
- # initialize local git repo client
- self.tei_gitrepo = Repo(gitrepo_path)
-
- self.stats = defaultdict(int)
-
- xmlfiles = options["files"] or glob.glob(os.path.join(gitrepo_path, "*.xml"))
- script_run_start = timezone.now()
-
- self.stdout.write("Migrating %d TEI files" % len(xmlfiles))
-
- # when running on all files (i.e., specific files not specified),
- # clear all annotations from the database before running the migration
- # NOTE: could make this optional behavior, but it probably only
- # impacts development and not the real migration?
- if not options["files"]:
- # cheating a little here, but much faster to clear all at once
- # instead of searching and deleting one at a time
- all_annos = Annotation.objects.all()
- self.stdout.write("Clearing %d annotations" % all_annos.count())
- all_annos.delete()
-
- # initialize annotation exporter; don't push changes until the end;
- # commit message will be overridden per export to docment TEI file
- self.anno_exporter = AnnotationExporter(
- stdout=self.stdout,
- verbosity=options["verbosity"],
- push_changes=False,
- commit_msg="PGP transcription export from TEI migration",
- )
- self.anno_exporter.setup_repo()
-
- # use rich progressbar without context manager
- progress = Progress(
- MofNCompleteColumn(), *Progress.get_default_columns(), expand=True
- )
- progress.start()
- task = progress.add_task("Migrating...", total=len(xmlfiles))
-
- # iterate through tei files to be migrated
- for xmlfile in xmlfiles:
- self.stats["xml"] += 1
- # update progress at the beginning instead of end,
- # since some records are skipped
- progress.update(task, advance=1, update=True)
-
- if self.verbosity >= self.v_normal:
- self.stdout.write(xmlfile)
-
- xmlfile_basename = os.path.basename(xmlfile)
-
- tei = xmlmap.load_xmlobject_from_file(xmlfile, GenizaTei)
- # some files are stubs with no content
- # check if the tei is ok to proceed; (e.g., empty or only translation content)
- # if empty, report and skip
- if not self.check_tei(tei, xmlfile):
- self.stdout.write(
- self.style.WARNING(
- "No transcription content in %s; skipping" % xmlfile
- )
- )
- continue
- # get the document for the file based on id / old id
- doc = self.get_pgp_document(xmlfile_basename)
- # if document was not found, skip
- if not doc:
- self.stdout.write(
- self.style.WARNING("Document not found for %s; skipping" % xmlfile)
- )
- self.document_not_found.append(xmlfile)
- continue
- # found the document
- if self.verbosity >= self.v_normal:
- self.stdout.write(str(doc))
-
- # get the footnote for this file & doc
- footnote = self.get_edition_footnote(doc, tei, xmlfile)
- # if no footnote, skip for now
- # (some are missing, but will handle with data work)
- if not footnote:
- self.stdout.write(
- self.style.ERROR(
- "footnote not found for %s / %s; skipping" % (xmlfile, doc.pk)
- )
- )
- self.missing_footnotes.append(xmlfile)
- continue
- footnote = self.migrate_footnote(footnote, doc)
-
- # if there is a single primary language, use the iso code if it is set
- lang_code = None
- if doc.languages.count() == 1:
- lang_code = doc.languages.first().iso_code
-
- # get html blocks from the tei
- html_blocks = tei.text_to_html(block_format=True)
-
- # get canvas objects for the images in order; skip any non-document images
- iiif_canvases = list(doc.iiif_images(filter_side=True).keys())
- # determine the number of canvases needed based on labels
- # that indicate new pages
- # check and count any after the first; always need at least 1 canvas
- num_canvases = 1 + len(
- [
- b["label"]
- for b in html_blocks[1:]
- if tei.label_indicates_new_page(b["label"])
- ]
- )
- # in verbose mode report on available/needed canvases
- if self.verbosity > self.v_normal:
- self.stdout.write(
- "%d iiif canvases; need %d canvases for %d blocks"
- % (len(iiif_canvases), num_canvases, len(html_blocks))
- )
- # if we need more canvases than we have available,
- # generate local canvas ids
- if num_canvases > len(iiif_canvases):
- # document manifest url is /documents/pgpid/iiif/manifest/
- # create canvas uris parallel to that
- canvas_base_uri = doc.manifest_uri.replace("manifest", "canvas")
- for i in range(num_canvases - len(iiif_canvases)):
- canvas_uri = "%s%d/" % (canvas_base_uri, i + 1)
- iiif_canvases.append(canvas_uri)
-
- # NOTE: pgpid 1390 folio example; each chunk should be half of the canvas
- # (probably should be handled manually)
- # if len(html_chunks) > len(iiif_canvases):
- # self.stdout.write(
- # "%s has more html chunks than canvases; skipping" % xmlfile
- # )
- # continue
-
- # start attaching to first canvas; increment based on chunk label
- canvas_index = 0
-
- # if specific files were specified, remove annotations
- # just for those documents & sources
- if options["files"]:
- # remove all existing annotations associated with this
- # document and source so we can reimport as needed
- existing_annos = Annotation.objects.filter(
- footnote__source=footnote.source,
- footnote__content_object=doc,
- created__lt=script_run_start,
- )
- # NOTE: this is problematic for transcriptions currently
- # split across two TEI files... take care when running
- # on individual or groups of files
- if existing_annos:
- print(
- "Removing %s pre-existing annotation%s for %s on %s "
- % (
- len(existing_annos),
- pluralize(existing_annos),
- footnote.source,
- doc.manifest_uri,
- )
- )
- # not creating log entries for deletion, but
- # this should probably only come up in dev runs
- existing_annos.delete()
-
- for i, block in enumerate(html_blocks):
- # if this is not the first block and the label suggests new image,
- # increment canvas index
- if i != 0 and tei.label_indicates_new_page(block["label"]):
- canvas_index += 1
-
- anno = new_transcription_annotation()
- # get the canvas uri for this section of text
- annotation_target = iiif_canvases[canvas_index]
- anno.target.source.id = annotation_target
-
- # apply to the full canvas using % notation
- # (using nearly full canvas to make it easier to edit zones)
- anno.target.selector.value = "xywh=percent:1,1,98,98"
- # anno.selector.value = "%s#xywh=pixel:0,0,%s,%s" % (annotation_target, canvas.width, canvas.height)
-
- # add html and optional label to annotation text body
- # NOTE: not specifying language in html here because we
- # handle it in wrapping template code based on db language
-
- html = tei.lines_to_html(block["lines"])
- if not unicodedata.is_normalized("NFC", html):
- self.normalized_unicode.add(xmlfile)
- html = unicodedata.normalize("NFC", html)
- anno.body[0].value = html
-
- if block["label"]:
- # check if label text requires normalization
- if not unicodedata.is_normalized("NFC", block["label"]):
- self.normalized_unicode.add(xmlfile)
- block["label"] = unicodedata.normalize("NFC", block["label"])
- anno.body[0].label = block["label"]
-
- anno["schema:position"] = i + 1
- # print(anno) # can print for debugging
-
- # create database annotation
- db_anno = Annotation()
- db_anno.set_content(dict(anno))
- # link to digital edition footnote
- db_anno.footnote = footnote
- db_anno.save()
- # log entry to document annotation creation
- self.log_addition(db_anno, "Migrated from TEI transcription")
- self.stats["created"] += 1
-
- # export migrated transcription to backup
- self.export_transcription(doc, xmlfile_basename)
-
- progress.refresh()
- progress.stop()
-
- print(
- "Processed %(xml)d TEI file(s). \nCreated %(created)d annotation(s)."
- % self.stats
- )
-
- # push all changes from migration to github
- self.anno_exporter.sync_github()
-
- # report on missing footnotes
- if self.missing_footnotes:
- print(
- "Could not find footnotes for %s document%s:"
- % (len(self.missing_footnotes), pluralize(self.missing_footnotes))
- )
- for xmlfile in self.missing_footnotes:
- print("\t%s" % xmlfile)
-
- # report on unicode normalization
- if self.normalized_unicode:
- print(
- "Normalized unicode for %s document%s:"
- % (len(self.normalized_unicode), pluralize(self.normalized_unicode))
- )
- for xmlfile in self.normalized_unicode:
- print("\t%s" % xmlfile)
-
- if self.document_not_found:
- print(
- "Document not found for %s TEI file%s:"
- % (len(self.document_not_found), pluralize(self.document_not_found))
- )
- for xmlfile in self.normalized_unicode:
- print("\t%s" % xmlfile)
-
- # report on edition footnotes that still have content
- # (skip when unning on a specified files)
- if not options["files"]:
- self.check_unmigrated_footnotes()
-
- def get_footnote_editions(self, doc):
- # extend to return digital edition or edition
- # (digital edition if from previous run of this script)
- return doc.footnotes.filter(
- models.Q(doc_relation__contains=Footnote.EDITION)
- | models.Q(doc_relation__contains=Footnote.DIGITAL_EDITION)
- )
-
- # we shouldn't be creating new footnotes at this point...
- # override method from sync transcriptions to ensure we don't
- def is_it_goitein(self, tei, doc):
- return None
-
- def migrate_footnote(self, footnote, document):
- # convert existing edition footnote to digital edition
- # OR make a new one if the existing footnote has other information
-
- # convert existing edition footnote to digital edition
- # OR make a new one if the existing footnote has other information
-
- # if footnote is already a digital edition, nothing to be done
- # (already migrated in a previous run)
- if footnote.doc_relation == Footnote.DIGITAL_EDITION:
- return footnote
-
- # check if a digital edition footnote for this document+source exists,
- # so we can avoid creating a duplicate
- diged_footnote = document.footnotes.filter(
- doc_relation=Footnote.DIGITAL_EDITION, source=footnote.source
- ).first()
-
- # if footnote has other types or a url, we should preserve it
- if (
- set(footnote.doc_relation).intersection(
- {Footnote.TRANSLATION, Footnote.DISCUSSION}
- )
- or footnote.url
- or footnote.location
- ):
- # remove interim transcription content
- if footnote.content:
- footnote.content = None
- footnote.save()
-
- # if a digital edition footnote for this document+source exists,
- # use that instead of creating a duplicate
- if diged_footnote:
- return diged_footnote
-
- # otherwise, make a new one
- new_footnote = Footnote(
- doc_relation=Footnote.DIGITAL_EDITION, source=footnote.source
- )
- # trying to set from related object footnote.document errors
- new_footnote.content_object = document
- new_footnote.save()
- # log footnote creation and return
- self.log_addition(
- new_footnote,
- "Created new footnote for migrated digital edition",
- )
- return new_footnote
-
- # when there is no additional information on the footnote
- else:
- # if a digital edition already exists, remove this one
- if diged_footnote:
- footnote.delete()
- # log deletion and and return existing diged
- self.log_deletion(footnote, "Removing redundant edition footnote")
- return diged_footnote
-
- # otherwise, convert edition to digital edition
- footnote.doc_relation = Footnote.DIGITAL_EDITION
- footnote.content = None
- footnote.save()
- # log footnote change and return
- self.log_change(footnote, "Migrated footnote to digital edition")
- return footnote
-
- # lookup to map tei git repo usernames to pgp db username for co-author string
- teicontributor_to_username = {
- "Alan Elbaum": "ae5677",
- # multiple Bens should all map to same user
- "Ben": "benj",
- "Ben Johnston": "benj",
- "benj@princeton.edu": "benj",
- "benjohnsto": "benj",
- # no github account that I can find; just use the name
- "Brendan Goldman": "Brendan Goldman",
- "Jessica Parker": "jp0630",
- "Ksenia Ryzhova": "kryzhova",
- "Rachel Richman": "rrichman",
- "mrustow": "mrustow",
- # multiple RSKs also...
- "Rebecca Sutton Koeser": "rkoeser",
- "rlskoeser": "rkoeser",
- }
-
- @cached_property
- def tei_contrib_users(self):
- # retrieve users from database based on known tei contributor usernames,
- # and return as a dict for lookup by username
- tei_users = User.objects.filter(
- username__in=set(self.teicontributor_to_username.values())
- )
- return {u.username: u for u in tei_users}
-
- def export_transcription(self, document, xmlfile):
- # get contributors and export to git backup
-
- # get the unique list of all contributors to this file
- commits = list(self.tei_gitrepo.iter_commits("master", paths=xmlfile))
- contributors = set([c.author.name for c in commits])
- # convert bitbucket users to unique set of pgp users
- contrib_usernames = set(
- self.teicontributor_to_username[c] for c in contributors
- )
- # now get actual users for those usernames...
- contrib_users = [self.tei_contrib_users.get(u, u) for u in contrib_usernames]
-
- # export transcription for the specified document,
- # documenting the users who modified the TEI file
- self.anno_exporter.export(
- pgpids=[document.pk],
- modifying_users=contrib_users,
- commit_msg="Transcription migrated from TEI %s" % xmlfile,
- )
-
- def log_addition(self, obj, log_message):
- "create a log entry documenting object creation"
- return self.log_entry(obj, log_message, ADDITION)
-
- def log_change(self, obj, log_message):
- "create a log entry documenting object change"
- return self.log_entry(obj, log_message, CHANGE)
-
- def log_deletion(self, obj, log_message):
- "create a log entry documenting object change"
- return self.log_entry(obj, log_message, DELETION)
-
- def check_unmigrated_footnotes(self):
- unmigrated_footnotes = Footnote.objects.filter(
- doc_relation__contains=Footnote.EDITION, content__isnull=False
- )
- if unmigrated_footnotes.exists():
- self.stdout.write(
- "\n%d unmigrated footnote%s"
- % (unmigrated_footnotes.count(), pluralize(unmigrated_footnotes))
- )
- for fn in unmigrated_footnotes:
- # provide admin link to make it easier to investigate
- admin_url = absolutize_url(
- reverse("admin:footnotes_footnote_change", args=(fn.id,))
- )
- print("\t%s\t%s" % (fn, admin_url))
-
- _content_types = {}
-
- def get_content_type(self, obj):
- # lookup and cache content type for model
- model_class = obj.__class__
- if model_class not in self._content_types:
- self._content_types[model_class] = ContentType.objects.get_for_model(
- model_class
- )
- return self._content_types[model_class]
-
- def log_entry(self, obj, log_message, log_action):
- "create a log entry documenting object creation/change/deletion"
- # for this migration, we can assume user is always script user
- content_type = self.get_content_type(obj)
- return LogEntry.objects.log_action(
- user_id=self.script_user.id,
- content_type_id=content_type.pk,
- object_id=obj.pk,
- object_repr=str(obj),
- change_message=log_message,
- action_flag=log_action,
- )
-
-
-def new_transcription_annotation():
- # initialize a new annotation dict object with all the defaults set
-
- anno = Dict()
- setattr(anno, "@context", "http://www.w3.org/ns/anno.jsonld")
- anno.type = "Annotation"
- anno.body = [Dict()]
- anno.body[0].type = "TextualBody"
- # purpose on body is only needed if more than one body
- # (e.g., transcription + tags in the same annotation)
- # anno.body[0].purpose = "transcribing"
- anno.body[0].format = "text/html"
- # explicitly indicate text direction; all transcriptions are RTL
- anno.body[0].TextInput = "rtl"
- # supplement rather than painting over the image
- # multiple motivations are allowed; add transcribing as secondary motivation
- # (could use edm:transcribing from Europeana Data Model, but not sure
- # how to declare edm namespace)
- anno.motivation = ["sc:supplementing", "transcribing"]
-
- anno.target.source.type = "Canvas"
- anno.target.selector.type = "FragmentSelector"
- anno.target.selector.conformsTo = "http://www.w3.org/TR/media-frags/"
-
- return anno
diff --git a/geniza/corpus/migrations/0045_fragment_provenance.py b/geniza/corpus/migrations/0045_fragment_provenance.py
new file mode 100644
index 000000000..aaf8fbbb5
--- /dev/null
+++ b/geniza/corpus/migrations/0045_fragment_provenance.py
@@ -0,0 +1,20 @@
+# Generated by Django 3.2.23 on 2024-03-05 21:07
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+ dependencies = [
+ ("corpus", "0044_populate_fragment_old_shelfmark"),
+ ]
+
+ operations = [
+ migrations.AddField(
+ model_name="fragment",
+ name="provenance",
+ field=models.TextField(
+ blank=True,
+ help_text="The origin and acquisition history of this fragment.",
+ ),
+ ),
+ ]
diff --git a/geniza/corpus/migrations/0046_document_events.py b/geniza/corpus/migrations/0046_document_events.py
new file mode 100644
index 000000000..a0e3fb69a
--- /dev/null
+++ b/geniza/corpus/migrations/0046_document_events.py
@@ -0,0 +1,52 @@
+# Generated by Django 3.2.23 on 2024-03-21 16:41
+
+import django.db.models.deletion
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+ dependencies = [
+ ("entities", "0021_event"),
+ ("corpus", "0045_fragment_provenance"),
+ ]
+
+ operations = [
+ migrations.CreateModel(
+ name="DocumentEventRelation",
+ fields=[
+ (
+ "id",
+ models.AutoField(
+ auto_created=True,
+ primary_key=True,
+ serialize=False,
+ verbose_name="ID",
+ ),
+ ),
+ ("notes", models.TextField(blank=True)),
+ (
+ "document",
+ models.ForeignKey(
+ on_delete=django.db.models.deletion.CASCADE,
+ to="corpus.document",
+ ),
+ ),
+ (
+ "event",
+ models.ForeignKey(
+ on_delete=django.db.models.deletion.CASCADE, to="entities.event"
+ ),
+ ),
+ ],
+ ),
+ migrations.AddField(
+ model_name="document",
+ name="events",
+ field=models.ManyToManyField(
+ related_name="documents",
+ through="corpus.DocumentEventRelation",
+ to="entities.Event",
+ verbose_name="Related Events",
+ ),
+ ),
+ ]
diff --git a/geniza/corpus/models.py b/geniza/corpus/models.py
index c0588d606..d6b406499 100644
--- a/geniza/corpus/models.py
+++ b/geniza/corpus/models.py
@@ -44,7 +44,7 @@
)
from geniza.common.utils import absolutize_url
from geniza.corpus.annotation_utils import document_id_from_manifest_uri
-from geniza.corpus.dates import DocumentDateMixin, PartialDate
+from geniza.corpus.dates import DocumentDateMixin, PartialDate, standard_date_display
from geniza.corpus.iiif_utils import GenizaManifestImporter, get_iiif_string
from geniza.corpus.solr_queryset import DocumentSolrQuerySet
from geniza.footnotes.models import Creator, Footnote
@@ -197,6 +197,9 @@ class Fragment(TrackChangesModel):
default=False,
help_text="True if there are multiple fragments in one shelfmark",
)
+ provenance = models.TextField(
+ blank=True, help_text="The origin and acquisition history of this fragment."
+ )
notes = models.TextField(blank=True)
needs_review = models.TextField(
blank=True,
@@ -316,8 +319,9 @@ def attribution(self):
)
@property
- def provenance(self):
- """Generate a provenance statement for this fragment"""
+ @admin.display(description="Provenance from IIIF manifest")
+ def iiif_provenance(self):
+ """Generate a provenance statement for this fragment from IIIF"""
if self.manifest and self.manifest.metadata:
return get_iiif_string(self.manifest.metadata.get("Provenance", ""))
@@ -558,7 +562,12 @@ class Document(ModelIndexable, DocumentDateMixin):
default=PUBLIC,
help_text="Decide whether a document should be publicly visible",
)
-
+ events = models.ManyToManyField(
+ to="entities.Event",
+ related_name="documents",
+ verbose_name="Related Events",
+ through="DocumentEventRelation",
+ )
footnotes = GenericRelation(Footnote, related_query_name="document")
log_entries = GenericRelation(LogEntry, related_query_name="document")
@@ -632,6 +641,16 @@ def shelfmark_display(self):
associated fragments; uses :attr:`shelfmark_override` if set"""
return self.shelfmark_override or self.shelfmark
+ @property
+ @admin.display(description="Historical shelfmarks")
+ def fragment_historical_shelfmarks(self):
+ """Property to display set of all historical shelfmarks on the document"""
+ all_textblocks = self.textblock_set.all()
+ all_fragments = [tb.fragment for tb in all_textblocks]
+ return "; ".join(
+ [frag.old_shelfmarks for frag in all_fragments if frag.old_shelfmarks]
+ )
+
@property
def collection(self):
"""collection (abbreviation) for associated fragments"""
@@ -943,6 +962,8 @@ def dating_range(self):
"""
# it is unlikely, but technically possible, that a document could have both on-document
# dates and inferred datings, so find the min and max out of all of them.
+
+ # start_date and end_date are PartialDate instances
dating_range = [self.start_date or None, self.end_date or None]
# bail out if we don't have any inferred datings
@@ -951,24 +972,15 @@ def dating_range(self):
# loop through inferred datings to find min and max among all dates (including both
# on-document and inferred)
- for dating in self.dating_set.all():
+ for inferred in self.dating_set.all():
# get start from standardized date range (formatted as "date1/date2" or "date")
- split_date = dating.standard_date.split("/")
+ split_date = inferred.standard_date.split("/")
start = PartialDate(split_date[0])
- # use numeric format to compare to current min, replace if smaller
- start_numeric = int(start.numeric_format(mode="min"))
- min = dating_range[0]
- if min is None or start_numeric < int(min.numeric_format(mode="min")):
- # store as PartialDate
- dating_range[0] = start
# get end from standardized date range
end = PartialDate(split_date[1]) if len(split_date) > 1 else start
- # use numeric format to compare to current max, replace if larger
- end_numeric = int(end.numeric_format(mode="max"))
- max = dating_range[1]
- if max is None or end_numeric > int(max.numeric_format(mode="max")):
- # store as PartialDate
- dating_range[1] = end
+ dating_range = PartialDate.get_date_range(
+ old_range=dating_range, new_range=[start, end]
+ )
return tuple(dating_range)
@@ -1159,12 +1171,14 @@ def index_data(self):
# type gets matched back to DocumentType object in get_result_document, for i18n;
# should always be indexed in English
"type_s": (
- self.doctype.display_label_en
- or self.doctype.name_en
- or str(self.doctype)
- )
- if self.doctype
- else "Unknown type",
+ (
+ self.doctype.display_label_en
+ or self.doctype.name_en
+ or str(self.doctype)
+ )
+ if self.doctype
+ else "Unknown type"
+ ),
# use english description for now
"description_en_bigram": strip_tags(self.description_en),
"notes_t": self.notes or None,
@@ -1187,12 +1201,12 @@ def index_data(self):
"document_dating_dr": self.solr_dating_range(),
# historic date, for searching
# start/end of document date or date range
- "start_date_i": self.start_date.numeric_format()
- if self.start_date
- else None,
- "end_date_i": self.end_date.numeric_format(mode="max")
- if self.end_date
- else None,
+ "start_date_i": (
+ self.start_date.numeric_format() if self.start_date else None
+ ),
+ "end_date_i": (
+ self.end_date.numeric_format(mode="max") if self.end_date else None
+ ),
# library/collection possibly redundant?
"collection_ss": [str(f.collection) for f in fragments],
"tags_ss_lower": [t.name for t in self.tags.all()],
@@ -1712,3 +1726,19 @@ class Meta:
notes = models.TextField(
help_text="Optional further details about the rationale",
)
+
+ @property
+ def standard_date_display(self):
+ """Standard date in human-readable format for document details pages"""
+ return standard_date_display(self.standard_date)
+
+
+class DocumentEventRelation(models.Model):
+ """A relationship between a document and an event"""
+
+ document = models.ForeignKey(Document, on_delete=models.CASCADE)
+ event = models.ForeignKey("entities.Event", on_delete=models.CASCADE)
+ notes = models.TextField(blank=True)
+
+ def __str__(self):
+ return f"Document-Event relation: {self.document} and {self.event}"
diff --git a/geniza/corpus/solr_queryset.py b/geniza/corpus/solr_queryset.py
index 0135860d6..9983ea21c 100644
--- a/geniza/corpus/solr_queryset.py
+++ b/geniza/corpus/solr_queryset.py
@@ -1,3 +1,4 @@
+import itertools
import re
from bs4 import BeautifulSoup
@@ -12,6 +13,23 @@
def clean_html(html_snippet):
"""utility method to clean up html, since solr snippets of html content
may result in non-valid content"""
+
+ # if this snippet starts with a line that includes a closing but no opening,
+ # try to append the opening
(and an ellipsis to show incompleteness)
+ incomplete_line = re.match(r"^(?!
{ellipsis}{html_snippet}"
+
return BeautifulSoup(html_snippet, "html.parser").prettify(formatter="minimal")
@@ -57,6 +75,9 @@ class DocumentSolrQuerySet(AliasedSolrQuerySet):
"has_digital_edition": "has_digital_edition_b",
"has_digital_translation": "has_digital_translation_b",
"has_discussion": "has_discussion_b",
+ "old_shelfmark": "old_shelfmark_bigram",
+ "transcription_nostem": "transcription_nostem",
+ "description_nostem": "description_nostem",
}
# regex to convert field aliases used in search to actual solr fields
@@ -90,7 +111,7 @@ class DocumentSolrQuerySet(AliasedSolrQuerySet):
# beginning/end of the string or after/before a space, and not followed by a
# tilde for fuzzy/proximity search (non-greedy to prevent matching the entire
# string if there are multiple sets of doublequotes)
- re_exact_match = re.compile(r'(? "pgpid_i:950 shelfmark_t:ena")
@@ -140,15 +177,25 @@ def _search_term_cleanup(self, search_term):
def admin_search(self, search_term):
# remove " + " from search string to allow searching on shelfmark joins
- return self.search(self.admin_doc_qf).raw_query_parameters(
- doc_query=self._search_term_cleanup(search_term)
- )
+ doc_query = self._search_term_cleanup(search_term)
+ query_params = {"doc_query": doc_query}
+ # nested edismax query no longer works since solr 7.2
+ # https://solr.apache.org/guide/7_2/solr-upgrade-notes.html#solr-7-2
+ if "{!type=edismax" in doc_query:
+ query_params.update({"uf": "* _query_"})
+
+ return self.search(self.admin_doc_qf).raw_query_parameters(**query_params)
keyword_search_qf = "{!type=edismax qf=$keyword_qf pf=$keyword_pf v=$keyword_query}"
def keyword_search(self, search_term):
+ keyword_query = self._search_term_cleanup(search_term)
+ query_params = {"keyword_query": keyword_query}
+ # nested edismax query no longer works since solr 7.2 (see above)
+ if "{!type=edismax" in keyword_query:
+ query_params.update({"uf": "* _query_"})
search = self.search(self.keyword_search_qf).raw_query_parameters(
- keyword_query=self._search_term_cleanup(search_term)
+ **query_params
)
# if search term cleanup identifies any exact phrase searches,
# pass the unmodified search to Solr as a highlighting query,
@@ -212,8 +259,17 @@ def get_highlighting(self):
"""highlight snippets within transcription/translation html may result in
invalid tags that will render strangely; clean up the html before returning"""
highlights = super().get_highlighting()
+ is_exact_search = "hl_query" in self.raw_params
for doc in highlights.keys():
- if "transcription" in highlights[doc]:
+ # _nostem fields should take precedence over stemmed fields in the case of an
+ # exact search; in that case, replace highlights for stemmed fields with nostem
+ if is_exact_search and "description_nostem" in highlights[doc]:
+ highlights[doc]["description"] = highlights[doc]["description_nostem"]
+ if is_exact_search and "transcription_nostem" in highlights[doc]:
+ highlights[doc]["transcription"] = [
+ clean_html(s) for s in highlights[doc]["transcription_nostem"]
+ ]
+ elif "transcription" in highlights[doc]:
highlights[doc]["transcription"] = [
clean_html(s) for s in highlights[doc]["transcription"]
]
diff --git a/geniza/corpus/tei_transcriptions.py b/geniza/corpus/tei_transcriptions.py
deleted file mode 100644
index ed3bb5d11..000000000
--- a/geniza/corpus/tei_transcriptions.py
+++ /dev/null
@@ -1,266 +0,0 @@
-from eulxml import xmlmap
-from eulxml.xmlmap import teimap
-
-from geniza.common.utils import simplify_quotes
-
-
-class GenizaTeiLine(teimap.TeiLine):
- name = xmlmap.StringField("local-name(.)")
- lang = xmlmap.StringField("@xml:lang|tei:span/@xml:lang")
- number = xmlmap.StringField("@n")
-
-
-class MainText(teimap.TeiDiv):
- lines = xmlmap.NodeListField("tei:l|tei:label", GenizaTeiLine)
-
-
-class GenizaTei(teimap.Tei):
- # extend eulxml TEI to add mappings for the fields we care about
- # NOTE: at least one pgpid is in format ### + ###
- pgpid = xmlmap.IntegerField('tei:teiHeader//tei:idno[@type="PGP"]')
- # normally main text content is under text/body/div; but at least one document has no div
- text = xmlmap.NodeField(
- "tei:text/tei:body/tei:div|tei:text/tei:body[not(tei:div)]", MainText
- )
- lines = xmlmap.NodeListField("tei:text/tei:body/tei:div/tei:l", GenizaTeiLine)
- labels = xmlmap.NodeListField(
- "tei:text/tei:body/tei:div/tei:label", GenizaTeiLine
- ) # not really a line...
- # source description sometimes contains reference to scholarship record
- source = xmlmap.NodeListField(
- "tei:teiHeader//tei:sourceDesc/tei:msDesc/tei:msContents/tei:p", GenizaTeiLine
- )
- # for documents with more than one transcription, authors have been
- # tagged with last name in n attribute to allow identifying/differentiating
- source_authors = xmlmap.StringListField(
- "tei:teiHeader//tei:sourceDesc//tei:author/@n"
- )
-
- def no_content(self):
- return str(self.text).strip() == ""
-
- # text that generally indicates a new page/image, anywhere in the label
- new_page_indicators = [
- "recto",
- "verso",
- "side ii",
- "page b",
- "page 2",
- "page two",
- 'ע"ב', # Hebrew label for page 2
- ]
- # text that indicates a new page/image at the start of the label
- new_page_start_indicators = ["t-s ", "ts ", "ena ", "moss. "]
-
- def label_indicates_new_page(self, label):
- label = simplify_quotes(label.lower())
- return any(
- [side_label in label for side_label in self.new_page_indicators]
- ) or any(
- label.startswith(start_label)
- for start_label in self.new_page_start_indicators
- )
-
- def labels_only(self):
- text_content = str(self.text).strip()
- label_content = " ".join([str(label).strip() for label in self.labels])
- return text_content == label_content
-
- def text_to_html(self, block_format=False):
- # convert the TEI text content to basic HTML
- blocks = []
- lines = []
- label = []
- # because blocks are indicated by labels without containing elements,
- # iterate over all lines and create blocks based on the labels
- for line in self.text.lines:
- if line.name == "label":
- # append current text block if set, and initialize a new one
- if lines:
- blocks.append(
- {
- "label": "\n".join(label),
- "lines": lines,
- # "languages": list(languages),
- }
- )
- label = []
- lines = []
-
- # store the label; sometimes there are two in a row
- label.append(str(line))
-
- elif line.name == "l":
- # use language codes? unreliable in the xml
- # append tuple of line number, text
- # return empty string for line number if no line attribute
- lines.append((line.number or "", str(line)))
-
- # append the last block
- if lines:
- blocks.append(
- {
- "label": "\n".join(label),
- "lines": lines,
- }
- )
-
- # if block format requested, return blocks without further processing
- if block_format:
- return blocks
-
- # otherwise, return chunked HTML
- return self.chunk_html(blocks)
-
- def chunk_html(self, blocks):
- # combine blocks of text into html, chunked into pages to match sides of images
- html = []
- page = []
- for block in blocks:
-
- # if there is a label and it looks like a new side,
- # start a new section
- if block["label"]:
- if self.label_indicates_new_page(block["label"]):
- # if we have any content, close the previous section
- if page:
- # combine all sections in the page and add to the html
- html.append("\n".join(page))
- # then start a new page
- page = []
-
- # start output for the new block
- output = [""]
- # add label if we have one
- if block["label"]:
- output.append(f"
{block['label']}
")
-
- output.append(self.lines_to_html(block["lines"]))
- output.append("")
- page.append("\n".join(output))
-
- # save the last page
- html.append("\n".join(page))
-
- return html
-
- def lines_to_html(self, lines):
- """Convert lines and line numbers from TEI to HTML, accounting
- for unnumbered lines and lines starting with numbers other than 1.
- Converts to ordered lists and paragraphs; ordered lists have
- start attribute when needed.
-
- :params lines: list of tuples of line number, line text
- :returns: string of html content
- """
-
- html_lines = []
- list_num = 1
- in_list = False
- for line_number, line in lines:
- # convertline number to integer for comparison
- if line_number:
- try:
- line_number = int(line_number)
- except ValueError:
- # in at least one instance, line number is a range "16-17"
- # ignore the problem (??)
- if "-" in line_number:
- line_number = int(line_number.split("-")[0])
-
- # if line is empty, skip it
- if not line.strip():
- continue
-
- # if line is unnumberred, output as a paragraph
- if not line_number:
- # if we were in a list, close it
- if in_list:
- html_lines.append("
" % line)
-
- # if line number is 1, start a new list
- elif line_number == 1:
- # close any preceeding list
- if in_list:
- html_lines.append("")
-
- in_list = True
- list_num = 1
- html_lines.append("")
- html_lines.append("
%s
" % line)
- # if the line number matches expected next value, output as line
- elif line_number == list_num:
- html_lines.append("
%s
" % line)
-
- # if line number does not match expected list number,
- # start a new list with start attribute specified
- else:
- # close existing list if any
- if in_list:
- html_lines.append("")
-
- # start a new list with the specified number IF numeric
- if isinstance(line_number, int):
- list_num = line_number
- in_list = True
- html_lines.append('' % line_number)
- html_lines.append("
%s
" % line)
- else:
- # if not numeric, we can't use as line number or start
- html_lines.append("")
- # add the n to text to preserve the content
- html_lines.append("
%s %s
" % (line_number, line))
-
- # increment expected list number if we're inside a list
- if in_list:
- list_num += 1
-
- # close the last list, if active
- if in_list:
- html_lines.append("")
-
- return "\n".join(html_lines)
-
- rtl_mark = "\u200F"
- ltr_mark = "\u200E"
-
- def text_to_plaintext(self):
- lines = []
- # because blocks are indicated by labels without containing elements,
- # iterate over all lines and create blocks based on the labels
-
- # errors if there are no lines; sync transcription now checks
- # and won't call in that case
- if not self.text.lines:
- return
-
- # determine longest line so we can pad the text
- longest_line = max(len(str(line)) for line in self.text.lines)
- # some files have descriptions that are making lines much too long,
- # so set a limit on line length
- if longest_line > 100:
- longest_line = 100
- for line in self.text.lines:
- if line.name == "label":
- # blank line to indicate breaks between blocks
- lines.append("")
- lines.append("%s%s" % (self.ltr_mark, line))
- elif line.name == "l":
- line_num = line.number or ""
- # combine line text with line number and right justify;
- # right justify line number
- lines.append(
- " ".join(
- [
- self.rtl_mark,
- str(line).rjust(longest_line),
- self.ltr_mark,
- line_num.rjust(3),
- ]
- )
- )
-
- return "\n".join(lines)
diff --git a/geniza/corpus/templates/corpus/document_detail.html b/geniza/corpus/templates/corpus/document_detail.html
index f4976e588..18d629c0c 100644
--- a/geniza/corpus/templates/corpus/document_detail.html
+++ b/geniza/corpus/templates/corpus/document_detail.html
@@ -74,6 +74,23 @@
{% endif %}
+ {% if document.dating_set.exists %}
+
{# Translators: Primary language label #}
@@ -117,13 +134,26 @@
{% translate 'Tags' %}
{% endif %}
-
- {# Translators: Label for date document was first added to the PGP #}
-
{% translate 'Input date' %}
- {# Translators: Date document was first added to the PGP #}
- {% blocktranslate with date=document.log_entries.last.action_time.year %}
- In PGP since {{ date }}
- {% endblocktranslate %}
+
+
+ {% if document.fragment_historical_shelfmarks %}
+ {# Translators: label for historical/old shelfmarks on document fragments #}
+
{% translate 'Historical shelfmarks' %}
+
{{ document.fragment_historical_shelfmarks }}
+ {% endif %}
+ {# Translators: Label for date document was first added to the PGP #}
+
{% translate 'Input date' %}
+
+ {# Translators: Date document was first added to the PGP #}
+ {% blocktranslate with date=document.log_entries.last.action_time.year %}
+ In PGP since {{ date }}
+ {% endblocktranslate %}
+
+
@@ -140,13 +170,14 @@
{# tertiary metadata #}
-
+ {# Translators: label for permanent link to a document #}
+ {% translate 'Permalink' as permalink %}
+
{# aria hidden because redundant for screen readers #}
- {# Translators: label for permanent link to a document #}
- {% translate 'Permalink' %}
+ {{ permalink }}
{% include "corpus/snippets/footnote_location.html" %}
+ {% endifchanged %}
{% endfor %}
diff --git a/geniza/corpus/templates/corpus/snippets/document_image_rights.html b/geniza/corpus/templates/corpus/snippets/document_image_rights.html
index 6e99ccf2b..c7a99f049 100644
--- a/geniza/corpus/templates/corpus/snippets/document_image_rights.html
+++ b/geniza/corpus/templates/corpus/snippets/document_image_rights.html
@@ -12,12 +12,12 @@
{% endwith %}
{% if fragment.attribution %}
{{ fragment.attribution }}
- {% elif fragment.provenance %}
- {{ fragment.provenance }}
+ {% elif fragment.iiif_provenance %}
+ {{ fragment.iiif_provenance }}
{% endif %}
{% if fragment.manifest.license %}
{% include "corpus/snippets/fragment_license_statement.html" %}
- {% elif not fragment.attribution and not fragment.provenance %}
+ {% elif not fragment.attribution and not fragment.iiif_provenance %}
{% translate "No attribution or license noted." %}
{% endif %}
@@ -25,15 +25,15 @@
{% endfor %}
{% regroup document.fragments.all by manifest.logo as logos_list %}
- {% regroup document.fragments.all by provenance as provenance_list %}
+ {% regroup document.fragments.all by iiif_provenance as provenance_list %}
{% for logo in logos_list %}
{% endfor %}
- {% for provenance in provenance_list %}
- {% if "JTS" in provenance.grouper %}
+ {% for iiif_provenance in provenance_list %}
+ {% if "JTS" in iiif_provenance.grouper %}
+
+ {# Translators: screen-reader label for "view document details" link #}
+ {% blocktranslate asvar aria_view_details with document_label=document|get_document_label %}View details for {{ document_label }}{% endblocktranslate %}
+
+ {% translate 'View document details' %}
+
+
{% endspaceless %}
diff --git a/geniza/corpus/templates/corpus/snippets/document_transcription.html b/geniza/corpus/templates/corpus/snippets/document_transcription.html
index dad06cae4..edc6a1268 100644
--- a/geniza/corpus/templates/corpus/snippets/document_transcription.html
+++ b/geniza/corpus/templates/corpus/snippets/document_transcription.html
@@ -4,14 +4,14 @@
{% elif perms.corpus.change_document %}
{% for ed in document.digital_editions.all %}
{# TODO: Make machine learning model editions editable once line-level editing implemented #}
- {% if not 'model' in ed.source.source_type.type %}
-
-
+
+
+ {% if not 'model' in ed.source.source_type.type %}
Edit {% for auth in ed.source.authorship_set.all %}{% include "snippets/comma.html" %}{{ auth.creator.last_name }}{% empty %}[unknown]{% endfor %}'s edition
-
- {% else %}
- Machine-generated transcription present, but not currently editable
- {% endif %}
+ {% else %}
+ Edit machine-generated transcription ({{ ed.source.title }})
+ {% endif %}
+
{% endfor %}
{% for tr in document.digital_translations.all %}
@@ -190,7 +190,7 @@