Merge branch 'develop' into experiment/regex-search

Princeton-CDH · Aug 27, 2024 · ca561b1 · ca561b1
2 parents 7733a06 + 3b566f9
commit ca561b1
Show file tree

Hide file tree

Showing 81 changed files with 1,760 additions and 742 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -1,6 +1,11 @@
 Change Log
 ==========
 
+4.17.3
+------
+
+- chore: Use self-hosted tinyMCE
+
 4.17.2
 ------
 

diff --git a/README.rst b/README.rst
@@ -42,9 +42,11 @@ Python 3.9 / Django 3.2 / Node 16 / Postgresql / Solr 9.2
     :target: https://github.com/Princeton-CDH/geniza/actions/workflows/sphinx_docs.yml
     :alt: Sphinx docs build status
 
-Technical documentation is available at https://princeton-cdh.github.io/geniza/
+Technical documentation is available at https://princeton-cdh.github.io/geniza/.
 
-For developer instructions, see DEVNOTES.
+For developer instructions, see `DEVNOTES <DEVNOTES.rst>`_.
+
+To access the data from the project, see https://github.com/princetongenizalab/.
 
 License
 -------

diff --git a/geniza/common/management/commands/visual_tests.py b/geniza/common/management/commands/visual_tests.py
@@ -33,15 +33,7 @@ def take_snapshots(self, browser, dark_mode=False):
 
         # turn on dark mode, save in local storage
         if dark_mode:
-            # handle separate theme toggles for mobile and desktop
-            try:
-                browser.find_element(
-                    By.CSS_SELECTOR, "#theme-toggle-desktop"
-                ).send_keys(Keys.ENTER)
-            except (ElementNotVisibleException, ElementNotInteractableException):
-                browser.find_element(By.CSS_SELECTOR, "#theme-toggle-mobile").send_keys(
-                    Keys.ENTER
-                )
+            browser.find_element(By.CSS_SELECTOR, "#theme-toggle").send_keys(Keys.ENTER)
             dark_mode_str = " (dark mode)"
 
         # homepage
@@ -79,22 +71,22 @@ def take_snapshots(self, browser, dark_mode=False):
         )
         percy_snapshot(browser, "Content Page%s" % dark_mode_str)
 
-        # document search with document type filter expanded
+        # document search with filters expanded
         browser.get(
-            "http://localhost:8000/en/documents/?per_page=2&sort=scholarship_desc#filters"
+            "http://localhost:8000/en/documents/?per_page=2&sort=scholarship_desc"
         )
-        # open document type filter programatically
-        doctype_filter = browser.find_element(By.CSS_SELECTOR, ".doctype-filter")
-        browser.execute_script("arguments[0].open = true", doctype_filter)
-        # click the first option
+        # expand filters
+        filter_button = browser.find_element(By.CSS_SELECTOR, "a#filters-button")
+        browser.execute_script("arguments[0].classList.add('open')", filter_button)
+        filters = browser.find_element(By.CSS_SELECTOR, "fieldset#filters")
+        browser.execute_script("arguments[0].ariaExpanded = true", filters)
+        # click the first doctype filter option
         browser.find_element(
-            By.CSS_SELECTOR, ".doctype-filter li:nth-child(1) label"
+            By.CSS_SELECTOR, "#id_doctype li:nth-child(1) label"
         ).click()
-        filter_modal_css = "fieldset#filters { display: flex !important; }"
         percy_snapshot(
             browser,
             "Document Search filter%s" % dark_mode_str,
-            percy_css=filter_modal_css,
         )
 
         # document search

diff --git a/geniza/corpus/forms.py b/geniza/corpus/forms.py
@@ -225,6 +225,13 @@ class DocumentSearchForm(RangeForm):
         ),
     )
 
+    exclude_inferred = forms.BooleanField(
+        # Translators: label for "exclude inferred dates" search form filter
+        label=_("Exclude inferred dates"),
+        required=False,
+        widget=forms.CheckboxInput,
+    )
+
     doctype = FacetChoiceField(
         # Translators: label for document type search form filter
         label=_("Document type"),

diff --git a/geniza/corpus/models.py b/geniza/corpus/models.py
@@ -1,3 +1,4 @@
+import itertools
 import logging
 import re
 from collections import defaultdict
@@ -1133,7 +1134,11 @@ def items_to_index(cls):
                 "textblock_set",
                 queryset=TextBlock.objects.select_related(
                     "fragment", "fragment__collection", "fragment__manifest"
-                ).prefetch_related("fragment__manifest__canvases"),
+                ).prefetch_related(
+                    "fragment__manifest__canvases",
+                    "fragment__textblock_set",
+                    "fragment__textblock_set__document",
+                ),
             ),
             Prefetch(
                 "footnotes",
@@ -1165,7 +1170,11 @@ def prep_index_chunk(cls, chunk):
                 "textblock_set",
                 queryset=TextBlock.objects.select_related(
                     "fragment", "fragment__collection", "fragment__manifest"
-                ).prefetch_related("fragment__manifest__canvases"),
+                ).prefetch_related(
+                    "fragment__manifest__canvases",
+                    "fragment__textblock_set",
+                    "fragment__textblock_set__document",
+                ),
             ),
             Prefetch(
                 "footnotes",
@@ -1188,6 +1197,15 @@ def index_data(self):
         # get fragments via textblocks for correct order
         # and to take advantage of prefetching
         fragments = [tb.fragment for tb in self.textblock_set.all()]
+
+        # get related documents: other textblocks on this document's fragments
+        other_textblocks_docs = [
+            f.textblock_set.exclude(document__pk=self.pk).values_list(
+                "document__pk", flat=True
+            )
+            for f in fragments
+        ]
+        related_document_pks = set(itertools.chain.from_iterable(other_textblocks_docs))
         # filter by side so that search results only show the relevant side image(s)
         images = self.iiif_images(filter_side=True).values()
         index_data.update(
@@ -1220,6 +1238,10 @@ def index_data(self):
                 ),
                 # combined original/standard document date for display
                 "document_date_t": strip_tags(self.document_date) or None,
+                # inferred document date for display
+                "document_dating_t": standard_date_display(
+                    "/".join([d.isoformat() for d in self.dating_range() if d])
+                ),
                 # date range for filtering
                 "document_date_dr": self.solr_date_range(),
                 # date range for filtering, but including inferred datings if any exist
@@ -1232,6 +1254,17 @@ def index_data(self):
                 "end_date_i": (
                     self.end_date.numeric_format(mode="max") if self.end_date else None
                 ),
+                # start/end of document date or date range, including inferred datings, for sort
+                "start_dating_i": (
+                    self.dating_range()[0].numeric_format()
+                    if self.dating_range()[0]
+                    else None
+                ),
+                "end_dating_i": (
+                    self.dating_range()[1].numeric_format(mode="max")
+                    if self.dating_range()[1]
+                    else None
+                ),
                 # library/collection possibly redundant?
                 "collection_ss": [str(f.collection) for f in fragments],
                 "tags_ss_lower": [t.name for t in self.tags.all()],
@@ -1252,6 +1285,7 @@ def index_data(self):
                 "has_image_b": len(images) > 0,
                 "people_count_i": self.persondocumentrelation_set.count(),
                 "places_count_i": self.documentplacerelation_set.count(),
+                "documents_count_i": len(related_document_pks),
             }
         )
 

diff --git a/geniza/corpus/solr_queryset.py b/geniza/corpus/solr_queryset.py
@@ -54,6 +54,7 @@ class DocumentSolrQuerySet(AliasedSolrQuerySet):
         "shelfmark": "shelfmark_s",  # string version for display
         "shelfmarks": "fragment_shelfmark_ss",
         "document_date": "document_date_t",  # text version for search & display
+        "document_dating": "document_dating_t",  # inferred date for display
         "original_date_t": "original_date",
         "collection": "collection_ss",
         "tags": "tags_ss_lower",
@@ -89,6 +90,7 @@ class DocumentSolrQuerySet(AliasedSolrQuerySet):
         "description_nostem": "description_nostem",
         "related_people": "people_count_i",
         "related_places": "places_count_i",
+        "related_documents": "documents_count_i",
         "transcription_regex": "transcription_regex",
     }
 
@@ -129,6 +131,9 @@ class DocumentSolrQuerySet(AliasedSolrQuerySet):
     # to use as highlighting query
     highlight_query = None
 
+    # if search consists only of quoted phrase scoped to shelfmark, handle separately
+    shelfmark_query = None
+
     def _search_term_cleanup(self, search_term):
         # adjust user search string before sending to solr
 
@@ -179,6 +184,12 @@ def _search_term_cleanup(self, search_term):
             search_term = search_term.replace(
                 "%s:" % self.shelfmark_qf, self.shelfmark_qf
             )
+            # special case: just a shelfmark query, in quotes
+            quoted_shelfmark_query = re.fullmatch(
+                rf'{re.escape(self.shelfmark_qf)}".+?"', search_term
+            )
+            if quoted_shelfmark_query:
+                self.shelfmark_query = quoted_shelfmark_query.group(0)
 
         return search_term
 
@@ -206,9 +217,14 @@ def keyword_search(self, search_term):
         # nested edismax query no longer works since solr 7.2 (see above)
         if "{!type=edismax" in keyword_query:
             query_params.update({"uf": "* _query_"})
-        search = self.search(self.keyword_search_qf).raw_query_parameters(
-            **query_params
-        )
+        # if search term consists only of a shelfmark query in quotes, only search shelfmark fields
+        if self.shelfmark_query:
+            search = self.search(self.shelfmark_query)
+        else:
+            # otherwise, search all fields as usual
+            search = self.search(self.keyword_search_qf).raw_query_parameters(
+                **query_params
+            )
         # if search term cleanup identifies any exact phrase searches,
         # pass the unmodified search to Solr as a highlighting query,
         # since otherwise the highlighted fields (description/transcription)
@@ -241,7 +257,7 @@ def related_to(self, document):
         "Return documents related to the given document (i.e. shares any shelfmarks)"
 
         # NOTE: using a string query filter because parasolr queryset
-        # # currently doesn't provide any kind of not/exclude filter
+        # currently doesn't provide any kind of not/exclude filter
         return (
             self.filter(status=document.PUBLIC_LABEL)
             .filter("NOT pgpid_i:%d" % document.id)
@@ -276,18 +292,6 @@ def get_result_document(self, doc):
             _("Unknown type"),
         )
 
-        if doc.get("shelfmarks"):
-            doc["related_documents"] = (
-                DocumentSolrQuerySet()
-                .filter("NOT pgpid_i:%d" % doc["pgpid"])
-                .filter(
-                    fragment_shelfmark_ss__in=[
-                        '"%s"' % shelfmark for shelfmark in doc["shelfmarks"]
-                    ]
-                )
-                .count()
-            )
-
         return doc
 
     def get_regex_highlight(self, text):

diff --git a/geniza/corpus/templates/corpus/document_list.html b/geniza/corpus/templates/corpus/document_list.html
@@ -73,6 +73,11 @@ <h2>
                     {# NOTE: stimulus action is configured via django widget attrs #}
                     {{ form.docdate }}
                 </label>
+                <label for="{{ form.exclude_inferred.auto_id }}">
+                    {% render_field form.exclude_inferred data-action="search#update" %}
+                    <span>{{ form.exclude_inferred.label }}</span>
+                    <div class="thumb" aria-hidden="true"></div>
+                </label>
             </div>
             <fieldset class="includes-fields">
                 <legend><span class="fieldname">{% translate "Includes" %}</span></legend>

diff --git a/geniza/corpus/templates/corpus/snippets/document_result.html b/geniza/corpus/templates/corpus/snippets/document_result.html
@@ -7,14 +7,10 @@
                 Relevance: {{ document.score|default:0 }}
             </span>
         {% endif %}
-        {# result number #}
-        <span class="counter">
-            {% with page_obj.start_index|default:1 as start_adjust %}
-                {{ forloop.counter0|add:start_adjust }}
-            {% endwith %}
-        </span>
         {# title #}
         <h2 class="title">
+            {# result number #}
+            <span class="counter">{% with page_obj.start_index|default:1 as start_adjust %}{{ forloop.counter0|add:start_adjust }}{% endwith %}</span>
             {# type and shelfmark #}
             <span class="doctype">{{ document.type }}</span>
             <span class="shelfmark">{{ document.shelfmark|shelfmark_wrap }}</span>
@@ -31,6 +27,16 @@ <h2 class="title">
                             {{ document.document_date.0 }} {# indexed as _t which is a multival field #}
                         </time>
                     </dd>
+                {% elif document.document_dating %}
+                    <dt>
+                        {# Translators: label for inferred date on a document #}
+                        {% translate "Inferred date" %}
+                    </dt>
+                    <dd>
+                        <time>
+                            {{ document.document_dating.0 }} {# indexed as _t which is a multival field #}
+                        </time>
+                    </dd>
                 {% endif %}
                 {% if document.languages|length %}
                     <dt>
@@ -198,10 +204,10 @@ <h3 class="sr-only">{% translate 'Tags' %}</h3>
             </dl>
             <dl class="additional-metadata">
                 {% with document_highlights=highlighting|dict_item:document.id %}
-                    {% if document_highlights.old_shelfmark and document_highlights.old_shelfmark.0 %}
+                    {% if document_highlights.old_shelfmark %}
                         {# Translators: label for historical/old shelfmark on document fragments #}
                         <dt>{% translate "Historical shelfmark" %}</dt>
-                        <dd>{{ document_highlights.old_shelfmark.0|striptags }}</dd>
+                        <dd>{{ document_highlights.old_shelfmark|striptags }}</dd>
                     {% endif %}
                 {% endwith %}
             </dl>

diff --git a/geniza/corpus/tests/test_corpus_solrqueryset.py b/geniza/corpus/tests/test_corpus_solrqueryset.py
@@ -202,6 +202,37 @@ def test_search_term_cleanup__exact_match_regex(self):
             == '(description_nostem:"מרכב אלצלטאן" OR transcription_nostem:"מרכב אלצלטאן") AND (description_nostem:"אלמרכב אלצלטאן" OR transcription_nostem:"אלמרכב אלצלטאן")'
         )
 
+    def test_search_term_cleanup__quoted_shelfmark_only(self):
+        dqs = DocumentSolrQuerySet()
+        # double quoted shelfmark-only search should populate dqs.shelfmark_query
+        dqs._search_term_cleanup('shelfmark:"T-S NS"')
+        assert "T-S NS" in dqs.shelfmark_query
+
+        # otherwise dqs.sheflmark_query should remain unset
+        dqs = DocumentSolrQuerySet()
+        assert "T-S NS" in dqs._search_term_cleanup(
+            'tag:"marriage payment" shelfmark:"T-S NS"'
+        )
+        assert not dqs.shelfmark_query
+        assert "NS" in dqs._search_term_cleanup("shelfmark:NS")
+        assert not dqs.shelfmark_query
+
+    def test_keyword_search__quoted_shelfmark(self):
+        dqs = DocumentSolrQuerySet()
+        with patch.object(dqs, "search") as mocksearch:
+            # only quoted shelfmark: should only search on shelfmark fields
+            dqs.keyword_search('shelfmark:"T-S NS"')
+            mocksearch.assert_called_with(dqs.shelfmark_query)
+            mocksearch.return_value.raw_query_parameters.assert_not_called()
+
+        dqs = DocumentSolrQuerySet()
+        with patch.object(dqs, "search") as mocksearch:
+            # otherwise should search as normal
+            dqs.keyword_search('tag:"marriage payment" shelfmark:"T-S NS"')
+            mocksearch.return_value.raw_query_parameters.assert_called()
+            dqs.keyword_search("shelfmark:NS")
+            mocksearch.return_value.raw_query_parameters.assert_called()
+
     def test_related_to(self, document, join, fragment, empty_solr):
         """should give filtered result: public documents with any shared shelfmarks"""