Merge pull request #1588 from Princeton-CDH/experiment/regex-search

RegEx search (#1631)
Princeton-CDH · Sep 5, 2024 · 415f707 · 415f707
2 parents 3b566f9 + d8c1da7
commit 415f707
Show file tree

Hide file tree

Showing 19 changed files with 613 additions and 36 deletions.
diff --git a/DEPLOYNOTES.md b/DEPLOYNOTES.md
@@ -1,5 +1,10 @@
 # Deploy Notes
 
+## 4.18
+
+-   Solr configuration has changed. Ensure Solr configset has been updated
+    and then reindex all content: `python manage.py index`
+
 ## 4.17
 
 -   Solr configuration has changed. Ensure Solr configset has been updated

diff --git a/geniza/corpus/forms.py b/geniza/corpus/forms.py
@@ -1,3 +1,5 @@
+import re
+
 from dal import autocomplete
 from django import forms
 from django.contrib.admin.widgets import RelatedFieldWidgetWrapper
@@ -196,6 +198,13 @@ class DocumentSearchForm(RangeForm):
     # Translators: label for end year when filtering by date range
     _("To year")
 
+    MODE_CHOICES = [
+        # Translators: label for general search mode
+        ("general", _("General")),
+        # Translators: label for regex (regular expressions) search mode
+        ("regex", _("RegEx")),
+    ]
+
     # NOTE these are not set by default!
     error_css_class = "error"
     required_css_class = "required"
@@ -244,6 +253,14 @@ class DocumentSearchForm(RangeForm):
         label=_("Discussion"),
     )
 
+    mode = forms.ChoiceField(
+        # Translators: label for "search mode" (general or regex)
+        label=_("Search mode"),
+        choices=MODE_CHOICES,
+        required=False,
+        widget=forms.RadioSelect,
+    )
+
     # mapping of solr facet fields to form input
     solr_facet_fields = {
         "type": "doctype",
@@ -319,6 +336,61 @@ def clean(self):
             self.add_error(
                 "q", _("Relevance sort is not available without a keyword search term.")
             )
+        # additional validation for regex mode due to some queries that cause Lucene errors
+        mode = cleaned_data.get("mode")
+        if mode == "regex":
+            # reused text about needing an escape character
+            needs_escape = (
+                lambda char: f"If you are searching for the character {char} in a transcription, escape it with \ by writing \{char} instead."
+            )
+            # see error messages for explanations of each regex here
+            if re.search(r"((?<!\\)\{[^0-9])|(^\{)|((?<!\\)\{[^\}]*$)", q):
+                print(q)
+                self.add_error(
+                    "q",
+                    # Translators: error message for malformed curly brace in regular expression
+                    _(
+                        "Regular expression cannot contain { without a preceding character, without an integer afterwards, or without a closing }. %s"
+                        % needs_escape("{")
+                    ),
+                )
+            if re.search(r"(^\*)|((?<!\\)\*\*)", q):
+                self.add_error(
+                    "q",
+                    # Translators: error message for malformed asterisk in regular expression
+                    _(
+                        "Regular expression cannot contain * without a preceding character, or multiple times in a row. %s"
+                        % needs_escape("*")
+                    ),
+                )
+            if re.search(r"(^\+)|((?<!\\)\+\+)", q):
+                self.add_error(
+                    "q",
+                    # Translators: error message for malformed plus sign in regular expression
+                    _(
+                        "Regular expression cannot contain + without a preceding character, or multiple times in a row. %s"
+                        % needs_escape("+")
+                    ),
+                )
+            if re.search(r"(?<!\\)\<", q):
+                self.add_error(
+                    "q",
+                    # Translators: error message for malformed less than sign in regular expression
+                    _(
+                        "Regular expression cannot contain < or use a negative lookbehind query. %s"
+                        % needs_escape("<")
+                    ),
+                )
+            if re.search(r"((?<!\\)\\[ABCE-RTUVXYZabce-rtuvxyz0-9])|((?<!\\)\\$)", q):
+                # see https://github.com/apache/lucene/issues/11678 for more information
+                self.add_error(
+                    "q",
+                    # Translators: error message for malformed backslash in regular expression
+                    _(
+                        "Regular expression cannot contain the escape character \\ followed by an alphanumeric character other than one of DdSsWw, or at the end of a query. %s"
+                        % needs_escape("\\")
+                    ),
+                )
 
 
 class DocumentChoiceField(forms.ModelChoiceField):

diff --git a/geniza/corpus/models.py b/geniza/corpus/models.py
@@ -1294,6 +1294,7 @@ def index_data(self):
         counts = defaultdict(int)
         # collect transcription and translation texts for indexing
         transcription_texts = []
+        transcription_texts_plaintext = []
         translation_texts = []
         # keep track of translation language for RTL/LTR display
         translation_langcode = ""
@@ -1308,6 +1309,9 @@ def index_data(self):
                 content = fn.content_html_str
                 if content:
                     transcription_texts.append(Footnote.explicit_line_numbers(content))
+                    for canvas in fn.content_text_canvases:
+                        # index plaintext only, per-canvas, for regex
+                        transcription_texts_plaintext.append(canvas)
             elif Footnote.DIGITAL_TRANSLATION in fn.doc_relation:
                 content = fn.content_html_str
                 if content:
@@ -1347,6 +1351,8 @@ def index_data(self):
                 "scholarship_t": [fn.display() for fn in footnotes],
                 # transcription content as html
                 "text_transcription": transcription_texts,
+                # transcription content as plaintext
+                "transcription_regex": transcription_texts_plaintext,
                 "translation_language_code_s": translation_langcode,
                 "translation_language_direction_s": translation_langdir,
                 # translation content as html

diff --git a/geniza/corpus/solr_queryset.py b/geniza/corpus/solr_queryset.py
@@ -3,6 +3,7 @@
 
 from bs4 import BeautifulSoup
 from django.apps import apps
+from django.utils.safestring import mark_safe
 from django.utils.translation import gettext as _
 from parasolr.django import AliasedSolrQuerySet
 from piffle.image import IIIFImageClient
@@ -30,7 +31,12 @@ def clean_html(html_snippet):
         else:
             html_snippet = f"<li>{ellipsis}{html_snippet}"
 
-    return BeautifulSoup(html_snippet, "html.parser").prettify(formatter="minimal")
+    return BeautifulSoup(
+        html_snippet,
+        "html.parser",
+        # ensure li and em tags don't get extra whitespace, as this may break display
+        preserve_whitespace_tags=["li", "em"],
+    ).prettify(formatter="minimal")
 
 
 class DocumentSolrQuerySet(AliasedSolrQuerySet):
@@ -85,6 +91,7 @@ class DocumentSolrQuerySet(AliasedSolrQuerySet):
         "related_people": "people_count_i",
         "related_places": "places_count_i",
         "related_documents": "documents_count_i",
+        "transcription_regex": "transcription_regex",
     }
 
     # regex to convert field aliases used in search to actual solr fields
@@ -235,6 +242,17 @@ def keyword_search(self, search_term):
             )
         return search
 
+    def regex_search(self, search_term):
+        """Build a Lucene query for searching with regular expressions.
+        NOTE: this function may cause Lucene errors if input is not validated beforehand.
+        """
+        # surround passed query with wildcards to allow non-anchored matches,
+        # and slashes so that it is interpreted as regex by Lucene
+        search_term = f"/.*{search_term}.*/"
+        # match in the non-analyzed transcription_regex field
+        search = self.search(f"transcription_regex:{search_term}")
+        return search
+
     def related_to(self, document):
         "Return documents related to the given document (i.e. shares any shelfmarks)"
 
@@ -276,10 +294,52 @@ def get_result_document(self, doc):
 
         return doc
 
+    def get_regex_highlight(self, text):
+        """Helper method to manually highlight and truncate a snippet for regex matches
+        (automatic highlight unavailable due to solr regex search limitations)"""
+        # remove solr field name and lucene-required "match all" logic to get original query
+        regex_query = (
+            self.search_qs[0]
+            .replace("transcription_regex:/.*", "")
+            .rsplit(".*/", maxsplit=1)[0]
+        )
+        # get ~150 characters of context plus a word on either side of the matched portion
+        pattern = r"(\b\w+.{0,150})(%s)(.{0,150}\w+\b)" % regex_query
+        # find all matches in the snippet
+        matches = re.findall(pattern, text, flags=re.DOTALL)
+        # separate multiple matches by HTML line breaks and ellipsis
+        separator = "<br />[…]<br />"
+        # surround matched portion in <em> so it is visible in search results; join all into string
+        return (
+            separator.join([f"{m[0]}<em>{m[1]}</em>{m[2]}" for m in matches if m])
+            if matches
+            else None
+        )
+
     def get_highlighting(self):
         """highlight snippets within transcription/translation html may result in
         invalid tags that will render strangely; clean up the html before returning"""
         highlights = super().get_highlighting()
+        is_regex_search = any("transcription_regex" in q for q in self.search_qs)
+        if is_regex_search:
+            # highlight regex results manually due to solr limitation
+            highlights = {}
+            # highlighting takes place *after* solr, so use get_results()
+            for doc in self.get_results():
+                # highlight per document, keyed on id as expected in results
+                highlights[doc["id"]] = {
+                    "transcription": [
+                        highlighted_block
+                        # this field is split by block-level annotation/group
+                        for highlighted_block in (
+                            self.get_regex_highlight(block)
+                            for block in doc["transcription_regex"]
+                        )
+                        # only include a block if it actually has highlights
+                        if highlighted_block
+                    ]
+                }
+
         is_exact_search = "hl_query" in self.raw_params
         for doc in highlights.keys():
             # _nostem fields should take precedence over stemmed fields in the case of an

diff --git a/geniza/corpus/templates/corpus/document_list.html b/geniza/corpus/templates/corpus/document_list.html
@@ -8,6 +8,23 @@
     <h1>{{ page_title }}</h1>
     <form data-controller="search" data-turbo-frame="main" data-turbo-action="advance" data-page="document">
         <fieldset id="query">
+            <span class="fieldname">{{ form.mode.label }}</span>
+            <dialog data-search-target="helpDialog">
+                {# Translators: heading for search mode help text #}
+                <h2>
+                    <span>{% translate "How to Search" %}</span>
+                    <button type="button" id="close-search-help" data-action="search#toggleHelpDialog">
+                        {# Translators: accessibility label for button to close search mode help dialog #}
+                        <span class="sr-only">{% translate "Close search mode help" %}</span>
+                    </button>
+                </h2>
+                {% include "corpus/snippets/document_search_helptext.html" %}
+            </dialog>
+            <button type="button" id="search-help" data-action="search#toggleHelpDialog">
+                {# Translators: accessibility label for button to open search mode help dialog #}
+                <span class="sr-only">{% translate "Open search mode help" %}</span>
+            </button>
+            {% render_field form.mode data-action="change->search#update" %}
             {% render_field form.q data-search-target="query" data-action="input->search#autoUpdateSort change->search#update" %}
 
             {# Translators: Search submit button #}

diff --git a/geniza/corpus/templates/corpus/snippets/document_result.html b/geniza/corpus/templates/corpus/snippets/document_result.html
@@ -1,6 +1,12 @@
 {% load i18n corpus_extras %}
 {% spaceless %}
     <li class="search-result">
+        {% if "SHOW_RELEVANCE_SCORES" in FEATURE_FLAGS %}
+            {# show relevance score if enabled #}
+            <span class="score">
+                Relevance: {{ document.score|default:0 }}
+            </span>
+        {% endif %}
         {# title #}
         <h2 class="title">
             {# result number #}
@@ -69,7 +75,7 @@ <h2 class="title">
                     <div class="transcription" dir="rtl" lang="{{ lang }}" {% if lang_script %}data-lang-script="{{ lang_script|lower }}"{% endif %}>
 
                         {% if document_highlights.transcription %}
-                            {% for snippet in document_highlights.transcription %}{{ snippet.strip|safe }}{% endfor %}
+                            {% for snippet in document_highlights.transcription %}{{ snippet.strip|safe }}{% if snippet.strip and not forloop.last %}...<br />{% endif %}{% endfor %}
                         {% elif document.transcription %}
                                 {# otherwise, display truncated transcription #}
                                 {# NOTE: might be nice to display N lines instead of using truncatechars #}

diff --git a/geniza/corpus/templates/corpus/snippets/document_search_helptext.html b/geniza/corpus/templates/corpus/snippets/document_search_helptext.html
@@ -0,0 +1,53 @@
+{% load i18n wagtailcore_tags %}
+
+{# Translators: heading for general search mode help text #}
+<h3>{% translate "General Search" %}</h3>
+<p>
+    {# Translators: general search help text #}
+    {% blocktranslate %}
+        Use keywords or phrases in any language to return matching or similar
+        results across all fields. Arabic script searches will return both
+        Arabic and Judaeo-Arabic transcription content.
+    {% endblocktranslate %}
+</p>
+{# Translators: heading for regular expressions search mode help text #}
+<h3>{% translate "Regular Expression Search" %}</h3>
+<p>
+    {% slugurl 'how-to-search' as howto_url %}
+    {# Translators: regular expressions search mode help text #}
+    {% blocktranslate %}
+        Use Hebrew or Arabic script to find precise matches in the
+        transcriptions. See
+        <a data-turbo="false" href="{{ howto_url }}">How to Search</a>
+        page for advanced use cases.
+    {% endblocktranslate %}
+</p>
+
+{# Translators: heading for regular expressions search cheat sheet #}
+<h4>{% translate "Cheat sheet:" %}</h4>
+<ul>
+    <li>
+        {# Translators: regular expression tip 1 #}
+        {% blocktranslate %}
+            If you're looking for a word with one missing letter, use a
+            period. Two missing letters, use two periods or <code>{2}</code>.
+            Increase the number in the curly brackets to increase the number of
+            characters, or insert a range with a comma in between, ex.
+            <code>{0,5}</code>.
+        {% endblocktranslate %}
+    </li>
+    <li>
+        {# Translators: regular expression tip 2 #}
+        {% blocktranslate %}
+            If you don't know how many characters are missing, use
+            <code>.*</code>.
+        {% endblocktranslate %}
+    </li>
+    <li>
+        {# Translators: regular expression tip 3 #}
+        {% blocktranslate %}
+            If you know which characters you want, use square brackets to find
+            multiple spellings, ex. <code>[יו]</code> for yud or vav.
+        {% endblocktranslate %}
+    </li>
+</ul>