Skip to content

Commit

Permalink
Merge pull request #1588 from Princeton-CDH/experiment/regex-search
Browse files Browse the repository at this point in the history
RegEx search (#1631)
  • Loading branch information
blms authored Sep 5, 2024
2 parents 3b566f9 + d8c1da7 commit 415f707
Show file tree
Hide file tree
Showing 19 changed files with 613 additions and 36 deletions.
5 changes: 5 additions & 0 deletions DEPLOYNOTES.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# Deploy Notes

## 4.18

- Solr configuration has changed. Ensure Solr configset has been updated
and then reindex all content: `python manage.py index`

## 4.17

- Solr configuration has changed. Ensure Solr configset has been updated
Expand Down
72 changes: 72 additions & 0 deletions geniza/corpus/forms.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import re

from dal import autocomplete
from django import forms
from django.contrib.admin.widgets import RelatedFieldWidgetWrapper
Expand Down Expand Up @@ -196,6 +198,13 @@ class DocumentSearchForm(RangeForm):
# Translators: label for end year when filtering by date range
_("To year")

MODE_CHOICES = [
# Translators: label for general search mode
("general", _("General")),
# Translators: label for regex (regular expressions) search mode
("regex", _("RegEx")),
]

# NOTE these are not set by default!
error_css_class = "error"
required_css_class = "required"
Expand Down Expand Up @@ -244,6 +253,14 @@ class DocumentSearchForm(RangeForm):
label=_("Discussion"),
)

mode = forms.ChoiceField(
# Translators: label for "search mode" (general or regex)
label=_("Search mode"),
choices=MODE_CHOICES,
required=False,
widget=forms.RadioSelect,
)

# mapping of solr facet fields to form input
solr_facet_fields = {
"type": "doctype",
Expand Down Expand Up @@ -319,6 +336,61 @@ def clean(self):
self.add_error(
"q", _("Relevance sort is not available without a keyword search term.")
)
# additional validation for regex mode due to some queries that cause Lucene errors
mode = cleaned_data.get("mode")
if mode == "regex":
# reused text about needing an escape character
needs_escape = (
lambda char: f"If you are searching for the character {char} in a transcription, escape it with \ by writing \{char} instead."
)
# see error messages for explanations of each regex here
if re.search(r"((?<!\\)\{[^0-9])|(^\{)|((?<!\\)\{[^\}]*$)", q):
print(q)
self.add_error(
"q",
# Translators: error message for malformed curly brace in regular expression
_(
"Regular expression cannot contain { without a preceding character, without an integer afterwards, or without a closing }. %s"
% needs_escape("{")
),
)
if re.search(r"(^\*)|((?<!\\)\*\*)", q):
self.add_error(
"q",
# Translators: error message for malformed asterisk in regular expression
_(
"Regular expression cannot contain * without a preceding character, or multiple times in a row. %s"
% needs_escape("*")
),
)
if re.search(r"(^\+)|((?<!\\)\+\+)", q):
self.add_error(
"q",
# Translators: error message for malformed plus sign in regular expression
_(
"Regular expression cannot contain + without a preceding character, or multiple times in a row. %s"
% needs_escape("+")
),
)
if re.search(r"(?<!\\)\<", q):
self.add_error(
"q",
# Translators: error message for malformed less than sign in regular expression
_(
"Regular expression cannot contain < or use a negative lookbehind query. %s"
% needs_escape("<")
),
)
if re.search(r"((?<!\\)\\[ABCE-RTUVXYZabce-rtuvxyz0-9])|((?<!\\)\\$)", q):
# see https://github.com/apache/lucene/issues/11678 for more information
self.add_error(
"q",
# Translators: error message for malformed backslash in regular expression
_(
"Regular expression cannot contain the escape character \\ followed by an alphanumeric character other than one of DdSsWw, or at the end of a query. %s"
% needs_escape("\\")
),
)


class DocumentChoiceField(forms.ModelChoiceField):
Expand Down
6 changes: 6 additions & 0 deletions geniza/corpus/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1294,6 +1294,7 @@ def index_data(self):
counts = defaultdict(int)
# collect transcription and translation texts for indexing
transcription_texts = []
transcription_texts_plaintext = []
translation_texts = []
# keep track of translation language for RTL/LTR display
translation_langcode = ""
Expand All @@ -1308,6 +1309,9 @@ def index_data(self):
content = fn.content_html_str
if content:
transcription_texts.append(Footnote.explicit_line_numbers(content))
for canvas in fn.content_text_canvases:
# index plaintext only, per-canvas, for regex
transcription_texts_plaintext.append(canvas)
elif Footnote.DIGITAL_TRANSLATION in fn.doc_relation:
content = fn.content_html_str
if content:
Expand Down Expand Up @@ -1347,6 +1351,8 @@ def index_data(self):
"scholarship_t": [fn.display() for fn in footnotes],
# transcription content as html
"text_transcription": transcription_texts,
# transcription content as plaintext
"transcription_regex": transcription_texts_plaintext,
"translation_language_code_s": translation_langcode,
"translation_language_direction_s": translation_langdir,
# translation content as html
Expand Down
62 changes: 61 additions & 1 deletion geniza/corpus/solr_queryset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from bs4 import BeautifulSoup
from django.apps import apps
from django.utils.safestring import mark_safe
from django.utils.translation import gettext as _
from parasolr.django import AliasedSolrQuerySet
from piffle.image import IIIFImageClient
Expand Down Expand Up @@ -30,7 +31,12 @@ def clean_html(html_snippet):
else:
html_snippet = f"<li>{ellipsis}{html_snippet}"

return BeautifulSoup(html_snippet, "html.parser").prettify(formatter="minimal")
return BeautifulSoup(
html_snippet,
"html.parser",
# ensure li and em tags don't get extra whitespace, as this may break display
preserve_whitespace_tags=["li", "em"],
).prettify(formatter="minimal")


class DocumentSolrQuerySet(AliasedSolrQuerySet):
Expand Down Expand Up @@ -85,6 +91,7 @@ class DocumentSolrQuerySet(AliasedSolrQuerySet):
"related_people": "people_count_i",
"related_places": "places_count_i",
"related_documents": "documents_count_i",
"transcription_regex": "transcription_regex",
}

# regex to convert field aliases used in search to actual solr fields
Expand Down Expand Up @@ -235,6 +242,17 @@ def keyword_search(self, search_term):
)
return search

def regex_search(self, search_term):
"""Build a Lucene query for searching with regular expressions.
NOTE: this function may cause Lucene errors if input is not validated beforehand.
"""
# surround passed query with wildcards to allow non-anchored matches,
# and slashes so that it is interpreted as regex by Lucene
search_term = f"/.*{search_term}.*/"
# match in the non-analyzed transcription_regex field
search = self.search(f"transcription_regex:{search_term}")
return search

def related_to(self, document):
"Return documents related to the given document (i.e. shares any shelfmarks)"

Expand Down Expand Up @@ -276,10 +294,52 @@ def get_result_document(self, doc):

return doc

def get_regex_highlight(self, text):
"""Helper method to manually highlight and truncate a snippet for regex matches
(automatic highlight unavailable due to solr regex search limitations)"""
# remove solr field name and lucene-required "match all" logic to get original query
regex_query = (
self.search_qs[0]
.replace("transcription_regex:/.*", "")
.rsplit(".*/", maxsplit=1)[0]
)
# get ~150 characters of context plus a word on either side of the matched portion
pattern = r"(\b\w+.{0,150})(%s)(.{0,150}\w+\b)" % regex_query
# find all matches in the snippet
matches = re.findall(pattern, text, flags=re.DOTALL)
# separate multiple matches by HTML line breaks and ellipsis
separator = "<br />[…]<br />"
# surround matched portion in <em> so it is visible in search results; join all into string
return (
separator.join([f"{m[0]}<em>{m[1]}</em>{m[2]}" for m in matches if m])
if matches
else None
)

def get_highlighting(self):
"""highlight snippets within transcription/translation html may result in
invalid tags that will render strangely; clean up the html before returning"""
highlights = super().get_highlighting()
is_regex_search = any("transcription_regex" in q for q in self.search_qs)
if is_regex_search:
# highlight regex results manually due to solr limitation
highlights = {}
# highlighting takes place *after* solr, so use get_results()
for doc in self.get_results():
# highlight per document, keyed on id as expected in results
highlights[doc["id"]] = {
"transcription": [
highlighted_block
# this field is split by block-level annotation/group
for highlighted_block in (
self.get_regex_highlight(block)
for block in doc["transcription_regex"]
)
# only include a block if it actually has highlights
if highlighted_block
]
}

is_exact_search = "hl_query" in self.raw_params
for doc in highlights.keys():
# _nostem fields should take precedence over stemmed fields in the case of an
Expand Down
17 changes: 17 additions & 0 deletions geniza/corpus/templates/corpus/document_list.html
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,23 @@
<h1>{{ page_title }}</h1>
<form data-controller="search" data-turbo-frame="main" data-turbo-action="advance" data-page="document">
<fieldset id="query">
<span class="fieldname">{{ form.mode.label }}</span>
<dialog data-search-target="helpDialog">
{# Translators: heading for search mode help text #}
<h2>
<span>{% translate "How to Search" %}</span>
<button type="button" id="close-search-help" data-action="search#toggleHelpDialog">
{# Translators: accessibility label for button to close search mode help dialog #}
<span class="sr-only">{% translate "Close search mode help" %}</span>
</button>
</h2>
{% include "corpus/snippets/document_search_helptext.html" %}
</dialog>
<button type="button" id="search-help" data-action="search#toggleHelpDialog">
{# Translators: accessibility label for button to open search mode help dialog #}
<span class="sr-only">{% translate "Open search mode help" %}</span>
</button>
{% render_field form.mode data-action="change->search#update" %}
{% render_field form.q data-search-target="query" data-action="input->search#autoUpdateSort change->search#update" %}

{# Translators: Search submit button #}
Expand Down
8 changes: 7 additions & 1 deletion geniza/corpus/templates/corpus/snippets/document_result.html
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
{% load i18n corpus_extras %}
{% spaceless %}
<li class="search-result">
{% if "SHOW_RELEVANCE_SCORES" in FEATURE_FLAGS %}
{# show relevance score if enabled #}
<span class="score">
Relevance: {{ document.score|default:0 }}
</span>
{% endif %}
{# title #}
<h2 class="title">
{# result number #}
Expand Down Expand Up @@ -69,7 +75,7 @@ <h2 class="title">
<div class="transcription" dir="rtl" lang="{{ lang }}" {% if lang_script %}data-lang-script="{{ lang_script|lower }}"{% endif %}>

{% if document_highlights.transcription %}
{% for snippet in document_highlights.transcription %}{{ snippet.strip|safe }}{% endfor %}
{% for snippet in document_highlights.transcription %}{{ snippet.strip|safe }}{% if snippet.strip and not forloop.last %}...<br />{% endif %}{% endfor %}
{% elif document.transcription %}
{# otherwise, display truncated transcription #}
{# NOTE: might be nice to display N lines instead of using truncatechars #}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
{% load i18n wagtailcore_tags %}

{# Translators: heading for general search mode help text #}
<h3>{% translate "General Search" %}</h3>
<p>
{# Translators: general search help text #}
{% blocktranslate %}
Use keywords or phrases in any language to return matching or similar
results across all fields. Arabic script searches will return both
Arabic and Judaeo-Arabic transcription content.
{% endblocktranslate %}
</p>
{# Translators: heading for regular expressions search mode help text #}
<h3>{% translate "Regular Expression Search" %}</h3>
<p>
{% slugurl 'how-to-search' as howto_url %}
{# Translators: regular expressions search mode help text #}
{% blocktranslate %}
Use Hebrew or Arabic script to find precise matches in the
transcriptions. See
<a data-turbo="false" href="{{ howto_url }}">How to Search</a>
page for advanced use cases.
{% endblocktranslate %}
</p>

{# Translators: heading for regular expressions search cheat sheet #}
<h4>{% translate "Cheat sheet:" %}</h4>
<ul>
<li>
{# Translators: regular expression tip 1 #}
{% blocktranslate %}
If you're looking for a word with one missing letter, use a
period. Two missing letters, use two periods or <code>{2}</code>.
Increase the number in the curly brackets to increase the number of
characters, or insert a range with a comma in between, ex.
<code>{0,5}</code>.
{% endblocktranslate %}
</li>
<li>
{# Translators: regular expression tip 2 #}
{% blocktranslate %}
If you don't know how many characters are missing, use
<code>.*</code>.
{% endblocktranslate %}
</li>
<li>
{# Translators: regular expression tip 3 #}
{% blocktranslate %}
If you know which characters you want, use square brackets to find
multiple spellings, ex. <code>[יו]</code> for yud or vav.
{% endblocktranslate %}
</li>
</ul>
Loading

0 comments on commit 415f707

Please sign in to comment.