Skip to content

Commit

Permalink
Merge branch 'develop' into experiment/regex-search
Browse files Browse the repository at this point in the history
  • Loading branch information
blms committed Aug 27, 2024
2 parents 7733a06 + 3b566f9 commit ca561b1
Show file tree
Hide file tree
Showing 81 changed files with 1,760 additions and 742 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
Change Log
==========

4.17.3
------

- chore: Use self-hosted tinyMCE

4.17.2
------

Expand Down
6 changes: 4 additions & 2 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,11 @@ Python 3.9 / Django 3.2 / Node 16 / Postgresql / Solr 9.2
:target: https://github.com/Princeton-CDH/geniza/actions/workflows/sphinx_docs.yml
:alt: Sphinx docs build status

Technical documentation is available at https://princeton-cdh.github.io/geniza/
Technical documentation is available at https://princeton-cdh.github.io/geniza/.

For developer instructions, see DEVNOTES.
For developer instructions, see `DEVNOTES <DEVNOTES.rst>`_.

To access the data from the project, see https://github.com/princetongenizalab/.

License
-------
Expand Down
28 changes: 10 additions & 18 deletions geniza/common/management/commands/visual_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,15 +33,7 @@ def take_snapshots(self, browser, dark_mode=False):

# turn on dark mode, save in local storage
if dark_mode:
# handle separate theme toggles for mobile and desktop
try:
browser.find_element(
By.CSS_SELECTOR, "#theme-toggle-desktop"
).send_keys(Keys.ENTER)
except (ElementNotVisibleException, ElementNotInteractableException):
browser.find_element(By.CSS_SELECTOR, "#theme-toggle-mobile").send_keys(
Keys.ENTER
)
browser.find_element(By.CSS_SELECTOR, "#theme-toggle").send_keys(Keys.ENTER)
dark_mode_str = " (dark mode)"

# homepage
Expand Down Expand Up @@ -79,22 +71,22 @@ def take_snapshots(self, browser, dark_mode=False):
)
percy_snapshot(browser, "Content Page%s" % dark_mode_str)

# document search with document type filter expanded
# document search with filters expanded
browser.get(
"http://localhost:8000/en/documents/?per_page=2&sort=scholarship_desc#filters"
"http://localhost:8000/en/documents/?per_page=2&sort=scholarship_desc"
)
# open document type filter programatically
doctype_filter = browser.find_element(By.CSS_SELECTOR, ".doctype-filter")
browser.execute_script("arguments[0].open = true", doctype_filter)
# click the first option
# expand filters
filter_button = browser.find_element(By.CSS_SELECTOR, "a#filters-button")
browser.execute_script("arguments[0].classList.add('open')", filter_button)
filters = browser.find_element(By.CSS_SELECTOR, "fieldset#filters")
browser.execute_script("arguments[0].ariaExpanded = true", filters)
# click the first doctype filter option
browser.find_element(
By.CSS_SELECTOR, ".doctype-filter li:nth-child(1) label"
By.CSS_SELECTOR, "#id_doctype li:nth-child(1) label"
).click()
filter_modal_css = "fieldset#filters { display: flex !important; }"
percy_snapshot(
browser,
"Document Search filter%s" % dark_mode_str,
percy_css=filter_modal_css,
)

# document search
Expand Down
7 changes: 7 additions & 0 deletions geniza/corpus/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,13 @@ class DocumentSearchForm(RangeForm):
),
)

exclude_inferred = forms.BooleanField(
# Translators: label for "exclude inferred dates" search form filter
label=_("Exclude inferred dates"),
required=False,
widget=forms.CheckboxInput,
)

doctype = FacetChoiceField(
# Translators: label for document type search form filter
label=_("Document type"),
Expand Down
38 changes: 36 additions & 2 deletions geniza/corpus/models.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import itertools
import logging
import re
from collections import defaultdict
Expand Down Expand Up @@ -1133,7 +1134,11 @@ def items_to_index(cls):
"textblock_set",
queryset=TextBlock.objects.select_related(
"fragment", "fragment__collection", "fragment__manifest"
).prefetch_related("fragment__manifest__canvases"),
).prefetch_related(
"fragment__manifest__canvases",
"fragment__textblock_set",
"fragment__textblock_set__document",
),
),
Prefetch(
"footnotes",
Expand Down Expand Up @@ -1165,7 +1170,11 @@ def prep_index_chunk(cls, chunk):
"textblock_set",
queryset=TextBlock.objects.select_related(
"fragment", "fragment__collection", "fragment__manifest"
).prefetch_related("fragment__manifest__canvases"),
).prefetch_related(
"fragment__manifest__canvases",
"fragment__textblock_set",
"fragment__textblock_set__document",
),
),
Prefetch(
"footnotes",
Expand All @@ -1188,6 +1197,15 @@ def index_data(self):
# get fragments via textblocks for correct order
# and to take advantage of prefetching
fragments = [tb.fragment for tb in self.textblock_set.all()]

# get related documents: other textblocks on this document's fragments
other_textblocks_docs = [
f.textblock_set.exclude(document__pk=self.pk).values_list(
"document__pk", flat=True
)
for f in fragments
]
related_document_pks = set(itertools.chain.from_iterable(other_textblocks_docs))
# filter by side so that search results only show the relevant side image(s)
images = self.iiif_images(filter_side=True).values()
index_data.update(
Expand Down Expand Up @@ -1220,6 +1238,10 @@ def index_data(self):
),
# combined original/standard document date for display
"document_date_t": strip_tags(self.document_date) or None,
# inferred document date for display
"document_dating_t": standard_date_display(
"/".join([d.isoformat() for d in self.dating_range() if d])
),
# date range for filtering
"document_date_dr": self.solr_date_range(),
# date range for filtering, but including inferred datings if any exist
Expand All @@ -1232,6 +1254,17 @@ def index_data(self):
"end_date_i": (
self.end_date.numeric_format(mode="max") if self.end_date else None
),
# start/end of document date or date range, including inferred datings, for sort
"start_dating_i": (
self.dating_range()[0].numeric_format()
if self.dating_range()[0]
else None
),
"end_dating_i": (
self.dating_range()[1].numeric_format(mode="max")
if self.dating_range()[1]
else None
),
# library/collection possibly redundant?
"collection_ss": [str(f.collection) for f in fragments],
"tags_ss_lower": [t.name for t in self.tags.all()],
Expand All @@ -1252,6 +1285,7 @@ def index_data(self):
"has_image_b": len(images) > 0,
"people_count_i": self.persondocumentrelation_set.count(),
"places_count_i": self.documentplacerelation_set.count(),
"documents_count_i": len(related_document_pks),
}
)

Expand Down
36 changes: 20 additions & 16 deletions geniza/corpus/solr_queryset.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ class DocumentSolrQuerySet(AliasedSolrQuerySet):
"shelfmark": "shelfmark_s", # string version for display
"shelfmarks": "fragment_shelfmark_ss",
"document_date": "document_date_t", # text version for search & display
"document_dating": "document_dating_t", # inferred date for display
"original_date_t": "original_date",
"collection": "collection_ss",
"tags": "tags_ss_lower",
Expand Down Expand Up @@ -89,6 +90,7 @@ class DocumentSolrQuerySet(AliasedSolrQuerySet):
"description_nostem": "description_nostem",
"related_people": "people_count_i",
"related_places": "places_count_i",
"related_documents": "documents_count_i",
"transcription_regex": "transcription_regex",
}

Expand Down Expand Up @@ -129,6 +131,9 @@ class DocumentSolrQuerySet(AliasedSolrQuerySet):
# to use as highlighting query
highlight_query = None

# if search consists only of quoted phrase scoped to shelfmark, handle separately
shelfmark_query = None

def _search_term_cleanup(self, search_term):
# adjust user search string before sending to solr

Expand Down Expand Up @@ -179,6 +184,12 @@ def _search_term_cleanup(self, search_term):
search_term = search_term.replace(
"%s:" % self.shelfmark_qf, self.shelfmark_qf
)
# special case: just a shelfmark query, in quotes
quoted_shelfmark_query = re.fullmatch(
rf'{re.escape(self.shelfmark_qf)}".+?"', search_term
)
if quoted_shelfmark_query:
self.shelfmark_query = quoted_shelfmark_query.group(0)

return search_term

Expand Down Expand Up @@ -206,9 +217,14 @@ def keyword_search(self, search_term):
# nested edismax query no longer works since solr 7.2 (see above)
if "{!type=edismax" in keyword_query:
query_params.update({"uf": "* _query_"})
search = self.search(self.keyword_search_qf).raw_query_parameters(
**query_params
)
# if search term consists only of a shelfmark query in quotes, only search shelfmark fields
if self.shelfmark_query:
search = self.search(self.shelfmark_query)
else:
# otherwise, search all fields as usual
search = self.search(self.keyword_search_qf).raw_query_parameters(
**query_params
)
# if search term cleanup identifies any exact phrase searches,
# pass the unmodified search to Solr as a highlighting query,
# since otherwise the highlighted fields (description/transcription)
Expand Down Expand Up @@ -241,7 +257,7 @@ def related_to(self, document):
"Return documents related to the given document (i.e. shares any shelfmarks)"

# NOTE: using a string query filter because parasolr queryset
# # currently doesn't provide any kind of not/exclude filter
# currently doesn't provide any kind of not/exclude filter
return (
self.filter(status=document.PUBLIC_LABEL)
.filter("NOT pgpid_i:%d" % document.id)
Expand Down Expand Up @@ -276,18 +292,6 @@ def get_result_document(self, doc):
_("Unknown type"),
)

if doc.get("shelfmarks"):
doc["related_documents"] = (
DocumentSolrQuerySet()
.filter("NOT pgpid_i:%d" % doc["pgpid"])
.filter(
fragment_shelfmark_ss__in=[
'"%s"' % shelfmark for shelfmark in doc["shelfmarks"]
]
)
.count()
)

return doc

def get_regex_highlight(self, text):
Expand Down
5 changes: 5 additions & 0 deletions geniza/corpus/templates/corpus/document_list.html
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,11 @@ <h2>
{# NOTE: stimulus action is configured via django widget attrs #}
{{ form.docdate }}
</label>
<label for="{{ form.exclude_inferred.auto_id }}">
{% render_field form.exclude_inferred data-action="search#update" %}
<span>{{ form.exclude_inferred.label }}</span>
<div class="thumb" aria-hidden="true"></div>
</label>
</div>
<fieldset class="includes-fields">
<legend><span class="fieldname">{% translate "Includes" %}</span></legend>
Expand Down
22 changes: 14 additions & 8 deletions geniza/corpus/templates/corpus/snippets/document_result.html
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,10 @@
Relevance: {{ document.score|default:0 }}
</span>
{% endif %}
{# result number #}
<span class="counter">
{% with page_obj.start_index|default:1 as start_adjust %}
{{ forloop.counter0|add:start_adjust }}
{% endwith %}
</span>
{# title #}
<h2 class="title">
{# result number #}
<span class="counter">{% with page_obj.start_index|default:1 as start_adjust %}{{ forloop.counter0|add:start_adjust }}{% endwith %}</span>
{# type and shelfmark #}
<span class="doctype">{{ document.type }}</span>
<span class="shelfmark">{{ document.shelfmark|shelfmark_wrap }}</span>
Expand All @@ -31,6 +27,16 @@ <h2 class="title">
{{ document.document_date.0 }} {# indexed as _t which is a multival field #}
</time>
</dd>
{% elif document.document_dating %}
<dt>
{# Translators: label for inferred date on a document #}
{% translate "Inferred date" %}
</dt>
<dd>
<time>
{{ document.document_dating.0 }} {# indexed as _t which is a multival field #}
</time>
</dd>
{% endif %}
{% if document.languages|length %}
<dt>
Expand Down Expand Up @@ -198,10 +204,10 @@ <h3 class="sr-only">{% translate 'Tags' %}</h3>
</dl>
<dl class="additional-metadata">
{% with document_highlights=highlighting|dict_item:document.id %}
{% if document_highlights.old_shelfmark and document_highlights.old_shelfmark.0 %}
{% if document_highlights.old_shelfmark %}
{# Translators: label for historical/old shelfmark on document fragments #}
<dt>{% translate "Historical shelfmark" %}</dt>
<dd>{{ document_highlights.old_shelfmark.0|striptags }}</dd>
<dd>{{ document_highlights.old_shelfmark|striptags }}</dd>
{% endif %}
{% endwith %}
</dl>
Expand Down
31 changes: 31 additions & 0 deletions geniza/corpus/tests/test_corpus_solrqueryset.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,37 @@ def test_search_term_cleanup__exact_match_regex(self):
== '(description_nostem:"מרכב אלצלטאן" OR transcription_nostem:"מרכב אלצלטאן") AND (description_nostem:"אלמרכב אלצלטאן" OR transcription_nostem:"אלמרכב אלצלטאן")'
)

def test_search_term_cleanup__quoted_shelfmark_only(self):
dqs = DocumentSolrQuerySet()
# double quoted shelfmark-only search should populate dqs.shelfmark_query
dqs._search_term_cleanup('shelfmark:"T-S NS"')
assert "T-S NS" in dqs.shelfmark_query

# otherwise dqs.sheflmark_query should remain unset
dqs = DocumentSolrQuerySet()
assert "T-S NS" in dqs._search_term_cleanup(
'tag:"marriage payment" shelfmark:"T-S NS"'
)
assert not dqs.shelfmark_query
assert "NS" in dqs._search_term_cleanup("shelfmark:NS")
assert not dqs.shelfmark_query

def test_keyword_search__quoted_shelfmark(self):
dqs = DocumentSolrQuerySet()
with patch.object(dqs, "search") as mocksearch:
# only quoted shelfmark: should only search on shelfmark fields
dqs.keyword_search('shelfmark:"T-S NS"')
mocksearch.assert_called_with(dqs.shelfmark_query)
mocksearch.return_value.raw_query_parameters.assert_not_called()

dqs = DocumentSolrQuerySet()
with patch.object(dqs, "search") as mocksearch:
# otherwise should search as normal
dqs.keyword_search('tag:"marriage payment" shelfmark:"T-S NS"')
mocksearch.return_value.raw_query_parameters.assert_called()
dqs.keyword_search("shelfmark:NS")
mocksearch.return_value.raw_query_parameters.assert_called()

def test_related_to(self, document, join, fragment, empty_solr):
"""should give filtered result: public documents with any shared shelfmarks"""

Expand Down
Loading

0 comments on commit ca561b1

Please sign in to comment.