Merge pull request #326 from sul-dlss/exhibits-prod-alt

Add new exhibits schema with a string-type id field
sul-dlss · Jun 12, 2024 · d0d0231 · d0d0231
2 parents 15ab0ee + f2392ee
commit d0d0231
Show file tree

Hide file tree

Showing 8 changed files with 1,842 additions and 0 deletions.
diff --git a/exhibits_prod/_rest_managed.json b/exhibits_prod/_rest_managed.json
@@ -0,0 +1,3 @@
+{
+  "initArgs":{},
+  "managedList":[]}
diff --git a/exhibits_prod/schema.xml b/exhibits_prod/schema.xml
diff --git a/exhibits_prod/solrconfig.xml b/exhibits_prod/solrconfig.xml
diff --git a/exhibits_prod/stopwords_punctuation.txt b/exhibits_prod/stopwords_punctuation.txt
@@ -0,0 +1,22 @@
+# Punctuation characters we want to ignore as terms (i.e., when surrounded 
+# by whitespace in a query, like 'fred : the puppy') in queries
+# ONLY FOR SINGLE TOKEN ANALYZED FIELDS
+#   see https://issues.apache.org/jira/browse/SOLR-3085
+# Note that hyphens, plusses, and double hyphens are not treated as terms
+#   per debugQuery
+:
+;
+&
+/
+=
+>
+<
+,
+.
+(
+)
+…
+»
+§
+•
+·
diff --git a/exhibits_prod/synonyms.txt b/exhibits_prod/synonyms.txt
@@ -0,0 +1,73 @@
+# http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.SynonymFilterFactory
+# A synonym file for Solr SynonymFilterFactory.
+# Needs to be included at both index and query time 
+#  AFTER the case folding
+#  BEFORE the WordDelimiterFilterFactory that removes punctuation
+# e.g. 
+# <analyzer>
+#   <tokenizer class="solr.WhitespaceTokenizerFactory" />
+#   <filter class="solr.ICUFoldingFilterFactory"/>  <!-- NFKC, case folding, diacritics removed -->
+#   <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
+#   <filter class="solr.WordDelimiterFilterFactory" ...
+#   ...
+#
+# !!!
+# !!! IMPORTANT:   see also   synonyms_both_anchors.txt, synonyms_left_anchor.txt  and  synonyms_right_anchor.txt
+# !!!
+
+# Explicit mappings match any token sequence on the LHS of "=>"
+#   and replace with all alternatives on the RHS.  These types of mappings
+#   ignore the expand parameter in the schema.
+# Equivalent synonyms may be separated with commas and give
+#   no explicit mapping.  In this case the mapping behavior will
+#   be taken from the expand parameter in the schema.
+#  If expand==true, "ipod, i-pod, i pod" is equivalent to the explicit mapping:
+#   ipod, i-pod, i pod => ipod, i-pod, i pod
+#  If expand==false, "ipod, i-pod, i pod" is equivalent to the explicit mapping:
+#   ipod, i-pod, i pod => ipod
+# set expand to true for index time and false for query time
+
+# See SW-845
+#  "Dept." will change to "Department"
+#  "Koran" will change to "Qur'an"
+#  "violoncello" will change to "cello"
+#  "O.T." and "N.T." will change to "Old Testament" and "New Testament"
+# note that mapping TO the abbreviation improves recall but reduces precision:
+#  O.T. can mean Old Testament or overtime;   dept could be a word in some
+#  language.
+department => dept
+qurʼan, qur'an, quran, qorʼan, qor'an, qoran => koran
+violoncello, violincello => cello
+# multi-token synonyms, and synonyms with punctuation, can be problematic
+#old testament => o.t.
+#new testament => n.t.
+
+# The below is inspired by Jonathan Rochkind at Johns Hopkins University, 2013-04-15
+
+# punctuation-including terms we want to whitelist protect and make searchable.
+# We do this by mapping them to unique tokens that do not include punctuation
+
+# computer languages
+#  these are explicit mappings so when WDF drops the non-letter chars, c++ is not equivalent to c
+c++  => cplusplus
+j#, j♯ => jsssharp
+# c# and f# are music keys as well as computer languages
+
+# musical keys
+#  these are explicit mappings so when WDF drops the non-letter chars, c# is not equivalent to c
+# We map from number-sign (#), musical sharp (♯)
+a#, a♯, a-sharp => a sharp
+b#, b♯, b-sharp => b sharp
+c#, c♯, c-sharp => c sharp
+d#, d♯, d-sharp => d sharp
+e#, e♯, e-sharp => e sharp
+f#, f♯, f-sharp => f sharp
+g#, g♯, g-sharp => g sharp
+# We map both from lowercase b and musical flat (♭)
+ab, a♭, a-flat => a flat
+bb, b♭, b-flat => b flat
+cb, c♭, c-flat => c flat
+db, d♭, d-flat => d flat
+eb, e♭, e-flat => e flat
+fb, f♭, f-flat => f flat
+gb, g♭, g-flat => g flat
diff --git a/exhibits_prod/synonyms_both_anchors.txt b/exhibits_prod/synonyms_both_anchors.txt
@@ -0,0 +1,47 @@
+# Include in analysis with both left anchor of 'aaaaaa' and right anchor of 'zzzzzz'
+#  for query or field comprised solely of token meant to be a synonym
+#
+# http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.SynonymFilterFactory
+# A synonym file for Solr SynonymFilterFactory.
+# Needs to be included at both index and query time 
+#  AFTER the case folding
+#  BEFORE the WordDelimiterFilterFactory that removes punctuation
+# e.g. 
+# <analyzer>
+#   <!-- put beginning and ending anchors on field value, removing trailing chars -->
+#   <charFilter class="solr.PatternReplaceCharFilterFactory" pattern="^\s*(.*[^\s\.\:\;\/\[\]])[\s\.\:\;\/\[\]]*$" replacement="aaaaaa$1zzzzzz"/>
+#   <tokenizer class="solr.WhitespaceTokenizerFactory" />
+#   <filter class="solr.ICUFoldingFilterFactory"/>  <!-- NFKC, case folding, diacritics removed -->
+#   <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
+#   <filter class="solr.SynonymFilterFactory" synonyms="synonyms_both_anchors.txt" ignoreCase="true" expand="false"/>
+#   <filter class="solr.SynonymFilterFactory" synonyms="synonyms_left_anchor.txt" ignoreCase="true" expand="false"/>
+#   <filter class="solr.SynonymFilterFactory" synonyms="synonyms_right_anchor.txt" ignoreCase="true" expand="false"/>
+#   <filter class="solr.WordDelimiterFilterFactory" ...
+#   ...
+#
+
+aaaaaadepartmentzzzzzz => aaaaaadeptzzzzzz
+aaaaaaqurʼanzzzzzz, aaaaaaqur'anzzzzzz, aaaaaaquranzzzzzz, aaaaaaqorʼanzzzzzz, aaaaaaqor'anzzzzzz, aaaaaaqoranzzzzzz => aaaaaakoranzzzzzz
+aaaaaavioloncellozzzzzz, aaaaaaviolincellozzzzzz => aaaaaacellozzzzzz
+
+# computer languages
+aaaaaac++zzzzzz => aaaaaacpluspluszzzzzz
+aaaaaaj#zzzzzz, aaaaaaj♯zzzzzz => aaaaaajsssharpzzzzzz
+
+# musical keys
+# We map from number-sign (#), musical sharp (♯)
+aaaaaaa#zzzzzz, aaaaaaa♯zzzzzz, aaaaaaa-sharpzzzzzz => aaaaaaa sharpzzzzzz
+aaaaaab#zzzzzz, aaaaaab♯zzzzzz, aaaaaab-sharpzzzzzz => aaaaaab sharpzzzzzz
+aaaaaac#zzzzzz, aaaaaac♯zzzzzz, aaaaaac-sharpzzzzzz => aaaaaac sharpzzzzzz
+aaaaaad#zzzzzz, aaaaaad♯zzzzzz, aaaaaad-sharpzzzzzz => aaaaaad sharpzzzzzz
+aaaaaae#zzzzzz, aaaaaae♯zzzzzz, aaaaaae-sharpzzzzzz => aaaaaae sharpzzzzzz
+aaaaaaf#zzzzzz, aaaaaaf♯zzzzzz, aaaaaaf-sharpzzzzzz => aaaaaaf sharpzzzzzz
+aaaaaag#zzzzzz, aaaaaag♯zzzzzz, aaaaaag-sharpzzzzzz => aaaaaag sharpzzzzzz
+# We map both from lowercase b and musical flat (♭)
+aaaaaaabzzzzzz, aaaaaaa♭zzzzzz, aaaaaaa-flatzzzzzz => aaaaaaa flatzzzzzz
+aaaaaabbzzzzzz, aaaaaab♭zzzzzz, aaaaaab-flatzzzzzz => aaaaaab flatzzzzzz
+aaaaaacbzzzzzz, aaaaaac♭zzzzzz, aaaaaac-flatzzzzzz => aaaaaac flatzzzzzz
+aaaaaadbzzzzzz, aaaaaad♭zzzzzz, aaaaaad-flatzzzzzz => aaaaaad flatzzzzzz
+aaaaaaebzzzzzz, aaaaaae♭zzzzzz, aaaaaae-flatzzzzzz => aaaaaae flatzzzzzz
+aaaaaafbzzzzzz, aaaaaaf♭zzzzzz, aaaaaaf-flatzzzzzz => aaaaaaf flatzzzzzz
+aaaaaagbzzzzzz, aaaaaag♭zzzzzz, aaaaaag-flatzzzzzz => aaaaaag flatzzzzzz
diff --git a/exhibits_prod/synonyms_left_anchor.txt b/exhibits_prod/synonyms_left_anchor.txt
@@ -0,0 +1,45 @@
+# Include in analysis with left anchor of 'aaaaaa'
+#  for query or field beginning with token meant to be a synonym
+#
+# http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.SynonymFilterFactory
+# A synonym file for Solr SynonymFilterFactory.
+# Needs to be included at both index and query time 
+#  AFTER the case folding
+#  BEFORE the WordDelimiterFilterFactory that removes punctuation
+# e.g. 
+# <analyzer>
+#   <!-- put beginning anchor on field value, assume first non-whitespace char is unicode letter or number or symbol -->
+#   <charFilter class="solr.PatternReplaceCharFilterFactory" pattern="^\s*([\p{L}\p{N}\p{S}]{1})" replacement="aaaaaa$1"/>
+#   <tokenizer class="solr.WhitespaceTokenizerFactory" />
+#   <filter class="solr.ICUFoldingFilterFactory"/>  <!-- NFKC, case folding, diacritics removed -->
+#   <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
+#   <filter class="solr.SynonymFilterFactory" synonyms="synonyms_left_anchor.txt" ignoreCase="true" expand="false"/>
+#   <filter class="solr.WordDelimiterFilterFactory" ...
+#   ...
+#
+
+aaaaaadepartment => aaaaaadept
+aaaaaaqurʼan, aaaaaaqur'an, aaaaaaquran, aaaaaaqorʼan, aaaaaaqor'an, aaaaaaqoran => aaaaaakoran
+aaaaaavioloncello, aaaaaaviolincello => aaaaaacello
+
+# computer languages
+aaaaaac++ => aaaaaacplusplus
+aaaaaaj#, aaaaaaj♯ => aaaaaajsssharp
+
+# musical keys
+# We map from number-sign (#), musical sharp (♯)
+aaaaaaa#, aaaaaaa♯, aaaaaaa-sharp => aaaaaaa sharp
+aaaaaab#, aaaaaab♯, aaaaaab-sharp => aaaaaab sharp
+aaaaaac#, aaaaaac♯, aaaaaac-sharp => aaaaaac sharp
+aaaaaad#, aaaaaad♯, aaaaaad-sharp => aaaaaad sharp
+aaaaaae#, aaaaaae♯, aaaaaae-sharp => aaaaaae sharp
+aaaaaaf#, aaaaaaf♯, aaaaaaf-sharp => aaaaaaf sharp
+aaaaaag#, aaaaaag♯, aaaaaag-sharp => aaaaaag sharp
+# We map both from lowercase b and musical flat (♭)
+aaaaaaab, aaaaaaa♭, aaaaaaa-flat => aaaaaaa flat
+aaaaaabb, aaaaaab♭, aaaaaab-flat => aaaaaab flat
+aaaaaacb, aaaaaac♭, aaaaaac-flat => aaaaaac flat
+aaaaaadb, aaaaaad♭, aaaaaad-flat => aaaaaad flat
+aaaaaaeb, aaaaaae♭, aaaaaae-flat => aaaaaae flat
+aaaaaafb, aaaaaaf♭, aaaaaaf-flat => aaaaaaf flat
+aaaaaagb, aaaaaag♭, aaaaaag-flat => aaaaaag flat
diff --git a/exhibits_prod/synonyms_right_anchor.txt b/exhibits_prod/synonyms_right_anchor.txt
@@ -0,0 +1,47 @@
+# Include in analysis with right anchor of 'zzzzzz'
+#  for query or field ending with token meant to be a synonym
+#
+# http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.SynonymFilterFactory
+# A synonym file for Solr SynonymFilterFactory.
+# Needs to be included at both index and query time 
+#  AFTER the case folding
+#  BEFORE the WordDelimiterFilterFactory that removes punctuation
+# e.g. 
+# <analyzer>
+#   <!-- put beginning and ending anchors on field value, removing trailing chars -->
+#   <charFilter class="solr.PatternReplaceCharFilterFactory" pattern="^\s*(.*[^\s\.\:\;\/\[\]])[\s\.\:\;\/\[\]]*$" replacement="aaaaaa$1zzzzzz"/>
+#   <tokenizer class="solr.WhitespaceTokenizerFactory" />
+#   <filter class="solr.ICUFoldingFilterFactory"/>  <!-- NFKC, case folding, diacritics removed -->
+#   <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
+#   <filter class="solr.SynonymFilterFactory" synonyms="synonyms_both_anchors.txt" ignoreCase="true" expand="false"/>
+#   <filter class="solr.SynonymFilterFactory" synonyms="synonyms_left_anchor.txt" ignoreCase="true" expand="false"/>
+#   <filter class="solr.SynonymFilterFactory" synonyms="synonyms_right_anchor.txt" ignoreCase="true" expand="false"/>
+#   <filter class="solr.WordDelimiterFilterFactory" ...
+#   ...
+#
+
+departmentzzzzzz => deptzzzzzz
+qurʼanzzzzzz, qur'anzzzzzz, quranzzzzzz, qorʼanzzzzzz, qor'anzzzzzz, qoranzzzzzz => koranzzzzzz
+violoncellozzzzzz, violincellozzzzzz => cellozzzzzz
+
+# computer languages
+c++zzzzzz => cpluspluszzzzzz
+j#zzzzzz, j♯zzzzzz => jsssharpzzzzzz
+
+# musical keys
+# We map from number-sign (#), musical sharp (♯)
+a#zzzzzz, a♯zzzzzz, a-sharpzzzzzz => a sharpzzzzzz
+b#zzzzzz, b♯zzzzzz, b-sharpzzzzzz => b sharpzzzzzz
+c#zzzzzz, c♯zzzzzz, c-sharpzzzzzz => c sharpzzzzzz
+d#zzzzzz, d♯zzzzzz, d-sharpzzzzzz => d sharpzzzzzz
+e#zzzzzz, e♯zzzzzz, e-sharpzzzzzz => e sharpzzzzzz
+f#zzzzzz, f♯zzzzzz, f-sharpzzzzzz => f sharpzzzzzz
+g#zzzzzz, g♯zzzzzz, g-sharpzzzzzz => g sharpzzzzzz
+# We map both from lowercase b and musical flat (♭)
+abzzzzzz, a♭zzzzzz, a-flatzzzzzz => a flatzzzzzz
+bbzzzzzz, b♭zzzzzz, b-flatzzzzzz => b flatzzzzzz
+cbzzzzzz, c♭zzzzzz, c-flatzzzzzz => c flatzzzzzz
+dbzzzzzz, d♭zzzzzz, d-flatzzzzzz => d flatzzzzzz
+ebzzzzzz, e♭zzzzzz, e-flatzzzzzz => e flatzzzzzz
+fbzzzzzz, f♭zzzzzz, f-flatzzzzzz => f flatzzzzzz
+gbzzzzzz, g♭zzzzzz, g-flatzzzzzz => g flatzzzzzz