Skip to content

Commit

Permalink
Merge pull request #326 from sul-dlss/exhibits-prod-alt
Browse files Browse the repository at this point in the history
Add new exhibits schema with a string-type id field
  • Loading branch information
jcoyne authored Jun 12, 2024
2 parents 15ab0ee + f2392ee commit d0d0231
Show file tree
Hide file tree
Showing 8 changed files with 1,842 additions and 0 deletions.
3 changes: 3 additions & 0 deletions exhibits_prod/_rest_managed.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"initArgs":{},
"managedList":[]}
625 changes: 625 additions & 0 deletions exhibits_prod/schema.xml

Large diffs are not rendered by default.

980 changes: 980 additions & 0 deletions exhibits_prod/solrconfig.xml

Large diffs are not rendered by default.

22 changes: 22 additions & 0 deletions exhibits_prod/stopwords_punctuation.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Punctuation characters we want to ignore as terms (i.e., when surrounded
# by whitespace in a query, like 'fred : the puppy') in queries
# ONLY FOR SINGLE TOKEN ANALYZED FIELDS
# see https://issues.apache.org/jira/browse/SOLR-3085
# Note that hyphens, plusses, and double hyphens are not treated as terms
# per debugQuery
:
;
&
/
=
>
<
,
.
(
)
»
§
·
73 changes: 73 additions & 0 deletions exhibits_prod/synonyms.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.SynonymFilterFactory
# A synonym file for Solr SynonymFilterFactory.
# Needs to be included at both index and query time
# AFTER the case folding
# BEFORE the WordDelimiterFilterFactory that removes punctuation
# e.g.
# <analyzer>
# <tokenizer class="solr.WhitespaceTokenizerFactory" />
# <filter class="solr.ICUFoldingFilterFactory"/> <!-- NFKC, case folding, diacritics removed -->
# <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
# <filter class="solr.WordDelimiterFilterFactory" ...
# ...
#
# !!!
# !!! IMPORTANT: see also synonyms_both_anchors.txt, synonyms_left_anchor.txt and synonyms_right_anchor.txt
# !!!

# Explicit mappings match any token sequence on the LHS of "=>"
# and replace with all alternatives on the RHS. These types of mappings
# ignore the expand parameter in the schema.
# Equivalent synonyms may be separated with commas and give
# no explicit mapping. In this case the mapping behavior will
# be taken from the expand parameter in the schema.
# If expand==true, "ipod, i-pod, i pod" is equivalent to the explicit mapping:
# ipod, i-pod, i pod => ipod, i-pod, i pod
# If expand==false, "ipod, i-pod, i pod" is equivalent to the explicit mapping:
# ipod, i-pod, i pod => ipod
# set expand to true for index time and false for query time

# See SW-845
# "Dept." will change to "Department"
# "Koran" will change to "Qur'an"
# "violoncello" will change to "cello"
# "O.T." and "N.T." will change to "Old Testament" and "New Testament"
# note that mapping TO the abbreviation improves recall but reduces precision:
# O.T. can mean Old Testament or overtime; dept could be a word in some
# language.
department => dept
qurʼan, qur'an, quran, qorʼan, qor'an, qoran => koran
violoncello, violincello => cello
# multi-token synonyms, and synonyms with punctuation, can be problematic
#old testament => o.t.
#new testament => n.t.

# The below is inspired by Jonathan Rochkind at Johns Hopkins University, 2013-04-15

# punctuation-including terms we want to whitelist protect and make searchable.
# We do this by mapping them to unique tokens that do not include punctuation

# computer languages
# these are explicit mappings so when WDF drops the non-letter chars, c++ is not equivalent to c
c++ => cplusplus
j#, j♯ => jsssharp
# c# and f# are music keys as well as computer languages

# musical keys
# these are explicit mappings so when WDF drops the non-letter chars, c# is not equivalent to c
# We map from number-sign (#), musical sharp (♯)
a#, a♯, a-sharp => a sharp
b#, b♯, b-sharp => b sharp
c#, c♯, c-sharp => c sharp
d#, d♯, d-sharp => d sharp
e#, e♯, e-sharp => e sharp
f#, f♯, f-sharp => f sharp
g#, g♯, g-sharp => g sharp
# We map both from lowercase b and musical flat (♭)
ab, a♭, a-flat => a flat
bb, b♭, b-flat => b flat
cb, c♭, c-flat => c flat
db, d♭, d-flat => d flat
eb, e♭, e-flat => e flat
fb, f♭, f-flat => f flat
gb, g♭, g-flat => g flat
47 changes: 47 additions & 0 deletions exhibits_prod/synonyms_both_anchors.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Include in analysis with both left anchor of 'aaaaaa' and right anchor of 'zzzzzz'
# for query or field comprised solely of token meant to be a synonym
#
# http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.SynonymFilterFactory
# A synonym file for Solr SynonymFilterFactory.
# Needs to be included at both index and query time
# AFTER the case folding
# BEFORE the WordDelimiterFilterFactory that removes punctuation
# e.g.
# <analyzer>
# <!-- put beginning and ending anchors on field value, removing trailing chars -->
# <charFilter class="solr.PatternReplaceCharFilterFactory" pattern="^\s*(.*[^\s\.\:\;\/\[\]])[\s\.\:\;\/\[\]]*$" replacement="aaaaaa$1zzzzzz"/>
# <tokenizer class="solr.WhitespaceTokenizerFactory" />
# <filter class="solr.ICUFoldingFilterFactory"/> <!-- NFKC, case folding, diacritics removed -->
# <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
# <filter class="solr.SynonymFilterFactory" synonyms="synonyms_both_anchors.txt" ignoreCase="true" expand="false"/>
# <filter class="solr.SynonymFilterFactory" synonyms="synonyms_left_anchor.txt" ignoreCase="true" expand="false"/>
# <filter class="solr.SynonymFilterFactory" synonyms="synonyms_right_anchor.txt" ignoreCase="true" expand="false"/>
# <filter class="solr.WordDelimiterFilterFactory" ...
# ...
#

aaaaaadepartmentzzzzzz => aaaaaadeptzzzzzz
aaaaaaqurʼanzzzzzz, aaaaaaqur'anzzzzzz, aaaaaaquranzzzzzz, aaaaaaqorʼanzzzzzz, aaaaaaqor'anzzzzzz, aaaaaaqoranzzzzzz => aaaaaakoranzzzzzz
aaaaaavioloncellozzzzzz, aaaaaaviolincellozzzzzz => aaaaaacellozzzzzz

# computer languages
aaaaaac++zzzzzz => aaaaaacpluspluszzzzzz
aaaaaaj#zzzzzz, aaaaaaj♯zzzzzz => aaaaaajsssharpzzzzzz

# musical keys
# We map from number-sign (#), musical sharp (♯)
aaaaaaa#zzzzzz, aaaaaaa♯zzzzzz, aaaaaaa-sharpzzzzzz => aaaaaaa sharpzzzzzz
aaaaaab#zzzzzz, aaaaaab♯zzzzzz, aaaaaab-sharpzzzzzz => aaaaaab sharpzzzzzz
aaaaaac#zzzzzz, aaaaaac♯zzzzzz, aaaaaac-sharpzzzzzz => aaaaaac sharpzzzzzz
aaaaaad#zzzzzz, aaaaaad♯zzzzzz, aaaaaad-sharpzzzzzz => aaaaaad sharpzzzzzz
aaaaaae#zzzzzz, aaaaaae♯zzzzzz, aaaaaae-sharpzzzzzz => aaaaaae sharpzzzzzz
aaaaaaf#zzzzzz, aaaaaaf♯zzzzzz, aaaaaaf-sharpzzzzzz => aaaaaaf sharpzzzzzz
aaaaaag#zzzzzz, aaaaaag♯zzzzzz, aaaaaag-sharpzzzzzz => aaaaaag sharpzzzzzz
# We map both from lowercase b and musical flat (♭)
aaaaaaabzzzzzz, aaaaaaa♭zzzzzz, aaaaaaa-flatzzzzzz => aaaaaaa flatzzzzzz
aaaaaabbzzzzzz, aaaaaab♭zzzzzz, aaaaaab-flatzzzzzz => aaaaaab flatzzzzzz
aaaaaacbzzzzzz, aaaaaac♭zzzzzz, aaaaaac-flatzzzzzz => aaaaaac flatzzzzzz
aaaaaadbzzzzzz, aaaaaad♭zzzzzz, aaaaaad-flatzzzzzz => aaaaaad flatzzzzzz
aaaaaaebzzzzzz, aaaaaae♭zzzzzz, aaaaaae-flatzzzzzz => aaaaaae flatzzzzzz
aaaaaafbzzzzzz, aaaaaaf♭zzzzzz, aaaaaaf-flatzzzzzz => aaaaaaf flatzzzzzz
aaaaaagbzzzzzz, aaaaaag♭zzzzzz, aaaaaag-flatzzzzzz => aaaaaag flatzzzzzz
45 changes: 45 additions & 0 deletions exhibits_prod/synonyms_left_anchor.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Include in analysis with left anchor of 'aaaaaa'
# for query or field beginning with token meant to be a synonym
#
# http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.SynonymFilterFactory
# A synonym file for Solr SynonymFilterFactory.
# Needs to be included at both index and query time
# AFTER the case folding
# BEFORE the WordDelimiterFilterFactory that removes punctuation
# e.g.
# <analyzer>
# <!-- put beginning anchor on field value, assume first non-whitespace char is unicode letter or number or symbol -->
# <charFilter class="solr.PatternReplaceCharFilterFactory" pattern="^\s*([\p{L}\p{N}\p{S}]{1})" replacement="aaaaaa$1"/>
# <tokenizer class="solr.WhitespaceTokenizerFactory" />
# <filter class="solr.ICUFoldingFilterFactory"/> <!-- NFKC, case folding, diacritics removed -->
# <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
# <filter class="solr.SynonymFilterFactory" synonyms="synonyms_left_anchor.txt" ignoreCase="true" expand="false"/>
# <filter class="solr.WordDelimiterFilterFactory" ...
# ...
#

aaaaaadepartment => aaaaaadept
aaaaaaqurʼan, aaaaaaqur'an, aaaaaaquran, aaaaaaqorʼan, aaaaaaqor'an, aaaaaaqoran => aaaaaakoran
aaaaaavioloncello, aaaaaaviolincello => aaaaaacello

# computer languages
aaaaaac++ => aaaaaacplusplus
aaaaaaj#, aaaaaaj♯ => aaaaaajsssharp

# musical keys
# We map from number-sign (#), musical sharp (♯)
aaaaaaa#, aaaaaaa♯, aaaaaaa-sharp => aaaaaaa sharp
aaaaaab#, aaaaaab♯, aaaaaab-sharp => aaaaaab sharp
aaaaaac#, aaaaaac♯, aaaaaac-sharp => aaaaaac sharp
aaaaaad#, aaaaaad♯, aaaaaad-sharp => aaaaaad sharp
aaaaaae#, aaaaaae♯, aaaaaae-sharp => aaaaaae sharp
aaaaaaf#, aaaaaaf♯, aaaaaaf-sharp => aaaaaaf sharp
aaaaaag#, aaaaaag♯, aaaaaag-sharp => aaaaaag sharp
# We map both from lowercase b and musical flat (♭)
aaaaaaab, aaaaaaa♭, aaaaaaa-flat => aaaaaaa flat
aaaaaabb, aaaaaab♭, aaaaaab-flat => aaaaaab flat
aaaaaacb, aaaaaac♭, aaaaaac-flat => aaaaaac flat
aaaaaadb, aaaaaad♭, aaaaaad-flat => aaaaaad flat
aaaaaaeb, aaaaaae♭, aaaaaae-flat => aaaaaae flat
aaaaaafb, aaaaaaf♭, aaaaaaf-flat => aaaaaaf flat
aaaaaagb, aaaaaag♭, aaaaaag-flat => aaaaaag flat
47 changes: 47 additions & 0 deletions exhibits_prod/synonyms_right_anchor.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Include in analysis with right anchor of 'zzzzzz'
# for query or field ending with token meant to be a synonym
#
# http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.SynonymFilterFactory
# A synonym file for Solr SynonymFilterFactory.
# Needs to be included at both index and query time
# AFTER the case folding
# BEFORE the WordDelimiterFilterFactory that removes punctuation
# e.g.
# <analyzer>
# <!-- put beginning and ending anchors on field value, removing trailing chars -->
# <charFilter class="solr.PatternReplaceCharFilterFactory" pattern="^\s*(.*[^\s\.\:\;\/\[\]])[\s\.\:\;\/\[\]]*$" replacement="aaaaaa$1zzzzzz"/>
# <tokenizer class="solr.WhitespaceTokenizerFactory" />
# <filter class="solr.ICUFoldingFilterFactory"/> <!-- NFKC, case folding, diacritics removed -->
# <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
# <filter class="solr.SynonymFilterFactory" synonyms="synonyms_both_anchors.txt" ignoreCase="true" expand="false"/>
# <filter class="solr.SynonymFilterFactory" synonyms="synonyms_left_anchor.txt" ignoreCase="true" expand="false"/>
# <filter class="solr.SynonymFilterFactory" synonyms="synonyms_right_anchor.txt" ignoreCase="true" expand="false"/>
# <filter class="solr.WordDelimiterFilterFactory" ...
# ...
#

departmentzzzzzz => deptzzzzzz
qurʼanzzzzzz, qur'anzzzzzz, quranzzzzzz, qorʼanzzzzzz, qor'anzzzzzz, qoranzzzzzz => koranzzzzzz
violoncellozzzzzz, violincellozzzzzz => cellozzzzzz

# computer languages
c++zzzzzz => cpluspluszzzzzz
j#zzzzzz, j♯zzzzzz => jsssharpzzzzzz

# musical keys
# We map from number-sign (#), musical sharp (♯)
a#zzzzzz, a♯zzzzzz, a-sharpzzzzzz => a sharpzzzzzz
b#zzzzzz, b♯zzzzzz, b-sharpzzzzzz => b sharpzzzzzz
c#zzzzzz, c♯zzzzzz, c-sharpzzzzzz => c sharpzzzzzz
d#zzzzzz, d♯zzzzzz, d-sharpzzzzzz => d sharpzzzzzz
e#zzzzzz, e♯zzzzzz, e-sharpzzzzzz => e sharpzzzzzz
f#zzzzzz, f♯zzzzzz, f-sharpzzzzzz => f sharpzzzzzz
g#zzzzzz, g♯zzzzzz, g-sharpzzzzzz => g sharpzzzzzz
# We map both from lowercase b and musical flat (♭)
abzzzzzz, a♭zzzzzz, a-flatzzzzzz => a flatzzzzzz
bbzzzzzz, b♭zzzzzz, b-flatzzzzzz => b flatzzzzzz
cbzzzzzz, c♭zzzzzz, c-flatzzzzzz => c flatzzzzzz
dbzzzzzz, d♭zzzzzz, d-flatzzzzzz => d flatzzzzzz
ebzzzzzz, e♭zzzzzz, e-flatzzzzzz => e flatzzzzzz
fbzzzzzz, f♭zzzzzz, f-flatzzzzzz => f flatzzzzzz
gbzzzzzz, g♭zzzzzz, g-flatzzzzzz => g flatzzzzzz

0 comments on commit d0d0231

Please sign in to comment.