From 8a85c3f5da42edc5259a3cdb6b116a12046eeced Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Niedzielski?= Date: Tue, 8 Jun 2021 10:13:27 +0200 Subject: [PATCH 1/3] IBX-519: Introduced Docker image for Solr standalone --- docker/solr/Dockerfile | 12 + docker/solr/conf/custom-fields-types.xml | 19 + docker/solr/conf/language-fieldtypes.xml | 13 + docker/solr/conf/managed-schema | 141 ++++++ .../solr.languages/ar/language-fieldtypes.xml | 14 + .../conf/solr.languages/ar/stopwords_ar.txt | 125 ++++++ .../cjk/language-fieldtypes.xml | 12 + .../solr.languages/de/language-fieldtypes.xml | 13 + .../conf/solr.languages/de/stopwords_de.txt | 294 ++++++++++++ .../solr.languages/en/language-fieldtypes.xml | 38 ++ .../conf/solr.languages/en/stopwords_en.txt | 54 +++ .../solr.languages/es/language-fieldtypes.xml | 12 + .../conf/solr.languages/es/stopwords_es.txt | 356 +++++++++++++++ .../solr.languages/fi/language-fieldtypes.xml | 12 + .../conf/solr.languages/fi/stopwords_fi.txt | 97 ++++ .../solr.languages/fr/contractions_fr.txt | 15 + .../solr.languages/fr/language-fieldtypes.xml | 15 + .../conf/solr.languages/fr/stopwords_fr.txt | 186 ++++++++ .../solr.languages/hr/language-fieldtypes.xml | 10 + .../conf/solr.languages/hr/stopwords_hr.txt | 1 + .../solr.languages/it/contractions_it.txt | 23 + .../solr.languages/it/language-fieldtypes.xml | 14 + .../conf/solr.languages/it/stopwords_it.txt | 303 +++++++++++++ .../solr.languages/ja/language-fieldtypes.xml | 48 ++ .../conf/solr.languages/ja/stoptags_ja.txt | 420 ++++++++++++++++++ .../conf/solr.languages/ja/stopwords_ja.txt | 127 ++++++ .../solr.languages/nl/language-fieldtypes.xml | 12 + .../conf/solr.languages/nl/stemdict_nl.txt | 6 + .../conf/solr.languages/nl/stopwords_nl.txt | 119 +++++ .../solr.languages/no/language-fieldtypes.xml | 14 + .../conf/solr.languages/no/stopwords_no.txt | 194 ++++++++ .../solr.languages/pt/language-fieldtypes.xml | 14 + .../conf/solr.languages/pt/stopwords_pt.txt | 253 +++++++++++ .../solr.languages/ru/language-fieldtypes.xml | 12 + .../conf/solr.languages/ru/stopwords_ru.txt | 243 ++++++++++ 35 files changed, 3241 insertions(+) create mode 100644 docker/solr/Dockerfile create mode 100644 docker/solr/conf/custom-fields-types.xml create mode 100644 docker/solr/conf/language-fieldtypes.xml create mode 100644 docker/solr/conf/managed-schema create mode 100644 docker/solr/conf/solr.languages/ar/language-fieldtypes.xml create mode 100644 docker/solr/conf/solr.languages/ar/stopwords_ar.txt create mode 100644 docker/solr/conf/solr.languages/cjk/language-fieldtypes.xml create mode 100644 docker/solr/conf/solr.languages/de/language-fieldtypes.xml create mode 100644 docker/solr/conf/solr.languages/de/stopwords_de.txt create mode 100644 docker/solr/conf/solr.languages/en/language-fieldtypes.xml create mode 100644 docker/solr/conf/solr.languages/en/stopwords_en.txt create mode 100644 docker/solr/conf/solr.languages/es/language-fieldtypes.xml create mode 100644 docker/solr/conf/solr.languages/es/stopwords_es.txt create mode 100644 docker/solr/conf/solr.languages/fi/language-fieldtypes.xml create mode 100644 docker/solr/conf/solr.languages/fi/stopwords_fi.txt create mode 100644 docker/solr/conf/solr.languages/fr/contractions_fr.txt create mode 100644 docker/solr/conf/solr.languages/fr/language-fieldtypes.xml create mode 100644 docker/solr/conf/solr.languages/fr/stopwords_fr.txt create mode 100644 docker/solr/conf/solr.languages/hr/language-fieldtypes.xml create mode 100644 docker/solr/conf/solr.languages/hr/stopwords_hr.txt create mode 100644 docker/solr/conf/solr.languages/it/contractions_it.txt create mode 100644 docker/solr/conf/solr.languages/it/language-fieldtypes.xml create mode 100644 docker/solr/conf/solr.languages/it/stopwords_it.txt create mode 100644 docker/solr/conf/solr.languages/ja/language-fieldtypes.xml create mode 100644 docker/solr/conf/solr.languages/ja/stoptags_ja.txt create mode 100644 docker/solr/conf/solr.languages/ja/stopwords_ja.txt create mode 100644 docker/solr/conf/solr.languages/nl/language-fieldtypes.xml create mode 100644 docker/solr/conf/solr.languages/nl/stemdict_nl.txt create mode 100644 docker/solr/conf/solr.languages/nl/stopwords_nl.txt create mode 100644 docker/solr/conf/solr.languages/no/language-fieldtypes.xml create mode 100644 docker/solr/conf/solr.languages/no/stopwords_no.txt create mode 100644 docker/solr/conf/solr.languages/pt/language-fieldtypes.xml create mode 100644 docker/solr/conf/solr.languages/pt/stopwords_pt.txt create mode 100644 docker/solr/conf/solr.languages/ru/language-fieldtypes.xml create mode 100644 docker/solr/conf/solr.languages/ru/stopwords_ru.txt diff --git a/docker/solr/Dockerfile b/docker/solr/Dockerfile new file mode 100644 index 000000000..36c24c3f6 --- /dev/null +++ b/docker/solr/Dockerfile @@ -0,0 +1,12 @@ +FROM solr:8.6.3 + +LABEL org.opencontainers.image.source=https://github.com/ezsystems/ezplatform-solr-search-engine + +USER root +COPY conf server/solr/configsets/_default/conf +RUN sed --in-place '//d' server/solr/configsets/_default/conf/solrconfig.xml +RUN sed --in-place 's/${solr.autoSoftCommit.maxTime:-1}/${solr.autoSoftCommit.maxTime:20}/' server/solr/configsets/_default/conf/solrconfig.xml + +USER solr + +CMD ["solr-precreate", "collection1"] diff --git a/docker/solr/conf/custom-fields-types.xml b/docker/solr/conf/custom-fields-types.xml new file mode 100644 index 000000000..92318ba62 --- /dev/null +++ b/docker/solr/conf/custom-fields-types.xml @@ -0,0 +1,19 @@ + + + + + + + + + diff --git a/docker/solr/conf/language-fieldtypes.xml b/docker/solr/conf/language-fieldtypes.xml new file mode 100644 index 000000000..f578645bf --- /dev/null +++ b/docker/solr/conf/language-fieldtypes.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + diff --git a/docker/solr/conf/managed-schema b/docker/solr/conf/managed-schema new file mode 100644 index 000000000..361630025 --- /dev/null +++ b/docker/solr/conf/managed-schema @@ -0,0 +1,141 @@ + + + +]> + + + + + &langfields; + + + &customfields; + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + id + diff --git a/docker/solr/conf/solr.languages/ar/language-fieldtypes.xml b/docker/solr/conf/solr.languages/ar/language-fieldtypes.xml new file mode 100644 index 000000000..007c154ac --- /dev/null +++ b/docker/solr/conf/solr.languages/ar/language-fieldtypes.xml @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + diff --git a/docker/solr/conf/solr.languages/ar/stopwords_ar.txt b/docker/solr/conf/solr.languages/ar/stopwords_ar.txt new file mode 100644 index 000000000..046829db6 --- /dev/null +++ b/docker/solr/conf/solr.languages/ar/stopwords_ar.txt @@ -0,0 +1,125 @@ +# This file was created by Jacques Savoy and is distributed under the BSD license. +# See http://members.unine.ch/jacques.savoy/clef/index.html. +# Also see http://www.opensource.org/licenses/bsd-license.html +# Cleaned on October 11, 2009 (not normalized, so use before normalization) +# This means that when modifying this list, you might need to add some +# redundant entries, for example containing forms with both أ and ا +من +ومن +منها +منه +في +وفي +فيها +فيه +و +ف +ثم +او +أو +ب +بها +به +ا +أ +اى +اي +أي +أى +لا +ولا +الا +ألا +إلا +لكن +ما +وما +كما +فما +عن +مع +اذا +إذا +ان +أن +إن +انها +أنها +إنها +انه +أنه +إنه +بان +بأن +فان +فأن +وان +وأن +وإن +التى +التي +الذى +الذي +الذين +الى +الي +إلى +إلي +على +عليها +عليه +اما +أما +إما +ايضا +أيضا +كل +وكل +لم +ولم +لن +ولن +هى +هي +هو +وهى +وهي +وهو +فهى +فهي +فهو +انت +أنت +لك +لها +له +هذه +هذا +تلك +ذلك +هناك +كانت +كان +يكون +تكون +وكانت +وكان +غير +بعض +قد +نحو +بين +بينما +منذ +ضمن +حيث +الان +الآن +خلال +بعد +قبل +حتى +عند +عندما +لدى +جميع diff --git a/docker/solr/conf/solr.languages/cjk/language-fieldtypes.xml b/docker/solr/conf/solr.languages/cjk/language-fieldtypes.xml new file mode 100644 index 000000000..0c183aef0 --- /dev/null +++ b/docker/solr/conf/solr.languages/cjk/language-fieldtypes.xml @@ -0,0 +1,12 @@ + + + + + + + + + + + + diff --git a/docker/solr/conf/solr.languages/de/language-fieldtypes.xml b/docker/solr/conf/solr.languages/de/language-fieldtypes.xml new file mode 100644 index 000000000..e165b0899 --- /dev/null +++ b/docker/solr/conf/solr.languages/de/language-fieldtypes.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + diff --git a/docker/solr/conf/solr.languages/de/stopwords_de.txt b/docker/solr/conf/solr.languages/de/stopwords_de.txt new file mode 100644 index 000000000..86525e7ae --- /dev/null +++ b/docker/solr/conf/solr.languages/de/stopwords_de.txt @@ -0,0 +1,294 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/german/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A German stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | The number of forms in this list is reduced significantly by passing it + | through the German stemmer. + + +aber | but + +alle | all +allem +allen +aller +alles + +als | than, as +also | so +am | an + dem +an | at + +ander | other +andere +anderem +anderen +anderer +anderes +anderm +andern +anderr +anders + +auch | also +auf | on +aus | out of +bei | by +bin | am +bis | until +bist | art +da | there +damit | with it +dann | then + +der | the +den +des +dem +die +das + +daß | that + +derselbe | the same +derselben +denselben +desselben +demselben +dieselbe +dieselben +dasselbe + +dazu | to that + +dein | thy +deine +deinem +deinen +deiner +deines + +denn | because + +derer | of those +dessen | of him + +dich | thee +dir | to thee +du | thou + +dies | this +diese +diesem +diesen +dieser +dieses + + +doch | (several meanings) +dort | (over) there + + +durch | through + +ein | a +eine +einem +einen +einer +eines + +einig | some +einige +einigem +einigen +einiger +einiges + +einmal | once + +er | he +ihn | him +ihm | to him + +es | it +etwas | something + +euer | your +eure +eurem +euren +eurer +eures + +für | for +gegen | towards +gewesen | p.p. of sein +hab | have +habe | have +haben | have +hat | has +hatte | had +hatten | had +hier | here +hin | there +hinter | behind + +ich | I +mich | me +mir | to me + + +ihr | you, to her +ihre +ihrem +ihren +ihrer +ihres +euch | to you + +im | in + dem +in | in +indem | while +ins | in + das +ist | is + +jede | each, every +jedem +jeden +jeder +jedes + +jene | that +jenem +jenen +jener +jenes + +jetzt | now +kann | can + +kein | no +keine +keinem +keinen +keiner +keines + +können | can +könnte | could +machen | do +man | one + +manche | some, many a +manchem +manchen +mancher +manches + +mein | my +meine +meinem +meinen +meiner +meines + +mit | with +muss | must +musste | had to +nach | to(wards) +nicht | not +nichts | nothing +noch | still, yet +nun | now +nur | only +ob | whether +oder | or +ohne | without +sehr | very + +sein | his +seine +seinem +seinen +seiner +seines + +selbst | self +sich | herself + +sie | they, she +ihnen | to them + +sind | are +so | so + +solche | such +solchem +solchen +solcher +solches + +soll | shall +sollte | should +sondern | but +sonst | else +über | over +um | about, around +und | and + +uns | us +unse +unsem +unsen +unser +unses + +unter | under +viel | much +vom | von + dem +von | from +vor | before +während | while +war | was +waren | were +warst | wast +was | what +weg | away, off +weil | because +weiter | further + +welche | which +welchem +welchen +welcher +welches + +wenn | when +werde | will +werden | will +wie | how +wieder | again +will | want +wir | we +wird | will +wirst | willst +wo | where +wollen | want +wollte | wanted +würde | would +würden | would +zu | to +zum | zu + dem +zur | zu + der +zwar | indeed +zwischen | between + diff --git a/docker/solr/conf/solr.languages/en/language-fieldtypes.xml b/docker/solr/conf/solr.languages/en/language-fieldtypes.xml new file mode 100644 index 000000000..9bccba163 --- /dev/null +++ b/docker/solr/conf/solr.languages/en/language-fieldtypes.xml @@ -0,0 +1,38 @@ + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docker/solr/conf/solr.languages/en/stopwords_en.txt b/docker/solr/conf/solr.languages/en/stopwords_en.txt new file mode 100644 index 000000000..2c164c0b2 --- /dev/null +++ b/docker/solr/conf/solr.languages/en/stopwords_en.txt @@ -0,0 +1,54 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# a couple of test stopwords to test that the words are really being +# configured from this file: +stopworda +stopwordb + +# Standard english stop words taken from Lucene's StopAnalyzer +a +an +and +are +as +at +be +but +by +for +if +in +into +is +it +no +not +of +on +or +such +that +the +their +then +there +these +they +this +to +was +will +with diff --git a/docker/solr/conf/solr.languages/es/language-fieldtypes.xml b/docker/solr/conf/solr.languages/es/language-fieldtypes.xml new file mode 100644 index 000000000..03c9d9805 --- /dev/null +++ b/docker/solr/conf/solr.languages/es/language-fieldtypes.xml @@ -0,0 +1,12 @@ + + + + + + + + + + + + diff --git a/docker/solr/conf/solr.languages/es/stopwords_es.txt b/docker/solr/conf/solr.languages/es/stopwords_es.txt new file mode 100644 index 000000000..487d78c8d --- /dev/null +++ b/docker/solr/conf/solr.languages/es/stopwords_es.txt @@ -0,0 +1,356 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/spanish/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A Spanish stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + + | The following is a ranked list (commonest to rarest) of stopwords + | deriving from a large sample of text. + + | Extra words have been added at the end. + +de | from, of +la | the, her +que | who, that +el | the +en | in +y | and +a | to +los | the, them +del | de + el +se | himself, from him etc +las | the, them +por | for, by, etc +un | a +para | for +con | with +no | no +una | a +su | his, her +al | a + el + | es from SER +lo | him +como | how +más | more +pero | pero +sus | su plural +le | to him, her +ya | already +o | or + | fue from SER +este | this + | ha from HABER +sí | himself etc +porque | because +esta | this + | son from SER +entre | between + | está from ESTAR +cuando | when +muy | very +sin | without +sobre | on + | ser from SER + | tiene from TENER +también | also +me | me +hasta | until +hay | there is/are +donde | where + | han from HABER +quien | whom, that + | están from ESTAR + | estado from ESTAR +desde | from +todo | all +nos | us +durante | during + | estados from ESTAR +todos | all +uno | a +les | to them +ni | nor +contra | against +otros | other + | fueron from SER +ese | that +eso | that + | había from HABER +ante | before +ellos | they +e | and (variant of y) +esto | this +mí | me +antes | before +algunos | some +qué | what? +unos | a +yo | I +otro | other +otras | other +otra | other +él | he +tanto | so much, many +esa | that +estos | these +mucho | much, many +quienes | who +nada | nothing +muchos | many +cual | who + | sea from SER +poco | few +ella | she +estar | to be + | haber from HABER +estas | these + | estaba from ESTAR + | estamos from ESTAR +algunas | some +algo | something +nosotros | we + + | other forms + +mi | me +mis | mi plural +tú | thou +te | thee +ti | thee +tu | thy +tus | tu plural +ellas | they +nosotras | we +vosotros | you +vosotras | you +os | you +mío | mine +mía | +míos | +mías | +tuyo | thine +tuya | +tuyos | +tuyas | +suyo | his, hers, theirs +suya | +suyos | +suyas | +nuestro | ours +nuestra | +nuestros | +nuestras | +vuestro | yours +vuestra | +vuestros | +vuestras | +esos | those +esas | those + + | forms of estar, to be (not including the infinitive): +estoy +estás +está +estamos +estáis +están +esté +estés +estemos +estéis +estén +estaré +estarás +estará +estaremos +estaréis +estarán +estaría +estarías +estaríamos +estaríais +estarían +estaba +estabas +estábamos +estabais +estaban +estuve +estuviste +estuvo +estuvimos +estuvisteis +estuvieron +estuviera +estuvieras +estuviéramos +estuvierais +estuvieran +estuviese +estuvieses +estuviésemos +estuvieseis +estuviesen +estando +estado +estada +estados +estadas +estad + + | forms of haber, to have (not including the infinitive): +he +has +ha +hemos +habéis +han +haya +hayas +hayamos +hayáis +hayan +habré +habrás +habrá +habremos +habréis +habrán +habría +habrías +habríamos +habríais +habrían +había +habías +habíamos +habíais +habían +hube +hubiste +hubo +hubimos +hubisteis +hubieron +hubiera +hubieras +hubiéramos +hubierais +hubieran +hubiese +hubieses +hubiésemos +hubieseis +hubiesen +habiendo +habido +habida +habidos +habidas + + | forms of ser, to be (not including the infinitive): +soy +eres +es +somos +sois +son +sea +seas +seamos +seáis +sean +seré +serás +será +seremos +seréis +serán +sería +serías +seríamos +seríais +serían +era +eras +éramos +erais +eran +fui +fuiste +fue +fuimos +fuisteis +fueron +fuera +fueras +fuéramos +fuerais +fueran +fuese +fueses +fuésemos +fueseis +fuesen +siendo +sido + | sed also means 'thirst' + + | forms of tener, to have (not including the infinitive): +tengo +tienes +tiene +tenemos +tenéis +tienen +tenga +tengas +tengamos +tengáis +tengan +tendré +tendrás +tendrá +tendremos +tendréis +tendrán +tendría +tendrías +tendríamos +tendríais +tendrían +tenía +tenías +teníamos +teníais +tenían +tuve +tuviste +tuvo +tuvimos +tuvisteis +tuvieron +tuviera +tuvieras +tuviéramos +tuvierais +tuvieran +tuviese +tuvieses +tuviésemos +tuvieseis +tuviesen +teniendo +tenido +tenida +tenidos +tenidas +tened + diff --git a/docker/solr/conf/solr.languages/fi/language-fieldtypes.xml b/docker/solr/conf/solr.languages/fi/language-fieldtypes.xml new file mode 100644 index 000000000..f375503be --- /dev/null +++ b/docker/solr/conf/solr.languages/fi/language-fieldtypes.xml @@ -0,0 +1,12 @@ + + + + + + + + + + + + diff --git a/docker/solr/conf/solr.languages/fi/stopwords_fi.txt b/docker/solr/conf/solr.languages/fi/stopwords_fi.txt new file mode 100644 index 000000000..4372c9a05 --- /dev/null +++ b/docker/solr/conf/solr.languages/fi/stopwords_fi.txt @@ -0,0 +1,97 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/finnish/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + +| forms of BE + +olla +olen +olet +on +olemme +olette +ovat +ole | negative form + +oli +olisi +olisit +olisin +olisimme +olisitte +olisivat +olit +olin +olimme +olitte +olivat +ollut +olleet + +en | negation +et +ei +emme +ette +eivät + +|Nom Gen Acc Part Iness Elat Illat Adess Ablat Allat Ess Trans +minä minun minut minua minussa minusta minuun minulla minulta minulle | I +sinä sinun sinut sinua sinussa sinusta sinuun sinulla sinulta sinulle | you +hän hänen hänet häntä hänessä hänestä häneen hänellä häneltä hänelle | he she +me meidän meidät meitä meissä meistä meihin meillä meiltä meille | we +te teidän teidät teitä teissä teistä teihin teillä teiltä teille | you +he heidän heidät heitä heissä heistä heihin heillä heiltä heille | they + +tämä tämän tätä tässä tästä tähän tallä tältä tälle tänä täksi | this +tuo tuon tuotä tuossa tuosta tuohon tuolla tuolta tuolle tuona tuoksi | that +se sen sitä siinä siitä siihen sillä siltä sille sinä siksi | it +nämä näiden näitä näissä näistä näihin näillä näiltä näille näinä näiksi | these +nuo noiden noita noissa noista noihin noilla noilta noille noina noiksi | those +ne niiden niitä niissä niistä niihin niillä niiltä niille niinä niiksi | they + +kuka kenen kenet ketä kenessä kenestä keneen kenellä keneltä kenelle kenenä keneksi| who +ketkä keiden ketkä keitä keissä keistä keihin keillä keiltä keille keinä keiksi | (pl) +mikä minkä minkä mitä missä mistä mihin millä miltä mille minä miksi | which what +mitkä | (pl) + +joka jonka jota jossa josta johon jolla jolta jolle jona joksi | who which +jotka joiden joita joissa joista joihin joilla joilta joille joina joiksi | (pl) + +| conjunctions + +että | that +ja | and +jos | if +koska | because +kuin | than +mutta | but +niin | so +sekä | and +sillä | for +tai | or +vaan | but +vai | or +vaikka | although + + +| prepositions + +kanssa | with +mukaan | according to +noin | about +poikki | across +yli | over, across + +| other + +kun | when +niin | so +nyt | now +itse | self + diff --git a/docker/solr/conf/solr.languages/fr/contractions_fr.txt b/docker/solr/conf/solr.languages/fr/contractions_fr.txt new file mode 100644 index 000000000..f1bba51b2 --- /dev/null +++ b/docker/solr/conf/solr.languages/fr/contractions_fr.txt @@ -0,0 +1,15 @@ +# Set of French contractions for ElisionFilter +# TODO: load this as a resource from the analyzer and sync it in build.xml +l +m +t +qu +n +s +j +d +c +jusqu +quoiqu +lorsqu +puisqu diff --git a/docker/solr/conf/solr.languages/fr/language-fieldtypes.xml b/docker/solr/conf/solr.languages/fr/language-fieldtypes.xml new file mode 100644 index 000000000..4336e0544 --- /dev/null +++ b/docker/solr/conf/solr.languages/fr/language-fieldtypes.xml @@ -0,0 +1,15 @@ + + + + + + + + + + + + + + + diff --git a/docker/solr/conf/solr.languages/fr/stopwords_fr.txt b/docker/solr/conf/solr.languages/fr/stopwords_fr.txt new file mode 100644 index 000000000..749abae68 --- /dev/null +++ b/docker/solr/conf/solr.languages/fr/stopwords_fr.txt @@ -0,0 +1,186 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/french/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A French stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + +au | a + le +aux | a + les +avec | with +ce | this +ces | these +dans | with +de | of +des | de + les +du | de + le +elle | she +en | `of them' etc +et | and +eux | them +il | he +je | I +la | the +le | the +leur | their +lui | him +ma | my (fem) +mais | but +me | me +même | same; as in moi-même (myself) etc +mes | me (pl) +moi | me +mon | my (masc) +ne | not +nos | our (pl) +notre | our +nous | we +on | one +ou | where +par | by +pas | not +pour | for +qu | que before vowel +que | that +qui | who +sa | his, her (fem) +se | oneself +ses | his (pl) +son | his, her (masc) +sur | on +ta | thy (fem) +te | thee +tes | thy (pl) +toi | thee +ton | thy (masc) +tu | thou +un | a +une | a +vos | your (pl) +votre | your +vous | you + + | single letter forms + +c | c' +d | d' +j | j' +l | l' +à | to, at +m | m' +n | n' +s | s' +t | t' +y | there + + | forms of être (not including the infinitive): +été +étée +étées +étés +étant +suis +es +est +sommes +êtes +sont +serai +seras +sera +serons +serez +seront +serais +serait +serions +seriez +seraient +étais +était +étions +étiez +étaient +fus +fut +fûmes +fûtes +furent +sois +soit +soyons +soyez +soient +fusse +fusses +fût +fussions +fussiez +fussent + + | forms of avoir (not including the infinitive): +ayant +eu +eue +eues +eus +ai +as +avons +avez +ont +aurai +auras +aura +aurons +aurez +auront +aurais +aurait +aurions +auriez +auraient +avais +avait +avions +aviez +avaient +eut +eûmes +eûtes +eurent +aie +aies +ait +ayons +ayez +aient +eusse +eusses +eût +eussions +eussiez +eussent + + | Later additions (from Jean-Christophe Deschamps) +ceci | this +cela | that +celà | that +cet | this +cette | this +ici | here +ils | they +les | the (pl) +leurs | their (pl) +quel | which +quels | which +quelle | which +quelles | which +sans | without +soi | oneself + diff --git a/docker/solr/conf/solr.languages/hr/language-fieldtypes.xml b/docker/solr/conf/solr.languages/hr/language-fieldtypes.xml new file mode 100644 index 000000000..784ae0a8d --- /dev/null +++ b/docker/solr/conf/solr.languages/hr/language-fieldtypes.xml @@ -0,0 +1,10 @@ + + + + + + + + + + diff --git a/docker/solr/conf/solr.languages/hr/stopwords_hr.txt b/docker/solr/conf/solr.languages/hr/stopwords_hr.txt new file mode 100644 index 000000000..16fc66739 --- /dev/null +++ b/docker/solr/conf/solr.languages/hr/stopwords_hr.txt @@ -0,0 +1 @@ +# Stub, a list of Croation stopwords is not part of the Solr distribution \ No newline at end of file diff --git a/docker/solr/conf/solr.languages/it/contractions_it.txt b/docker/solr/conf/solr.languages/it/contractions_it.txt new file mode 100644 index 000000000..cac040953 --- /dev/null +++ b/docker/solr/conf/solr.languages/it/contractions_it.txt @@ -0,0 +1,23 @@ +# Set of Italian contractions for ElisionFilter +# TODO: load this as a resource from the analyzer and sync it in build.xml +c +l +all +dall +dell +nell +sull +coll +pell +gl +agl +dagl +degl +negl +sugl +un +m +t +s +v +d diff --git a/docker/solr/conf/solr.languages/it/language-fieldtypes.xml b/docker/solr/conf/solr.languages/it/language-fieldtypes.xml new file mode 100644 index 000000000..16b52cfa6 --- /dev/null +++ b/docker/solr/conf/solr.languages/it/language-fieldtypes.xml @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + diff --git a/docker/solr/conf/solr.languages/it/stopwords_it.txt b/docker/solr/conf/solr.languages/it/stopwords_it.txt new file mode 100644 index 000000000..1219cc773 --- /dev/null +++ b/docker/solr/conf/solr.languages/it/stopwords_it.txt @@ -0,0 +1,303 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/italian/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | An Italian stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + +ad | a (to) before vowel +al | a + il +allo | a + lo +ai | a + i +agli | a + gli +all | a + l' +agl | a + gl' +alla | a + la +alle | a + le +con | with +col | con + il +coi | con + i (forms collo, cogli etc are now very rare) +da | from +dal | da + il +dallo | da + lo +dai | da + i +dagli | da + gli +dall | da + l' +dagl | da + gll' +dalla | da + la +dalle | da + le +di | of +del | di + il +dello | di + lo +dei | di + i +degli | di + gli +dell | di + l' +degl | di + gl' +della | di + la +delle | di + le +in | in +nel | in + el +nello | in + lo +nei | in + i +negli | in + gli +nell | in + l' +negl | in + gl' +nella | in + la +nelle | in + le +su | on +sul | su + il +sullo | su + lo +sui | su + i +sugli | su + gli +sull | su + l' +sugl | su + gl' +sulla | su + la +sulle | su + le +per | through, by +tra | among +contro | against +io | I +tu | thou +lui | he +lei | she +noi | we +voi | you +loro | they +mio | my +mia | +miei | +mie | +tuo | +tua | +tuoi | thy +tue | +suo | +sua | +suoi | his, her +sue | +nostro | our +nostra | +nostri | +nostre | +vostro | your +vostra | +vostri | +vostre | +mi | me +ti | thee +ci | us, there +vi | you, there +lo | him, the +la | her, the +li | them +le | them, the +gli | to him, the +ne | from there etc +il | the +un | a +uno | a +una | a +ma | but +ed | and +se | if +perché | why, because +anche | also +come | how +dov | where (as dov') +dove | where +che | who, that +chi | who +cui | whom +non | not +più | more +quale | who, that +quanto | how much +quanti | +quanta | +quante | +quello | that +quelli | +quella | +quelle | +questo | this +questi | +questa | +queste | +si | yes +tutto | all +tutti | all + + | single letter forms: + +a | at +c | as c' for ce or ci +e | and +i | the +l | as l' +o | or + + | forms of avere, to have (not including the infinitive): + +ho +hai +ha +abbiamo +avete +hanno +abbia +abbiate +abbiano +avrò +avrai +avrà +avremo +avrete +avranno +avrei +avresti +avrebbe +avremmo +avreste +avrebbero +avevo +avevi +aveva +avevamo +avevate +avevano +ebbi +avesti +ebbe +avemmo +aveste +ebbero +avessi +avesse +avessimo +avessero +avendo +avuto +avuta +avuti +avute + + | forms of essere, to be (not including the infinitive): +sono +sei +è +siamo +siete +sia +siate +siano +sarò +sarai +sarà +saremo +sarete +saranno +sarei +saresti +sarebbe +saremmo +sareste +sarebbero +ero +eri +era +eravamo +eravate +erano +fui +fosti +fu +fummo +foste +furono +fossi +fosse +fossimo +fossero +essendo + + | forms of fare, to do (not including the infinitive, fa, fat-): +faccio +fai +facciamo +fanno +faccia +facciate +facciano +farò +farai +farà +faremo +farete +faranno +farei +faresti +farebbe +faremmo +fareste +farebbero +facevo +facevi +faceva +facevamo +facevate +facevano +feci +facesti +fece +facemmo +faceste +fecero +facessi +facesse +facessimo +facessero +facendo + + | forms of stare, to be (not including the infinitive): +sto +stai +sta +stiamo +stanno +stia +stiate +stiano +starò +starai +starà +staremo +starete +staranno +starei +staresti +starebbe +staremmo +stareste +starebbero +stavo +stavi +stava +stavamo +stavate +stavano +stetti +stesti +stette +stemmo +steste +stettero +stessi +stesse +stessimo +stessero +stando diff --git a/docker/solr/conf/solr.languages/ja/language-fieldtypes.xml b/docker/solr/conf/solr.languages/ja/language-fieldtypes.xml new file mode 100644 index 000000000..4a744fa80 --- /dev/null +++ b/docker/solr/conf/solr.languages/ja/language-fieldtypes.xml @@ -0,0 +1,48 @@ + + + + + + + + + + + + + + + + + + + + + diff --git a/docker/solr/conf/solr.languages/ja/stoptags_ja.txt b/docker/solr/conf/solr.languages/ja/stoptags_ja.txt new file mode 100644 index 000000000..71b750845 --- /dev/null +++ b/docker/solr/conf/solr.languages/ja/stoptags_ja.txt @@ -0,0 +1,420 @@ +# +# This file defines a Japanese stoptag set for JapanesePartOfSpeechStopFilter. +# +# Any token with a part-of-speech tag that exactly matches those defined in this +# file are removed from the token stream. +# +# Set your own stoptags by uncommenting the lines below. Note that comments are +# not allowed on the same line as a stoptag. See LUCENE-3745 for frequency lists, +# etc. that can be useful for building you own stoptag set. +# +# The entire possible tagset is provided below for convenience. +# +##### +# noun: unclassified nouns +#名詞 +# +# noun-common: Common nouns or nouns where the sub-classification is undefined +#名詞-一般 +# +# noun-proper: Proper nouns where the sub-classification is undefined +#名詞-固有名詞 +# +# noun-proper-misc: miscellaneous proper nouns +#名詞-固有名詞-一般 +# +# noun-proper-person: Personal names where the sub-classification is undefined +#名詞-固有名詞-人名 +# +# noun-proper-person-misc: names that cannot be divided into surname and +# given name; foreign names; names where the surname or given name is unknown. +# e.g. お市の方 +#名詞-固有名詞-人名-一般 +# +# noun-proper-person-surname: Mainly Japanese surnames. +# e.g. 山田 +#名詞-固有名詞-人名-姓 +# +# noun-proper-person-given_name: Mainly Japanese given names. +# e.g. 太郎 +#名詞-固有名詞-人名-名 +# +# noun-proper-organization: Names representing organizations. +# e.g. 通産省, NHK +#名詞-固有名詞-組織 +# +# noun-proper-place: Place names where the sub-classification is undefined +#名詞-固有名詞-地域 +# +# noun-proper-place-misc: Place names excluding countries. +# e.g. アジア, バルセロナ, 京都 +#名詞-固有名詞-地域-一般 +# +# noun-proper-place-country: Country names. +# e.g. 日本, オーストラリア +#名詞-固有名詞-地域-国 +# +# noun-pronoun: Pronouns where the sub-classification is undefined +#名詞-代名詞 +# +# noun-pronoun-misc: miscellaneous pronouns: +# e.g. それ, ここ, あいつ, あなた, あちこち, いくつ, どこか, なに, みなさん, みんな, わたくし, われわれ +#名詞-代名詞-一般 +# +# noun-pronoun-contraction: Spoken language contraction made by combining a +# pronoun and the particle 'wa'. +# e.g. ありゃ, こりゃ, こりゃあ, そりゃ, そりゃあ +#名詞-代名詞-縮約 +# +# noun-adverbial: Temporal nouns such as names of days or months that behave +# like adverbs. Nouns that represent amount or ratios and can be used adverbially, +# e.g. 金曜, 一月, 午後, 少量 +#名詞-副詞可能 +# +# noun-verbal: Nouns that take arguments with case and can appear followed by +# 'suru' and related verbs (する, できる, なさる, くださる) +# e.g. インプット, 愛着, 悪化, 悪戦苦闘, 一安心, 下取り +#名詞-サ変接続 +# +# noun-adjective-base: The base form of adjectives, words that appear before な ("na") +# e.g. 健康, 安易, 駄目, だめ +#名詞-形容動詞語幹 +# +# noun-numeric: Arabic numbers, Chinese numerals, and counters like 何 (回), 数. +# e.g. 0, 1, 2, 何, 数, 幾 +#名詞-数 +# +# noun-affix: noun affixes where the sub-classification is undefined +#名詞-非自立 +# +# noun-affix-misc: Of adnominalizers, the case-marker の ("no"), and words that +# attach to the base form of inflectional words, words that cannot be classified +# into any of the other categories below. This category includes indefinite nouns. +# e.g. あかつき, 暁, かい, 甲斐, 気, きらい, 嫌い, くせ, 癖, こと, 事, ごと, 毎, しだい, 次第, +# 順, せい, 所為, ついで, 序で, つもり, 積もり, 点, どころ, の, はず, 筈, はずみ, 弾み, +# 拍子, ふう, ふり, 振り, ほう, 方, 旨, もの, 物, 者, ゆえ, 故, ゆえん, 所以, わけ, 訳, +# わり, 割り, 割, ん-口語/, もん-口語/ +#名詞-非自立-一般 +# +# noun-affix-adverbial: noun affixes that that can behave as adverbs. +# e.g. あいだ, 間, あげく, 挙げ句, あと, 後, 余り, 以外, 以降, 以後, 以上, 以前, 一方, うえ, +# 上, うち, 内, おり, 折り, かぎり, 限り, きり, っきり, 結果, ころ, 頃, さい, 際, 最中, さなか, +# 最中, じたい, 自体, たび, 度, ため, 為, つど, 都度, とおり, 通り, とき, 時, ところ, 所, +# とたん, 途端, なか, 中, のち, 後, ばあい, 場合, 日, ぶん, 分, ほか, 他, まえ, 前, まま, +# 儘, 侭, みぎり, 矢先 +#名詞-非自立-副詞可能 +# +# noun-affix-aux: noun affixes treated as 助動詞 ("auxiliary verb") in school grammars +# with the stem よう(だ) ("you(da)"). +# e.g. よう, やう, 様 (よう) +#名詞-非自立-助動詞語幹 +# +# noun-affix-adjective-base: noun affixes that can connect to the indeclinable +# connection form な (aux "da"). +# e.g. みたい, ふう +#名詞-非自立-形容動詞語幹 +# +# noun-special: special nouns where the sub-classification is undefined. +#名詞-特殊 +# +# noun-special-aux: The そうだ ("souda") stem form that is used for reporting news, is +# treated as 助動詞 ("auxiliary verb") in school grammars, and attach to the base +# form of inflectional words. +# e.g. そう +#名詞-特殊-助動詞語幹 +# +# noun-suffix: noun suffixes where the sub-classification is undefined. +#名詞-接尾 +# +# noun-suffix-misc: Of the nouns or stem forms of other parts of speech that connect +# to ガル or タイ and can combine into compound nouns, words that cannot be classified into +# any of the other categories below. In general, this category is more inclusive than +# 接尾語 ("suffix") and is usually the last element in a compound noun. +# e.g. おき, かた, 方, 甲斐 (がい), がかり, ぎみ, 気味, ぐるみ, (~した) さ, 次第, 済 (ず) み, +# よう, (でき)っこ, 感, 観, 性, 学, 類, 面, 用 +#名詞-接尾-一般 +# +# noun-suffix-person: Suffixes that form nouns and attach to person names more often +# than other nouns. +# e.g. 君, 様, 著 +#名詞-接尾-人名 +# +# noun-suffix-place: Suffixes that form nouns and attach to place names more often +# than other nouns. +# e.g. 町, 市, 県 +#名詞-接尾-地域 +# +# noun-suffix-verbal: Of the suffixes that attach to nouns and form nouns, those that +# can appear before スル ("suru"). +# e.g. 化, 視, 分け, 入り, 落ち, 買い +#名詞-接尾-サ変接続 +# +# noun-suffix-aux: The stem form of そうだ (様態) that is used to indicate conditions, +# is treated as 助動詞 ("auxiliary verb") in school grammars, and attach to the +# conjunctive form of inflectional words. +# e.g. そう +#名詞-接尾-助動詞語幹 +# +# noun-suffix-adjective-base: Suffixes that attach to other nouns or the conjunctive +# form of inflectional words and appear before the copula だ ("da"). +# e.g. 的, げ, がち +#名詞-接尾-形容動詞語幹 +# +# noun-suffix-adverbial: Suffixes that attach to other nouns and can behave as adverbs. +# e.g. 後 (ご), 以後, 以降, 以前, 前後, 中, 末, 上, 時 (じ) +#名詞-接尾-副詞可能 +# +# noun-suffix-classifier: Suffixes that attach to numbers and form nouns. This category +# is more inclusive than 助数詞 ("classifier") and includes common nouns that attach +# to numbers. +# e.g. 個, つ, 本, 冊, パーセント, cm, kg, カ月, か国, 区画, 時間, 時半 +#名詞-接尾-助数詞 +# +# noun-suffix-special: Special suffixes that mainly attach to inflecting words. +# e.g. (楽し) さ, (考え) 方 +#名詞-接尾-特殊 +# +# noun-suffix-conjunctive: Nouns that behave like conjunctions and join two words +# together. +# e.g. (日本) 対 (アメリカ), 対 (アメリカ), (3) 対 (5), (女優) 兼 (主婦) +#名詞-接続詞的 +# +# noun-verbal_aux: Nouns that attach to the conjunctive particle て ("te") and are +# semantically verb-like. +# e.g. ごらん, ご覧, 御覧, 頂戴 +#名詞-動詞非自立的 +# +# noun-quotation: text that cannot be segmented into words, proverbs, Chinese poetry, +# dialects, English, etc. Currently, the only entry for 名詞 引用文字列 ("noun quotation") +# is いわく ("iwaku"). +#名詞-引用文字列 +# +# noun-nai_adjective: Words that appear before the auxiliary verb ない ("nai") and +# behave like an adjective. +# e.g. 申し訳, 仕方, とんでも, 違い +#名詞-ナイ形容詞語幹 +# +##### +# prefix: unclassified prefixes +#接頭詞 +# +# prefix-nominal: Prefixes that attach to nouns (including adjective stem forms) +# excluding numerical expressions. +# e.g. お (水), 某 (氏), 同 (社), 故 (~氏), 高 (品質), お (見事), ご (立派) +#接頭詞-名詞接続 +# +# prefix-verbal: Prefixes that attach to the imperative form of a verb or a verb +# in conjunctive form followed by なる/なさる/くださる. +# e.g. お (読みなさい), お (座り) +#接頭詞-動詞接続 +# +# prefix-adjectival: Prefixes that attach to adjectives. +# e.g. お (寒いですねえ), バカ (でかい) +#接頭詞-形容詞接続 +# +# prefix-numerical: Prefixes that attach to numerical expressions. +# e.g. 約, およそ, 毎時 +#接頭詞-数接続 +# +##### +# verb: unclassified verbs +#動詞 +# +# verb-main: +#動詞-自立 +# +# verb-auxiliary: +#動詞-非自立 +# +# verb-suffix: +#動詞-接尾 +# +##### +# adjective: unclassified adjectives +#形容詞 +# +# adjective-main: +#形容詞-自立 +# +# adjective-auxiliary: +#形容詞-非自立 +# +# adjective-suffix: +#形容詞-接尾 +# +##### +# adverb: unclassified adverbs +#副詞 +# +# adverb-misc: Words that can be segmented into one unit and where adnominal +# modification is not possible. +# e.g. あいかわらず, 多分 +#副詞-一般 +# +# adverb-particle_conjunction: Adverbs that can be followed by の, は, に, +# な, する, だ, etc. +# e.g. こんなに, そんなに, あんなに, なにか, なんでも +#副詞-助詞類接続 +# +##### +# adnominal: Words that only have noun-modifying forms. +# e.g. この, その, あの, どの, いわゆる, なんらかの, 何らかの, いろんな, こういう, そういう, ああいう, +# どういう, こんな, そんな, あんな, どんな, 大きな, 小さな, おかしな, ほんの, たいした, +# 「(, も) さる (ことながら)」, 微々たる, 堂々たる, 単なる, いかなる, 我が」「同じ, 亡き +#連体詞 +# +##### +# conjunction: Conjunctions that can occur independently. +# e.g. が, けれども, そして, じゃあ, それどころか +接続詞 +# +##### +# particle: unclassified particles. +助詞 +# +# particle-case: case particles where the subclassification is undefined. +助詞-格助詞 +# +# particle-case-misc: Case particles. +# e.g. から, が, で, と, に, へ, より, を, の, にて +助詞-格助詞-一般 +# +# particle-case-quote: the "to" that appears after nouns, a person’s speech, +# quotation marks, expressions of decisions from a meeting, reasons, judgements, +# conjectures, etc. +# e.g. ( だ) と (述べた.), ( である) と (して執行猶予...) +助詞-格助詞-引用 +# +# particle-case-compound: Compounds of particles and verbs that mainly behave +# like case particles. +# e.g. という, といった, とかいう, として, とともに, と共に, でもって, にあたって, に当たって, に当って, +# にあたり, に当たり, に当り, に当たる, にあたる, において, に於いて,に於て, における, に於ける, +# にかけ, にかけて, にかんし, に関し, にかんして, に関して, にかんする, に関する, に際し, +# に際して, にしたがい, に従い, に従う, にしたがって, に従って, にたいし, に対し, にたいして, +# に対して, にたいする, に対する, について, につき, につけ, につけて, につれ, につれて, にとって, +# にとり, にまつわる, によって, に依って, に因って, により, に依り, に因り, による, に依る, に因る, +# にわたって, にわたる, をもって, を以って, を通じ, を通じて, を通して, をめぐって, をめぐり, をめぐる, +# って-口語/, ちゅう-関西弁「という」/, (何) ていう (人)-口語/, っていう-口語/, といふ, とかいふ +助詞-格助詞-連語 +# +# particle-conjunctive: +# e.g. から, からには, が, けれど, けれども, けど, し, つつ, て, で, と, ところが, どころか, とも, ども, +# ながら, なり, ので, のに, ば, ものの, や ( した), やいなや, (ころん) じゃ(いけない)-口語/, +# (行っ) ちゃ(いけない)-口語/, (言っ) たって (しかたがない)-口語/, (それがなく)ったって (平気)-口語/ +助詞-接続助詞 +# +# particle-dependency: +# e.g. こそ, さえ, しか, すら, は, も, ぞ +助詞-係助詞 +# +# particle-adverbial: +# e.g. がてら, かも, くらい, 位, ぐらい, しも, (学校) じゃ(これが流行っている)-口語/, +# (それ)じゃあ (よくない)-口語/, ずつ, (私) なぞ, など, (私) なり (に), (先生) なんか (大嫌い)-口語/, +# (私) なんぞ, (先生) なんて (大嫌い)-口語/, のみ, だけ, (私) だって-口語/, だに, +# (彼)ったら-口語/, (お茶) でも (いかが), 等 (とう), (今後) とも, ばかり, ばっか-口語/, ばっかり-口語/, +# ほど, 程, まで, 迄, (誰) も (が)([助詞-格助詞] および [助詞-係助詞] の前に位置する「も」) +助詞-副助詞 +# +# particle-interjective: particles with interjective grammatical roles. +# e.g. (松島) や +助詞-間投助詞 +# +# particle-coordinate: +# e.g. と, たり, だの, だり, とか, なり, や, やら +助詞-並立助詞 +# +# particle-final: +# e.g. かい, かしら, さ, ぜ, (だ)っけ-口語/, (とまってる) で-方言/, な, ナ, なあ-口語/, ぞ, ね, ネ, +# ねぇ-口語/, ねえ-口語/, ねん-方言/, の, のう-口語/, や, よ, ヨ, よぉ-口語/, わ, わい-口語/ +助詞-終助詞 +# +# particle-adverbial/conjunctive/final: The particle "ka" when unknown whether it is +# adverbial, conjunctive, or sentence final. For example: +# (a) 「A か B か」. Ex:「(国内で運用する) か,(海外で運用する) か (.)」 +# (b) Inside an adverb phrase. Ex:「(幸いという) か (, 死者はいなかった.)」 +# 「(祈りが届いたせい) か (, 試験に合格した.)」 +# (c) 「かのように」. Ex:「(何もなかった) か (のように振る舞った.)」 +# e.g. か +助詞-副助詞/並立助詞/終助詞 +# +# particle-adnominalizer: The "no" that attaches to nouns and modifies +# non-inflectional words. +助詞-連体化 +# +# particle-adnominalizer: The "ni" and "to" that appear following nouns and adverbs +# that are giongo, giseigo, or gitaigo. +# e.g. に, と +助詞-副詞化 +# +# particle-special: A particle that does not fit into one of the above classifications. +# This includes particles that are used in Tanka, Haiku, and other poetry. +# e.g. かな, けむ, ( しただろう) に, (あんた) にゃ(わからん), (俺) ん (家) +助詞-特殊 +# +##### +# auxiliary-verb: +助動詞 +# +##### +# interjection: Greetings and other exclamations. +# e.g. おはよう, おはようございます, こんにちは, こんばんは, ありがとう, どうもありがとう, ありがとうございます, +# いただきます, ごちそうさま, さよなら, さようなら, はい, いいえ, ごめん, ごめんなさい +#感動詞 +# +##### +# symbol: unclassified Symbols. +記号 +# +# symbol-misc: A general symbol not in one of the categories below. +# e.g. [○◎@$〒→+] +記号-一般 +# +# symbol-comma: Commas +# e.g. [,、] +記号-読点 +# +# symbol-period: Periods and full stops. +# e.g. [..。] +記号-句点 +# +# symbol-space: Full-width whitespace. +記号-空白 +# +# symbol-open_bracket: +# e.g. [({‘“『【] +記号-括弧開 +# +# symbol-close_bracket: +# e.g. [)}’”』」】] +記号-括弧閉 +# +# symbol-alphabetic: +#記号-アルファベット +# +##### +# other: unclassified other +#その他 +# +# other-interjection: Words that are hard to classify as noun-suffixes or +# sentence-final particles. +# e.g. (だ)ァ +その他-間投 +# +##### +# filler: Aizuchi that occurs during a conversation or sounds inserted as filler. +# e.g. あの, うんと, えと +フィラー +# +##### +# non-verbal: non-verbal sound. +非言語音 +# +##### +# fragment: +#語断片 +# +##### +# unknown: unknown part of speech. +#未知語 +# +##### End of file diff --git a/docker/solr/conf/solr.languages/ja/stopwords_ja.txt b/docker/solr/conf/solr.languages/ja/stopwords_ja.txt new file mode 100644 index 000000000..d4321be6b --- /dev/null +++ b/docker/solr/conf/solr.languages/ja/stopwords_ja.txt @@ -0,0 +1,127 @@ +# +# This file defines a stopword set for Japanese. +# +# This set is made up of hand-picked frequent terms from segmented Japanese Wikipedia. +# Punctuation characters and frequent kanji have mostly been left out. See LUCENE-3745 +# for frequency lists, etc. that can be useful for making your own set (if desired) +# +# Note that there is an overlap between these stopwords and the terms stopped when used +# in combination with the JapanesePartOfSpeechStopFilter. When editing this file, note +# that comments are not allowed on the same line as stopwords. +# +# Also note that stopping is done in a case-insensitive manner. Change your StopFilter +# configuration if you need case-sensitive stopping. Lastly, note that stopping is done +# using the same character width as the entries in this file. Since this StopFilter is +# normally done after a CJKWidthFilter in your chain, you would usually want your romaji +# entries to be in half-width and your kana entries to be in full-width. +# +の +に +は +を +た +が +で +て +と +し +れ +さ +ある +いる +も +する +から +な +こと +として +い +や +れる +など +なっ +ない +この +ため +その +あっ +よう +また +もの +という +あり +まで +られ +なる +へ +か +だ +これ +によって +により +おり +より +による +ず +なり +られる +において +ば +なかっ +なく +しかし +について +せ +だっ +その後 +できる +それ +う +ので +なお +のみ +でき +き +つ +における +および +いう +さらに +でも +ら +たり +その他 +に関する +たち +ます +ん +なら +に対して +特に +せる +及び +これら +とき +では +にて +ほか +ながら +うち +そして +とともに +ただし +かつて +それぞれ +または +お +ほど +ものの +に対する +ほとんど +と共に +といった +です +とも +ところ +ここ +##### End of file diff --git a/docker/solr/conf/solr.languages/nl/language-fieldtypes.xml b/docker/solr/conf/solr.languages/nl/language-fieldtypes.xml new file mode 100644 index 000000000..45e2448f8 --- /dev/null +++ b/docker/solr/conf/solr.languages/nl/language-fieldtypes.xml @@ -0,0 +1,12 @@ + + + + + + + + + + + + diff --git a/docker/solr/conf/solr.languages/nl/stemdict_nl.txt b/docker/solr/conf/solr.languages/nl/stemdict_nl.txt new file mode 100644 index 000000000..441072971 --- /dev/null +++ b/docker/solr/conf/solr.languages/nl/stemdict_nl.txt @@ -0,0 +1,6 @@ +# Set of overrides for the dutch stemmer +# TODO: load this as a resource from the analyzer and sync it in build.xml +fiets fiets +bromfiets bromfiets +ei eier +kind kinder diff --git a/docker/solr/conf/solr.languages/nl/stopwords_nl.txt b/docker/solr/conf/solr.languages/nl/stopwords_nl.txt new file mode 100644 index 000000000..47a2aeacf --- /dev/null +++ b/docker/solr/conf/solr.languages/nl/stopwords_nl.txt @@ -0,0 +1,119 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/dutch/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A Dutch stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This is a ranked list (commonest to rarest) of stopwords derived from + | a large sample of Dutch text. + + | Dutch stop words frequently exhibit homonym clashes. These are indicated + | clearly below. + +de | the +en | and +van | of, from +ik | I, the ego +te | (1) chez, at etc, (2) to, (3) too +dat | that, which +die | that, those, who, which +in | in, inside +een | a, an, one +hij | he +het | the, it +niet | not, nothing, naught +zijn | (1) to be, being, (2) his, one's, its +is | is +was | (1) was, past tense of all persons sing. of 'zijn' (to be) (2) wax, (3) the washing, (4) rise of river +op | on, upon, at, in, up, used up +aan | on, upon, to (as dative) +met | with, by +als | like, such as, when +voor | (1) before, in front of, (2) furrow +had | had, past tense all persons sing. of 'hebben' (have) +er | there +maar | but, only +om | round, about, for etc +hem | him +dan | then +zou | should/would, past tense all persons sing. of 'zullen' +of | or, whether, if +wat | what, something, anything +mijn | possessive and noun 'mine' +men | people, 'one' +dit | this +zo | so, thus, in this way +door | through by +over | over, across +ze | she, her, they, them +zich | oneself +bij | (1) a bee, (2) by, near, at +ook | also, too +tot | till, until +je | you +mij | me +uit | out of, from +der | Old Dutch form of 'van der' still found in surnames +daar | (1) there, (2) because +haar | (1) her, their, them, (2) hair +naar | (1) unpleasant, unwell etc, (2) towards, (3) as +heb | present first person sing. of 'to have' +hoe | how, why +heeft | present third person sing. of 'to have' +hebben | 'to have' and various parts thereof +deze | this +u | you +want | (1) for, (2) mitten, (3) rigging +nog | yet, still +zal | 'shall', first and third person sing. of verb 'zullen' (will) +me | me +zij | she, they +nu | now +ge | 'thou', still used in Belgium and south Netherlands +geen | none +omdat | because +iets | something, somewhat +worden | to become, grow, get +toch | yet, still +al | all, every, each +waren | (1) 'were' (2) to wander, (3) wares, (3) +veel | much, many +meer | (1) more, (2) lake +doen | to do, to make +toen | then, when +moet | noun 'spot/mote' and present form of 'to must' +ben | (1) am, (2) 'are' in interrogative second person singular of 'to be' +zonder | without +kan | noun 'can' and present form of 'to be able' +hun | their, them +dus | so, consequently +alles | all, everything, anything +onder | under, beneath +ja | yes, of course +eens | once, one day +hier | here +wie | who +werd | imperfect third person sing. of 'become' +altijd | always +doch | yet, but etc +wordt | present third person sing. of 'become' +wezen | (1) to be, (2) 'been' as in 'been fishing', (3) orphans +kunnen | to be able +ons | us/our +zelf | self +tegen | against, towards, at +na | after, near +reeds | already +wil | (1) present tense of 'want', (2) 'will', noun, (3) fender +kon | could; past tense of 'to be able' +niets | nothing +uw | your +iemand | somebody +geweest | been; past participle of 'be' +andere | other diff --git a/docker/solr/conf/solr.languages/no/language-fieldtypes.xml b/docker/solr/conf/solr.languages/no/language-fieldtypes.xml new file mode 100644 index 000000000..ee6f092bc --- /dev/null +++ b/docker/solr/conf/solr.languages/no/language-fieldtypes.xml @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + diff --git a/docker/solr/conf/solr.languages/no/stopwords_no.txt b/docker/solr/conf/solr.languages/no/stopwords_no.txt new file mode 100644 index 000000000..a7a2c28ba --- /dev/null +++ b/docker/solr/conf/solr.languages/no/stopwords_no.txt @@ -0,0 +1,194 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/norwegian/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A Norwegian stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This stop word list is for the dominant bokmål dialect. Words unique + | to nynorsk are marked *. + + | Revised by Jan Bruusgaard , Jan 2005 + +og | and +i | in +jeg | I +det | it/this/that +at | to (w. inf.) +en | a/an +et | a/an +den | it/this/that +til | to +er | is/am/are +som | who/that +på | on +de | they / you(formal) +med | with +han | he +av | of +ikke | not +ikkje | not * +der | there +så | so +var | was/were +meg | me +seg | you +men | but +ett | one +har | have +om | about +vi | we +min | my +mitt | my +ha | have +hadde | had +hun | she +nå | now +over | over +da | when/as +ved | by/know +fra | from +du | you +ut | out +sin | your +dem | them +oss | us +opp | up +man | you/one +kan | can +hans | his +hvor | where +eller | or +hva | what +skal | shall/must +selv | self (reflective) +sjøl | self (reflective) +her | here +alle | all +vil | will +bli | become +ble | became +blei | became * +blitt | have become +kunne | could +inn | in +når | when +være | be +kom | come +noen | some +noe | some +ville | would +dere | you +som | who/which/that +deres | their/theirs +kun | only/just +ja | yes +etter | after +ned | down +skulle | should +denne | this +for | for/because +deg | you +si | hers/his +sine | hers/his +sitt | hers/his +mot | against +å | to +meget | much +hvorfor | why +dette | this +disse | these/those +uten | without +hvordan | how +ingen | none +din | your +ditt | your +blir | become +samme | same +hvilken | which +hvilke | which (plural) +sånn | such a +inni | inside/within +mellom | between +vår | our +hver | each +hvem | who +vors | us/ours +hvis | whose +både | both +bare | only/just +enn | than +fordi | as/because +før | before +mange | many +også | also +slik | just +vært | been +være | to be +båe | both * +begge | both +siden | since +dykk | your * +dykkar | yours * +dei | they * +deira | them * +deires | theirs * +deim | them * +di | your (fem.) * +då | as/when * +eg | I * +ein | a/an * +eit | a/an * +eitt | a/an * +elles | or * +honom | he * +hjå | at * +ho | she * +hoe | she * +henne | her +hennar | her/hers +hennes | hers +hoss | how * +hossen | how * +ikkje | not * +ingi | noone * +inkje | noone * +korleis | how * +korso | how * +kva | what/which * +kvar | where * +kvarhelst | where * +kven | who/whom * +kvi | why * +kvifor | why * +me | we * +medan | while * +mi | my * +mine | my * +mykje | much * +no | now * +nokon | some (masc./neut.) * +noka | some (fem.) * +nokor | some * +noko | some * +nokre | some * +si | his/hers * +sia | since * +sidan | since * +so | so * +somt | some * +somme | some * +um | about* +upp | up * +vere | be * +vore | was * +verte | become * +vort | become * +varte | became * +vart | became * + diff --git a/docker/solr/conf/solr.languages/pt/language-fieldtypes.xml b/docker/solr/conf/solr.languages/pt/language-fieldtypes.xml new file mode 100644 index 000000000..82187b7ac --- /dev/null +++ b/docker/solr/conf/solr.languages/pt/language-fieldtypes.xml @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + diff --git a/docker/solr/conf/solr.languages/pt/stopwords_pt.txt b/docker/solr/conf/solr.languages/pt/stopwords_pt.txt new file mode 100644 index 000000000..acfeb01af --- /dev/null +++ b/docker/solr/conf/solr.languages/pt/stopwords_pt.txt @@ -0,0 +1,253 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/portuguese/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A Portuguese stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + + | The following is a ranked list (commonest to rarest) of stopwords + | deriving from a large sample of text. + + | Extra words have been added at the end. + +de | of, from +a | the; to, at; her +o | the; him +que | who, that +e | and +do | de + o +da | de + a +em | in +um | a +para | for + | é from SER +com | with +não | not, no +uma | a +os | the; them +no | em + o +se | himself etc +na | em + a +por | for +mais | more +as | the; them +dos | de + os +como | as, like +mas | but + | foi from SER +ao | a + o +ele | he +das | de + as + | tem from TER +à | a + a +seu | his +sua | her +ou | or + | ser from SER +quando | when +muito | much + | há from HAV +nos | em + os; us +já | already, now + | está from EST +eu | I +também | also +só | only, just +pelo | per + o +pela | per + a +até | up to +isso | that +ela | he +entre | between + | era from SER +depois | after +sem | without +mesmo | same +aos | a + os + | ter from TER +seus | his +quem | whom +nas | em + as +me | me +esse | that +eles | they + | estão from EST +você | you + | tinha from TER + | foram from SER +essa | that +num | em + um +nem | nor +suas | her +meu | my +às | a + as +minha | my + | têm from TER +numa | em + uma +pelos | per + os +elas | they + | havia from HAV + | seja from SER +qual | which + | será from SER +nós | we + | tenho from TER +lhe | to him, her +deles | of them +essas | those +esses | those +pelas | per + as +este | this + | fosse from SER +dele | of him + + | other words. There are many contractions such as naquele = em+aquele, + | mo = me+o, but they are rare. + | Indefinite article plural forms are also rare. + +tu | thou +te | thee +vocês | you (plural) +vos | you +lhes | to them +meus | my +minhas +teu | thy +tua +teus +tuas +nosso | our +nossa +nossos +nossas + +dela | of her +delas | of them + +esta | this +estes | these +estas | these +aquele | that +aquela | that +aqueles | those +aquelas | those +isto | this +aquilo | that + + | forms of estar, to be (not including the infinitive): +estou +está +estamos +estão +estive +esteve +estivemos +estiveram +estava +estávamos +estavam +estivera +estivéramos +esteja +estejamos +estejam +estivesse +estivéssemos +estivessem +estiver +estivermos +estiverem + + | forms of haver, to have (not including the infinitive): +hei +há +havemos +hão +houve +houvemos +houveram +houvera +houvéramos +haja +hajamos +hajam +houvesse +houvéssemos +houvessem +houver +houvermos +houverem +houverei +houverá +houveremos +houverão +houveria +houveríamos +houveriam + + | forms of ser, to be (not including the infinitive): +sou +somos +são +era +éramos +eram +fui +foi +fomos +foram +fora +fôramos +seja +sejamos +sejam +fosse +fôssemos +fossem +for +formos +forem +serei +será +seremos +serão +seria +seríamos +seriam + + | forms of ter, to have (not including the infinitive): +tenho +tem +temos +tém +tinha +tínhamos +tinham +tive +teve +tivemos +tiveram +tivera +tivéramos +tenha +tenhamos +tenham +tivesse +tivéssemos +tivessem +tiver +tivermos +tiverem +terei +terá +teremos +terão +teria +teríamos +teriam diff --git a/docker/solr/conf/solr.languages/ru/language-fieldtypes.xml b/docker/solr/conf/solr.languages/ru/language-fieldtypes.xml new file mode 100644 index 000000000..371b1e9f7 --- /dev/null +++ b/docker/solr/conf/solr.languages/ru/language-fieldtypes.xml @@ -0,0 +1,12 @@ + + + + + + + + + + + + diff --git a/docker/solr/conf/solr.languages/ru/stopwords_ru.txt b/docker/solr/conf/solr.languages/ru/stopwords_ru.txt new file mode 100644 index 000000000..55271400c --- /dev/null +++ b/docker/solr/conf/solr.languages/ru/stopwords_ru.txt @@ -0,0 +1,243 @@ + | From svn.tartarus.org/snowball/trunk/website/algorithms/russian/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | a russian stop word list. comments begin with vertical bar. each stop + | word is at the start of a line. + + | this is a ranked list (commonest to rarest) of stopwords derived from + | a large text sample. + + | letter `ё' is translated to `е'. + +и | and +в | in/into +во | alternative form +не | not +что | what/that +он | he +на | on/onto +я | i +с | from +со | alternative form +как | how +а | milder form of `no' (but) +то | conjunction and form of `that' +все | all +она | she +так | so, thus +его | him +но | but +да | yes/and +ты | thou +к | towards, by +у | around, chez +же | intensifier particle +вы | you +за | beyond, behind +бы | conditional/subj. particle +по | up to, along +только | only +ее | her +мне | to me +было | it was +вот | here is/are, particle +от | away from +меня | me +еще | still, yet, more +нет | no, there isnt/arent +о | about +из | out of +ему | to him +теперь | now +когда | when +даже | even +ну | so, well +вдруг | suddenly +ли | interrogative particle +если | if +уже | already, but homonym of `narrower' +или | or +ни | neither +быть | to be +был | he was +него | prepositional form of его +до | up to +вас | you accusative +нибудь | indef. suffix preceded by hyphen +опять | again +уж | already, but homonym of `adder' +вам | to you +сказал | he said +ведь | particle `after all' +там | there +потом | then +себя | oneself +ничего | nothing +ей | to her +может | usually with `быть' as `maybe' +они | they +тут | here +где | where +есть | there is/are +надо | got to, must +ней | prepositional form of ей +для | for +мы | we +тебя | thee +их | them, their +чем | than +была | she was +сам | self +чтоб | in order to +без | without +будто | as if +человек | man, person, one +чего | genitive form of `what' +раз | once +тоже | also +себе | to oneself +под | beneath +жизнь | life +будет | will be +ж | short form of intensifer particle `же' +тогда | then +кто | who +этот | this +говорил | was saying +того | genitive form of `that' +потому | for that reason +этого | genitive form of `this' +какой | which +совсем | altogether +ним | prepositional form of `его', `они' +здесь | here +этом | prepositional form of `этот' +один | one +почти | almost +мой | my +тем | instrumental/dative plural of `тот', `то' +чтобы | full form of `in order that' +нее | her (acc.) +кажется | it seems +сейчас | now +были | they were +куда | where to +зачем | why +сказать | to say +всех | all (acc., gen. preposn. plural) +никогда | never +сегодня | today +можно | possible, one can +при | by +наконец | finally +два | two +об | alternative form of `о', about +другой | another +хоть | even +после | after +над | above +больше | more +тот | that one (masc.) +через | across, in +эти | these +нас | us +про | about +всего | in all, only, of all +них | prepositional form of `они' (they) +какая | which, feminine +много | lots +разве | interrogative particle +сказала | she said +три | three +эту | this, acc. fem. sing. +моя | my, feminine +впрочем | moreover, besides +хорошо | good +свою | ones own, acc. fem. sing. +этой | oblique form of `эта', fem. `this' +перед | in front of +иногда | sometimes +лучше | better +чуть | a little +том | preposn. form of `that one' +нельзя | one must not +такой | such a one +им | to them +более | more +всегда | always +конечно | of course +всю | acc. fem. sing of `all' +между | between + + + | b: some paradigms + | + | personal pronouns + | + | я меня мне мной [мною] + | ты тебя тебе тобой [тобою] + | он его ему им [него, нему, ним] + | она ее эи ею [нее, нэи, нею] + | оно его ему им [него, нему, ним] + | + | мы нас нам нами + | вы вас вам вами + | они их им ими [них, ним, ними] + | + | себя себе собой [собою] + | + | demonstrative pronouns: этот (this), тот (that) + | + | этот эта это эти + | этого эты это эти + | этого этой этого этих + | этому этой этому этим + | этим этой этим [этою] этими + | этом этой этом этих + | + | тот та то те + | того ту то те + | того той того тех + | тому той тому тем + | тем той тем [тою] теми + | том той том тех + | + | determinative pronouns + | + | (a) весь (all) + | + | весь вся все все + | всего всю все все + | всего всей всего всех + | всему всей всему всем + | всем всей всем [всею] всеми + | всем всей всем всех + | + | (b) сам (himself etc) + | + | сам сама само сами + | самого саму само самих + | самого самой самого самих + | самому самой самому самим + | самим самой самим [самою] самими + | самом самой самом самих + | + | stems of verbs `to be', `to have', `to do' and modal + | + | быть бы буд быв есть суть + | име + | дел + | мог мож мочь + | уме + | хоч хот + | долж + | можн + | нужн + | нельзя + From 8fd83fb0c355f264abb9f280d73e51494803bdea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Niedzielski?= Date: Wed, 9 Jun 2021 10:00:28 +0200 Subject: [PATCH 2/3] IBX-519: Added healthcheck --- docker/solr/Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/solr/Dockerfile b/docker/solr/Dockerfile index 36c24c3f6..73b4eb84a 100644 --- a/docker/solr/Dockerfile +++ b/docker/solr/Dockerfile @@ -9,4 +9,5 @@ RUN sed --in-place 's/${solr.autoSoftCommit.maxTime:-1}/${solr.autoSoftCommit.ma USER solr +HEALTHCHECK --interval=10s --timeout=5s --retries=10 --start-period=15s CMD solr status CMD ["solr-precreate", "collection1"] From 83812e3ce021d0751cab00374796ba4fb7da67da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Niedzielski?= Date: Wed, 9 Jun 2021 11:15:42 +0200 Subject: [PATCH 3/3] IBX-519: Removed duplicated Solr config files, extend docker context instead --- docker/solr/Dockerfile | 5 +- docker/solr/conf/custom-fields-types.xml | 19 - docker/solr/conf/language-fieldtypes.xml | 13 - docker/solr/conf/managed-schema | 141 ------ .../solr.languages/ar/language-fieldtypes.xml | 14 - .../conf/solr.languages/ar/stopwords_ar.txt | 125 ------ .../cjk/language-fieldtypes.xml | 12 - .../solr.languages/de/language-fieldtypes.xml | 13 - .../conf/solr.languages/de/stopwords_de.txt | 294 ------------ .../solr.languages/en/language-fieldtypes.xml | 38 -- .../conf/solr.languages/en/stopwords_en.txt | 54 --- .../solr.languages/es/language-fieldtypes.xml | 12 - .../conf/solr.languages/es/stopwords_es.txt | 356 --------------- .../solr.languages/fi/language-fieldtypes.xml | 12 - .../conf/solr.languages/fi/stopwords_fi.txt | 97 ---- .../solr.languages/fr/contractions_fr.txt | 15 - .../solr.languages/fr/language-fieldtypes.xml | 15 - .../conf/solr.languages/fr/stopwords_fr.txt | 186 -------- .../solr.languages/hr/language-fieldtypes.xml | 10 - .../conf/solr.languages/hr/stopwords_hr.txt | 1 - .../solr.languages/it/contractions_it.txt | 23 - .../solr.languages/it/language-fieldtypes.xml | 14 - .../conf/solr.languages/it/stopwords_it.txt | 303 ------------- .../solr.languages/ja/language-fieldtypes.xml | 48 -- .../conf/solr.languages/ja/stoptags_ja.txt | 420 ------------------ .../conf/solr.languages/ja/stopwords_ja.txt | 127 ------ .../solr.languages/nl/language-fieldtypes.xml | 12 - .../conf/solr.languages/nl/stemdict_nl.txt | 6 - .../conf/solr.languages/nl/stopwords_nl.txt | 119 ----- .../solr.languages/no/language-fieldtypes.xml | 14 - .../conf/solr.languages/no/stopwords_no.txt | 194 -------- .../solr.languages/pt/language-fieldtypes.xml | 14 - .../conf/solr.languages/pt/stopwords_pt.txt | 253 ----------- .../solr.languages/ru/language-fieldtypes.xml | 12 - .../conf/solr.languages/ru/stopwords_ru.txt | 243 ---------- 35 files changed, 4 insertions(+), 3230 deletions(-) delete mode 100644 docker/solr/conf/custom-fields-types.xml delete mode 100644 docker/solr/conf/language-fieldtypes.xml delete mode 100644 docker/solr/conf/managed-schema delete mode 100644 docker/solr/conf/solr.languages/ar/language-fieldtypes.xml delete mode 100644 docker/solr/conf/solr.languages/ar/stopwords_ar.txt delete mode 100644 docker/solr/conf/solr.languages/cjk/language-fieldtypes.xml delete mode 100644 docker/solr/conf/solr.languages/de/language-fieldtypes.xml delete mode 100644 docker/solr/conf/solr.languages/de/stopwords_de.txt delete mode 100644 docker/solr/conf/solr.languages/en/language-fieldtypes.xml delete mode 100644 docker/solr/conf/solr.languages/en/stopwords_en.txt delete mode 100644 docker/solr/conf/solr.languages/es/language-fieldtypes.xml delete mode 100644 docker/solr/conf/solr.languages/es/stopwords_es.txt delete mode 100644 docker/solr/conf/solr.languages/fi/language-fieldtypes.xml delete mode 100644 docker/solr/conf/solr.languages/fi/stopwords_fi.txt delete mode 100644 docker/solr/conf/solr.languages/fr/contractions_fr.txt delete mode 100644 docker/solr/conf/solr.languages/fr/language-fieldtypes.xml delete mode 100644 docker/solr/conf/solr.languages/fr/stopwords_fr.txt delete mode 100644 docker/solr/conf/solr.languages/hr/language-fieldtypes.xml delete mode 100644 docker/solr/conf/solr.languages/hr/stopwords_hr.txt delete mode 100644 docker/solr/conf/solr.languages/it/contractions_it.txt delete mode 100644 docker/solr/conf/solr.languages/it/language-fieldtypes.xml delete mode 100644 docker/solr/conf/solr.languages/it/stopwords_it.txt delete mode 100644 docker/solr/conf/solr.languages/ja/language-fieldtypes.xml delete mode 100644 docker/solr/conf/solr.languages/ja/stoptags_ja.txt delete mode 100644 docker/solr/conf/solr.languages/ja/stopwords_ja.txt delete mode 100644 docker/solr/conf/solr.languages/nl/language-fieldtypes.xml delete mode 100644 docker/solr/conf/solr.languages/nl/stemdict_nl.txt delete mode 100644 docker/solr/conf/solr.languages/nl/stopwords_nl.txt delete mode 100644 docker/solr/conf/solr.languages/no/language-fieldtypes.xml delete mode 100644 docker/solr/conf/solr.languages/no/stopwords_no.txt delete mode 100644 docker/solr/conf/solr.languages/pt/language-fieldtypes.xml delete mode 100644 docker/solr/conf/solr.languages/pt/stopwords_pt.txt delete mode 100644 docker/solr/conf/solr.languages/ru/language-fieldtypes.xml delete mode 100644 docker/solr/conf/solr.languages/ru/stopwords_ru.txt diff --git a/docker/solr/Dockerfile b/docker/solr/Dockerfile index 73b4eb84a..52ce17a5e 100644 --- a/docker/solr/Dockerfile +++ b/docker/solr/Dockerfile @@ -3,7 +3,10 @@ FROM solr:8.6.3 LABEL org.opencontainers.image.source=https://github.com/ezsystems/ezplatform-solr-search-engine USER root -COPY conf server/solr/configsets/_default/conf +# Note: requires project dir as docker context +# Build using: `docker build -f docker/solr/Dockerfile .` when in project root directory +COPY lib/Resources/config/solr server/solr/configsets/_default/conf +RUN mv server/solr/configsets/_default/conf/schema.xml server/solr/configsets/_default/conf/managed-schema RUN sed --in-place '//d' server/solr/configsets/_default/conf/solrconfig.xml RUN sed --in-place 's/${solr.autoSoftCommit.maxTime:-1}/${solr.autoSoftCommit.maxTime:20}/' server/solr/configsets/_default/conf/solrconfig.xml diff --git a/docker/solr/conf/custom-fields-types.xml b/docker/solr/conf/custom-fields-types.xml deleted file mode 100644 index 92318ba62..000000000 --- a/docker/solr/conf/custom-fields-types.xml +++ /dev/null @@ -1,19 +0,0 @@ - - - - - - - - - diff --git a/docker/solr/conf/language-fieldtypes.xml b/docker/solr/conf/language-fieldtypes.xml deleted file mode 100644 index f578645bf..000000000 --- a/docker/solr/conf/language-fieldtypes.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - diff --git a/docker/solr/conf/managed-schema b/docker/solr/conf/managed-schema deleted file mode 100644 index 361630025..000000000 --- a/docker/solr/conf/managed-schema +++ /dev/null @@ -1,141 +0,0 @@ - - - -]> - - - - - &langfields; - - - &customfields; - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - id - diff --git a/docker/solr/conf/solr.languages/ar/language-fieldtypes.xml b/docker/solr/conf/solr.languages/ar/language-fieldtypes.xml deleted file mode 100644 index 007c154ac..000000000 --- a/docker/solr/conf/solr.languages/ar/language-fieldtypes.xml +++ /dev/null @@ -1,14 +0,0 @@ - - - - - - - - - - - - - - diff --git a/docker/solr/conf/solr.languages/ar/stopwords_ar.txt b/docker/solr/conf/solr.languages/ar/stopwords_ar.txt deleted file mode 100644 index 046829db6..000000000 --- a/docker/solr/conf/solr.languages/ar/stopwords_ar.txt +++ /dev/null @@ -1,125 +0,0 @@ -# This file was created by Jacques Savoy and is distributed under the BSD license. -# See http://members.unine.ch/jacques.savoy/clef/index.html. -# Also see http://www.opensource.org/licenses/bsd-license.html -# Cleaned on October 11, 2009 (not normalized, so use before normalization) -# This means that when modifying this list, you might need to add some -# redundant entries, for example containing forms with both أ and ا -من -ومن -منها -منه -في -وفي -فيها -فيه -و -ف -ثم -او -أو -ب -بها -به -ا -أ -اى -اي -أي -أى -لا -ولا -الا -ألا -إلا -لكن -ما -وما -كما -فما -عن -مع -اذا -إذا -ان -أن -إن -انها -أنها -إنها -انه -أنه -إنه -بان -بأن -فان -فأن -وان -وأن -وإن -التى -التي -الذى -الذي -الذين -الى -الي -إلى -إلي -على -عليها -عليه -اما -أما -إما -ايضا -أيضا -كل -وكل -لم -ولم -لن -ولن -هى -هي -هو -وهى -وهي -وهو -فهى -فهي -فهو -انت -أنت -لك -لها -له -هذه -هذا -تلك -ذلك -هناك -كانت -كان -يكون -تكون -وكانت -وكان -غير -بعض -قد -نحو -بين -بينما -منذ -ضمن -حيث -الان -الآن -خلال -بعد -قبل -حتى -عند -عندما -لدى -جميع diff --git a/docker/solr/conf/solr.languages/cjk/language-fieldtypes.xml b/docker/solr/conf/solr.languages/cjk/language-fieldtypes.xml deleted file mode 100644 index 0c183aef0..000000000 --- a/docker/solr/conf/solr.languages/cjk/language-fieldtypes.xml +++ /dev/null @@ -1,12 +0,0 @@ - - - - - - - - - - - - diff --git a/docker/solr/conf/solr.languages/de/language-fieldtypes.xml b/docker/solr/conf/solr.languages/de/language-fieldtypes.xml deleted file mode 100644 index e165b0899..000000000 --- a/docker/solr/conf/solr.languages/de/language-fieldtypes.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - diff --git a/docker/solr/conf/solr.languages/de/stopwords_de.txt b/docker/solr/conf/solr.languages/de/stopwords_de.txt deleted file mode 100644 index 86525e7ae..000000000 --- a/docker/solr/conf/solr.languages/de/stopwords_de.txt +++ /dev/null @@ -1,294 +0,0 @@ - | From svn.tartarus.org/snowball/trunk/website/algorithms/german/stop.txt - | This file is distributed under the BSD License. - | See http://snowball.tartarus.org/license.php - | Also see http://www.opensource.org/licenses/bsd-license.html - | - Encoding was converted to UTF-8. - | - This notice was added. - | - | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" - - | A German stop word list. Comments begin with vertical bar. Each stop - | word is at the start of a line. - - | The number of forms in this list is reduced significantly by passing it - | through the German stemmer. - - -aber | but - -alle | all -allem -allen -aller -alles - -als | than, as -also | so -am | an + dem -an | at - -ander | other -andere -anderem -anderen -anderer -anderes -anderm -andern -anderr -anders - -auch | also -auf | on -aus | out of -bei | by -bin | am -bis | until -bist | art -da | there -damit | with it -dann | then - -der | the -den -des -dem -die -das - -daß | that - -derselbe | the same -derselben -denselben -desselben -demselben -dieselbe -dieselben -dasselbe - -dazu | to that - -dein | thy -deine -deinem -deinen -deiner -deines - -denn | because - -derer | of those -dessen | of him - -dich | thee -dir | to thee -du | thou - -dies | this -diese -diesem -diesen -dieser -dieses - - -doch | (several meanings) -dort | (over) there - - -durch | through - -ein | a -eine -einem -einen -einer -eines - -einig | some -einige -einigem -einigen -einiger -einiges - -einmal | once - -er | he -ihn | him -ihm | to him - -es | it -etwas | something - -euer | your -eure -eurem -euren -eurer -eures - -für | for -gegen | towards -gewesen | p.p. of sein -hab | have -habe | have -haben | have -hat | has -hatte | had -hatten | had -hier | here -hin | there -hinter | behind - -ich | I -mich | me -mir | to me - - -ihr | you, to her -ihre -ihrem -ihren -ihrer -ihres -euch | to you - -im | in + dem -in | in -indem | while -ins | in + das -ist | is - -jede | each, every -jedem -jeden -jeder -jedes - -jene | that -jenem -jenen -jener -jenes - -jetzt | now -kann | can - -kein | no -keine -keinem -keinen -keiner -keines - -können | can -könnte | could -machen | do -man | one - -manche | some, many a -manchem -manchen -mancher -manches - -mein | my -meine -meinem -meinen -meiner -meines - -mit | with -muss | must -musste | had to -nach | to(wards) -nicht | not -nichts | nothing -noch | still, yet -nun | now -nur | only -ob | whether -oder | or -ohne | without -sehr | very - -sein | his -seine -seinem -seinen -seiner -seines - -selbst | self -sich | herself - -sie | they, she -ihnen | to them - -sind | are -so | so - -solche | such -solchem -solchen -solcher -solches - -soll | shall -sollte | should -sondern | but -sonst | else -über | over -um | about, around -und | and - -uns | us -unse -unsem -unsen -unser -unses - -unter | under -viel | much -vom | von + dem -von | from -vor | before -während | while -war | was -waren | were -warst | wast -was | what -weg | away, off -weil | because -weiter | further - -welche | which -welchem -welchen -welcher -welches - -wenn | when -werde | will -werden | will -wie | how -wieder | again -will | want -wir | we -wird | will -wirst | willst -wo | where -wollen | want -wollte | wanted -würde | would -würden | would -zu | to -zum | zu + dem -zur | zu + der -zwar | indeed -zwischen | between - diff --git a/docker/solr/conf/solr.languages/en/language-fieldtypes.xml b/docker/solr/conf/solr.languages/en/language-fieldtypes.xml deleted file mode 100644 index 9bccba163..000000000 --- a/docker/solr/conf/solr.languages/en/language-fieldtypes.xml +++ /dev/null @@ -1,38 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/docker/solr/conf/solr.languages/en/stopwords_en.txt b/docker/solr/conf/solr.languages/en/stopwords_en.txt deleted file mode 100644 index 2c164c0b2..000000000 --- a/docker/solr/conf/solr.languages/en/stopwords_en.txt +++ /dev/null @@ -1,54 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# a couple of test stopwords to test that the words are really being -# configured from this file: -stopworda -stopwordb - -# Standard english stop words taken from Lucene's StopAnalyzer -a -an -and -are -as -at -be -but -by -for -if -in -into -is -it -no -not -of -on -or -such -that -the -their -then -there -these -they -this -to -was -will -with diff --git a/docker/solr/conf/solr.languages/es/language-fieldtypes.xml b/docker/solr/conf/solr.languages/es/language-fieldtypes.xml deleted file mode 100644 index 03c9d9805..000000000 --- a/docker/solr/conf/solr.languages/es/language-fieldtypes.xml +++ /dev/null @@ -1,12 +0,0 @@ - - - - - - - - - - - - diff --git a/docker/solr/conf/solr.languages/es/stopwords_es.txt b/docker/solr/conf/solr.languages/es/stopwords_es.txt deleted file mode 100644 index 487d78c8d..000000000 --- a/docker/solr/conf/solr.languages/es/stopwords_es.txt +++ /dev/null @@ -1,356 +0,0 @@ - | From svn.tartarus.org/snowball/trunk/website/algorithms/spanish/stop.txt - | This file is distributed under the BSD License. - | See http://snowball.tartarus.org/license.php - | Also see http://www.opensource.org/licenses/bsd-license.html - | - Encoding was converted to UTF-8. - | - This notice was added. - | - | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" - - | A Spanish stop word list. Comments begin with vertical bar. Each stop - | word is at the start of a line. - - - | The following is a ranked list (commonest to rarest) of stopwords - | deriving from a large sample of text. - - | Extra words have been added at the end. - -de | from, of -la | the, her -que | who, that -el | the -en | in -y | and -a | to -los | the, them -del | de + el -se | himself, from him etc -las | the, them -por | for, by, etc -un | a -para | for -con | with -no | no -una | a -su | his, her -al | a + el - | es from SER -lo | him -como | how -más | more -pero | pero -sus | su plural -le | to him, her -ya | already -o | or - | fue from SER -este | this - | ha from HABER -sí | himself etc -porque | because -esta | this - | son from SER -entre | between - | está from ESTAR -cuando | when -muy | very -sin | without -sobre | on - | ser from SER - | tiene from TENER -también | also -me | me -hasta | until -hay | there is/are -donde | where - | han from HABER -quien | whom, that - | están from ESTAR - | estado from ESTAR -desde | from -todo | all -nos | us -durante | during - | estados from ESTAR -todos | all -uno | a -les | to them -ni | nor -contra | against -otros | other - | fueron from SER -ese | that -eso | that - | había from HABER -ante | before -ellos | they -e | and (variant of y) -esto | this -mí | me -antes | before -algunos | some -qué | what? -unos | a -yo | I -otro | other -otras | other -otra | other -él | he -tanto | so much, many -esa | that -estos | these -mucho | much, many -quienes | who -nada | nothing -muchos | many -cual | who - | sea from SER -poco | few -ella | she -estar | to be - | haber from HABER -estas | these - | estaba from ESTAR - | estamos from ESTAR -algunas | some -algo | something -nosotros | we - - | other forms - -mi | me -mis | mi plural -tú | thou -te | thee -ti | thee -tu | thy -tus | tu plural -ellas | they -nosotras | we -vosotros | you -vosotras | you -os | you -mío | mine -mía | -míos | -mías | -tuyo | thine -tuya | -tuyos | -tuyas | -suyo | his, hers, theirs -suya | -suyos | -suyas | -nuestro | ours -nuestra | -nuestros | -nuestras | -vuestro | yours -vuestra | -vuestros | -vuestras | -esos | those -esas | those - - | forms of estar, to be (not including the infinitive): -estoy -estás -está -estamos -estáis -están -esté -estés -estemos -estéis -estén -estaré -estarás -estará -estaremos -estaréis -estarán -estaría -estarías -estaríamos -estaríais -estarían -estaba -estabas -estábamos -estabais -estaban -estuve -estuviste -estuvo -estuvimos -estuvisteis -estuvieron -estuviera -estuvieras -estuviéramos -estuvierais -estuvieran -estuviese -estuvieses -estuviésemos -estuvieseis -estuviesen -estando -estado -estada -estados -estadas -estad - - | forms of haber, to have (not including the infinitive): -he -has -ha -hemos -habéis -han -haya -hayas -hayamos -hayáis -hayan -habré -habrás -habrá -habremos -habréis -habrán -habría -habrías -habríamos -habríais -habrían -había -habías -habíamos -habíais -habían -hube -hubiste -hubo -hubimos -hubisteis -hubieron -hubiera -hubieras -hubiéramos -hubierais -hubieran -hubiese -hubieses -hubiésemos -hubieseis -hubiesen -habiendo -habido -habida -habidos -habidas - - | forms of ser, to be (not including the infinitive): -soy -eres -es -somos -sois -son -sea -seas -seamos -seáis -sean -seré -serás -será -seremos -seréis -serán -sería -serías -seríamos -seríais -serían -era -eras -éramos -erais -eran -fui -fuiste -fue -fuimos -fuisteis -fueron -fuera -fueras -fuéramos -fuerais -fueran -fuese -fueses -fuésemos -fueseis -fuesen -siendo -sido - | sed also means 'thirst' - - | forms of tener, to have (not including the infinitive): -tengo -tienes -tiene -tenemos -tenéis -tienen -tenga -tengas -tengamos -tengáis -tengan -tendré -tendrás -tendrá -tendremos -tendréis -tendrán -tendría -tendrías -tendríamos -tendríais -tendrían -tenía -tenías -teníamos -teníais -tenían -tuve -tuviste -tuvo -tuvimos -tuvisteis -tuvieron -tuviera -tuvieras -tuviéramos -tuvierais -tuvieran -tuviese -tuvieses -tuviésemos -tuvieseis -tuviesen -teniendo -tenido -tenida -tenidos -tenidas -tened - diff --git a/docker/solr/conf/solr.languages/fi/language-fieldtypes.xml b/docker/solr/conf/solr.languages/fi/language-fieldtypes.xml deleted file mode 100644 index f375503be..000000000 --- a/docker/solr/conf/solr.languages/fi/language-fieldtypes.xml +++ /dev/null @@ -1,12 +0,0 @@ - - - - - - - - - - - - diff --git a/docker/solr/conf/solr.languages/fi/stopwords_fi.txt b/docker/solr/conf/solr.languages/fi/stopwords_fi.txt deleted file mode 100644 index 4372c9a05..000000000 --- a/docker/solr/conf/solr.languages/fi/stopwords_fi.txt +++ /dev/null @@ -1,97 +0,0 @@ - | From svn.tartarus.org/snowball/trunk/website/algorithms/finnish/stop.txt - | This file is distributed under the BSD License. - | See http://snowball.tartarus.org/license.php - | Also see http://www.opensource.org/licenses/bsd-license.html - | - Encoding was converted to UTF-8. - | - This notice was added. - | - | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" - -| forms of BE - -olla -olen -olet -on -olemme -olette -ovat -ole | negative form - -oli -olisi -olisit -olisin -olisimme -olisitte -olisivat -olit -olin -olimme -olitte -olivat -ollut -olleet - -en | negation -et -ei -emme -ette -eivät - -|Nom Gen Acc Part Iness Elat Illat Adess Ablat Allat Ess Trans -minä minun minut minua minussa minusta minuun minulla minulta minulle | I -sinä sinun sinut sinua sinussa sinusta sinuun sinulla sinulta sinulle | you -hän hänen hänet häntä hänessä hänestä häneen hänellä häneltä hänelle | he she -me meidän meidät meitä meissä meistä meihin meillä meiltä meille | we -te teidän teidät teitä teissä teistä teihin teillä teiltä teille | you -he heidän heidät heitä heissä heistä heihin heillä heiltä heille | they - -tämä tämän tätä tässä tästä tähän tallä tältä tälle tänä täksi | this -tuo tuon tuotä tuossa tuosta tuohon tuolla tuolta tuolle tuona tuoksi | that -se sen sitä siinä siitä siihen sillä siltä sille sinä siksi | it -nämä näiden näitä näissä näistä näihin näillä näiltä näille näinä näiksi | these -nuo noiden noita noissa noista noihin noilla noilta noille noina noiksi | those -ne niiden niitä niissä niistä niihin niillä niiltä niille niinä niiksi | they - -kuka kenen kenet ketä kenessä kenestä keneen kenellä keneltä kenelle kenenä keneksi| who -ketkä keiden ketkä keitä keissä keistä keihin keillä keiltä keille keinä keiksi | (pl) -mikä minkä minkä mitä missä mistä mihin millä miltä mille minä miksi | which what -mitkä | (pl) - -joka jonka jota jossa josta johon jolla jolta jolle jona joksi | who which -jotka joiden joita joissa joista joihin joilla joilta joille joina joiksi | (pl) - -| conjunctions - -että | that -ja | and -jos | if -koska | because -kuin | than -mutta | but -niin | so -sekä | and -sillä | for -tai | or -vaan | but -vai | or -vaikka | although - - -| prepositions - -kanssa | with -mukaan | according to -noin | about -poikki | across -yli | over, across - -| other - -kun | when -niin | so -nyt | now -itse | self - diff --git a/docker/solr/conf/solr.languages/fr/contractions_fr.txt b/docker/solr/conf/solr.languages/fr/contractions_fr.txt deleted file mode 100644 index f1bba51b2..000000000 --- a/docker/solr/conf/solr.languages/fr/contractions_fr.txt +++ /dev/null @@ -1,15 +0,0 @@ -# Set of French contractions for ElisionFilter -# TODO: load this as a resource from the analyzer and sync it in build.xml -l -m -t -qu -n -s -j -d -c -jusqu -quoiqu -lorsqu -puisqu diff --git a/docker/solr/conf/solr.languages/fr/language-fieldtypes.xml b/docker/solr/conf/solr.languages/fr/language-fieldtypes.xml deleted file mode 100644 index 4336e0544..000000000 --- a/docker/solr/conf/solr.languages/fr/language-fieldtypes.xml +++ /dev/null @@ -1,15 +0,0 @@ - - - - - - - - - - - - - - - diff --git a/docker/solr/conf/solr.languages/fr/stopwords_fr.txt b/docker/solr/conf/solr.languages/fr/stopwords_fr.txt deleted file mode 100644 index 749abae68..000000000 --- a/docker/solr/conf/solr.languages/fr/stopwords_fr.txt +++ /dev/null @@ -1,186 +0,0 @@ - | From svn.tartarus.org/snowball/trunk/website/algorithms/french/stop.txt - | This file is distributed under the BSD License. - | See http://snowball.tartarus.org/license.php - | Also see http://www.opensource.org/licenses/bsd-license.html - | - Encoding was converted to UTF-8. - | - This notice was added. - | - | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" - - | A French stop word list. Comments begin with vertical bar. Each stop - | word is at the start of a line. - -au | a + le -aux | a + les -avec | with -ce | this -ces | these -dans | with -de | of -des | de + les -du | de + le -elle | she -en | `of them' etc -et | and -eux | them -il | he -je | I -la | the -le | the -leur | their -lui | him -ma | my (fem) -mais | but -me | me -même | same; as in moi-même (myself) etc -mes | me (pl) -moi | me -mon | my (masc) -ne | not -nos | our (pl) -notre | our -nous | we -on | one -ou | where -par | by -pas | not -pour | for -qu | que before vowel -que | that -qui | who -sa | his, her (fem) -se | oneself -ses | his (pl) -son | his, her (masc) -sur | on -ta | thy (fem) -te | thee -tes | thy (pl) -toi | thee -ton | thy (masc) -tu | thou -un | a -une | a -vos | your (pl) -votre | your -vous | you - - | single letter forms - -c | c' -d | d' -j | j' -l | l' -à | to, at -m | m' -n | n' -s | s' -t | t' -y | there - - | forms of être (not including the infinitive): -été -étée -étées -étés -étant -suis -es -est -sommes -êtes -sont -serai -seras -sera -serons -serez -seront -serais -serait -serions -seriez -seraient -étais -était -étions -étiez -étaient -fus -fut -fûmes -fûtes -furent -sois -soit -soyons -soyez -soient -fusse -fusses -fût -fussions -fussiez -fussent - - | forms of avoir (not including the infinitive): -ayant -eu -eue -eues -eus -ai -as -avons -avez -ont -aurai -auras -aura -aurons -aurez -auront -aurais -aurait -aurions -auriez -auraient -avais -avait -avions -aviez -avaient -eut -eûmes -eûtes -eurent -aie -aies -ait -ayons -ayez -aient -eusse -eusses -eût -eussions -eussiez -eussent - - | Later additions (from Jean-Christophe Deschamps) -ceci | this -cela | that -celà | that -cet | this -cette | this -ici | here -ils | they -les | the (pl) -leurs | their (pl) -quel | which -quels | which -quelle | which -quelles | which -sans | without -soi | oneself - diff --git a/docker/solr/conf/solr.languages/hr/language-fieldtypes.xml b/docker/solr/conf/solr.languages/hr/language-fieldtypes.xml deleted file mode 100644 index 784ae0a8d..000000000 --- a/docker/solr/conf/solr.languages/hr/language-fieldtypes.xml +++ /dev/null @@ -1,10 +0,0 @@ - - - - - - - - - - diff --git a/docker/solr/conf/solr.languages/hr/stopwords_hr.txt b/docker/solr/conf/solr.languages/hr/stopwords_hr.txt deleted file mode 100644 index 16fc66739..000000000 --- a/docker/solr/conf/solr.languages/hr/stopwords_hr.txt +++ /dev/null @@ -1 +0,0 @@ -# Stub, a list of Croation stopwords is not part of the Solr distribution \ No newline at end of file diff --git a/docker/solr/conf/solr.languages/it/contractions_it.txt b/docker/solr/conf/solr.languages/it/contractions_it.txt deleted file mode 100644 index cac040953..000000000 --- a/docker/solr/conf/solr.languages/it/contractions_it.txt +++ /dev/null @@ -1,23 +0,0 @@ -# Set of Italian contractions for ElisionFilter -# TODO: load this as a resource from the analyzer and sync it in build.xml -c -l -all -dall -dell -nell -sull -coll -pell -gl -agl -dagl -degl -negl -sugl -un -m -t -s -v -d diff --git a/docker/solr/conf/solr.languages/it/language-fieldtypes.xml b/docker/solr/conf/solr.languages/it/language-fieldtypes.xml deleted file mode 100644 index 16b52cfa6..000000000 --- a/docker/solr/conf/solr.languages/it/language-fieldtypes.xml +++ /dev/null @@ -1,14 +0,0 @@ - - - - - - - - - - - - - - diff --git a/docker/solr/conf/solr.languages/it/stopwords_it.txt b/docker/solr/conf/solr.languages/it/stopwords_it.txt deleted file mode 100644 index 1219cc773..000000000 --- a/docker/solr/conf/solr.languages/it/stopwords_it.txt +++ /dev/null @@ -1,303 +0,0 @@ - | From svn.tartarus.org/snowball/trunk/website/algorithms/italian/stop.txt - | This file is distributed under the BSD License. - | See http://snowball.tartarus.org/license.php - | Also see http://www.opensource.org/licenses/bsd-license.html - | - Encoding was converted to UTF-8. - | - This notice was added. - | - | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" - - | An Italian stop word list. Comments begin with vertical bar. Each stop - | word is at the start of a line. - -ad | a (to) before vowel -al | a + il -allo | a + lo -ai | a + i -agli | a + gli -all | a + l' -agl | a + gl' -alla | a + la -alle | a + le -con | with -col | con + il -coi | con + i (forms collo, cogli etc are now very rare) -da | from -dal | da + il -dallo | da + lo -dai | da + i -dagli | da + gli -dall | da + l' -dagl | da + gll' -dalla | da + la -dalle | da + le -di | of -del | di + il -dello | di + lo -dei | di + i -degli | di + gli -dell | di + l' -degl | di + gl' -della | di + la -delle | di + le -in | in -nel | in + el -nello | in + lo -nei | in + i -negli | in + gli -nell | in + l' -negl | in + gl' -nella | in + la -nelle | in + le -su | on -sul | su + il -sullo | su + lo -sui | su + i -sugli | su + gli -sull | su + l' -sugl | su + gl' -sulla | su + la -sulle | su + le -per | through, by -tra | among -contro | against -io | I -tu | thou -lui | he -lei | she -noi | we -voi | you -loro | they -mio | my -mia | -miei | -mie | -tuo | -tua | -tuoi | thy -tue | -suo | -sua | -suoi | his, her -sue | -nostro | our -nostra | -nostri | -nostre | -vostro | your -vostra | -vostri | -vostre | -mi | me -ti | thee -ci | us, there -vi | you, there -lo | him, the -la | her, the -li | them -le | them, the -gli | to him, the -ne | from there etc -il | the -un | a -uno | a -una | a -ma | but -ed | and -se | if -perché | why, because -anche | also -come | how -dov | where (as dov') -dove | where -che | who, that -chi | who -cui | whom -non | not -più | more -quale | who, that -quanto | how much -quanti | -quanta | -quante | -quello | that -quelli | -quella | -quelle | -questo | this -questi | -questa | -queste | -si | yes -tutto | all -tutti | all - - | single letter forms: - -a | at -c | as c' for ce or ci -e | and -i | the -l | as l' -o | or - - | forms of avere, to have (not including the infinitive): - -ho -hai -ha -abbiamo -avete -hanno -abbia -abbiate -abbiano -avrò -avrai -avrà -avremo -avrete -avranno -avrei -avresti -avrebbe -avremmo -avreste -avrebbero -avevo -avevi -aveva -avevamo -avevate -avevano -ebbi -avesti -ebbe -avemmo -aveste -ebbero -avessi -avesse -avessimo -avessero -avendo -avuto -avuta -avuti -avute - - | forms of essere, to be (not including the infinitive): -sono -sei -è -siamo -siete -sia -siate -siano -sarò -sarai -sarà -saremo -sarete -saranno -sarei -saresti -sarebbe -saremmo -sareste -sarebbero -ero -eri -era -eravamo -eravate -erano -fui -fosti -fu -fummo -foste -furono -fossi -fosse -fossimo -fossero -essendo - - | forms of fare, to do (not including the infinitive, fa, fat-): -faccio -fai -facciamo -fanno -faccia -facciate -facciano -farò -farai -farà -faremo -farete -faranno -farei -faresti -farebbe -faremmo -fareste -farebbero -facevo -facevi -faceva -facevamo -facevate -facevano -feci -facesti -fece -facemmo -faceste -fecero -facessi -facesse -facessimo -facessero -facendo - - | forms of stare, to be (not including the infinitive): -sto -stai -sta -stiamo -stanno -stia -stiate -stiano -starò -starai -starà -staremo -starete -staranno -starei -staresti -starebbe -staremmo -stareste -starebbero -stavo -stavi -stava -stavamo -stavate -stavano -stetti -stesti -stette -stemmo -steste -stettero -stessi -stesse -stessimo -stessero -stando diff --git a/docker/solr/conf/solr.languages/ja/language-fieldtypes.xml b/docker/solr/conf/solr.languages/ja/language-fieldtypes.xml deleted file mode 100644 index 4a744fa80..000000000 --- a/docker/solr/conf/solr.languages/ja/language-fieldtypes.xml +++ /dev/null @@ -1,48 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - diff --git a/docker/solr/conf/solr.languages/ja/stoptags_ja.txt b/docker/solr/conf/solr.languages/ja/stoptags_ja.txt deleted file mode 100644 index 71b750845..000000000 --- a/docker/solr/conf/solr.languages/ja/stoptags_ja.txt +++ /dev/null @@ -1,420 +0,0 @@ -# -# This file defines a Japanese stoptag set for JapanesePartOfSpeechStopFilter. -# -# Any token with a part-of-speech tag that exactly matches those defined in this -# file are removed from the token stream. -# -# Set your own stoptags by uncommenting the lines below. Note that comments are -# not allowed on the same line as a stoptag. See LUCENE-3745 for frequency lists, -# etc. that can be useful for building you own stoptag set. -# -# The entire possible tagset is provided below for convenience. -# -##### -# noun: unclassified nouns -#名詞 -# -# noun-common: Common nouns or nouns where the sub-classification is undefined -#名詞-一般 -# -# noun-proper: Proper nouns where the sub-classification is undefined -#名詞-固有名詞 -# -# noun-proper-misc: miscellaneous proper nouns -#名詞-固有名詞-一般 -# -# noun-proper-person: Personal names where the sub-classification is undefined -#名詞-固有名詞-人名 -# -# noun-proper-person-misc: names that cannot be divided into surname and -# given name; foreign names; names where the surname or given name is unknown. -# e.g. お市の方 -#名詞-固有名詞-人名-一般 -# -# noun-proper-person-surname: Mainly Japanese surnames. -# e.g. 山田 -#名詞-固有名詞-人名-姓 -# -# noun-proper-person-given_name: Mainly Japanese given names. -# e.g. 太郎 -#名詞-固有名詞-人名-名 -# -# noun-proper-organization: Names representing organizations. -# e.g. 通産省, NHK -#名詞-固有名詞-組織 -# -# noun-proper-place: Place names where the sub-classification is undefined -#名詞-固有名詞-地域 -# -# noun-proper-place-misc: Place names excluding countries. -# e.g. アジア, バルセロナ, 京都 -#名詞-固有名詞-地域-一般 -# -# noun-proper-place-country: Country names. -# e.g. 日本, オーストラリア -#名詞-固有名詞-地域-国 -# -# noun-pronoun: Pronouns where the sub-classification is undefined -#名詞-代名詞 -# -# noun-pronoun-misc: miscellaneous pronouns: -# e.g. それ, ここ, あいつ, あなた, あちこち, いくつ, どこか, なに, みなさん, みんな, わたくし, われわれ -#名詞-代名詞-一般 -# -# noun-pronoun-contraction: Spoken language contraction made by combining a -# pronoun and the particle 'wa'. -# e.g. ありゃ, こりゃ, こりゃあ, そりゃ, そりゃあ -#名詞-代名詞-縮約 -# -# noun-adverbial: Temporal nouns such as names of days or months that behave -# like adverbs. Nouns that represent amount or ratios and can be used adverbially, -# e.g. 金曜, 一月, 午後, 少量 -#名詞-副詞可能 -# -# noun-verbal: Nouns that take arguments with case and can appear followed by -# 'suru' and related verbs (する, できる, なさる, くださる) -# e.g. インプット, 愛着, 悪化, 悪戦苦闘, 一安心, 下取り -#名詞-サ変接続 -# -# noun-adjective-base: The base form of adjectives, words that appear before な ("na") -# e.g. 健康, 安易, 駄目, だめ -#名詞-形容動詞語幹 -# -# noun-numeric: Arabic numbers, Chinese numerals, and counters like 何 (回), 数. -# e.g. 0, 1, 2, 何, 数, 幾 -#名詞-数 -# -# noun-affix: noun affixes where the sub-classification is undefined -#名詞-非自立 -# -# noun-affix-misc: Of adnominalizers, the case-marker の ("no"), and words that -# attach to the base form of inflectional words, words that cannot be classified -# into any of the other categories below. This category includes indefinite nouns. -# e.g. あかつき, 暁, かい, 甲斐, 気, きらい, 嫌い, くせ, 癖, こと, 事, ごと, 毎, しだい, 次第, -# 順, せい, 所為, ついで, 序で, つもり, 積もり, 点, どころ, の, はず, 筈, はずみ, 弾み, -# 拍子, ふう, ふり, 振り, ほう, 方, 旨, もの, 物, 者, ゆえ, 故, ゆえん, 所以, わけ, 訳, -# わり, 割り, 割, ん-口語/, もん-口語/ -#名詞-非自立-一般 -# -# noun-affix-adverbial: noun affixes that that can behave as adverbs. -# e.g. あいだ, 間, あげく, 挙げ句, あと, 後, 余り, 以外, 以降, 以後, 以上, 以前, 一方, うえ, -# 上, うち, 内, おり, 折り, かぎり, 限り, きり, っきり, 結果, ころ, 頃, さい, 際, 最中, さなか, -# 最中, じたい, 自体, たび, 度, ため, 為, つど, 都度, とおり, 通り, とき, 時, ところ, 所, -# とたん, 途端, なか, 中, のち, 後, ばあい, 場合, 日, ぶん, 分, ほか, 他, まえ, 前, まま, -# 儘, 侭, みぎり, 矢先 -#名詞-非自立-副詞可能 -# -# noun-affix-aux: noun affixes treated as 助動詞 ("auxiliary verb") in school grammars -# with the stem よう(だ) ("you(da)"). -# e.g. よう, やう, 様 (よう) -#名詞-非自立-助動詞語幹 -# -# noun-affix-adjective-base: noun affixes that can connect to the indeclinable -# connection form な (aux "da"). -# e.g. みたい, ふう -#名詞-非自立-形容動詞語幹 -# -# noun-special: special nouns where the sub-classification is undefined. -#名詞-特殊 -# -# noun-special-aux: The そうだ ("souda") stem form that is used for reporting news, is -# treated as 助動詞 ("auxiliary verb") in school grammars, and attach to the base -# form of inflectional words. -# e.g. そう -#名詞-特殊-助動詞語幹 -# -# noun-suffix: noun suffixes where the sub-classification is undefined. -#名詞-接尾 -# -# noun-suffix-misc: Of the nouns or stem forms of other parts of speech that connect -# to ガル or タイ and can combine into compound nouns, words that cannot be classified into -# any of the other categories below. In general, this category is more inclusive than -# 接尾語 ("suffix") and is usually the last element in a compound noun. -# e.g. おき, かた, 方, 甲斐 (がい), がかり, ぎみ, 気味, ぐるみ, (~した) さ, 次第, 済 (ず) み, -# よう, (でき)っこ, 感, 観, 性, 学, 類, 面, 用 -#名詞-接尾-一般 -# -# noun-suffix-person: Suffixes that form nouns and attach to person names more often -# than other nouns. -# e.g. 君, 様, 著 -#名詞-接尾-人名 -# -# noun-suffix-place: Suffixes that form nouns and attach to place names more often -# than other nouns. -# e.g. 町, 市, 県 -#名詞-接尾-地域 -# -# noun-suffix-verbal: Of the suffixes that attach to nouns and form nouns, those that -# can appear before スル ("suru"). -# e.g. 化, 視, 分け, 入り, 落ち, 買い -#名詞-接尾-サ変接続 -# -# noun-suffix-aux: The stem form of そうだ (様態) that is used to indicate conditions, -# is treated as 助動詞 ("auxiliary verb") in school grammars, and attach to the -# conjunctive form of inflectional words. -# e.g. そう -#名詞-接尾-助動詞語幹 -# -# noun-suffix-adjective-base: Suffixes that attach to other nouns or the conjunctive -# form of inflectional words and appear before the copula だ ("da"). -# e.g. 的, げ, がち -#名詞-接尾-形容動詞語幹 -# -# noun-suffix-adverbial: Suffixes that attach to other nouns and can behave as adverbs. -# e.g. 後 (ご), 以後, 以降, 以前, 前後, 中, 末, 上, 時 (じ) -#名詞-接尾-副詞可能 -# -# noun-suffix-classifier: Suffixes that attach to numbers and form nouns. This category -# is more inclusive than 助数詞 ("classifier") and includes common nouns that attach -# to numbers. -# e.g. 個, つ, 本, 冊, パーセント, cm, kg, カ月, か国, 区画, 時間, 時半 -#名詞-接尾-助数詞 -# -# noun-suffix-special: Special suffixes that mainly attach to inflecting words. -# e.g. (楽し) さ, (考え) 方 -#名詞-接尾-特殊 -# -# noun-suffix-conjunctive: Nouns that behave like conjunctions and join two words -# together. -# e.g. (日本) 対 (アメリカ), 対 (アメリカ), (3) 対 (5), (女優) 兼 (主婦) -#名詞-接続詞的 -# -# noun-verbal_aux: Nouns that attach to the conjunctive particle て ("te") and are -# semantically verb-like. -# e.g. ごらん, ご覧, 御覧, 頂戴 -#名詞-動詞非自立的 -# -# noun-quotation: text that cannot be segmented into words, proverbs, Chinese poetry, -# dialects, English, etc. Currently, the only entry for 名詞 引用文字列 ("noun quotation") -# is いわく ("iwaku"). -#名詞-引用文字列 -# -# noun-nai_adjective: Words that appear before the auxiliary verb ない ("nai") and -# behave like an adjective. -# e.g. 申し訳, 仕方, とんでも, 違い -#名詞-ナイ形容詞語幹 -# -##### -# prefix: unclassified prefixes -#接頭詞 -# -# prefix-nominal: Prefixes that attach to nouns (including adjective stem forms) -# excluding numerical expressions. -# e.g. お (水), 某 (氏), 同 (社), 故 (~氏), 高 (品質), お (見事), ご (立派) -#接頭詞-名詞接続 -# -# prefix-verbal: Prefixes that attach to the imperative form of a verb or a verb -# in conjunctive form followed by なる/なさる/くださる. -# e.g. お (読みなさい), お (座り) -#接頭詞-動詞接続 -# -# prefix-adjectival: Prefixes that attach to adjectives. -# e.g. お (寒いですねえ), バカ (でかい) -#接頭詞-形容詞接続 -# -# prefix-numerical: Prefixes that attach to numerical expressions. -# e.g. 約, およそ, 毎時 -#接頭詞-数接続 -# -##### -# verb: unclassified verbs -#動詞 -# -# verb-main: -#動詞-自立 -# -# verb-auxiliary: -#動詞-非自立 -# -# verb-suffix: -#動詞-接尾 -# -##### -# adjective: unclassified adjectives -#形容詞 -# -# adjective-main: -#形容詞-自立 -# -# adjective-auxiliary: -#形容詞-非自立 -# -# adjective-suffix: -#形容詞-接尾 -# -##### -# adverb: unclassified adverbs -#副詞 -# -# adverb-misc: Words that can be segmented into one unit and where adnominal -# modification is not possible. -# e.g. あいかわらず, 多分 -#副詞-一般 -# -# adverb-particle_conjunction: Adverbs that can be followed by の, は, に, -# な, する, だ, etc. -# e.g. こんなに, そんなに, あんなに, なにか, なんでも -#副詞-助詞類接続 -# -##### -# adnominal: Words that only have noun-modifying forms. -# e.g. この, その, あの, どの, いわゆる, なんらかの, 何らかの, いろんな, こういう, そういう, ああいう, -# どういう, こんな, そんな, あんな, どんな, 大きな, 小さな, おかしな, ほんの, たいした, -# 「(, も) さる (ことながら)」, 微々たる, 堂々たる, 単なる, いかなる, 我が」「同じ, 亡き -#連体詞 -# -##### -# conjunction: Conjunctions that can occur independently. -# e.g. が, けれども, そして, じゃあ, それどころか -接続詞 -# -##### -# particle: unclassified particles. -助詞 -# -# particle-case: case particles where the subclassification is undefined. -助詞-格助詞 -# -# particle-case-misc: Case particles. -# e.g. から, が, で, と, に, へ, より, を, の, にて -助詞-格助詞-一般 -# -# particle-case-quote: the "to" that appears after nouns, a person’s speech, -# quotation marks, expressions of decisions from a meeting, reasons, judgements, -# conjectures, etc. -# e.g. ( だ) と (述べた.), ( である) と (して執行猶予...) -助詞-格助詞-引用 -# -# particle-case-compound: Compounds of particles and verbs that mainly behave -# like case particles. -# e.g. という, といった, とかいう, として, とともに, と共に, でもって, にあたって, に当たって, に当って, -# にあたり, に当たり, に当り, に当たる, にあたる, において, に於いて,に於て, における, に於ける, -# にかけ, にかけて, にかんし, に関し, にかんして, に関して, にかんする, に関する, に際し, -# に際して, にしたがい, に従い, に従う, にしたがって, に従って, にたいし, に対し, にたいして, -# に対して, にたいする, に対する, について, につき, につけ, につけて, につれ, につれて, にとって, -# にとり, にまつわる, によって, に依って, に因って, により, に依り, に因り, による, に依る, に因る, -# にわたって, にわたる, をもって, を以って, を通じ, を通じて, を通して, をめぐって, をめぐり, をめぐる, -# って-口語/, ちゅう-関西弁「という」/, (何) ていう (人)-口語/, っていう-口語/, といふ, とかいふ -助詞-格助詞-連語 -# -# particle-conjunctive: -# e.g. から, からには, が, けれど, けれども, けど, し, つつ, て, で, と, ところが, どころか, とも, ども, -# ながら, なり, ので, のに, ば, ものの, や ( した), やいなや, (ころん) じゃ(いけない)-口語/, -# (行っ) ちゃ(いけない)-口語/, (言っ) たって (しかたがない)-口語/, (それがなく)ったって (平気)-口語/ -助詞-接続助詞 -# -# particle-dependency: -# e.g. こそ, さえ, しか, すら, は, も, ぞ -助詞-係助詞 -# -# particle-adverbial: -# e.g. がてら, かも, くらい, 位, ぐらい, しも, (学校) じゃ(これが流行っている)-口語/, -# (それ)じゃあ (よくない)-口語/, ずつ, (私) なぞ, など, (私) なり (に), (先生) なんか (大嫌い)-口語/, -# (私) なんぞ, (先生) なんて (大嫌い)-口語/, のみ, だけ, (私) だって-口語/, だに, -# (彼)ったら-口語/, (お茶) でも (いかが), 等 (とう), (今後) とも, ばかり, ばっか-口語/, ばっかり-口語/, -# ほど, 程, まで, 迄, (誰) も (が)([助詞-格助詞] および [助詞-係助詞] の前に位置する「も」) -助詞-副助詞 -# -# particle-interjective: particles with interjective grammatical roles. -# e.g. (松島) や -助詞-間投助詞 -# -# particle-coordinate: -# e.g. と, たり, だの, だり, とか, なり, や, やら -助詞-並立助詞 -# -# particle-final: -# e.g. かい, かしら, さ, ぜ, (だ)っけ-口語/, (とまってる) で-方言/, な, ナ, なあ-口語/, ぞ, ね, ネ, -# ねぇ-口語/, ねえ-口語/, ねん-方言/, の, のう-口語/, や, よ, ヨ, よぉ-口語/, わ, わい-口語/ -助詞-終助詞 -# -# particle-adverbial/conjunctive/final: The particle "ka" when unknown whether it is -# adverbial, conjunctive, or sentence final. For example: -# (a) 「A か B か」. Ex:「(国内で運用する) か,(海外で運用する) か (.)」 -# (b) Inside an adverb phrase. Ex:「(幸いという) か (, 死者はいなかった.)」 -# 「(祈りが届いたせい) か (, 試験に合格した.)」 -# (c) 「かのように」. Ex:「(何もなかった) か (のように振る舞った.)」 -# e.g. か -助詞-副助詞/並立助詞/終助詞 -# -# particle-adnominalizer: The "no" that attaches to nouns and modifies -# non-inflectional words. -助詞-連体化 -# -# particle-adnominalizer: The "ni" and "to" that appear following nouns and adverbs -# that are giongo, giseigo, or gitaigo. -# e.g. に, と -助詞-副詞化 -# -# particle-special: A particle that does not fit into one of the above classifications. -# This includes particles that are used in Tanka, Haiku, and other poetry. -# e.g. かな, けむ, ( しただろう) に, (あんた) にゃ(わからん), (俺) ん (家) -助詞-特殊 -# -##### -# auxiliary-verb: -助動詞 -# -##### -# interjection: Greetings and other exclamations. -# e.g. おはよう, おはようございます, こんにちは, こんばんは, ありがとう, どうもありがとう, ありがとうございます, -# いただきます, ごちそうさま, さよなら, さようなら, はい, いいえ, ごめん, ごめんなさい -#感動詞 -# -##### -# symbol: unclassified Symbols. -記号 -# -# symbol-misc: A general symbol not in one of the categories below. -# e.g. [○◎@$〒→+] -記号-一般 -# -# symbol-comma: Commas -# e.g. [,、] -記号-読点 -# -# symbol-period: Periods and full stops. -# e.g. [..。] -記号-句点 -# -# symbol-space: Full-width whitespace. -記号-空白 -# -# symbol-open_bracket: -# e.g. [({‘“『【] -記号-括弧開 -# -# symbol-close_bracket: -# e.g. [)}’”』」】] -記号-括弧閉 -# -# symbol-alphabetic: -#記号-アルファベット -# -##### -# other: unclassified other -#その他 -# -# other-interjection: Words that are hard to classify as noun-suffixes or -# sentence-final particles. -# e.g. (だ)ァ -その他-間投 -# -##### -# filler: Aizuchi that occurs during a conversation or sounds inserted as filler. -# e.g. あの, うんと, えと -フィラー -# -##### -# non-verbal: non-verbal sound. -非言語音 -# -##### -# fragment: -#語断片 -# -##### -# unknown: unknown part of speech. -#未知語 -# -##### End of file diff --git a/docker/solr/conf/solr.languages/ja/stopwords_ja.txt b/docker/solr/conf/solr.languages/ja/stopwords_ja.txt deleted file mode 100644 index d4321be6b..000000000 --- a/docker/solr/conf/solr.languages/ja/stopwords_ja.txt +++ /dev/null @@ -1,127 +0,0 @@ -# -# This file defines a stopword set for Japanese. -# -# This set is made up of hand-picked frequent terms from segmented Japanese Wikipedia. -# Punctuation characters and frequent kanji have mostly been left out. See LUCENE-3745 -# for frequency lists, etc. that can be useful for making your own set (if desired) -# -# Note that there is an overlap between these stopwords and the terms stopped when used -# in combination with the JapanesePartOfSpeechStopFilter. When editing this file, note -# that comments are not allowed on the same line as stopwords. -# -# Also note that stopping is done in a case-insensitive manner. Change your StopFilter -# configuration if you need case-sensitive stopping. Lastly, note that stopping is done -# using the same character width as the entries in this file. Since this StopFilter is -# normally done after a CJKWidthFilter in your chain, you would usually want your romaji -# entries to be in half-width and your kana entries to be in full-width. -# -の -に -は -を -た -が -で -て -と -し -れ -さ -ある -いる -も -する -から -な -こと -として -い -や -れる -など -なっ -ない -この -ため -その -あっ -よう -また -もの -という -あり -まで -られ -なる -へ -か -だ -これ -によって -により -おり -より -による -ず -なり -られる -において -ば -なかっ -なく -しかし -について -せ -だっ -その後 -できる -それ -う -ので -なお -のみ -でき -き -つ -における -および -いう -さらに -でも -ら -たり -その他 -に関する -たち -ます -ん -なら -に対して -特に -せる -及び -これら -とき -では -にて -ほか -ながら -うち -そして -とともに -ただし -かつて -それぞれ -または -お -ほど -ものの -に対する -ほとんど -と共に -といった -です -とも -ところ -ここ -##### End of file diff --git a/docker/solr/conf/solr.languages/nl/language-fieldtypes.xml b/docker/solr/conf/solr.languages/nl/language-fieldtypes.xml deleted file mode 100644 index 45e2448f8..000000000 --- a/docker/solr/conf/solr.languages/nl/language-fieldtypes.xml +++ /dev/null @@ -1,12 +0,0 @@ - - - - - - - - - - - - diff --git a/docker/solr/conf/solr.languages/nl/stemdict_nl.txt b/docker/solr/conf/solr.languages/nl/stemdict_nl.txt deleted file mode 100644 index 441072971..000000000 --- a/docker/solr/conf/solr.languages/nl/stemdict_nl.txt +++ /dev/null @@ -1,6 +0,0 @@ -# Set of overrides for the dutch stemmer -# TODO: load this as a resource from the analyzer and sync it in build.xml -fiets fiets -bromfiets bromfiets -ei eier -kind kinder diff --git a/docker/solr/conf/solr.languages/nl/stopwords_nl.txt b/docker/solr/conf/solr.languages/nl/stopwords_nl.txt deleted file mode 100644 index 47a2aeacf..000000000 --- a/docker/solr/conf/solr.languages/nl/stopwords_nl.txt +++ /dev/null @@ -1,119 +0,0 @@ - | From svn.tartarus.org/snowball/trunk/website/algorithms/dutch/stop.txt - | This file is distributed under the BSD License. - | See http://snowball.tartarus.org/license.php - | Also see http://www.opensource.org/licenses/bsd-license.html - | - Encoding was converted to UTF-8. - | - This notice was added. - | - | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" - - | A Dutch stop word list. Comments begin with vertical bar. Each stop - | word is at the start of a line. - - | This is a ranked list (commonest to rarest) of stopwords derived from - | a large sample of Dutch text. - - | Dutch stop words frequently exhibit homonym clashes. These are indicated - | clearly below. - -de | the -en | and -van | of, from -ik | I, the ego -te | (1) chez, at etc, (2) to, (3) too -dat | that, which -die | that, those, who, which -in | in, inside -een | a, an, one -hij | he -het | the, it -niet | not, nothing, naught -zijn | (1) to be, being, (2) his, one's, its -is | is -was | (1) was, past tense of all persons sing. of 'zijn' (to be) (2) wax, (3) the washing, (4) rise of river -op | on, upon, at, in, up, used up -aan | on, upon, to (as dative) -met | with, by -als | like, such as, when -voor | (1) before, in front of, (2) furrow -had | had, past tense all persons sing. of 'hebben' (have) -er | there -maar | but, only -om | round, about, for etc -hem | him -dan | then -zou | should/would, past tense all persons sing. of 'zullen' -of | or, whether, if -wat | what, something, anything -mijn | possessive and noun 'mine' -men | people, 'one' -dit | this -zo | so, thus, in this way -door | through by -over | over, across -ze | she, her, they, them -zich | oneself -bij | (1) a bee, (2) by, near, at -ook | also, too -tot | till, until -je | you -mij | me -uit | out of, from -der | Old Dutch form of 'van der' still found in surnames -daar | (1) there, (2) because -haar | (1) her, their, them, (2) hair -naar | (1) unpleasant, unwell etc, (2) towards, (3) as -heb | present first person sing. of 'to have' -hoe | how, why -heeft | present third person sing. of 'to have' -hebben | 'to have' and various parts thereof -deze | this -u | you -want | (1) for, (2) mitten, (3) rigging -nog | yet, still -zal | 'shall', first and third person sing. of verb 'zullen' (will) -me | me -zij | she, they -nu | now -ge | 'thou', still used in Belgium and south Netherlands -geen | none -omdat | because -iets | something, somewhat -worden | to become, grow, get -toch | yet, still -al | all, every, each -waren | (1) 'were' (2) to wander, (3) wares, (3) -veel | much, many -meer | (1) more, (2) lake -doen | to do, to make -toen | then, when -moet | noun 'spot/mote' and present form of 'to must' -ben | (1) am, (2) 'are' in interrogative second person singular of 'to be' -zonder | without -kan | noun 'can' and present form of 'to be able' -hun | their, them -dus | so, consequently -alles | all, everything, anything -onder | under, beneath -ja | yes, of course -eens | once, one day -hier | here -wie | who -werd | imperfect third person sing. of 'become' -altijd | always -doch | yet, but etc -wordt | present third person sing. of 'become' -wezen | (1) to be, (2) 'been' as in 'been fishing', (3) orphans -kunnen | to be able -ons | us/our -zelf | self -tegen | against, towards, at -na | after, near -reeds | already -wil | (1) present tense of 'want', (2) 'will', noun, (3) fender -kon | could; past tense of 'to be able' -niets | nothing -uw | your -iemand | somebody -geweest | been; past participle of 'be' -andere | other diff --git a/docker/solr/conf/solr.languages/no/language-fieldtypes.xml b/docker/solr/conf/solr.languages/no/language-fieldtypes.xml deleted file mode 100644 index ee6f092bc..000000000 --- a/docker/solr/conf/solr.languages/no/language-fieldtypes.xml +++ /dev/null @@ -1,14 +0,0 @@ - - - - - - - - - - - - - - diff --git a/docker/solr/conf/solr.languages/no/stopwords_no.txt b/docker/solr/conf/solr.languages/no/stopwords_no.txt deleted file mode 100644 index a7a2c28ba..000000000 --- a/docker/solr/conf/solr.languages/no/stopwords_no.txt +++ /dev/null @@ -1,194 +0,0 @@ - | From svn.tartarus.org/snowball/trunk/website/algorithms/norwegian/stop.txt - | This file is distributed under the BSD License. - | See http://snowball.tartarus.org/license.php - | Also see http://www.opensource.org/licenses/bsd-license.html - | - Encoding was converted to UTF-8. - | - This notice was added. - | - | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" - - | A Norwegian stop word list. Comments begin with vertical bar. Each stop - | word is at the start of a line. - - | This stop word list is for the dominant bokmål dialect. Words unique - | to nynorsk are marked *. - - | Revised by Jan Bruusgaard , Jan 2005 - -og | and -i | in -jeg | I -det | it/this/that -at | to (w. inf.) -en | a/an -et | a/an -den | it/this/that -til | to -er | is/am/are -som | who/that -på | on -de | they / you(formal) -med | with -han | he -av | of -ikke | not -ikkje | not * -der | there -så | so -var | was/were -meg | me -seg | you -men | but -ett | one -har | have -om | about -vi | we -min | my -mitt | my -ha | have -hadde | had -hun | she -nå | now -over | over -da | when/as -ved | by/know -fra | from -du | you -ut | out -sin | your -dem | them -oss | us -opp | up -man | you/one -kan | can -hans | his -hvor | where -eller | or -hva | what -skal | shall/must -selv | self (reflective) -sjøl | self (reflective) -her | here -alle | all -vil | will -bli | become -ble | became -blei | became * -blitt | have become -kunne | could -inn | in -når | when -være | be -kom | come -noen | some -noe | some -ville | would -dere | you -som | who/which/that -deres | their/theirs -kun | only/just -ja | yes -etter | after -ned | down -skulle | should -denne | this -for | for/because -deg | you -si | hers/his -sine | hers/his -sitt | hers/his -mot | against -å | to -meget | much -hvorfor | why -dette | this -disse | these/those -uten | without -hvordan | how -ingen | none -din | your -ditt | your -blir | become -samme | same -hvilken | which -hvilke | which (plural) -sånn | such a -inni | inside/within -mellom | between -vår | our -hver | each -hvem | who -vors | us/ours -hvis | whose -både | both -bare | only/just -enn | than -fordi | as/because -før | before -mange | many -også | also -slik | just -vært | been -være | to be -båe | both * -begge | both -siden | since -dykk | your * -dykkar | yours * -dei | they * -deira | them * -deires | theirs * -deim | them * -di | your (fem.) * -då | as/when * -eg | I * -ein | a/an * -eit | a/an * -eitt | a/an * -elles | or * -honom | he * -hjå | at * -ho | she * -hoe | she * -henne | her -hennar | her/hers -hennes | hers -hoss | how * -hossen | how * -ikkje | not * -ingi | noone * -inkje | noone * -korleis | how * -korso | how * -kva | what/which * -kvar | where * -kvarhelst | where * -kven | who/whom * -kvi | why * -kvifor | why * -me | we * -medan | while * -mi | my * -mine | my * -mykje | much * -no | now * -nokon | some (masc./neut.) * -noka | some (fem.) * -nokor | some * -noko | some * -nokre | some * -si | his/hers * -sia | since * -sidan | since * -so | so * -somt | some * -somme | some * -um | about* -upp | up * -vere | be * -vore | was * -verte | become * -vort | become * -varte | became * -vart | became * - diff --git a/docker/solr/conf/solr.languages/pt/language-fieldtypes.xml b/docker/solr/conf/solr.languages/pt/language-fieldtypes.xml deleted file mode 100644 index 82187b7ac..000000000 --- a/docker/solr/conf/solr.languages/pt/language-fieldtypes.xml +++ /dev/null @@ -1,14 +0,0 @@ - - - - - - - - - - - - - - diff --git a/docker/solr/conf/solr.languages/pt/stopwords_pt.txt b/docker/solr/conf/solr.languages/pt/stopwords_pt.txt deleted file mode 100644 index acfeb01af..000000000 --- a/docker/solr/conf/solr.languages/pt/stopwords_pt.txt +++ /dev/null @@ -1,253 +0,0 @@ - | From svn.tartarus.org/snowball/trunk/website/algorithms/portuguese/stop.txt - | This file is distributed under the BSD License. - | See http://snowball.tartarus.org/license.php - | Also see http://www.opensource.org/licenses/bsd-license.html - | - Encoding was converted to UTF-8. - | - This notice was added. - | - | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" - - | A Portuguese stop word list. Comments begin with vertical bar. Each stop - | word is at the start of a line. - - - | The following is a ranked list (commonest to rarest) of stopwords - | deriving from a large sample of text. - - | Extra words have been added at the end. - -de | of, from -a | the; to, at; her -o | the; him -que | who, that -e | and -do | de + o -da | de + a -em | in -um | a -para | for - | é from SER -com | with -não | not, no -uma | a -os | the; them -no | em + o -se | himself etc -na | em + a -por | for -mais | more -as | the; them -dos | de + os -como | as, like -mas | but - | foi from SER -ao | a + o -ele | he -das | de + as - | tem from TER -à | a + a -seu | his -sua | her -ou | or - | ser from SER -quando | when -muito | much - | há from HAV -nos | em + os; us -já | already, now - | está from EST -eu | I -também | also -só | only, just -pelo | per + o -pela | per + a -até | up to -isso | that -ela | he -entre | between - | era from SER -depois | after -sem | without -mesmo | same -aos | a + os - | ter from TER -seus | his -quem | whom -nas | em + as -me | me -esse | that -eles | they - | estão from EST -você | you - | tinha from TER - | foram from SER -essa | that -num | em + um -nem | nor -suas | her -meu | my -às | a + as -minha | my - | têm from TER -numa | em + uma -pelos | per + os -elas | they - | havia from HAV - | seja from SER -qual | which - | será from SER -nós | we - | tenho from TER -lhe | to him, her -deles | of them -essas | those -esses | those -pelas | per + as -este | this - | fosse from SER -dele | of him - - | other words. There are many contractions such as naquele = em+aquele, - | mo = me+o, but they are rare. - | Indefinite article plural forms are also rare. - -tu | thou -te | thee -vocês | you (plural) -vos | you -lhes | to them -meus | my -minhas -teu | thy -tua -teus -tuas -nosso | our -nossa -nossos -nossas - -dela | of her -delas | of them - -esta | this -estes | these -estas | these -aquele | that -aquela | that -aqueles | those -aquelas | those -isto | this -aquilo | that - - | forms of estar, to be (not including the infinitive): -estou -está -estamos -estão -estive -esteve -estivemos -estiveram -estava -estávamos -estavam -estivera -estivéramos -esteja -estejamos -estejam -estivesse -estivéssemos -estivessem -estiver -estivermos -estiverem - - | forms of haver, to have (not including the infinitive): -hei -há -havemos -hão -houve -houvemos -houveram -houvera -houvéramos -haja -hajamos -hajam -houvesse -houvéssemos -houvessem -houver -houvermos -houverem -houverei -houverá -houveremos -houverão -houveria -houveríamos -houveriam - - | forms of ser, to be (not including the infinitive): -sou -somos -são -era -éramos -eram -fui -foi -fomos -foram -fora -fôramos -seja -sejamos -sejam -fosse -fôssemos -fossem -for -formos -forem -serei -será -seremos -serão -seria -seríamos -seriam - - | forms of ter, to have (not including the infinitive): -tenho -tem -temos -tém -tinha -tínhamos -tinham -tive -teve -tivemos -tiveram -tivera -tivéramos -tenha -tenhamos -tenham -tivesse -tivéssemos -tivessem -tiver -tivermos -tiverem -terei -terá -teremos -terão -teria -teríamos -teriam diff --git a/docker/solr/conf/solr.languages/ru/language-fieldtypes.xml b/docker/solr/conf/solr.languages/ru/language-fieldtypes.xml deleted file mode 100644 index 371b1e9f7..000000000 --- a/docker/solr/conf/solr.languages/ru/language-fieldtypes.xml +++ /dev/null @@ -1,12 +0,0 @@ - - - - - - - - - - - - diff --git a/docker/solr/conf/solr.languages/ru/stopwords_ru.txt b/docker/solr/conf/solr.languages/ru/stopwords_ru.txt deleted file mode 100644 index 55271400c..000000000 --- a/docker/solr/conf/solr.languages/ru/stopwords_ru.txt +++ /dev/null @@ -1,243 +0,0 @@ - | From svn.tartarus.org/snowball/trunk/website/algorithms/russian/stop.txt - | This file is distributed under the BSD License. - | See http://snowball.tartarus.org/license.php - | Also see http://www.opensource.org/licenses/bsd-license.html - | - Encoding was converted to UTF-8. - | - This notice was added. - | - | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" - - | a russian stop word list. comments begin with vertical bar. each stop - | word is at the start of a line. - - | this is a ranked list (commonest to rarest) of stopwords derived from - | a large text sample. - - | letter `ё' is translated to `е'. - -и | and -в | in/into -во | alternative form -не | not -что | what/that -он | he -на | on/onto -я | i -с | from -со | alternative form -как | how -а | milder form of `no' (but) -то | conjunction and form of `that' -все | all -она | she -так | so, thus -его | him -но | but -да | yes/and -ты | thou -к | towards, by -у | around, chez -же | intensifier particle -вы | you -за | beyond, behind -бы | conditional/subj. particle -по | up to, along -только | only -ее | her -мне | to me -было | it was -вот | here is/are, particle -от | away from -меня | me -еще | still, yet, more -нет | no, there isnt/arent -о | about -из | out of -ему | to him -теперь | now -когда | when -даже | even -ну | so, well -вдруг | suddenly -ли | interrogative particle -если | if -уже | already, but homonym of `narrower' -или | or -ни | neither -быть | to be -был | he was -него | prepositional form of его -до | up to -вас | you accusative -нибудь | indef. suffix preceded by hyphen -опять | again -уж | already, but homonym of `adder' -вам | to you -сказал | he said -ведь | particle `after all' -там | there -потом | then -себя | oneself -ничего | nothing -ей | to her -может | usually with `быть' as `maybe' -они | they -тут | here -где | where -есть | there is/are -надо | got to, must -ней | prepositional form of ей -для | for -мы | we -тебя | thee -их | them, their -чем | than -была | she was -сам | self -чтоб | in order to -без | without -будто | as if -человек | man, person, one -чего | genitive form of `what' -раз | once -тоже | also -себе | to oneself -под | beneath -жизнь | life -будет | will be -ж | short form of intensifer particle `же' -тогда | then -кто | who -этот | this -говорил | was saying -того | genitive form of `that' -потому | for that reason -этого | genitive form of `this' -какой | which -совсем | altogether -ним | prepositional form of `его', `они' -здесь | here -этом | prepositional form of `этот' -один | one -почти | almost -мой | my -тем | instrumental/dative plural of `тот', `то' -чтобы | full form of `in order that' -нее | her (acc.) -кажется | it seems -сейчас | now -были | they were -куда | where to -зачем | why -сказать | to say -всех | all (acc., gen. preposn. plural) -никогда | never -сегодня | today -можно | possible, one can -при | by -наконец | finally -два | two -об | alternative form of `о', about -другой | another -хоть | even -после | after -над | above -больше | more -тот | that one (masc.) -через | across, in -эти | these -нас | us -про | about -всего | in all, only, of all -них | prepositional form of `они' (they) -какая | which, feminine -много | lots -разве | interrogative particle -сказала | she said -три | three -эту | this, acc. fem. sing. -моя | my, feminine -впрочем | moreover, besides -хорошо | good -свою | ones own, acc. fem. sing. -этой | oblique form of `эта', fem. `this' -перед | in front of -иногда | sometimes -лучше | better -чуть | a little -том | preposn. form of `that one' -нельзя | one must not -такой | such a one -им | to them -более | more -всегда | always -конечно | of course -всю | acc. fem. sing of `all' -между | between - - - | b: some paradigms - | - | personal pronouns - | - | я меня мне мной [мною] - | ты тебя тебе тобой [тобою] - | он его ему им [него, нему, ним] - | она ее эи ею [нее, нэи, нею] - | оно его ему им [него, нему, ним] - | - | мы нас нам нами - | вы вас вам вами - | они их им ими [них, ним, ними] - | - | себя себе собой [собою] - | - | demonstrative pronouns: этот (this), тот (that) - | - | этот эта это эти - | этого эты это эти - | этого этой этого этих - | этому этой этому этим - | этим этой этим [этою] этими - | этом этой этом этих - | - | тот та то те - | того ту то те - | того той того тех - | тому той тому тем - | тем той тем [тою] теми - | том той том тех - | - | determinative pronouns - | - | (a) весь (all) - | - | весь вся все все - | всего всю все все - | всего всей всего всех - | всему всей всему всем - | всем всей всем [всею] всеми - | всем всей всем всех - | - | (b) сам (himself etc) - | - | сам сама само сами - | самого саму само самих - | самого самой самого самих - | самому самой самому самим - | самим самой самим [самою] самими - | самом самой самом самих - | - | stems of verbs `to be', `to have', `to do' and modal - | - | быть бы буд быв есть суть - | име - | дел - | мог мож мочь - | уме - | хоч хот - | долж - | можн - | нужн - | нельзя -