From 3bdab50fb6f2c1bc91e37a9ad7a54bc72eff790a Mon Sep 17 00:00:00 2001 From: Nick Knize Date: Wed, 12 Oct 2022 13:32:27 -0500 Subject: [PATCH] Better plural stemmer than minimal_english (#4738) Drops the trailing "e" in taxes, dresses, watches, dishes etc that otherwise cause mismatches with plural and singular forms. Signed-off-by: Nicholas Walter Knize Co-authored-by: Mark Harwood Co-authored-by: Nicholas Walter Knize (cherry picked from commit c92846d0ae121db54ce740790df75b36a24f804e) --- CHANGELOG.md | 2 + .../common/EnglishPluralStemFilter.java | 182 ++++++++++++++++++ .../common/StemmerTokenFilterFactory.java | 2 + .../StemmerTokenFilterFactoryTests.java | 77 ++++++++ 4 files changed, 263 insertions(+) create mode 100644 modules/analysis-common/src/main/java/org/opensearch/analysis/common/EnglishPluralStemFilter.java diff --git a/CHANGELOG.md b/CHANGELOG.md index acae8b94973ee..a2d03b2c944a7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -93,6 +93,8 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) - [Segment Replication] Fix timeout issue by calculating time needed to process getSegmentFiles ([#4434](https://github.com/opensearch-project/OpenSearch/pull/4434)) - [Segment Replication] Update replicas to commit SegmentInfos instead of relying on segments_N from primary shards ([#4450](https://github.com/opensearch-project/OpenSearch/pull/4450)) - [Segment Replication] Adding check to make sure checkpoint is not processed when a shard's shard routing is primary ([#4716](https://github.com/opensearch-project/OpenSearch/pull/4716)) +- Better plural stemmer than minimal_english ([#4738](https://github.com/opensearch-project/OpenSearch/pull/4738)) + ### Security [Unreleased]: https://github.com/opensearch-project/OpenSearch/compare/2.2.0...HEAD diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/EnglishPluralStemFilter.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/EnglishPluralStemFilter.java new file mode 100644 index 0000000000000..c30318a31527b --- /dev/null +++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/EnglishPluralStemFilter.java @@ -0,0 +1,182 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Modifications Copyright OpenSearch Contributors. See + * GitHub history for details. + */ + +package org.opensearch.analysis.common; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.en.EnglishMinimalStemFilter; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; + +import java.io.IOException; + +public final class EnglishPluralStemFilter extends TokenFilter { + private final EnglishPluralStemmer stemmer = new EnglishPluralStemmer(); + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class); + + public EnglishPluralStemFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (!keywordAttr.isKeyword()) { + final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length()); + termAtt.setLength(newlen); + } + return true; + } else { + return false; + } + } + + /** + * Plural stemmer for English based on the {@link EnglishMinimalStemFilter} + *

+ * This stemmer removes plurals but beyond EnglishMinimalStemFilter adds + * four new suffix rules to remove dangling e characters: + *

    + *
  • xes - "boxes" becomes "box"
  • + *
  • sses - "dresses" becomes "dress"
  • + *
  • shes - "dishes" becomes "dish"
  • + *
  • tches - "watches" becomes "watch"
  • + *
+ * See https://github.com/elastic/elasticsearch/issues/42892 + *

+ * In addition the s stemmer logic is amended so that + *

    + *
  • ees->ee so that bees matches bee
  • + *
  • ies->y only on longer words to that ties matches tie
  • + *
  • oes->o rule so that tomatoes matches tomato but retains e for some words eg shoes to shoe
  • + *
+ */ + public static class EnglishPluralStemmer { + + // Words ending in oes that retain the e when stemmed + public static final char[][] oesExceptions = { "shoes".toCharArray(), "canoes".toCharArray(), "oboes".toCharArray() }; + // Words ending in ches that retain the e when stemmed + public static final char[][] chesExceptions = { + "cliches".toCharArray(), + "avalanches".toCharArray(), + "mustaches".toCharArray(), + "moustaches".toCharArray(), + "quiches".toCharArray(), + "headaches".toCharArray(), + "heartaches".toCharArray(), + "porsches".toCharArray(), + "tranches".toCharArray(), + "caches".toCharArray() }; + + @SuppressWarnings("fallthrough") + public int stem(char s[], int len) { + if (len < 3 || s[len - 1] != 's') return len; + + switch (s[len - 2]) { + case 'u': + case 's': + return len; + case 'e': + // Modified ies->y logic from original s-stemmer - only work on strings > 4 + // so spies -> spy still but pies->pie. + // The original code also special-cased aies and eies for no good reason as far as I can tell. + // ( no words of consequence - eg http://www.thefreedictionary.com/words-that-end-in-aies ) + if (len > 4 && s[len - 3] == 'i') { + s[len - 3] = 'y'; + return len - 2; + } + + // Suffix rules to remove any dangling "e" + if (len > 3) { + // xes (but >1 prefix so we can stem "boxes->box" but keep "axes->axe") + if (len > 4 && s[len - 3] == 'x') { + return len - 2; + } + // oes + if (len > 3 && s[len - 3] == 'o') { + if (isException(s, len, oesExceptions)) { + // Only remove the S + return len - 1; + } + // Remove the es + return len - 2; + } + if (len > 4) { + // shes/sses + if (s[len - 4] == 's' && (s[len - 3] == 'h' || s[len - 3] == 's')) { + return len - 2; + } + + // ches + if (len > 4) { + if (s[len - 4] == 'c' && s[len - 3] == 'h') { + if (isException(s, len, chesExceptions)) { + // Only remove the S + return len - 1; + } + // Remove the es + return len - 2; + + } + } + } + } + + default: + return len - 1; + } + } + + private boolean isException(char[] s, int len, char[][] exceptionsList) { + for (char[] oesRule : exceptionsList) { + int rulePos = oesRule.length - 1; + int sPos = len - 1; + boolean matched = true; + while (rulePos >= 0 && sPos >= 0) { + if (oesRule[rulePos] != s[sPos]) { + matched = false; + break; + } + rulePos--; + sPos--; + } + if (matched) { + return true; + } + } + return false; + } + } + +} diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/StemmerTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/StemmerTokenFilterFactory.java index 5d96f01265cf6..fc045447e159e 100644 --- a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/StemmerTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/StemmerTokenFilterFactory.java @@ -154,6 +154,8 @@ public TokenStream create(TokenStream tokenStream) { return new SnowballFilter(tokenStream, new EnglishStemmer()); } else if ("minimal_english".equalsIgnoreCase(language) || "minimalEnglish".equalsIgnoreCase(language)) { return new EnglishMinimalStemFilter(tokenStream); + } else if ("plural_english".equalsIgnoreCase(language) || "pluralEnglish".equalsIgnoreCase(language)) { + return new EnglishPluralStemFilter(tokenStream); } else if ("possessive_english".equalsIgnoreCase(language) || "possessiveEnglish".equalsIgnoreCase(language)) { return new EnglishPossessiveFilter(tokenStream); diff --git a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/StemmerTokenFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/StemmerTokenFilterFactoryTests.java index fca64f4915cbf..faaf6136448f0 100644 --- a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/StemmerTokenFilterFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/StemmerTokenFilterFactoryTests.java @@ -111,6 +111,83 @@ public void testPorter2FilterFactory() throws IOException { } } + public void testEnglishPluralFilter() throws IOException { + int iters = scaledRandomIntBetween(20, 100); + for (int i = 0; i < iters; i++) { + + Version v = VersionUtils.randomVersion(random()); + Settings settings = Settings.builder() + .put("index.analysis.filter.my_plurals.type", "stemmer") + .put("index.analysis.filter.my_plurals.language", "plural_english") + .put("index.analysis.analyzer.my_plurals.tokenizer", "whitespace") + .put("index.analysis.analyzer.my_plurals.filter", "my_plurals") + .put(SETTING_VERSION_CREATED, v) + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .build(); + + OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, PLUGIN); + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_plurals"); + assertThat(tokenFilter, instanceOf(StemmerTokenFilterFactory.class)); + Tokenizer tokenizer = new WhitespaceTokenizer(); + tokenizer.setReader(new StringReader("dresses")); + TokenStream create = tokenFilter.create(tokenizer); + IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers; + NamedAnalyzer analyzer = indexAnalyzers.get("my_plurals"); + assertThat(create, instanceOf(EnglishPluralStemFilter.class)); + + // Check old EnglishMinimalStemmer ("S" stemmer) logic + assertAnalyzesTo(analyzer, "phones", new String[] { "phone" }); + assertAnalyzesTo(analyzer, "horses", new String[] { "horse" }); + assertAnalyzesTo(analyzer, "cameras", new String[] { "camera" }); + + // The orginal s stemmer gives up on stemming oes words because English has no fixed rule for the stem + // (see https://howtospell.co.uk/making-O-words-plural ) + // This stemmer removes the es but retains e for a small number of exceptions + assertAnalyzesTo(analyzer, "mosquitoes", new String[] { "mosquito" }); + assertAnalyzesTo(analyzer, "heroes", new String[] { "hero" }); + // oes exceptions that retain the e. + assertAnalyzesTo(analyzer, "shoes", new String[] { "shoe" }); + assertAnalyzesTo(analyzer, "horseshoes", new String[] { "horseshoe" }); + assertAnalyzesTo(analyzer, "canoes", new String[] { "canoe" }); + assertAnalyzesTo(analyzer, "oboes", new String[] { "oboe" }); + + // Check improved EnglishPluralStemFilter logic + // sses + assertAnalyzesTo(analyzer, "dresses", new String[] { "dress" }); + assertAnalyzesTo(analyzer, "possess", new String[] { "possess" }); + assertAnalyzesTo(analyzer, "possesses", new String[] { "possess" }); + // xes + assertAnalyzesTo(analyzer, "boxes", new String[] { "box" }); + assertAnalyzesTo(analyzer, "axes", new String[] { "axe" }); + // shes + assertAnalyzesTo(analyzer, "dishes", new String[] { "dish" }); + assertAnalyzesTo(analyzer, "washes", new String[] { "wash" }); + // ees + assertAnalyzesTo(analyzer, "employees", new String[] { "employee" }); + assertAnalyzesTo(analyzer, "bees", new String[] { "bee" }); + // tch + assertAnalyzesTo(analyzer, "watches", new String[] { "watch" }); + assertAnalyzesTo(analyzer, "itches", new String[] { "itch" }); + // ies->y but only for length >4 + assertAnalyzesTo(analyzer, "spies", new String[] { "spy" }); + assertAnalyzesTo(analyzer, "ties", new String[] { "tie" }); + assertAnalyzesTo(analyzer, "lies", new String[] { "lie" }); + assertAnalyzesTo(analyzer, "pies", new String[] { "pie" }); + assertAnalyzesTo(analyzer, "dies", new String[] { "die" }); + + assertAnalyzesTo(analyzer, "lunches", new String[] { "lunch" }); + assertAnalyzesTo(analyzer, "avalanches", new String[] { "avalanche" }); + assertAnalyzesTo(analyzer, "headaches", new String[] { "headache" }); + assertAnalyzesTo(analyzer, "caches", new String[] { "cache" }); + assertAnalyzesTo(analyzer, "beaches", new String[] { "beach" }); + assertAnalyzesTo(analyzer, "britches", new String[] { "britch" }); + assertAnalyzesTo(analyzer, "cockroaches", new String[] { "cockroach" }); + assertAnalyzesTo(analyzer, "cliches", new String[] { "cliche" }); + assertAnalyzesTo(analyzer, "quiches", new String[] { "quiche" }); + + } + } + public void testMultipleLanguagesThrowsException() throws IOException { Version v = VersionUtils.randomVersion(random()); Settings settings = Settings.builder()