From 9a4801894e440bd7ecdaa5ccbda6c2325a63d194 Mon Sep 17 00:00:00 2001 From: Ebrahim Byagowi Date: Sat, 20 Jul 2024 11:23:44 +0330 Subject: [PATCH] Use Lucene provided Persian stem Lucene provided Persian stem apparently isn't hooked yet and this change is doing that based on what is done for Arabic stem support. --- CHANGELOG.md | 1 + .../common/CommonAnalysisModulePlugin.java | 2 + .../common/PersianStemTokenFilterFactory.java | 52 +++++++++++++++++++ .../common/CommonAnalysisFactoryTests.java | 2 + .../analysis/AnalysisFactoryTestCase.java | 1 + 5 files changed, 58 insertions(+) create mode 100644 modules/analysis-common/src/main/java/org/opensearch/analysis/common/PersianStemTokenFilterFactory.java diff --git a/CHANGELOG.md b/CHANGELOG.md index 29e70c5026bb8..e87b92a3f5ccb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Add Plugin interface for loading application based configuration templates (([#14659](https://github.com/opensearch-project/OpenSearch/issues/14659))) - Refactor remote-routing-table service inline with remote state interfaces([#14668](https://github.com/opensearch-project/OpenSearch/pull/14668)) - Add prefix mode verification setting for repository verification (([#14790](https://github.com/opensearch-project/OpenSearch/pull/14790))) +- Use Lucene provided Persian stem support (([#14847](https://github.com/opensearch-project/OpenSearch/pull/14847))) ### Dependencies - Bump `org.gradle.test-retry` from 1.5.8 to 1.5.9 ([#13442](https://github.com/opensearch-project/OpenSearch/pull/13442)) diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java index cf2736a8583d2..9896e694af2c4 100644 --- a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java +++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java @@ -315,6 +315,7 @@ public Map> getTokenFilters() { filters.put("pattern_capture", requiresAnalysisSettings(PatternCaptureGroupTokenFilterFactory::new)); filters.put("pattern_replace", requiresAnalysisSettings(PatternReplaceTokenFilterFactory::new)); filters.put("persian_normalization", PersianNormalizationFilterFactory::new); + filters.put("persian_stem", PersianStemTokenFilterFactory::new); filters.put("porter_stem", PorterStemTokenFilterFactory::new); filters.put( "predicate_token_filter", @@ -558,6 +559,7 @@ public List getPreConfiguredTokenFilters() { ); })); filters.add(PreConfiguredTokenFilter.singleton("persian_normalization", true, PersianNormalizationFilter::new)); + filters.add(PreConfiguredTokenFilter.singleton("persian_stem", true, PersianStemTokenFilterFactory::new)); filters.add(PreConfiguredTokenFilter.singleton("porter_stem", false, PorterStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("reverse", false, ReverseStringFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("russian_stem", false, input -> new SnowballFilter(input, "Russian"))); diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/PersianStemTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/PersianStemTokenFilterFactory.java new file mode 100644 index 0000000000000..afe8058343e17 --- /dev/null +++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/PersianStemTokenFilterFactory.java @@ -0,0 +1,52 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Modifications Copyright OpenSearch Contributors. See + * GitHub history for details. + */ + +package org.opensearch.analysis.common; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.fa.PersianStemFilter; +import org.opensearch.common.settings.Settings; +import org.opensearch.env.Environment; +import org.opensearch.index.IndexSettings; +import org.opensearch.index.analysis.AbstractTokenFilterFactory; + +public class PersianStemTokenFilterFactory extends AbstractTokenFilterFactory { + + PersianStemTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + super(indexSettings, name, settings); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new PersianStemFilter(tokenStream); + } +} diff --git a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/CommonAnalysisFactoryTests.java b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/CommonAnalysisFactoryTests.java index 11713f52f5b18..7e3140f8bcba3 100644 --- a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/CommonAnalysisFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/CommonAnalysisFactoryTests.java @@ -158,6 +158,7 @@ protected Map> getTokenFilters() { filters.put("brazilianstem", BrazilianStemTokenFilterFactory.class); filters.put("czechstem", CzechStemTokenFilterFactory.class); filters.put("germanstem", GermanStemTokenFilterFactory.class); + filters.put("persianstem", PersianStemTokenFilterFactory.class); filters.put("telugunormalization", TeluguNormalizationFilterFactory.class); filters.put("telugustem", TeluguStemFilterFactory.class); // this filter is not exposed and should only be used internally @@ -220,6 +221,7 @@ protected Map> getPreConfiguredTokenFilters() { filters.put("ngram", null); filters.put("nGram", null); filters.put("persian_normalization", null); + filters.put("persian_stem", null); filters.put("porter_stem", null); filters.put("reverse", ReverseStringFilterFactory.class); filters.put("russian_stem", SnowballPorterFilterFactory.class); diff --git a/test/framework/src/main/java/org/opensearch/indices/analysis/AnalysisFactoryTestCase.java b/test/framework/src/main/java/org/opensearch/indices/analysis/AnalysisFactoryTestCase.java index 5231fe095f0f0..e762ae3afcecc 100644 --- a/test/framework/src/main/java/org/opensearch/indices/analysis/AnalysisFactoryTestCase.java +++ b/test/framework/src/main/java/org/opensearch/indices/analysis/AnalysisFactoryTestCase.java @@ -139,6 +139,7 @@ public abstract class AnalysisFactoryTestCase extends OpenSearchTestCase { .put("patterncapturegroup", MovedToAnalysisCommon.class) .put("patternreplace", MovedToAnalysisCommon.class) .put("persiannormalization", MovedToAnalysisCommon.class) + .put("persianstem", MovedToAnalysisCommon.class) .put("porterstem", MovedToAnalysisCommon.class) .put("portuguesestem", MovedToAnalysisCommon.class) .put("portugueselightstem", MovedToAnalysisCommon.class)