From f49c74840f41f3be2fa8471938f32256c2bb0656 Mon Sep 17 00:00:00 2001 From: Ebrahim Byagowi Date: Sat, 20 Jul 2024 11:23:44 +0330 Subject: [PATCH] Use Lucene provided Persian stem Lucene provided Persian stem apparently isn't hooked yet and this change is doing that based on what is done for Arabic stem support. --- CHANGELOG-3.0.md | 1 + .../common/CommonAnalysisModulePlugin.java | 1 + .../common/PersianStemTokenFilterFactory.java | 52 +++++++++++++++++++ .../common/CommonAnalysisFactoryTests.java | 2 + .../analysis/AnalysisFactoryTestCase.java | 1 + 5 files changed, 57 insertions(+) create mode 100644 modules/analysis-common/src/main/java/org/opensearch/analysis/common/PersianStemTokenFilterFactory.java diff --git a/CHANGELOG-3.0.md b/CHANGELOG-3.0.md index 06b761b1df8bd..0afc8c91b8d45 100644 --- a/CHANGELOG-3.0.md +++ b/CHANGELOG-3.0.md @@ -13,6 +13,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - GHA to verify checklist items completion in PR descriptions ([#10800](https://github.com/opensearch-project/OpenSearch/pull/10800)) - Allow to pass the list settings through environment variables (like [], ["a", "b", "c"], ...) ([#10625](https://github.com/opensearch-project/OpenSearch/pull/10625)) - Views, simplify data access and manipulation by providing a virtual layer over one or more indices ([#11957](https://github.com/opensearch-project/OpenSearch/pull/11957)) +- Use Lucene provided Persian stem support (([#14847](https://github.com/opensearch-project/OpenSearch/pull/14847))) ### Dependencies diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java index cf2736a8583d2..9a728eea02108 100644 --- a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java +++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java @@ -315,6 +315,7 @@ public Map> getTokenFilters() { filters.put("pattern_capture", requiresAnalysisSettings(PatternCaptureGroupTokenFilterFactory::new)); filters.put("pattern_replace", requiresAnalysisSettings(PatternReplaceTokenFilterFactory::new)); filters.put("persian_normalization", PersianNormalizationFilterFactory::new); + filters.put("persian_stem", PersianStemTokenFilterFactory::new); filters.put("porter_stem", PorterStemTokenFilterFactory::new); filters.put( "predicate_token_filter", diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/PersianStemTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/PersianStemTokenFilterFactory.java new file mode 100644 index 0000000000000..afe8058343e17 --- /dev/null +++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/PersianStemTokenFilterFactory.java @@ -0,0 +1,52 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Modifications Copyright OpenSearch Contributors. See + * GitHub history for details. + */ + +package org.opensearch.analysis.common; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.fa.PersianStemFilter; +import org.opensearch.common.settings.Settings; +import org.opensearch.env.Environment; +import org.opensearch.index.IndexSettings; +import org.opensearch.index.analysis.AbstractTokenFilterFactory; + +public class PersianStemTokenFilterFactory extends AbstractTokenFilterFactory { + + PersianStemTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + super(indexSettings, name, settings); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new PersianStemFilter(tokenStream); + } +} diff --git a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/CommonAnalysisFactoryTests.java b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/CommonAnalysisFactoryTests.java index 11713f52f5b18..7e3140f8bcba3 100644 --- a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/CommonAnalysisFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/CommonAnalysisFactoryTests.java @@ -158,6 +158,7 @@ protected Map> getTokenFilters() { filters.put("brazilianstem", BrazilianStemTokenFilterFactory.class); filters.put("czechstem", CzechStemTokenFilterFactory.class); filters.put("germanstem", GermanStemTokenFilterFactory.class); + filters.put("persianstem", PersianStemTokenFilterFactory.class); filters.put("telugunormalization", TeluguNormalizationFilterFactory.class); filters.put("telugustem", TeluguStemFilterFactory.class); // this filter is not exposed and should only be used internally @@ -220,6 +221,7 @@ protected Map> getPreConfiguredTokenFilters() { filters.put("ngram", null); filters.put("nGram", null); filters.put("persian_normalization", null); + filters.put("persian_stem", null); filters.put("porter_stem", null); filters.put("reverse", ReverseStringFilterFactory.class); filters.put("russian_stem", SnowballPorterFilterFactory.class); diff --git a/test/framework/src/main/java/org/opensearch/indices/analysis/AnalysisFactoryTestCase.java b/test/framework/src/main/java/org/opensearch/indices/analysis/AnalysisFactoryTestCase.java index 5231fe095f0f0..e762ae3afcecc 100644 --- a/test/framework/src/main/java/org/opensearch/indices/analysis/AnalysisFactoryTestCase.java +++ b/test/framework/src/main/java/org/opensearch/indices/analysis/AnalysisFactoryTestCase.java @@ -139,6 +139,7 @@ public abstract class AnalysisFactoryTestCase extends OpenSearchTestCase { .put("patterncapturegroup", MovedToAnalysisCommon.class) .put("patternreplace", MovedToAnalysisCommon.class) .put("persiannormalization", MovedToAnalysisCommon.class) + .put("persianstem", MovedToAnalysisCommon.class) .put("porterstem", MovedToAnalysisCommon.class) .put("portuguesestem", MovedToAnalysisCommon.class) .put("portugueselightstem", MovedToAnalysisCommon.class)