From 11836d00a84d4343b6bf237ef6952676fdcd92e5 Mon Sep 17 00:00:00 2001 From: Tatsuya Kawakami <43780506+hogesako@users.noreply.github.com> Date: Wed, 6 Mar 2024 01:05:39 +0900 Subject: [PATCH] Add kuromoji_completion analyzer and filter (#4835) (#12287) * Add kuromoji_completion analyzer and filter (#4835) Signed-off-by: Tatsuya Kawakami <43780506+hogesako@users.noreply.github.com> * Use INDEX mode if an invalid value is set for mode in the kuromoji_completion filter Signed-off-by: Tatsuya Kawakami <43780506+hogesako@users.noreply.github.com> --------- Signed-off-by: Tatsuya Kawakami <43780506+hogesako@users.noreply.github.com> --- CHANGELOG.md | 1 + .../KuromojiCompletionAnalyzerProvider.java | 34 +++++++++ .../KuromojiCompletionFilterFactory.java | 42 +++++++++++ .../kuromoji/AnalysisKuromojiPlugin.java | 8 ++- .../AnalysisKuromojiFactoryTests.java | 1 + .../index/analysis/KuromojiAnalysisTests.java | 69 +++++++++++++++++++ .../index/analysis/kuromoji_analysis.json | 16 +++++ .../test/analysis_kuromoji/10_basic.yml | 30 ++++++++ 8 files changed, 200 insertions(+), 1 deletion(-) create mode 100644 plugins/analysis-kuromoji/src/main/java/org/opensearch/index/analysis/KuromojiCompletionAnalyzerProvider.java create mode 100644 plugins/analysis-kuromoji/src/main/java/org/opensearch/index/analysis/KuromojiCompletionFilterFactory.java diff --git a/CHANGELOG.md b/CHANGELOG.md index ea776fe0c9237..e91e766b6ab38 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -105,6 +105,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Add toString methods to MultiSearchRequest, MultiGetRequest and CreateIndexRequest ([#12163](https://github.com/opensearch-project/OpenSearch/pull/12163)) - Support for returning scores in matched queries ([#11626](https://github.com/opensearch-project/OpenSearch/pull/11626)) - Add shard id property to SearchLookup for use in field types provided by plugins ([#1063](https://github.com/opensearch-project/OpenSearch/pull/1063)) +- Add kuromoji_completion analyzer and filter ([#4835](https://github.com/opensearch-project/OpenSearch/issues/4835)) ### Dependencies - Bump `peter-evans/find-comment` from 2 to 3 ([#12288](https://github.com/opensearch-project/OpenSearch/pull/12288)) diff --git a/plugins/analysis-kuromoji/src/main/java/org/opensearch/index/analysis/KuromojiCompletionAnalyzerProvider.java b/plugins/analysis-kuromoji/src/main/java/org/opensearch/index/analysis/KuromojiCompletionAnalyzerProvider.java new file mode 100644 index 0000000000000..314daab1801a6 --- /dev/null +++ b/plugins/analysis-kuromoji/src/main/java/org/opensearch/index/analysis/KuromojiCompletionAnalyzerProvider.java @@ -0,0 +1,34 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.index.analysis; + +import org.apache.lucene.analysis.ja.JapaneseCompletionAnalyzer; +import org.apache.lucene.analysis.ja.JapaneseCompletionFilter; +import org.apache.lucene.analysis.ja.dict.UserDictionary; +import org.opensearch.common.settings.Settings; +import org.opensearch.env.Environment; +import org.opensearch.index.IndexSettings; + +public class KuromojiCompletionAnalyzerProvider extends AbstractIndexAnalyzerProvider { + + private final JapaneseCompletionAnalyzer analyzer; + + public KuromojiCompletionAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { + super(indexSettings, name, settings); + final JapaneseCompletionFilter.Mode mode = KuromojiCompletionFilterFactory.getMode(settings); + final UserDictionary userDictionary = KuromojiTokenizerFactory.getUserDictionary(env, settings); + analyzer = new JapaneseCompletionAnalyzer(userDictionary, mode); + } + + @Override + public JapaneseCompletionAnalyzer get() { + return this.analyzer; + } + +} diff --git a/plugins/analysis-kuromoji/src/main/java/org/opensearch/index/analysis/KuromojiCompletionFilterFactory.java b/plugins/analysis-kuromoji/src/main/java/org/opensearch/index/analysis/KuromojiCompletionFilterFactory.java new file mode 100644 index 0000000000000..1459c19de46db --- /dev/null +++ b/plugins/analysis-kuromoji/src/main/java/org/opensearch/index/analysis/KuromojiCompletionFilterFactory.java @@ -0,0 +1,42 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.index.analysis; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.ja.JapaneseCompletionFilter; +import org.apache.lucene.analysis.ja.JapaneseCompletionFilter.Mode; +import org.opensearch.common.settings.Settings; +import org.opensearch.env.Environment; +import org.opensearch.index.IndexSettings; + +public class KuromojiCompletionFilterFactory extends AbstractTokenFilterFactory { + private final Mode mode; + + public KuromojiCompletionFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + super(indexSettings, name, settings); + this.mode = getMode(settings); + } + + public static Mode getMode(Settings settings) { + String modeSetting = settings.get("mode", null); + if (modeSetting != null) { + if ("index".equalsIgnoreCase(modeSetting)) { + return Mode.INDEX; + } else if ("query".equalsIgnoreCase(modeSetting)) { + return Mode.QUERY; + } + } + return Mode.INDEX; + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new JapaneseCompletionFilter(tokenStream, mode); + } +} diff --git a/plugins/analysis-kuromoji/src/main/java/org/opensearch/plugin/analysis/kuromoji/AnalysisKuromojiPlugin.java b/plugins/analysis-kuromoji/src/main/java/org/opensearch/plugin/analysis/kuromoji/AnalysisKuromojiPlugin.java index 76d3df8c2e76c..c429e8e4dd830 100644 --- a/plugins/analysis-kuromoji/src/main/java/org/opensearch/plugin/analysis/kuromoji/AnalysisKuromojiPlugin.java +++ b/plugins/analysis-kuromoji/src/main/java/org/opensearch/plugin/analysis/kuromoji/AnalysisKuromojiPlugin.java @@ -38,6 +38,8 @@ import org.opensearch.index.analysis.JapaneseStopTokenFilterFactory; import org.opensearch.index.analysis.KuromojiAnalyzerProvider; import org.opensearch.index.analysis.KuromojiBaseFormFilterFactory; +import org.opensearch.index.analysis.KuromojiCompletionAnalyzerProvider; +import org.opensearch.index.analysis.KuromojiCompletionFilterFactory; import org.opensearch.index.analysis.KuromojiIterationMarkCharFilterFactory; import org.opensearch.index.analysis.KuromojiKatakanaStemmerFactory; import org.opensearch.index.analysis.KuromojiNumberFilterFactory; @@ -70,6 +72,7 @@ public Map> getTokenFilters() { extra.put("kuromoji_stemmer", KuromojiKatakanaStemmerFactory::new); extra.put("ja_stop", JapaneseStopTokenFilterFactory::new); extra.put("kuromoji_number", KuromojiNumberFilterFactory::new); + extra.put("kuromoji_completion", KuromojiCompletionFilterFactory::new); return extra; } @@ -80,6 +83,9 @@ public Map> getTokenizers() { @Override public Map>> getAnalyzers() { - return singletonMap("kuromoji", KuromojiAnalyzerProvider::new); + Map>> extra = new HashMap<>(); + extra.put("kuromoji", KuromojiAnalyzerProvider::new); + extra.put("kuromoji_completion", KuromojiCompletionAnalyzerProvider::new); + return extra; } } diff --git a/plugins/analysis-kuromoji/src/test/java/org/opensearch/index/analysis/AnalysisKuromojiFactoryTests.java b/plugins/analysis-kuromoji/src/test/java/org/opensearch/index/analysis/AnalysisKuromojiFactoryTests.java index a76406d4dc925..b6b953f9ba417 100644 --- a/plugins/analysis-kuromoji/src/test/java/org/opensearch/index/analysis/AnalysisKuromojiFactoryTests.java +++ b/plugins/analysis-kuromoji/src/test/java/org/opensearch/index/analysis/AnalysisKuromojiFactoryTests.java @@ -59,6 +59,7 @@ protected Map> getTokenFilters() { filters.put("japanesereadingform", KuromojiReadingFormFilterFactory.class); filters.put("japanesekatakanastem", KuromojiKatakanaStemmerFactory.class); filters.put("japanesenumber", KuromojiNumberFilterFactory.class); + filters.put("japanesecompletion", KuromojiCompletionFilterFactory.class); return filters; } diff --git a/plugins/analysis-kuromoji/src/test/java/org/opensearch/index/analysis/KuromojiAnalysisTests.java b/plugins/analysis-kuromoji/src/test/java/org/opensearch/index/analysis/KuromojiAnalysisTests.java index ffc2db6672899..ec18041f451fc 100644 --- a/plugins/analysis-kuromoji/src/test/java/org/opensearch/index/analysis/KuromojiAnalysisTests.java +++ b/plugins/analysis-kuromoji/src/test/java/org/opensearch/index/analysis/KuromojiAnalysisTests.java @@ -36,6 +36,7 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.ja.JapaneseAnalyzer; +import org.apache.lucene.analysis.ja.JapaneseCompletionAnalyzer; import org.apache.lucene.analysis.ja.JapaneseTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.opensearch.Version; @@ -85,6 +86,15 @@ public void testDefaultsKuromojiAnalysis() throws IOException { filterFactory = analysis.tokenFilter.get("kuromoji_number"); assertThat(filterFactory, instanceOf(KuromojiNumberFilterFactory.class)); + filterFactory = analysis.tokenFilter.get("kuromoji_completion"); + assertThat(filterFactory, instanceOf(KuromojiCompletionFilterFactory.class)); + + filterFactory = analysis.tokenFilter.get("kuromoji_completion_index"); + assertThat(filterFactory, instanceOf(KuromojiCompletionFilterFactory.class)); + + filterFactory = analysis.tokenFilter.get("kuromoji_completion_query"); + assertThat(filterFactory, instanceOf(KuromojiCompletionFilterFactory.class)); + IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers; NamedAnalyzer analyzer = indexAnalyzers.get("kuromoji"); assertThat(analyzer.analyzer(), instanceOf(JapaneseAnalyzer.class)); @@ -93,6 +103,15 @@ public void testDefaultsKuromojiAnalysis() throws IOException { assertThat(analyzer.analyzer(), instanceOf(CustomAnalyzer.class)); assertThat(analyzer.analyzer().tokenStream(null, new StringReader("")), instanceOf(JapaneseTokenizer.class)); + analyzer = indexAnalyzers.get("kuromoji_completion"); + assertThat(analyzer.analyzer(), instanceOf(JapaneseCompletionAnalyzer.class)); + + analyzer = indexAnalyzers.get("kuromoji_completion_index"); + assertThat(analyzer.analyzer(), instanceOf(JapaneseCompletionAnalyzer.class)); + + analyzer = indexAnalyzers.get("kuromoji_completion_query"); + assertThat(analyzer.analyzer(), instanceOf(JapaneseCompletionAnalyzer.class)); + CharFilterFactory charFilterFactory = analysis.charFilter.get("kuromoji_iteration_mark"); assertThat(charFilterFactory, instanceOf(KuromojiIterationMarkCharFilterFactory.class)); @@ -199,6 +218,32 @@ public void testKatakanaStemFilter() throws IOException { assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens_katakana); } + public void testJapaneseCompletionFilter() throws IOException { + TestAnalysis analysis = createTestAnalysis(); + + String source = "寿司がおいしいね"; + String[] expected_tokens = new String[] { "寿司", "susi", "sushi", "が", "ga", "おいしい", "oisii", "oishii", "ね", "ne" }; + + // mode = INDEX(default) + Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH); + tokenizer.setReader(new StringReader(source)); + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("kuromoji_completion"); + assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens); + + // mode = INDEX + tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH); + tokenizer.setReader(new StringReader(source)); + tokenFilter = analysis.tokenFilter.get("kuromoji_completion_index"); + assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens); + + // mode = QUERY + tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH); + tokenizer.setReader(new StringReader(source)); + tokenFilter = analysis.tokenFilter.get("kuromoji_completion_query"); + expected_tokens = new String[] { "寿司", "susi", "sushi", "がおいしいね", "gaoisiine", "gaoishiine" }; + assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens); + } + public void testIterationMarkCharFilter() throws IOException { TestAnalysis analysis = createTestAnalysis(); // test only kanji @@ -414,6 +459,30 @@ public void testDiscardCompoundToken() throws Exception { assertSimpleTSOutput(tokenizer, expected); } + public void testJapaneseCompletionAnalyzer() throws Exception { + TestAnalysis analysis = createTestAnalysis(); + IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers; + NamedAnalyzer analyzer = indexAnalyzers.get("kuromoji_completion"); + + // mode = INDEX(default) + try (TokenStream stream = analyzer.tokenStream("", "寿司がおいしいね")) { + assertTokenStreamContents(stream, new String[] { "寿司", "susi", "sushi", "が", "ga", "おいしい", "oisii", "oishii", "ね", "ne" }); + } + + // mode = INDEX + analyzer = indexAnalyzers.get("kuromoji_completion_index"); + try (TokenStream stream = analyzer.tokenStream("", "寿司がおいしいね")) { + assertTokenStreamContents(stream, new String[] { "寿司", "susi", "sushi", "が", "ga", "おいしい", "oisii", "oishii", "ね", "ne" }); + } + + // mode = QUERY + analyzer = indexAnalyzers.get("kuromoji_completion_query"); + try (TokenStream stream = analyzer.tokenStream("", "寿司がおいしいね")) { + assertTokenStreamContents(stream, new String[] { "寿司", "susi", "sushi", "がおいしいね", "gaoisiine", "gaoishiine" }); + } + + } + private TestAnalysis createTestAnalysis(Settings analysisSettings) throws IOException { InputStream dict = KuromojiAnalysisTests.class.getResourceAsStream("user_dict.txt"); Path home = createTempDir(); diff --git a/plugins/analysis-kuromoji/src/test/resources/org/opensearch/index/analysis/kuromoji_analysis.json b/plugins/analysis-kuromoji/src/test/resources/org/opensearch/index/analysis/kuromoji_analysis.json index a55947f53e34b..3e952b51e4ece 100644 --- a/plugins/analysis-kuromoji/src/test/resources/org/opensearch/index/analysis/kuromoji_analysis.json +++ b/plugins/analysis-kuromoji/src/test/resources/org/opensearch/index/analysis/kuromoji_analysis.json @@ -17,6 +17,14 @@ "ja_stop" : { "type": "ja_stop", "stopwords": ["_japanese_", "スピード"] + }, + "kuromoji_completion_index" : { + "type" : "kuromoji_completion", + "mode" : "index" + }, + "kuromoji_completion_query" : { + "type" : "kuromoji_completion", + "mode" : "query" } }, @@ -70,6 +78,14 @@ "my_analyzer" : { "type" : "custom", "tokenizer" : "kuromoji_tokenizer" + }, + "kuromoji_completion_index" : { + "type" : "kuromoji_completion", + "mode" : "index" + }, + "kuromoji_completion_query" : { + "type" : "kuromoji_completion", + "mode" : "query" } } diff --git a/plugins/analysis-kuromoji/src/yamlRestTest/resources/rest-api-spec/test/analysis_kuromoji/10_basic.yml b/plugins/analysis-kuromoji/src/yamlRestTest/resources/rest-api-spec/test/analysis_kuromoji/10_basic.yml index 1cca2b728e0aa..3363591ded5ca 100644 --- a/plugins/analysis-kuromoji/src/yamlRestTest/resources/rest-api-spec/test/analysis_kuromoji/10_basic.yml +++ b/plugins/analysis-kuromoji/src/yamlRestTest/resources/rest-api-spec/test/analysis_kuromoji/10_basic.yml @@ -16,6 +16,24 @@ - match: { tokens.5.token: 飲む } - match: { tokens.6.token: 行く } --- +"Completion Analyzer": + - do: + indices.analyze: + body: + text: 寿司がおいしいね + analyzer: kuromoji_completion + - length: { tokens: 10 } + - match: { tokens.0.token: "寿司" } + - match: { tokens.1.token: "susi" } + - match: { tokens.2.token: "sushi" } + - match: { tokens.3.token: "が" } + - match: { tokens.4.token: "ga" } + - match: { tokens.5.token: "おいしい" } + - match: { tokens.6.token: "oisii" } + - match: { tokens.7.token: "oishii" } + - match: { tokens.8.token: "ね" } + - match: { tokens.9.token: "ne" } +--- "Tokenizer": - do: indices.analyze: @@ -57,3 +75,15 @@ filter: [kuromoji_stemmer] - length: { tokens: 1 } - match: { tokens.0.token: サーバ } +--- +"Completion filter": + - do: + indices.analyze: + body: + text: 寿司 + tokenizer: kuromoji_tokenizer + filter: [kuromoji_completion] + - length: { tokens: 3 } + - match: { tokens.0.token: "寿司" } + - match: { tokens.1.token: "susi" } + - match: { tokens.2.token: "sushi" }