From 997a600336a813f39a3ba252b31ad78b9bff0211 Mon Sep 17 00:00:00 2001 From: Tomoko Uchida Date: Mon, 31 Jan 2022 21:11:11 +0900 Subject: [PATCH] Expose Japanese completion filter to kuromoji analysis plugin (#81858) This adds analysis factories of JapaneseCompletionFilter and JapaneseCompletionAnalyzer (https://issues.apache.org/jira/browse/LUCENE-10102) to the kuromoji plugin. --- .../kuromoji/AnalysisKuromojiPlugin.java | 6 ++- .../KuromojiCompletionAnalyzerProvider.java | 34 ++++++++++++++ .../KuromojiCompletionFilterFactory.java | 46 +++++++++++++++++++ .../kuromoji/KuromojiAnalysisTests.java | 43 +++++++++++++++++ .../analysis/kuromoji/kuromoji_analysis.json | 16 +++++++ .../test/analysis_kuromoji/10_basic.yml | 38 +++++++++++++++ 6 files changed, 182 insertions(+), 1 deletion(-) create mode 100644 plugins/analysis-kuromoji/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/KuromojiCompletionAnalyzerProvider.java create mode 100644 plugins/analysis-kuromoji/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/KuromojiCompletionFilterFactory.java diff --git a/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/AnalysisKuromojiPlugin.java b/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/AnalysisKuromojiPlugin.java index 7d7e437e2631..6ded572faa1e 100644 --- a/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/AnalysisKuromojiPlugin.java +++ b/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/AnalysisKuromojiPlugin.java @@ -37,6 +37,7 @@ public Map> getTokenFilters() { extra.put("kuromoji_stemmer", KuromojiKatakanaStemmerFactory::new); extra.put("ja_stop", JapaneseStopTokenFilterFactory::new); extra.put("kuromoji_number", KuromojiNumberFilterFactory::new); + extra.put("kuromoji_completion", KuromojiCompletionFilterFactory::new); return extra; } @@ -47,6 +48,9 @@ public Map> getTokenizers() { @Override public Map>> getAnalyzers() { - return singletonMap("kuromoji", KuromojiAnalyzerProvider::new); + Map>> extra = new HashMap<>(); + extra.put("kuromoji", KuromojiAnalyzerProvider::new); + extra.put("kuromoji_completion", KuromojiCompletionAnalyzerProvider::new); + return extra; } } diff --git a/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/KuromojiCompletionAnalyzerProvider.java b/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/KuromojiCompletionAnalyzerProvider.java new file mode 100644 index 000000000000..82d3d8591fe3 --- /dev/null +++ b/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/KuromojiCompletionAnalyzerProvider.java @@ -0,0 +1,34 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.plugin.analysis.kuromoji; + +import org.apache.lucene.analysis.ja.JapaneseCompletionAnalyzer; +import org.apache.lucene.analysis.ja.JapaneseCompletionFilter.Mode; +import org.apache.lucene.analysis.ja.dict.UserDictionary; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider; + +public class KuromojiCompletionAnalyzerProvider extends AbstractIndexAnalyzerProvider { + + private final JapaneseCompletionAnalyzer analyzer; + + public KuromojiCompletionAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { + super(indexSettings, name, settings); + final UserDictionary userDictionary = KuromojiTokenizerFactory.getUserDictionary(env, settings); + final Mode mode = KuromojiCompletionFilterFactory.getMode(settings); + analyzer = new JapaneseCompletionAnalyzer(userDictionary, mode); + } + + @Override + public JapaneseCompletionAnalyzer get() { + return analyzer; + } +} diff --git a/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/KuromojiCompletionFilterFactory.java b/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/KuromojiCompletionFilterFactory.java new file mode 100644 index 000000000000..60e0676cb510 --- /dev/null +++ b/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/plugin/analysis/kuromoji/KuromojiCompletionFilterFactory.java @@ -0,0 +1,46 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.plugin.analysis.kuromoji; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.ja.JapaneseCompletionFilter; +import org.apache.lucene.analysis.ja.JapaneseCompletionFilter.Mode; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; + +public class KuromojiCompletionFilterFactory extends AbstractTokenFilterFactory { + + private final Mode mode; + + public KuromojiCompletionFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { + super(indexSettings, name, settings); + mode = getMode(settings); + } + + public static JapaneseCompletionFilter.Mode getMode(Settings settings) { + JapaneseCompletionFilter.Mode mode = Mode.INDEX; + String modeSetting = settings.get("mode", null); + if (modeSetting != null) { + if ("index".equalsIgnoreCase(modeSetting)) { + mode = JapaneseCompletionFilter.Mode.INDEX; + } else if ("query".equalsIgnoreCase(modeSetting)) { + mode = JapaneseCompletionFilter.Mode.QUERY; + } + } + return mode; + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new JapaneseCompletionFilter(tokenStream, mode); + } + +} diff --git a/plugins/analysis-kuromoji/src/test/java/org/elasticsearch/plugin/analysis/kuromoji/KuromojiAnalysisTests.java b/plugins/analysis-kuromoji/src/test/java/org/elasticsearch/plugin/analysis/kuromoji/KuromojiAnalysisTests.java index 1bf12510f136..3f97530d6c43 100644 --- a/plugins/analysis-kuromoji/src/test/java/org/elasticsearch/plugin/analysis/kuromoji/KuromojiAnalysisTests.java +++ b/plugins/analysis-kuromoji/src/test/java/org/elasticsearch/plugin/analysis/kuromoji/KuromojiAnalysisTests.java @@ -12,6 +12,7 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.ja.JapaneseAnalyzer; +import org.apache.lucene.analysis.ja.JapaneseCompletionAnalyzer; import org.apache.lucene.analysis.ja.JapaneseTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.elasticsearch.Version; @@ -67,10 +68,16 @@ public void testDefaultsKuromojiAnalysis() throws IOException { filterFactory = analysis.tokenFilter.get("kuromoji_number"); assertThat(filterFactory, instanceOf(KuromojiNumberFilterFactory.class)); + filterFactory = analysis.tokenFilter.get("kuromoji_completion"); + assertThat(filterFactory, instanceOf(KuromojiCompletionFilterFactory.class)); + IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers; NamedAnalyzer analyzer = indexAnalyzers.get("kuromoji"); assertThat(analyzer.analyzer(), instanceOf(JapaneseAnalyzer.class)); + analyzer = indexAnalyzers.get("kuromoji_completion"); + assertThat(analyzer.analyzer(), instanceOf(JapaneseCompletionAnalyzer.class)); + analyzer = indexAnalyzers.get("my_analyzer"); assertThat(analyzer.analyzer(), instanceOf(CustomAnalyzer.class)); assertThat(analyzer.analyzer().tokenStream(null, new StringReader("")), instanceOf(JapaneseTokenizer.class)); @@ -225,6 +232,42 @@ public void testJapaneseStopFilterFactory() throws IOException { assertSimpleTSOutput(tokenFilter.create(tokenizer), expected); } + public void testCompletionFilterFactory() throws IOException { + // mode=INDEX + TestAnalysis analysis = createTestAnalysis(); + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("kuromoji_completion_index"); + assertThat(tokenFilter, instanceOf(KuromojiCompletionFilterFactory.class)); + String source = "東京都"; + String[] expected_tokens = new String[] { "東京", "toukyou", "都", "to" }; + Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH); + tokenizer.setReader(new StringReader(source)); + assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens); + + // mode=QUERY + tokenFilter = analysis.tokenFilter.get("kuromoji_completion_query"); + assertThat(tokenFilter, instanceOf(KuromojiCompletionFilterFactory.class)); + source = "サッk"; + expected_tokens = new String[] { "サッk", "sakk" }; + tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH); + tokenizer.setReader(new StringReader(source)); + assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens); + } + + public void testCompletionAnalyzer() throws IOException { + // mode=INDEX + TestAnalysis analysis = createTestAnalysis(); + Analyzer analyzer = analysis.indexAnalyzers.get("completion_index_analyzer"); + try (TokenStream stream = analyzer.tokenStream("", "ソースコード")) { + assertTokenStreamContents(stream, new String[] { "ソース", "soーsu", "コード", "koーdo" }); + } + + // mode=QUERY + analyzer = analysis.indexAnalyzers.get("completion_query_analyzer"); + try (TokenStream stream = analyzer.tokenStream("", "ソースコード")) { + assertTokenStreamContents(stream, new String[] { "ソースコード", "soーsukoーdo" }); + } + } + private static TestAnalysis createTestAnalysis() throws IOException { InputStream empty_dict = KuromojiAnalysisTests.class.getResourceAsStream("empty_user_dict.txt"); InputStream dict = KuromojiAnalysisTests.class.getResourceAsStream("user_dict.txt"); diff --git a/plugins/analysis-kuromoji/src/test/resources/org/elasticsearch/plugin/analysis/kuromoji/kuromoji_analysis.json b/plugins/analysis-kuromoji/src/test/resources/org/elasticsearch/plugin/analysis/kuromoji/kuromoji_analysis.json index a55947f53e34..c4a2706adfa1 100644 --- a/plugins/analysis-kuromoji/src/test/resources/org/elasticsearch/plugin/analysis/kuromoji/kuromoji_analysis.json +++ b/plugins/analysis-kuromoji/src/test/resources/org/elasticsearch/plugin/analysis/kuromoji/kuromoji_analysis.json @@ -17,6 +17,14 @@ "ja_stop" : { "type": "ja_stop", "stopwords": ["_japanese_", "スピード"] + }, + "kuromoji_completion_index" : { + "type": "kuromoji_completion", + "mode": "index" + }, + "kuromoji_completion_query" : { + "type": "kuromoji_completion", + "mode": "query" } }, @@ -70,6 +78,14 @@ "my_analyzer" : { "type" : "custom", "tokenizer" : "kuromoji_tokenizer" + }, + "completion_index_analyzer" : { + "type" : "kuromoji_completion", + "mode" : "index" + }, + "completion_query_analyzer" : { + "type" : "kuromoji_completion", + "mode" : "query" } } diff --git a/plugins/analysis-kuromoji/src/yamlRestTest/resources/rest-api-spec/test/analysis_kuromoji/10_basic.yml b/plugins/analysis-kuromoji/src/yamlRestTest/resources/rest-api-spec/test/analysis_kuromoji/10_basic.yml index 1cca2b728e0a..bacbd2139c63 100644 --- a/plugins/analysis-kuromoji/src/yamlRestTest/resources/rest-api-spec/test/analysis_kuromoji/10_basic.yml +++ b/plugins/analysis-kuromoji/src/yamlRestTest/resources/rest-api-spec/test/analysis_kuromoji/10_basic.yml @@ -57,3 +57,41 @@ filter: [kuromoji_stemmer] - length: { tokens: 1 } - match: { tokens.0.token: サーバ } +--- +"Completion analyzer": + - do: + indices.create: + index: kuromoji_completion_sample + body: + settings: + index: + analysis: + analyzer: + completion_index: + type: kuromoji_completion + mode: index + completion_query: + type: kuromoji_completion + mode: query + + - do: + indices.analyze: + index: kuromoji_completion_sample + body: + text: ソースコード + analyzer: completion_index + - length: { tokens: 4 } + - match: { tokens.0.token: ソース } + - match: { tokens.1.token: soーsu } + - match: { tokens.2.token: コード } + - match: { tokens.3.token: koーdo } + + - do: + indices.analyze: + index: kuromoji_completion_sample + body: + text: ソースコード + analyzer: completion_query + - length: { tokens: 2 } + - match: { tokens.0.token: ソースコード } + - match: { tokens.1.token: soーsukoーdo }