Skip to content

Commit

Permalink
Add kuromoji_completion analyzer and filter (#4835)
Browse files Browse the repository at this point in the history
Signed-off-by: Tatsuya Kawakami <[email protected]>
  • Loading branch information
hogesako committed Feb 14, 2024
1 parent 8489294 commit b48d145
Show file tree
Hide file tree
Showing 8 changed files with 197 additions and 1 deletion.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
- Add community_id ingest processor ([#12121](https://github.com/opensearch-project/OpenSearch/pull/12121))
- Introduce query level setting `index.query.max_nested_depth` limiting nested queries ([#3268](https://github.com/opensearch-project/OpenSearch/issues/3268)
- Add toString methods to MultiSearchRequest, MultiGetRequest and CreateIndexRequest ([#12163](https://github.com/opensearch-project/OpenSearch/pull/12163))
- Add kuromoji_completion analyzer and filter ([#4835](https://github.com/opensearch-project/OpenSearch/issues/4835))

### Dependencies
- Bump `peter-evans/find-comment` from 2 to 3 ([#12288](https://github.com/opensearch-project/OpenSearch/pull/12288))
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/

package org.opensearch.index.analysis;

import org.apache.lucene.analysis.ja.JapaneseCompletionAnalyzer;
import org.apache.lucene.analysis.ja.JapaneseCompletionFilter;
import org.apache.lucene.analysis.ja.dict.UserDictionary;
import org.opensearch.common.settings.Settings;
import org.opensearch.env.Environment;
import org.opensearch.index.IndexSettings;

public class KuromojiCompletionAnalyzerProvider extends AbstractIndexAnalyzerProvider<JapaneseCompletionAnalyzer> {

private final JapaneseCompletionAnalyzer analyzer;

public KuromojiCompletionAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
final JapaneseCompletionFilter.Mode mode = KuromojiCompletionFilterFactory.getMode(settings);
final UserDictionary userDictionary = KuromojiTokenizerFactory.getUserDictionary(env, settings);
analyzer = new JapaneseCompletionAnalyzer(userDictionary, mode);
}

@Override
public JapaneseCompletionAnalyzer get() {
return this.analyzer;
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/

package org.opensearch.index.analysis;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ja.JapaneseCompletionFilter;
import org.apache.lucene.analysis.ja.JapaneseCompletionFilter.Mode;
import org.opensearch.common.settings.Settings;
import org.opensearch.env.Environment;
import org.opensearch.index.IndexSettings;

public class KuromojiCompletionFilterFactory extends AbstractTokenFilterFactory {
private final Mode mode;

public KuromojiCompletionFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
this.mode = getMode(settings);
}

public static Mode getMode(Settings settings) {
String modeSetting = settings.get("mode", null);
if (modeSetting == null || "index".equalsIgnoreCase(modeSetting)) {
return Mode.INDEX;
} else {
return Mode.QUERY;
}
}

@Override
public TokenStream create(TokenStream tokenStream) {
return new JapaneseCompletionFilter(tokenStream, mode);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@
import org.opensearch.index.analysis.JapaneseStopTokenFilterFactory;
import org.opensearch.index.analysis.KuromojiAnalyzerProvider;
import org.opensearch.index.analysis.KuromojiBaseFormFilterFactory;
import org.opensearch.index.analysis.KuromojiCompletionAnalyzerProvider;
import org.opensearch.index.analysis.KuromojiCompletionFilterFactory;
import org.opensearch.index.analysis.KuromojiIterationMarkCharFilterFactory;
import org.opensearch.index.analysis.KuromojiKatakanaStemmerFactory;
import org.opensearch.index.analysis.KuromojiNumberFilterFactory;
Expand Down Expand Up @@ -70,6 +72,7 @@ public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
extra.put("kuromoji_stemmer", KuromojiKatakanaStemmerFactory::new);
extra.put("ja_stop", JapaneseStopTokenFilterFactory::new);
extra.put("kuromoji_number", KuromojiNumberFilterFactory::new);
extra.put("kuromoji_completion", KuromojiCompletionFilterFactory::new);
return extra;
}

Expand All @@ -80,6 +83,9 @@ public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {

@Override
public Map<String, AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> getAnalyzers() {
return singletonMap("kuromoji", KuromojiAnalyzerProvider::new);
Map<String, AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> extra = new HashMap<>();
extra.put("kuromoji", KuromojiAnalyzerProvider::new);
extra.put("kuromoji_completion", KuromojiCompletionAnalyzerProvider::new);
return extra;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ protected Map<String, Class<?>> getTokenFilters() {
filters.put("japanesereadingform", KuromojiReadingFormFilterFactory.class);
filters.put("japanesekatakanastem", KuromojiKatakanaStemmerFactory.class);
filters.put("japanesenumber", KuromojiNumberFilterFactory.class);
filters.put("japanesecompletion", KuromojiCompletionFilterFactory.class);
return filters;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
import org.apache.lucene.analysis.ja.JapaneseCompletionAnalyzer;
import org.apache.lucene.analysis.ja.JapaneseTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.opensearch.Version;
Expand Down Expand Up @@ -85,6 +86,15 @@ public void testDefaultsKuromojiAnalysis() throws IOException {
filterFactory = analysis.tokenFilter.get("kuromoji_number");
assertThat(filterFactory, instanceOf(KuromojiNumberFilterFactory.class));

filterFactory = analysis.tokenFilter.get("kuromoji_completion");
assertThat(filterFactory, instanceOf(KuromojiCompletionFilterFactory.class));

filterFactory = analysis.tokenFilter.get("kuromoji_completion_index");
assertThat(filterFactory, instanceOf(KuromojiCompletionFilterFactory.class));

filterFactory = analysis.tokenFilter.get("kuromoji_completion_query");
assertThat(filterFactory, instanceOf(KuromojiCompletionFilterFactory.class));

IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers;
NamedAnalyzer analyzer = indexAnalyzers.get("kuromoji");
assertThat(analyzer.analyzer(), instanceOf(JapaneseAnalyzer.class));
Expand All @@ -93,6 +103,15 @@ public void testDefaultsKuromojiAnalysis() throws IOException {
assertThat(analyzer.analyzer(), instanceOf(CustomAnalyzer.class));
assertThat(analyzer.analyzer().tokenStream(null, new StringReader("")), instanceOf(JapaneseTokenizer.class));

analyzer = indexAnalyzers.get("kuromoji_completion");
assertThat(analyzer.analyzer(), instanceOf(JapaneseCompletionAnalyzer.class));

analyzer = indexAnalyzers.get("kuromoji_completion_index");
assertThat(analyzer.analyzer(), instanceOf(JapaneseCompletionAnalyzer.class));

analyzer = indexAnalyzers.get("kuromoji_completion_query");
assertThat(analyzer.analyzer(), instanceOf(JapaneseCompletionAnalyzer.class));

CharFilterFactory charFilterFactory = analysis.charFilter.get("kuromoji_iteration_mark");
assertThat(charFilterFactory, instanceOf(KuromojiIterationMarkCharFilterFactory.class));

Expand Down Expand Up @@ -199,6 +218,32 @@ public void testKatakanaStemFilter() throws IOException {
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens_katakana);
}

public void testJapaneseCompletionFilter() throws IOException {
TestAnalysis analysis = createTestAnalysis();

String source = "寿司がおいしいね";
String[] expected_tokens = new String[] { "寿司", "susi", "sushi", "が", "ga", "おいしい", "oisii", "oishii", "ね", "ne" };

// mode = INDEX(default)
Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
tokenizer.setReader(new StringReader(source));
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("kuromoji_completion");
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens);

// mode = INDEX
tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
tokenizer.setReader(new StringReader(source));
tokenFilter = analysis.tokenFilter.get("kuromoji_completion_index");
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens);

// mode = QUERY
tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
tokenizer.setReader(new StringReader(source));
tokenFilter = analysis.tokenFilter.get("kuromoji_completion_query");
expected_tokens = new String[] { "寿司", "susi", "sushi", "がおいしいね", "gaoisiine", "gaoishiine" };
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens);
}

public void testIterationMarkCharFilter() throws IOException {
TestAnalysis analysis = createTestAnalysis();
// test only kanji
Expand Down Expand Up @@ -414,6 +459,30 @@ public void testDiscardCompoundToken() throws Exception {
assertSimpleTSOutput(tokenizer, expected);
}

public void testJapaneseCompletionAnalyzer() throws Exception {
TestAnalysis analysis = createTestAnalysis();
IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers;
NamedAnalyzer analyzer = indexAnalyzers.get("kuromoji_completion");

// mode = INDEX(default)
try (TokenStream stream = analyzer.tokenStream("", "寿司がおいしいね")) {
assertTokenStreamContents(stream, new String[] { "寿司", "susi", "sushi", "が", "ga", "おいしい", "oisii", "oishii", "ね", "ne" });
}

// mode = INDEX
analyzer = indexAnalyzers.get("kuromoji_completion_index");
try (TokenStream stream = analyzer.tokenStream("", "寿司がおいしいね")) {
assertTokenStreamContents(stream, new String[] { "寿司", "susi", "sushi", "が", "ga", "おいしい", "oisii", "oishii", "ね", "ne" });
}

// mode = QUERY
analyzer = indexAnalyzers.get("kuromoji_completion_query");
try (TokenStream stream = analyzer.tokenStream("", "寿司がおいしいね")) {
assertTokenStreamContents(stream, new String[] { "寿司", "susi", "sushi", "がおいしいね", "gaoisiine", "gaoishiine" });
}

}

private TestAnalysis createTestAnalysis(Settings analysisSettings) throws IOException {
InputStream dict = KuromojiAnalysisTests.class.getResourceAsStream("user_dict.txt");
Path home = createTempDir();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,14 @@
"ja_stop" : {
"type": "ja_stop",
"stopwords": ["_japanese_", "スピード"]
},
"kuromoji_completion_index" : {
"type" : "kuromoji_completion",
"mode" : "index"
},
"kuromoji_completion_query" : {
"type" : "kuromoji_completion",
"mode" : "query"
}
},

Expand Down Expand Up @@ -70,6 +78,14 @@
"my_analyzer" : {
"type" : "custom",
"tokenizer" : "kuromoji_tokenizer"
},
"kuromoji_completion_index" : {
"type" : "kuromoji_completion",
"mode" : "index"
},
"kuromoji_completion_query" : {
"type" : "kuromoji_completion",
"mode" : "query"
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,24 @@
- match: { tokens.5.token: 飲む }
- match: { tokens.6.token: 行く }
---
"Completion Analyzer":
- do:
indices.analyze:
body:
text: 寿司がおいしいね
analyzer: kuromoji_completion
- length: { tokens: 10 }
- match: { tokens.0.token: "寿司" }
- match: { tokens.1.token: "susi" }
- match: { tokens.2.token: "sushi" }
- match: { tokens.3.token: "が" }
- match: { tokens.4.token: "ga" }
- match: { tokens.5.token: "おいしい" }
- match: { tokens.6.token: "oisii" }
- match: { tokens.7.token: "oishii" }
- match: { tokens.8.token: "ね" }
- match: { tokens.9.token: "ne" }
---
"Tokenizer":
- do:
indices.analyze:
Expand Down Expand Up @@ -57,3 +75,15 @@
filter: [kuromoji_stemmer]
- length: { tokens: 1 }
- match: { tokens.0.token: サーバ }
---
"Completion filter":
- do:
indices.analyze:
body:
text: 寿司
tokenizer: kuromoji_tokenizer
filter: [kuromoji_completion]
- length: { tokens: 3 }
- match: { tokens.0.token: "寿司" }
- match: { tokens.1.token: "susi" }
- match: { tokens.2.token: "sushi" }

0 comments on commit b48d145

Please sign in to comment.