From 55526ec46575f0e0245887dfd338b97dfb7451c7 Mon Sep 17 00:00:00 2001 From: Russ Cam Date: Tue, 22 Aug 2023 21:57:12 +1000 Subject: [PATCH] Expose DelimitedTermFrequencyTokenFilter Relates: #9413 This commit exposes Lucene's delimited term frequency token filter to be able to provide term frequencies along with terms. Signed-off-by: Russ Cam --- .../common/CommonAnalysisModulePlugin.java | 12 +++++ ...imitedTermFrequencyTokenFilterFactory.java | 46 +++++++++++++++++++ .../common/CommonAnalysisFactoryTests.java | 1 + .../test/analysis-common/40_token_filters.yml | 37 +++++++++++++++ .../analysis/AnalysisFactoryTestCase.java | 4 +- 5 files changed, 97 insertions(+), 3 deletions(-) create mode 100644 modules/analysis-common/src/main/java/org/opensearch/analysis/common/DelimitedTermFrequencyTokenFilterFactory.java diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java index 46220f5369d16..8e68874a2c049 100644 --- a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java +++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java @@ -89,6 +89,7 @@ import org.apache.lucene.analysis.lt.LithuanianAnalyzer; import org.apache.lucene.analysis.lv.LatvianAnalyzer; import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter; +import org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilter; import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute; import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilter; import org.apache.lucene.analysis.miscellaneous.LengthFilter; @@ -265,6 +266,7 @@ public Map> getTokenFilters() { ); filters.put("decimal_digit", DecimalDigitFilterFactory::new); filters.put("delimited_payload", DelimitedPayloadTokenFilterFactory::new); + filters.put("delimited_termfreq", DelimitedTermFrequencyTokenFilterFactory::new); filters.put("dictionary_decompounder", requiresAnalysisSettings(DictionaryCompoundWordTokenFilterFactory::new)); filters.put("dutch_stem", DutchStemTokenFilterFactory::new); filters.put("edge_ngram", EdgeNGramTokenFilterFactory::new); @@ -500,6 +502,16 @@ public List getPreConfiguredTokenFilters() { ) ) ); + filters.add( + PreConfiguredTokenFilter.singleton( + "delimited_termfreq", + false, + input -> new DelimitedTermFrequencyTokenFilter( + input, + DelimitedTermFrequencyTokenFilterFactory.DEFAULT_DELIMITER + ) + ) + ); filters.add(PreConfiguredTokenFilter.singleton("dutch_stem", false, input -> new SnowballFilter(input, new DutchStemmer()))); filters.add(PreConfiguredTokenFilter.singleton("edge_ngram", false, false, input -> new EdgeNGramTokenFilter(input, 1))); filters.add(PreConfiguredTokenFilter.openSearchVersion("edgeNGram", false, false, (reader, version) -> { diff --git a/modules/analysis-common/src/main/java/org/opensearch/analysis/common/DelimitedTermFrequencyTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/DelimitedTermFrequencyTokenFilterFactory.java new file mode 100644 index 0000000000000..a35515c9694d3 --- /dev/null +++ b/modules/analysis-common/src/main/java/org/opensearch/analysis/common/DelimitedTermFrequencyTokenFilterFactory.java @@ -0,0 +1,46 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.analysis.common; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.DelimitedTermFrequencyTokenFilter; +import org.opensearch.common.settings.Settings; +import org.opensearch.env.Environment; +import org.opensearch.index.IndexSettings; +import org.opensearch.index.analysis.AbstractTokenFilterFactory; + +public class DelimitedTermFrequencyTokenFilterFactory extends AbstractTokenFilterFactory { + public static final char DEFAULT_DELIMITER = '|'; + private static final String DELIMITER = "delimiter"; + private final char delimiter; + + DelimitedTermFrequencyTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + super(indexSettings, name, settings); + delimiter = parseDelimiter(settings); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new DelimitedTermFrequencyTokenFilter(tokenStream, delimiter); + } + + private static char parseDelimiter(Settings settings) throws IllegalArgumentException { + String delimiter = settings.get(DELIMITER); + if (delimiter == null) { + return DEFAULT_DELIMITER; + } else if (delimiter.length() == 1) { + return delimiter.charAt(0); + } + + throw new IllegalArgumentException( + "Setting [" + DELIMITER + "] must be a single, non-null character. [" + delimiter + "] was provided." + ); + } +} + diff --git a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/CommonAnalysisFactoryTests.java b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/CommonAnalysisFactoryTests.java index 1c4db089565ff..b9c47b9a24acc 100644 --- a/modules/analysis-common/src/test/java/org/opensearch/analysis/common/CommonAnalysisFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/opensearch/analysis/common/CommonAnalysisFactoryTests.java @@ -145,6 +145,7 @@ protected Map> getTokenFilters() { filters.put("cjkwidth", CJKWidthFilterFactory.class); filters.put("cjkbigram", CJKBigramFilterFactory.class); filters.put("delimitedpayload", DelimitedPayloadTokenFilterFactory.class); + filters.put("delimitedtermfreq", DelimitedTermFrequencyTokenFilterFactory.class); filters.put("keepword", KeepWordFilterFactory.class); filters.put("type", KeepTypesFilterFactory.class); filters.put("classic", ClassicFilterFactory.class); diff --git a/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/40_token_filters.yml b/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/40_token_filters.yml index 40c82ff185661..836e9b550f52a 100644 --- a/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/40_token_filters.yml +++ b/modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/40_token_filters.yml @@ -1198,6 +1198,43 @@ - match: { tokens.0.token: foo } --- +"delimited_termfreq": + - do: + indices.create: + index: test + body: + settings: + analysis: + filter: + my_delimited_termfreq: + type: delimited_termfreq + delimiter: ^ + - do: + indices.analyze: + index: test + body: + text: foo^3 + tokenizer: keyword + filter: [my_delimited_termfreq] + attributes: termFrequency + explain: true + - length: { detail.tokenfilters: 1 } + - match: { detail.tokenfilters.0.tokens.0.token: foo } + - match: { detail.tokenfilters.0.tokens.0.termFrequency: 3 } + + # Test pre-configured token filter too: + - do: + indices.analyze: + body: + text: foo|100 + tokenizer: keyword + filter: [delimited_termfreq] + attributes: termFrequency + explain: true + - length: { detail.tokenfilters: 1 } + - match: { detail.tokenfilters.0.tokens.0.token: foo } + - match: { detail.tokenfilters.0.tokens.0.termFrequency: 100 } +--- "keep_filter": - do: indices.create: diff --git a/test/framework/src/main/java/org/opensearch/indices/analysis/AnalysisFactoryTestCase.java b/test/framework/src/main/java/org/opensearch/indices/analysis/AnalysisFactoryTestCase.java index b93cb64e32cfe..c412ae8317f24 100644 --- a/test/framework/src/main/java/org/opensearch/indices/analysis/AnalysisFactoryTestCase.java +++ b/test/framework/src/main/java/org/opensearch/indices/analysis/AnalysisFactoryTestCase.java @@ -98,6 +98,7 @@ public abstract class AnalysisFactoryTestCase extends OpenSearchTestCase { .put("czechstem", MovedToAnalysisCommon.class) .put("decimaldigit", MovedToAnalysisCommon.class) .put("delimitedpayload", MovedToAnalysisCommon.class) + .put("delimitedtermfrequency", MovedToAnalysisCommon.class) .put("dictionarycompoundword", MovedToAnalysisCommon.class) .put("edgengram", MovedToAnalysisCommon.class) .put("elision", MovedToAnalysisCommon.class) @@ -201,9 +202,6 @@ public abstract class AnalysisFactoryTestCase extends OpenSearchTestCase { .put("daterecognizer", Void.class) // for token filters that generate bad offsets, which are now rejected since Lucene 7 .put("fixbrokenoffsets", Void.class) - // should we expose it, or maybe think about higher level integration of the - // fake term frequency feature (LUCENE-7854) - .put("delimitedtermfrequency", Void.class) // LUCENE-8273: ProtectedTermFilterFactory allows analysis chains to skip // particular token filters based on the attributes of the current token. .put("protectedterm", Void.class)