Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a new configuration setting synonym_analyzer for synonym_graph and synonym. #16488

Merged
merged 5 commits into from
Nov 12, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
- Increase segrep pressure checkpoint default limit to 30 ([#16577](https://github.com/opensearch-project/OpenSearch/pull/16577/files))
- Add dynamic setting allowing size > 0 requests to be cached in the request cache ([#16483](https://github.com/opensearch-project/OpenSearch/pull/16483))
- Make IndexStoreListener a pluggable interface ([#16583](https://github.com/opensearch-project/OpenSearch/pull/16583))
- Add new configuration setting `synonym_analyzer`, to the `synonym` and `synonym_graph` filters, enabling the specification of a custom analyzer for reading the synonym file ([#16488](https://github.com/opensearch-project/OpenSearch/pull/16488)).

### Dependencies
- Bump `com.azure:azure-storage-common` from 12.25.1 to 12.27.1 ([#16521](https://github.com/opensearch-project/OpenSearch/pull/16521))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@
import org.opensearch.index.analysis.PreConfiguredTokenizer;
import org.opensearch.index.analysis.TokenFilterFactory;
import org.opensearch.index.analysis.TokenizerFactory;
import org.opensearch.indices.analysis.AnalysisModule;
import org.opensearch.indices.analysis.AnalysisModule.AnalysisProvider;
import org.opensearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
import org.opensearch.plugins.AnalysisPlugin;
Expand Down Expand Up @@ -247,7 +248,7 @@ public Map<String, AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> getAn
}

@Override
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters(AnalysisModule analysisModule) {
Map<String, AnalysisProvider<TokenFilterFactory>> filters = new TreeMap<>();
filters.put("apostrophe", ApostropheFilterFactory::new);
filters.put("arabic_normalization", ArabicNormalizationFilterFactory::new);
Expand Down Expand Up @@ -332,14 +333,36 @@ public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
filters.put("sorani_normalization", SoraniNormalizationFilterFactory::new);
filters.put("stemmer_override", requiresAnalysisSettings(StemmerOverrideTokenFilterFactory::new));
filters.put("stemmer", StemmerTokenFilterFactory::new);
filters.put("synonym", requiresAnalysisSettings(SynonymTokenFilterFactory::new));
filters.put("synonym_graph", requiresAnalysisSettings(SynonymGraphTokenFilterFactory::new));
filters.put("trim", TrimTokenFilterFactory::new);
filters.put("truncate", requiresAnalysisSettings(TruncateTokenFilterFactory::new));
filters.put("unique", UniqueTokenFilterFactory::new);
filters.put("uppercase", UpperCaseTokenFilterFactory::new);
filters.put("word_delimiter_graph", WordDelimiterGraphTokenFilterFactory::new);
filters.put("word_delimiter", WordDelimiterTokenFilterFactory::new);
filters.put(
"synonym",
requiresAnalysisSettings(
(indexSettings, environment, name, settings) -> new SynonymTokenFilterFactory(
indexSettings,
environment,
name,
settings,
analysisModule.getAnalysisRegistry()
)
)
);
filters.put(
"synonym_graph",
requiresAnalysisSettings(
(indexSettings, environment, name, settings) -> new SynonymGraphTokenFilterFactory(
indexSettings,
environment,
name,
settings,
analysisModule.getAnalysisRegistry()
)
)
);
return filters;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
import org.opensearch.env.Environment;
import org.opensearch.index.IndexSettings;
import org.opensearch.index.analysis.AnalysisMode;
import org.opensearch.index.analysis.AnalysisRegistry;
import org.opensearch.index.analysis.CharFilterFactory;
import org.opensearch.index.analysis.TokenFilterFactory;
import org.opensearch.index.analysis.TokenizerFactory;
Expand All @@ -49,8 +50,14 @@

public class SynonymGraphTokenFilterFactory extends SynonymTokenFilterFactory {

SynonymGraphTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, env, name, settings);
SynonymGraphTokenFilterFactory(
IndexSettings indexSettings,
Environment env,
String name,
Settings settings,
AnalysisRegistry analysisRegistry
) {
super(indexSettings, env, name, settings, analysisRegistry);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,13 @@
import org.opensearch.index.analysis.AbstractTokenFilterFactory;
import org.opensearch.index.analysis.Analysis;
import org.opensearch.index.analysis.AnalysisMode;
import org.opensearch.index.analysis.AnalysisRegistry;
import org.opensearch.index.analysis.CharFilterFactory;
import org.opensearch.index.analysis.CustomAnalyzer;
import org.opensearch.index.analysis.TokenFilterFactory;
import org.opensearch.index.analysis.TokenizerFactory;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.List;
Expand All @@ -64,8 +66,16 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
protected final Settings settings;
protected final Environment environment;
protected final AnalysisMode analysisMode;

SynonymTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
private final String synonymAnalyzerName;
private final AnalysisRegistry analysisRegistry;

SynonymTokenFilterFactory(
IndexSettings indexSettings,
Environment env,
String name,
Settings settings,
AnalysisRegistry analysisRegistry
) {
super(indexSettings, name, settings);
this.settings = settings;

Expand All @@ -83,6 +93,8 @@ public class SynonymTokenFilterFactory extends AbstractTokenFilterFactory {
boolean updateable = settings.getAsBoolean("updateable", false);
this.analysisMode = updateable ? AnalysisMode.SEARCH_TIME : AnalysisMode.ALL;
this.environment = env;
this.synonymAnalyzerName = settings.get("synonym_analyzer", null);
this.analysisRegistry = analysisRegistry;
}

@Override
Expand Down Expand Up @@ -137,6 +149,17 @@ Analyzer buildSynonymAnalyzer(
List<TokenFilterFactory> tokenFilters,
Function<String, TokenFilterFactory> allFilters
) {
if (synonymAnalyzerName != null) {
Analyzer customSynonymAnalyzer;
try {
customSynonymAnalyzer = analysisRegistry.getAnalyzer(synonymAnalyzerName);
} catch (IOException e) {
throw new RuntimeException(e);
}
if (customSynonymAnalyzer != null) {
return customSynonymAnalyzer;
}
}
return new CustomAnalyzer(
tokenizer,
charFilters.toArray(new CharFilterFactory[0]),
Expand Down Expand Up @@ -177,5 +200,4 @@ Reader getRulesFromSettings(Environment env) {
}
return rulesReader;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,16 @@
import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory;
import org.apache.lucene.analysis.te.TeluguNormalizationFilterFactory;
import org.apache.lucene.analysis.te.TeluguStemFilterFactory;
import org.opensearch.index.analysis.TokenFilterFactory;
import org.opensearch.indices.analysis.AnalysisFactoryTestCase;
import org.opensearch.indices.analysis.AnalysisModule;

import java.util.List;
import java.util.Map;
import java.util.TreeMap;

import org.mockito.Mock;

import static java.util.Collections.emptyList;
import static java.util.stream.Collectors.toList;

Expand All @@ -53,6 +57,9 @@ public CommonAnalysisFactoryTests() {
super(new CommonAnalysisModulePlugin());
}

@Mock
private AnalysisModule analysisModule;

@Override
protected Map<String, Class<?>> getTokenizers() {
Map<String, Class<?>> tokenizers = new TreeMap<>(super.getTokenizers());
Expand Down Expand Up @@ -302,4 +309,19 @@ private void markedTestCase(String name, Map<String, Class<?>> map) {
unmarked
);
}

/**
* Tests the getTokenFilters(AnalysisModule) method to verify:
* 1. All token filters are properly loaded
* 2. Basic filters remain available
* 3. Synonym filters remain available when AnalysisModule is provided
*/
public void testGetTokenFiltersWithAnalysisModule() {
CommonAnalysisModulePlugin plugin = (CommonAnalysisModulePlugin) getAnalysisPlugin();
Map<String, AnalysisModule.AnalysisProvider<TokenFilterFactory>> filters = plugin.getTokenFilters(analysisModule);
assertNotNull("Token filters should not be null", filters);
assertTrue("Should contain basic filters", filters.containsKey("lowercase"));
assertTrue("Should contain synonym filter", filters.containsKey("synonym"));
assertTrue("Should contain synonym_graph filter", filters.containsKey("synonym_graph"));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,14 @@
import org.opensearch.cluster.metadata.IndexMetadata;
import org.opensearch.common.settings.Settings;
import org.opensearch.env.Environment;
import org.opensearch.env.TestEnvironment;
import org.opensearch.index.IndexSettings;
import org.opensearch.index.analysis.AnalysisRegistry;
import org.opensearch.index.analysis.IndexAnalyzers;
import org.opensearch.index.analysis.PreConfiguredTokenFilter;
import org.opensearch.index.analysis.TokenFilterFactory;
import org.opensearch.index.analysis.TokenizerFactory;
import org.opensearch.indices.analysis.AnalysisModule;
import org.opensearch.test.IndexSettingsModule;
import org.opensearch.test.OpenSearchTestCase;
import org.opensearch.test.VersionUtils;
Expand All @@ -63,6 +66,7 @@
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.instanceOf;
import static org.hamcrest.Matchers.startsWith;
import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertTokenStreamContents;

public class SynonymsAnalysisTests extends OpenSearchTestCase {
private IndexAnalyzers indexAnalyzers;
Expand Down Expand Up @@ -255,14 +259,16 @@ public void testTokenFiltersBypassSynonymAnalysis() throws IOException {
.put("hyphenation_patterns_path", "foo")
.build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);

Environment environment = TestEnvironment.newEnvironment(settings);
AnalysisModule analysisModule = new AnalysisModule(environment, Collections.singletonList(new CommonAnalysisModulePlugin()));
AnalysisRegistry analysisRegistry = analysisModule.getAnalysisRegistry();
String[] bypassingFactories = new String[] { "dictionary_decompounder" };

CommonAnalysisModulePlugin plugin = new CommonAnalysisModulePlugin();
for (String factory : bypassingFactories) {
TokenFilterFactory tff = plugin.getTokenFilters().get(factory).get(idxSettings, null, factory, settings);
TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, null, "keyword", settings);
SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, null, "synonym", settings);
TokenFilterFactory tff = plugin.getTokenFilters(analysisModule).get(factory).get(idxSettings, environment, factory, settings);
TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, environment, "keyword", settings);
SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, environment, "synonym", settings, analysisRegistry);
Analyzer analyzer = stff.buildSynonymAnalyzer(tok, Collections.emptyList(), Collections.singletonList(tff), null);

try (TokenStream ts = analyzer.tokenStream("field", "text")) {
Expand Down Expand Up @@ -319,7 +325,11 @@ public void testDisallowedTokenFilters() throws IOException {
.putList("common_words", "a", "b")
.put("output_unigrams", "true")
.build();

Environment environment = TestEnvironment.newEnvironment(settings);
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
AnalysisModule analysisModule = new AnalysisModule(environment, Collections.singletonList(new CommonAnalysisModulePlugin()));
AnalysisRegistry analysisRegistry = analysisModule.getAnalysisRegistry();
CommonAnalysisModulePlugin plugin = new CommonAnalysisModulePlugin();

String[] disallowedFactories = new String[] {
Expand All @@ -333,9 +343,9 @@ public void testDisallowedTokenFilters() throws IOException {
"fingerprint" };

for (String factory : disallowedFactories) {
TokenFilterFactory tff = plugin.getTokenFilters().get(factory).get(idxSettings, null, factory, settings);
TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, null, "keyword", settings);
SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, null, "synonym", settings);
TokenFilterFactory tff = plugin.getTokenFilters(analysisModule).get(factory).get(idxSettings, environment, factory, settings);
TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, environment, "keyword", settings);
SynonymTokenFilterFactory stff = new SynonymTokenFilterFactory(idxSettings, environment, "synonym", settings, analysisRegistry);

IllegalArgumentException e = expectThrows(
IllegalArgumentException.class,
Expand All @@ -362,4 +372,75 @@ private void match(String analyzerName, String source, String target) throws IOE
MatcherAssert.assertThat(target, equalTo(sb.toString().trim()));
}

/**
* Tests the integration of word delimiter and synonym graph filters with synonym_analyzer based on issue #16263.
* This test verifies the correct handling of:
* 1. Hyphenated words with word delimiter (e.g., "note-book" → ["notebook", "note", "book"])
* 2. Multi-word synonyms (e.g., "mobile phone" → ["smartphone"])
* 3. Single word synonyms (e.g., "laptop" → ["notebook"])
*
* @see <a href="https://github.com/opensearch-project/OpenSearch/issues/16263">Issue #16263</a>
*/
public void testSynonymAnalyzerWithWordDelimiter() throws IOException {
Settings settings = Settings.builder()
.put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT)
.put("path.home", createTempDir().toString())
.put("index.analysis.filter.custom_word_delimiter.type", "word_delimiter_graph")
.put("index.analysis.filter.custom_word_delimiter.generate_word_parts", true)
.put("index.analysis.filter.custom_word_delimiter.catenate_all", true)
.put("index.analysis.filter.custom_word_delimiter.split_on_numerics", false)
.put("index.analysis.filter.custom_word_delimiter.split_on_case_change", false)
.put("index.analysis.filter.custom_pattern_replace_filter.type", "pattern_replace")
.put("index.analysis.filter.custom_pattern_replace_filter.pattern", "(-)")
.put("index.analysis.filter.custom_pattern_replace_filter.replacement", " ")
.put("index.analysis.filter.custom_pattern_replace_filter.all", true)
.put("index.analysis.filter.custom_synonym_graph_filter.type", "synonym_graph")
.putList(
"index.analysis.filter.custom_synonym_graph_filter.synonyms",
"laptop => notebook",
"smartphone, mobile phone, cell phone => smartphone",
"tv, television => television"
)
.put("index.analysis.filter.custom_synonym_graph_filter.synonym_analyzer", "standard")
.put("index.analysis.analyzer.text_en_index.type", "custom")
.put("index.analysis.analyzer.text_en_index.tokenizer", "whitespace")
.putList(
"index.analysis.analyzer.text_en_index.filter",
"lowercase",
"custom_word_delimiter",
"custom_synonym_graph_filter",
"custom_pattern_replace_filter",
"flatten_graph"
)
.build();
Environment environment = TestEnvironment.newEnvironment(settings);
IndexSettings indexSettings = IndexSettingsModule.newIndexSettings("test", settings);
AnalysisModule module = new AnalysisModule(environment, Collections.singletonList(new CommonAnalysisModulePlugin()));
IndexAnalyzers analyzers = module.getAnalysisRegistry().build(indexSettings);
try (TokenStream ts = analyzers.get("text_en_index").tokenStream("", "note-book")) {
assertTokenStreamContents(
ts,
new String[] { "notebook", "note", "book" },
new int[] { 0, 0, 5 },
new int[] { 9, 4, 9 },
new String[] { "word", "word", "word" },
new int[] { 1, 0, 1 },
new int[] { 2, 1, 1 }
);
}
try (TokenStream ts = analyzers.get("text_en_index").tokenStream("", "mobile phone")) {
assertTokenStreamContents(
ts,
new String[] { "smartphone" },
new int[] { 0 },
new int[] { 12 },
new String[] { "SYNONYM" },
new int[] { 1 },
new int[] { 1 }
);
}
try (TokenStream ts = analyzers.get("text_en_index").tokenStream("", "laptop")) {
assertTokenStreamContents(ts, new String[] { "notebook" }, new int[] { 0 }, new int[] { 6 });
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,12 @@ public boolean requiresAnalysisSettings() {
)
);

tokenFilters.extractAndRegister(plugins, AnalysisPlugin::getTokenFilters);
for (AnalysisPlugin plugin : plugins) {
Map<String, AnalysisProvider<TokenFilterFactory>> filters = plugin.getTokenFilters(this);
for (Map.Entry<String, AnalysisProvider<TokenFilterFactory>> entry : filters.entrySet()) {
tokenFilters.register(entry.getKey(), entry.getValue());
}
}
return tokenFilters;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
import org.opensearch.index.analysis.PreConfiguredTokenizer;
import org.opensearch.index.analysis.TokenFilterFactory;
import org.opensearch.index.analysis.TokenizerFactory;
import org.opensearch.indices.analysis.AnalysisModule;
import org.opensearch.indices.analysis.AnalysisModule.AnalysisProvider;

import java.io.IOException;
Expand Down Expand Up @@ -84,6 +85,14 @@ default Map<String, AnalysisProvider<CharFilterFactory>> getCharFilters() {
return emptyMap();
}

/**
* Override to add additional {@link TokenFilter}s that need access to the AnalysisModule.
* The default implementation for plugins that don't need AnalysisModule calls the existing getTokenFilters() method.
*/
default Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters(AnalysisModule analysisModule) {
return getTokenFilters();
}

/**
* Override to add additional {@link TokenFilter}s. See {@link #requiresAnalysisSettings(AnalysisProvider)}
* how to on get the configuration from the index.
Expand Down
Loading
Loading