Skip to content

Commit

Permalink
Provide access to new settings for HyphenationCompoundWordTokenFilter (
Browse files Browse the repository at this point in the history
…elastic#115585)

Allow the new flags added in Lucene in the HyphenationCompoundWordTokenFilter

Adds access to the two new flags no_sub_matches and no_overlapping_matches.

Lucene issue: apache/lucene#9231
  • Loading branch information
pstrsr authored Nov 18, 2024
1 parent 9968928 commit c804953
Show file tree
Hide file tree
Showing 7 changed files with 1,295 additions and 11 deletions.
6 changes: 6 additions & 0 deletions docs/changelog/115585.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
pr: 115459
summary: Adds access to flags no_sub_matches and no_overlapping_matches to hyphenation-decompounder-tokenfilter
area: Search
type: enhancement
issues:
- 97849
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,18 @@ output. Defaults to `5`.
(Optional, Boolean)
If `true`, only include the longest matching subword. Defaults to `false`.

`no_sub_matches`::
(Optional, Boolean)
If `true`, do not match sub tokens in tokens that are in the word list.
Defaults to `false`.

`no_overlapping_matches`::
(Optional, Boolean)
If `true`, do not allow overlapping tokens.
Defaults to `false`.

Typically users will only want to include one of the three flags as enabling `no_overlapping_matches` is the most restrictive and `no_sub_matches` is more restrictive than `only_longest_match`. When enabling a more restrictive option the state of the less restrictive does not have any effect.

[[analysis-hyp-decomp-tokenfilter-customize]]
==== Customize and add to an analyzer

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
*/
public class HyphenationCompoundWordTokenFilterFactory extends AbstractCompoundWordTokenFilterFactory {

private final boolean noSubMatches;
private final boolean noOverlappingMatches;
private final HyphenationTree hyphenationTree;

HyphenationCompoundWordTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
Expand All @@ -46,6 +48,9 @@ public class HyphenationCompoundWordTokenFilterFactory extends AbstractCompoundW
} catch (Exception e) {
throw new IllegalArgumentException("Exception while reading hyphenation_patterns_path.", e);
}

noSubMatches = settings.getAsBoolean("no_sub_matches", false);
noOverlappingMatches = settings.getAsBoolean("no_overlapping_matches", false);
}

@Override
Expand All @@ -57,7 +62,9 @@ public TokenStream create(TokenStream tokenStream) {
minWordSize,
minSubwordSize,
maxSubwordSize,
onlyLongestMatch
onlyLongestMatch,
noSubMatches,
noOverlappingMatches
);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@
import org.hamcrest.MatcherAssert;

import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
Expand All @@ -42,6 +45,7 @@
import static org.hamcrest.Matchers.instanceOf;

public class CompoundAnalysisTests extends ESTestCase {

public void testDefaultsCompoundAnalysis() throws Exception {
Settings settings = getJsonSettings();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test", settings);
Expand All @@ -63,6 +67,44 @@ public void testDictionaryDecompounder() throws Exception {
assertWarnings("Setting [version] on analysis component [custom7] has no effect and is deprecated");
}

public void testHyphenationDecompoundingAnalyzerOnlyLongestMatch() throws Exception {
Settings[] settingsArr = new Settings[] { getJsonSettings(), getYamlSettings() };
for (Settings settings : settingsArr) {
List<String> terms = analyze(settings, "hyphenationDecompoundingAnalyzerOnlyLongestMatch", "kaffeemaschine fussballpumpe");
MatcherAssert.assertThat(
terms,
hasItems("kaffeemaschine", "kaffee", "fee", "maschine", "fussballpumpe", "fussball", "ballpumpe", "pumpe")
);
}
assertWarnings("Setting [version] on analysis component [custom7] has no effect and is deprecated");
}

/**
* For example given a word list of: ["kaffee", "fee", "maschine"]
* no_sub_matches should prevent the token "fee" as a token in "kaffeemaschine".
*/
public void testHyphenationDecompoundingAnalyzerNoSubMatches() throws Exception {
Settings[] settingsArr = new Settings[] { getJsonSettings(), getYamlSettings() };
for (Settings settings : settingsArr) {
List<String> terms = analyze(settings, "hyphenationDecompoundingAnalyzerNoSubMatches", "kaffeemaschine fussballpumpe");
MatcherAssert.assertThat(terms, hasItems("kaffeemaschine", "kaffee", "maschine", "fussballpumpe", "fussball", "ballpumpe"));
}
assertWarnings("Setting [version] on analysis component [custom7] has no effect and is deprecated");
}

/**
* For example given a word list of: ["fuss", "fussball", "ballpumpe", "ball", "pumpe"]
* no_overlapping_matches should prevent the token "ballpumpe" as a token in "fussballpumpe.
*/
public void testHyphenationDecompoundingAnalyzerNoOverlappingMatches() throws Exception {
Settings[] settingsArr = new Settings[] { getJsonSettings(), getYamlSettings() };
for (Settings settings : settingsArr) {
List<String> terms = analyze(settings, "hyphenationDecompoundingAnalyzerNoOverlappingMatches", "kaffeemaschine fussballpumpe");
MatcherAssert.assertThat(terms, hasItems("kaffeemaschine", "kaffee", "maschine", "fussballpumpe", "fussball", "pumpe"));
}
assertWarnings("Setting [version] on analysis component [custom7] has no effect and is deprecated");
}

private List<String> analyze(Settings settings, String analyzerName, String text) throws IOException {
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test", settings);
AnalysisModule analysisModule = createAnalysisModule(settings);
Expand Down Expand Up @@ -92,20 +134,25 @@ public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
}

private Settings getJsonSettings() throws IOException {
String json = "/org/elasticsearch/analysis/common/test1.json";
return Settings.builder()
.loadFromStream(json, getClass().getResourceAsStream(json), false)
.put(IndexMetadata.SETTING_VERSION_CREATED, IndexVersion.current())
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();
return getSettings("/org/elasticsearch/analysis/common/test1.json");
}

private Settings getYamlSettings() throws IOException {
String yaml = "/org/elasticsearch/analysis/common/test1.yml";
return getSettings("/org/elasticsearch/analysis/common/test1.yml");
}

private Settings getSettings(String filePath) throws IOException {
String hypenationRulesFileName = "de_DR.xml";
InputStream hypenationRules = getClass().getResourceAsStream(hypenationRulesFileName);
Path home = createTempDir();
Path config = home.resolve("config");
Files.createDirectory(config);
Files.copy(hypenationRules, config.resolve(hypenationRulesFileName));

return Settings.builder()
.loadFromStream(yaml, getClass().getResourceAsStream(yaml), false)
.loadFromStream(filePath, getClass().getResourceAsStream(filePath), false)
.put(IndexMetadata.SETTING_VERSION_CREATED, IndexVersion.current())
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put(Environment.PATH_HOME_SETTING.getKey(), home.toString())
.build();
}
}
Loading

0 comments on commit c804953

Please sign in to comment.