Skip to content

Commit

Permalink
fix escaped wildcard query on wildcard field (#15737)
Browse files Browse the repository at this point in the history
* fix escaped wildcard query on wildcard field

Signed-off-by: gesong.samuel <[email protected]>

* fix format error

Signed-off-by: gesong.samuel <[email protected]>

* add change log

Signed-off-by: gesong.samuel <[email protected]>

---------

Signed-off-by: gesong.samuel <[email protected]>
Signed-off-by: Michael Froh <[email protected]>
Co-authored-by: gesong.samuel <[email protected]>
Co-authored-by: Michael Froh <[email protected]>
  • Loading branch information
3 people authored Sep 10, 2024
1 parent f8515c7 commit 320611a
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 13 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
### Removed

### Fixed
- Fix wildcard query containing escaped character ([#15737](https://github.com/opensearch-project/OpenSearch/pull/15737))

### Security

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@
import org.apache.lucene.util.automaton.RegExp;
import org.opensearch.common.lucene.BytesRefs;
import org.opensearch.common.lucene.Lucene;
import org.opensearch.common.regex.Regex;
import org.opensearch.common.unit.Fuzziness;
import org.opensearch.core.xcontent.XContentParser;
import org.opensearch.index.analysis.IndexAnalyzers;
Expand Down Expand Up @@ -430,22 +429,27 @@ public Query wildcardQuery(String value, MultiTermQuery.RewriteMethod method, bo
finalValue = value;
}
Predicate<String> matchPredicate;
if (value.contains("?")) {
Automaton automaton = WildcardQuery.toAutomaton(new Term(name(), finalValue));
CompiledAutomaton compiledAutomaton = new CompiledAutomaton(automaton);
Automaton automaton = WildcardQuery.toAutomaton(new Term(name(), finalValue));
CompiledAutomaton compiledAutomaton = new CompiledAutomaton(automaton);
if (compiledAutomaton.type == CompiledAutomaton.AUTOMATON_TYPE.SINGLE) {
// when type equals SINGLE, #compiledAutomaton.runAutomaton is null
matchPredicate = s -> {
if (caseInsensitive) {
s = s.toLowerCase(Locale.ROOT);
}
BytesRef valueBytes = BytesRefs.toBytesRef(s);
return compiledAutomaton.runAutomaton.run(valueBytes.bytes, valueBytes.offset, valueBytes.length);
return s.equals(finalValue);
};
} else if (compiledAutomaton.type == CompiledAutomaton.AUTOMATON_TYPE.ALL) {
return existsQuery(context);
} else if (compiledAutomaton.type == CompiledAutomaton.AUTOMATON_TYPE.NONE) {
return new MatchNoDocsQuery("Wildcard expression matches nothing");
} else {
matchPredicate = s -> {
if (caseInsensitive) {
s = s.toLowerCase(Locale.ROOT);
}
return Regex.simpleMatch(finalValue, s);
BytesRef valueBytes = BytesRefs.toBytesRef(s);
return compiledAutomaton.runAutomaton.run(valueBytes.bytes, valueBytes.offset, valueBytes.length);
};
}

Expand All @@ -468,22 +472,30 @@ public Query wildcardQuery(String value, MultiTermQuery.RewriteMethod method, bo
// Package-private for testing
static Set<String> getRequiredNGrams(String value) {
Set<String> terms = new HashSet<>();

if (value.isEmpty()) {
return terms;
}

int pos = 0;
String rawSequence = null;
String currentSequence = null;
if (!value.startsWith("?") && !value.startsWith("*")) {
// Can add prefix term
currentSequence = getNonWildcardSequence(value, 0);
rawSequence = getNonWildcardSequence(value, 0);
currentSequence = performEscape(rawSequence);
if (currentSequence.length() == 1) {
terms.add(new String(new char[] { 0, currentSequence.charAt(0) }));
} else {
terms.add(new String(new char[] { 0, currentSequence.charAt(0), currentSequence.charAt(1) }));
}
} else {
pos = findNonWildcardSequence(value, pos);
currentSequence = getNonWildcardSequence(value, pos);
rawSequence = getNonWildcardSequence(value, pos);
}
while (pos < value.length()) {
boolean isEndOfValue = pos + currentSequence.length() == value.length();
boolean isEndOfValue = pos + rawSequence.length() == value.length();
currentSequence = performEscape(rawSequence);
if (!currentSequence.isEmpty() && currentSequence.length() < 3 && !isEndOfValue && pos > 0) {
// If this is a prefix or suffix of length < 3, then we already have a longer token including the anchor.
terms.add(currentSequence);
Expand All @@ -502,16 +514,16 @@ static Set<String> getRequiredNGrams(String value) {
terms.add(new String(new char[] { a, b, 0 }));
}
}
pos = findNonWildcardSequence(value, pos + currentSequence.length());
currentSequence = getNonWildcardSequence(value, pos);
pos = findNonWildcardSequence(value, pos + rawSequence.length());
rawSequence = getNonWildcardSequence(value, pos);
}
return terms;
}

private static String getNonWildcardSequence(String value, int startFrom) {
for (int i = startFrom; i < value.length(); i++) {
char c = value.charAt(i);
if (c == '?' || c == '*') {
if ((c == '?' || c == '*') && (i == 0 || value.charAt(i - 1) != '\\')) {
return value.substring(startFrom, i);
}
}
Expand All @@ -529,6 +541,22 @@ private static int findNonWildcardSequence(String value, int startFrom) {
return value.length();
}

private static String performEscape(String str) {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < str.length(); i++) {
if (str.charAt(i) == '\\' && (i + 1) < str.length()) {
char c = str.charAt(i + 1);
if (c == '*' || c == '?') {
i++;
}
}
sb.append(str.charAt(i));
}
assert !sb.toString().contains("\\*");
assert !sb.toString().contains("\\?");
return sb.toString();
}

@Override
public Query regexpQuery(
String value,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,32 @@ public void testWildcardQuery() {
);
}

public void testEscapedWildcardQuery() {
MappedFieldType ft = new WildcardFieldMapper.WildcardFieldType("field");
Set<String> expectedTerms = new HashSet<>();
expectedTerms.add(prefixAnchored("*"));
expectedTerms.add(suffixAnchored("*"));

BooleanQuery.Builder builder = new BooleanQuery.Builder();
for (String term : expectedTerms) {
builder.add(new TermQuery(new Term("field", term)), BooleanClause.Occur.FILTER);
}

assertEquals(
new WildcardFieldMapper.WildcardMatchingQuery("field", builder.build(), "\\**\\*"),
ft.wildcardQuery("\\**\\*", null, null)
);

assertEquals(new WildcardFieldMapper.WildcardMatchingQuery("field", builder.build(), "\\*"), ft.wildcardQuery("\\*", null, null));

expectedTerms.remove(suffixAnchored("*"));
builder = new BooleanQuery.Builder();
for (String term : expectedTerms) {
builder.add(new TermQuery(new Term("field", term)), BooleanClause.Occur.FILTER);
}
assertEquals(new WildcardFieldMapper.WildcardMatchingQuery("field", builder.build(), "\\**"), ft.wildcardQuery("\\**", null, null));
}

public void testMultipleWildcardsInQuery() {
final String pattern = "a?cd*efg?h";
MappedFieldType ft = new WildcardFieldMapper.WildcardFieldType("field");
Expand Down

0 comments on commit 320611a

Please sign in to comment.