Skip to content

Commit

Permalink
[Backport 2.x] Adds various Query overrides to Keyword Field (#10425) (
Browse files Browse the repository at this point in the history
…#11031)

* Adds various Query overrides to Keyword Field (#10425)

The keywordfield mapper provides access to various query types, e.g. the termsQuery, fuzzyQuery. These are inherited as is from the StringType. But we do not take into account the fact that keyword fields can have doc_values enabled. This PR adds the ability for various queries to first check if doc_values are enabled and if so out-source the work to lucene to decide if it's better to use index values or doc_values when running queries.

Signed-off-by: Harsha Vamsi Kalluri <[email protected]>
(cherry picked from commit 63aff16)

* Update CHANGELOG.md

---------

Signed-off-by: Harsha Vamsi Kalluri <[email protected]>
Co-authored-by: Michael Froh <[email protected]>
  • Loading branch information
harshavamsi and msfroh authored Nov 1, 2023
1 parent e297b21 commit 0d30590
Show file tree
Hide file tree
Showing 7 changed files with 446 additions and 18 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
- Add telemetry tracer/metric enable flag and integ test. ([#10395](https://github.com/opensearch-project/OpenSearch/pull/10395))
- Performance improvement for Datetime field caching ([#4558](https://github.com/opensearch-project/OpenSearch/issues/4558))
- Add instrumentation for indexing in transport bulk action and transport shard bulk action. ([#10273](https://github.com/opensearch-project/OpenSearch/pull/10273))
- Performance improvement for MultiTerm Queries on Keyword fields ([#7057](https://github.com/opensearch-project/OpenSearch/issues/7057))
- [BUG] Disable sort optimization for HALF_FLOAT ([#10999](https://github.com/opensearch-project/OpenSearch/pull/10999))

### Deprecated
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
---
"search on keyword fields with doc_values enabled":
- do:
indices.create:
index: test
body:
mappings:
properties:
"some_keyword":
type: "keyword"
index: true
doc_values: true

- do:
bulk:
index: test
refresh: true
body:
- '{"index": {"_index": "test", "_id": "1" }}'
- '{ "some_keyword": "ingesting some random keyword data" }'
- '{ "index": { "_index": "test", "_id": "2" }}'
- '{ "some_keyword": "400" }'
- '{ "index": { "_index": "test", "_id": "3" } }'
- '{ "some_keyword": "5" }'

- do:
search:
index: test
body:
query:
prefix:
some_keyword: "ing"

- match: { hits.hits.0._source.some_keyword: "ingesting some random keyword data" }

- do:
search:
index: test
body:
query:
range: {
"some_keyword": {
"lt": 500
} }

- match: { hits.total.value: 2 }
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,24 @@
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.IndexOrDocValuesQuery;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.RegexpQuery;
import org.apache.lucene.search.TermInSetQuery;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.Operations;
import org.opensearch.OpenSearchException;
import org.opensearch.common.Nullable;
import org.opensearch.common.lucene.BytesRefs;
import org.opensearch.common.lucene.Lucene;
import org.opensearch.common.lucene.search.AutomatonQueries;
import org.opensearch.common.unit.Fuzziness;
import org.opensearch.core.xcontent.XContentParser;
import org.opensearch.index.analysis.IndexAnalyzers;
import org.opensearch.index.analysis.NamedAnalyzer;
Expand All @@ -62,6 +75,8 @@
import java.util.Objects;
import java.util.function.Supplier;

import static org.opensearch.search.SearchService.ALLOW_EXPENSIVE_QUERIES;

/**
* A field mapper for keywords. This mapper accepts strings and indexes them as-is.
*
Expand Down Expand Up @@ -317,7 +332,7 @@ public IndexFieldData.Builder fielddataBuilder(String fullyQualifiedIndexName, S
@Override
public ValueFetcher valueFetcher(QueryShardContext context, SearchLookup searchLookup, String format) {
if (format != null) {
throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() + "] doesn't support formats.");
throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() + "] doesn't " + "support formats.");
}

return new SourceValueFetcher(name(), context, nullValue) {
Expand Down Expand Up @@ -372,17 +387,226 @@ protected BytesRef indexedValueForSearch(Object value) {
return getTextSearchInfo().getSearchAnalyzer().normalize(name(), value.toString());
}

@Override
public Query termsQuery(List<?> values, QueryShardContext context) {
failIfNotIndexedAndNoDocValues();
// has index and doc_values enabled
if (isSearchable() && hasDocValues()) {
BytesRef[] bytesRefs = new BytesRef[values.size()];
for (int i = 0; i < bytesRefs.length; i++) {
bytesRefs[i] = indexedValueForSearch(values.get(i));
}
Query indexQuery = new TermInSetQuery(name(), bytesRefs);
Query dvQuery = new TermInSetQuery(MultiTermQuery.DOC_VALUES_REWRITE, name(), bytesRefs);
return new IndexOrDocValuesQuery(indexQuery, dvQuery);
}
// if we only have doc_values enabled, we construct a new query with doc_values re-written
if (hasDocValues()) {
BytesRef[] bytesRefs = new BytesRef[values.size()];
for (int i = 0; i < bytesRefs.length; i++) {
bytesRefs[i] = indexedValueForSearch(values.get(i));
}
return new TermInSetQuery(MultiTermQuery.DOC_VALUES_REWRITE, name(), bytesRefs);
}
// has index enabled, we're going to return the query as is
return super.termsQuery(values, context);
}

@Override
public Query prefixQuery(
String value,
@Nullable MultiTermQuery.RewriteMethod method,
boolean caseInsensitive,
QueryShardContext context
) {
if (context.allowExpensiveQueries() == false) {
throw new OpenSearchException(
"[prefix] queries cannot be executed when '"
+ ALLOW_EXPENSIVE_QUERIES.getKey()
+ "' is set to false. For optimised prefix queries on text "
+ "fields please enable [index_prefixes]."
);
}
failIfNotIndexedAndNoDocValues();
if (isSearchable() && hasDocValues()) {
Query indexQuery = super.prefixQuery(value, method, caseInsensitive, context);
Query dvQuery = super.prefixQuery(value, MultiTermQuery.DOC_VALUES_REWRITE, caseInsensitive, context);
return new IndexOrDocValuesQuery(indexQuery, dvQuery);
}
if (hasDocValues()) {
if (caseInsensitive) {
return AutomatonQueries.caseInsensitivePrefixQuery(
(new Term(name(), indexedValueForSearch(value))),
MultiTermQuery.DOC_VALUES_REWRITE
);
}
return new PrefixQuery(new Term(name(), indexedValueForSearch(value)), MultiTermQuery.DOC_VALUES_REWRITE);
}
return super.prefixQuery(value, method, caseInsensitive, context);
}

@Override
public Query regexpQuery(
String value,
int syntaxFlags,
int matchFlags,
int maxDeterminizedStates,
@Nullable MultiTermQuery.RewriteMethod method,
QueryShardContext context
) {
if (context.allowExpensiveQueries() == false) {
throw new OpenSearchException(
"[regexp] queries cannot be executed when '" + ALLOW_EXPENSIVE_QUERIES.getKey() + "' is set to " + "false."
);
}
failIfNotIndexedAndNoDocValues();
if (isSearchable() && hasDocValues()) {
Query indexQuery = super.regexpQuery(value, syntaxFlags, matchFlags, maxDeterminizedStates, method, context);
Query dvQuery = super.regexpQuery(
value,
syntaxFlags,
matchFlags,
maxDeterminizedStates,
MultiTermQuery.DOC_VALUES_REWRITE,
context
);
return new IndexOrDocValuesQuery(indexQuery, dvQuery);
}
if (hasDocValues()) {
return new RegexpQuery(
new Term(name(), indexedValueForSearch(value)),
syntaxFlags,
matchFlags,
RegexpQuery.DEFAULT_PROVIDER,
maxDeterminizedStates,
MultiTermQuery.DOC_VALUES_REWRITE
);
}
return super.regexpQuery(value, syntaxFlags, matchFlags, maxDeterminizedStates, method, context);
}

@Override
public Query rangeQuery(Object lowerTerm, Object upperTerm, boolean includeLower, boolean includeUpper, QueryShardContext context) {
if (context.allowExpensiveQueries() == false) {
throw new OpenSearchException(
"[range] queries on [text] or [keyword] fields cannot be executed when '"
+ ALLOW_EXPENSIVE_QUERIES.getKey()
+ "' is set to false."
);
}
failIfNotIndexedAndNoDocValues();
if (isSearchable() && hasDocValues()) {
Query indexQuery = new TermRangeQuery(
name(),
lowerTerm == null ? null : indexedValueForSearch(lowerTerm),
upperTerm == null ? null : indexedValueForSearch(upperTerm),
includeLower,
includeUpper
);
Query dvQuery = new TermRangeQuery(
name(),
lowerTerm == null ? null : indexedValueForSearch(lowerTerm),
upperTerm == null ? null : indexedValueForSearch(upperTerm),
includeLower,
includeUpper,
MultiTermQuery.DOC_VALUES_REWRITE
);
return new IndexOrDocValuesQuery(indexQuery, dvQuery);
}
if (hasDocValues()) {
return new TermRangeQuery(
name(),
lowerTerm == null ? null : indexedValueForSearch(lowerTerm),
upperTerm == null ? null : indexedValueForSearch(upperTerm),
includeLower,
includeUpper,
MultiTermQuery.DOC_VALUES_REWRITE
);
}
return new TermRangeQuery(
name(),
lowerTerm == null ? null : indexedValueForSearch(lowerTerm),
upperTerm == null ? null : indexedValueForSearch(upperTerm),
includeLower,
includeUpper
);
}

@Override
public Query fuzzyQuery(
Object value,
Fuzziness fuzziness,
int prefixLength,
int maxExpansions,
boolean transpositions,
@Nullable MultiTermQuery.RewriteMethod method,
QueryShardContext context
) {
failIfNotIndexedAndNoDocValues();
if (context.allowExpensiveQueries() == false) {
throw new OpenSearchException(
"[fuzzy] queries cannot be executed when '" + ALLOW_EXPENSIVE_QUERIES.getKey() + "' is set to " + "false."
);
}
if (isSearchable() && hasDocValues()) {
Query indexQuery = super.fuzzyQuery(value, fuzziness, prefixLength, maxExpansions, transpositions, context);
Query dvQuery = super.fuzzyQuery(
value,
fuzziness,
prefixLength,
maxExpansions,
transpositions,
MultiTermQuery.DOC_VALUES_REWRITE,
context
);
return new IndexOrDocValuesQuery(indexQuery, dvQuery);
}
if (hasDocValues()) {
return new FuzzyQuery(
new Term(name(), indexedValueForSearch(value)),
fuzziness.asDistance(BytesRefs.toString(value)),
prefixLength,
maxExpansions,
transpositions,
MultiTermQuery.DOC_VALUES_REWRITE
);
}
return super.fuzzyQuery(value, fuzziness, prefixLength, maxExpansions, transpositions, context);
}

@Override
public Query wildcardQuery(
String value,
@Nullable MultiTermQuery.RewriteMethod method,
boolean caseInsensitve,
boolean caseInsensitive,
QueryShardContext context
) {
// keyword field types are always normalized, so ignore case sensitivity and force normalize the wildcard
if (context.allowExpensiveQueries() == false) {
throw new OpenSearchException(
"[wildcard] queries cannot be executed when '" + ALLOW_EXPENSIVE_QUERIES.getKey() + "' is set to " + "false."
);
}
failIfNotIndexedAndNoDocValues();
// keyword field types are always normalized, so ignore case sensitivity and force normalize the
// wildcard
// query text
return super.wildcardQuery(value, method, caseInsensitve, true, context);
if (isSearchable() && hasDocValues()) {
Query indexQuery = super.wildcardQuery(value, method, caseInsensitive, true, context);
Query dvQuery = super.wildcardQuery(value, MultiTermQuery.DOC_VALUES_REWRITE, caseInsensitive, true, context);
return new IndexOrDocValuesQuery(indexQuery, dvQuery);
}
if (hasDocValues()) {
Term term;
value = normalizeWildcardPattern(name(), value, getTextSearchInfo().getSearchAnalyzer());
term = new Term(name(), value);
if (caseInsensitive) {
return AutomatonQueries.caseInsensitiveWildcardQuery(term, method);
}
return new WildcardQuery(term, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT, MultiTermQuery.DOC_VALUES_REWRITE);
}
return super.wildcardQuery(value, method, caseInsensitive, true, context);
}

}

private final boolean indexed;
Expand Down Expand Up @@ -422,8 +646,10 @@ protected KeywordFieldMapper(
this.indexAnalyzers = builder.indexAnalyzers;
}

/** Values that have more chars than the return value of this method will
* be skipped at parsing time. */
/**
* Values that have more chars than the return value of this method will
* be skipped at parsing time.
*/
public int ignoreAbove() {
return ignoreAbove;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,21 @@ public Query fuzzyQuery(
);
}

// Fuzzy Query with re-write method
public Query fuzzyQuery(
Object value,
Fuzziness fuzziness,
int prefixLength,
int maxExpansions,
boolean transpositions,
@Nullable MultiTermQuery.RewriteMethod method,
QueryShardContext context
) {
throw new IllegalArgumentException(
"Can only use fuzzy queries on keyword and text fields - not on [" + name + "] which is of type [" + typeName() + "]"
);
}

// Case sensitive form of prefix query
public final Query prefixQuery(String value, @Nullable MultiTermQuery.RewriteMethod method, QueryShardContext context) {
return prefixQuery(value, method, false, context);
Expand Down Expand Up @@ -433,6 +448,15 @@ protected final void failIfNotIndexed() {
}
}

protected final void failIfNotIndexedAndNoDocValues() {
// we fail if a field is both not indexed and does not have doc_values enabled
if (isIndexed == false && hasDocValues() == false) {
throw new IllegalArgumentException(
"Cannot search on field [" + name() + "] since it is both not indexed," + " and does not have doc_values enabled."
);
}
}

public boolean eagerGlobalOrdinals() {
return eagerGlobalOrdinals;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
package org.opensearch.search.fetch.subphase.highlight;

import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.IndexOrDocValuesQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.WeightedSpanTerm;
Expand Down Expand Up @@ -104,6 +105,8 @@ protected void extract(Query query, float boost, Map<String, WeightedSpanTerm> t
super.extract(((FunctionScoreQuery) query).getSubQuery(), boost, terms);
} else if (query instanceof OpenSearchToParentBlockJoinQuery) {
super.extract(((OpenSearchToParentBlockJoinQuery) query).getChildQuery(), boost, terms);
} else if (query instanceof IndexOrDocValuesQuery) {
super.extract(((IndexOrDocValuesQuery) query).getIndexQuery(), boost, terms);
} else {
super.extract(query, boost, terms);
}
Expand Down
Loading

0 comments on commit 0d30590

Please sign in to comment.