Skip to content

Commit

Permalink
Optimize global ordinal includes/excludes for prefix matching
Browse files Browse the repository at this point in the history
If an aggregration specifies includes or excludes based on a regular
expression, and the regular expression has a finite expansion followed
by .*, then we can optimize the global ordinal filter.

Specifically, in this case, we can expand the matching prefixes, then
include/exclude the range of global ordinals that start with each
prefix.

Signed-off-by: Michael Froh <[email protected]>
  • Loading branch information
msfroh committed Jun 15, 2024
1 parent cf2c31f commit 05c8e3c
Showing 1 changed file with 169 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,11 @@
import org.opensearch.search.DocValueFormat;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Objects;
import java.util.Set;
import java.util.SortedSet;
Expand Down Expand Up @@ -86,6 +90,10 @@ public class IncludeExclude implements Writeable, ToXContentFragment {
* https://github.com/opensearch-project/OpenSearch/issues/2858
*/
private static final int DEFAULT_MAX_REGEX_LENGTH = 1000;
/**
* The maximum number of prefixes to extract from a regex in tryCreatePrefixOrdinalsFilter
*/
private static final int MAX_PREFIXES = 1000;

// for parsing purposes only
// TODO: move all aggs to the same package so that this stuff could be pkg-private
Expand Down Expand Up @@ -393,6 +401,78 @@ public LongBitSet acceptedGlobalOrdinals(SortedSetDocValues globalOrdinals) thro

}

/**
* An ordinals filter that includes/excludes all ordinals corresponding to terms starting with the given prefixes
*/
static class PrefixBackedOrdinalsFilter extends OrdinalsFilter {

private final SortedSet<BytesRef> includePrefixes, excludePrefixes;

private PrefixBackedOrdinalsFilter(SortedSet<BytesRef> includePrefixes, SortedSet<BytesRef> excludePrefixes) {
this.includePrefixes = includePrefixes;
this.excludePrefixes = excludePrefixes;
}

private static BytesRef nextBytesRef(BytesRef bytesRef) {
BytesRef next = BytesRef.deepCopyOf(bytesRef);
int pos = next.offset + next.length - 1;
while (pos >= bytesRef.offset && next.bytes[pos] == -1) {
next.bytes[pos] = 0;
pos--;
}
if (pos >= bytesRef.offset) {
next.bytes[pos]++;
} else {
// Every byte in our prefix had value 0xFF. We must match all subsequent ordinals.
return null;
}
return next;
}

private interface LongLongBiconsumer {
void accept(long a, long b);
}

private static void process(SortedSetDocValues globalOrdinals, long length, SortedSet<BytesRef> prefixes, LongLongBiconsumer consumer) throws IOException {
for (BytesRef prefix : prefixes) {
long startOrd = globalOrdinals.lookupTerm(prefix);
if (startOrd < 0) {
// The prefix is not an exact match in the ordinals (can skip equal length below)
startOrd = -1 - startOrd;
// Make sure that the term at startOrd starts with prefix
BytesRef startTerm = globalOrdinals.lookupOrd(startOrd);
if (startTerm.length <= prefix.length || !Arrays.equals(startTerm.bytes, startTerm.offset, startTerm.offset + prefix.length, prefix.bytes, prefix.offset, prefix.offset + prefix.length)) {
continue;
}
}
if (startOrd >= length) {
continue;
}
BytesRef next = nextBytesRef(prefix);
if (next == null) {
consumer.accept(startOrd, length);
} else {
long endOrd = globalOrdinals.lookupTerm(next);
if (endOrd < 0) {
endOrd = -1 - endOrd;
}
if (startOrd < endOrd) {
consumer.accept(startOrd, endOrd);
}
}
}

}

@Override
public LongBitSet acceptedGlobalOrdinals(SortedSetDocValues globalOrdinals) throws IOException {
LongBitSet accept = new LongBitSet(globalOrdinals.getValueCount());
process(globalOrdinals, accept.length(), includePrefixes, accept::set);
process(globalOrdinals, accept.length(), excludePrefixes, accept::clear);
return accept;
}
}

private final String include, exclude;
private final SortedSet<BytesRef> includeValues, excludeValues;
private final int incZeroBasedPartition;
Expand Down Expand Up @@ -709,8 +789,13 @@ public OrdinalsFilter convertToOrdinalsFilter(DocValueFormat format) {
}

public OrdinalsFilter convertToOrdinalsFilter(DocValueFormat format, int maxRegexLength) {

if (isRegexBased()) {
if ((include == null || include.endsWith(".*")) && (exclude == null || exclude.endsWith(".*"))) {
PrefixBackedOrdinalsFilter prefixBackedOrdinalsFilter = tryCreatePrefixOrdinalsFilter(maxRegexLength);
if (prefixBackedOrdinalsFilter != null) {
return prefixBackedOrdinalsFilter;
}
}
return new AutomatonBackedOrdinalsFilter(toAutomaton(maxRegexLength));
}
if (isPartitionBased()) {
Expand All @@ -720,6 +805,89 @@ public OrdinalsFilter convertToOrdinalsFilter(DocValueFormat format, int maxRege
return new TermListBackedOrdinalsFilter(parseForDocValues(includeValues, format), parseForDocValues(excludeValues, format));
}


private static List<String> expandPrefixes(RegExp regExp, int maxPrefixes) {
switch (regExp.kind) {
case REGEXP_UNION:
List<RegExp> leaves = new ArrayList<>();
while (regExp.exp1.kind == RegExp.Kind.REGEXP_UNION) {
leaves.add(regExp.exp2);
regExp = regExp.exp1;
}
leaves.add(regExp.exp2);
leaves.add(regExp.exp1);
List<String> output = new ArrayList<>();
for (RegExp leaf : leaves) {
List<String> prefixes = expandPrefixes(leaf, maxPrefixes);
if (prefixes == null) {
return null;
} else {
if (output.size() + prefixes.size() > maxPrefixes) {
return null;
}
output.addAll(prefixes);
}
}
return output;
case REGEXP_CONCATENATION:
List<String> prefixes = expandPrefixes(regExp.exp1, maxPrefixes);
List<String> suffixes = expandPrefixes(regExp.exp2, maxPrefixes);
if (prefixes != null && suffixes != null) {
if (prefixes.size() * suffixes.size() > maxPrefixes) {
return null;
}
List<String> out = new ArrayList<>();
for (String prefix : prefixes) {
for (String suffix : suffixes) {
out.add(prefix + suffix);
}
}
return out;
}
return null;
case REGEXP_CHAR:
return List.of(Character.toString(regExp.c));
case REGEXP_STRING:
return List.of(regExp.s);
default:
return null;
}
}

private static SortedSet<BytesRef> extractPrefixes(String pattern, int maxRegexLength) {
if (pattern == null) {
return Collections.emptySortedSet();
}
SortedSet<BytesRef> prefixSet = null;
validateRegExpStringLength(pattern, maxRegexLength);
RegExp regExp = new RegExp(pattern);
if (regExp.kind == RegExp.Kind.REGEXP_CONCATENATION && regExp.exp2.kind == RegExp.Kind.REGEXP_REPEAT) {
RegExp tail = regExp.exp2.exp1;
if (tail.kind == RegExp.Kind.REGEXP_ANYCHAR || tail.kind == RegExp.Kind.REGEXP_ANYSTRING) {
List<String> prefixes = expandPrefixes(regExp.exp1, MAX_PREFIXES);
if (prefixes != null) {
prefixSet = new TreeSet<>();
for (String prefix: prefixes) {
prefixSet.add(new BytesRef(prefix));
}
}
}
}
return prefixSet;
}

private PrefixBackedOrdinalsFilter tryCreatePrefixOrdinalsFilter(int maxRegexLength) {
SortedSet<BytesRef> includeSet = extractPrefixes(include, maxRegexLength);
if (includeSet == null) {
return null;
}
SortedSet<BytesRef> excludeSet = extractPrefixes(exclude, maxRegexLength);
if (excludeSet == null) {
return null;
}
return new PrefixBackedOrdinalsFilter(includeSet, excludeSet);
}

public LongFilter convertToLongFilter(DocValueFormat format) {

if (isPartitionBased()) {
Expand Down

0 comments on commit 05c8e3c

Please sign in to comment.