Skip to content

Commit

Permalink
Adding new SortedUnsignedLongDocValuesSetQuery to allow for BitIntege…
Browse files Browse the repository at this point in the history
…r Terms query

Signed-off-by: Harsha Vamsi Kalluri <[email protected]>
  • Loading branch information
harshavamsi committed Dec 7, 2023
1 parent 670afb4 commit 2cdeb08
Show file tree
Hide file tree
Showing 6 changed files with 324 additions and 6 deletions.
3 changes: 2 additions & 1 deletion .idea/inspectionProfiles/Project_Default.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ setup:
integer: 1
long: 1
short: 1
unsigned_long: 1
unsigned_long: 10223372036854775807

- do:
headers:
Expand All @@ -74,7 +74,7 @@ setup:
integer: 1
long: 1
short: 1
unsigned_long: 1
unsigned_long: 10223372036854775807


- do:
Expand All @@ -91,7 +91,7 @@ setup:
integer: 1
long: 1
short: 1
unsigned_long: 1
unsigned_long: 10223372036854775807


- do:
Expand Down
138 changes: 138 additions & 0 deletions server/src/main/java/org/apache/lucene/document/LongHashSet.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/

package org.apache.lucene.document;

import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.packed.PackedInts;

import java.util.Arrays;
import java.util.Objects;
import java.util.stream.Collectors;
import java.util.stream.LongStream;

/** Set of longs, optimized for docvalues usage */
public final class LongHashSet implements Accountable {
private static final long BASE_RAM_BYTES = RamUsageEstimator.shallowSizeOfInstance(LongHashSet.class);

private static final long MISSING = Long.MIN_VALUE;

final long[] table;
final int mask;
final boolean hasMissingValue;
final int size;
/** minimum value in the set, or Long.MAX_VALUE for an empty set */
public final long minValue;
/** maximum value in the set, or Long.MIN_VALUE for an empty set */
public final long maxValue;

/** Construct a set. Values must be in sorted order. */
public LongHashSet(long[] values) {
int tableSize = Math.toIntExact(values.length * 3L / 2);
tableSize = 1 << PackedInts.bitsRequired(tableSize); // make it a power of 2
assert tableSize >= values.length * 3L / 2;
table = new long[tableSize];
Arrays.fill(table, MISSING);
mask = tableSize - 1;
boolean hasMissingValue = false;
int size = 0;
long previousValue = Long.MIN_VALUE; // for assert
for (long value : values) {
if (value == MISSING) {
size += hasMissingValue ? 0 : 1;
hasMissingValue = true;
} else if (add(value)) {
++size;
}
assert value >= previousValue : "values must be provided in sorted order";
previousValue = value;
}
this.hasMissingValue = hasMissingValue;
this.size = size;
this.minValue = values.length == 0 ? Long.MAX_VALUE : values[0];
this.maxValue = values.length == 0 ? Long.MIN_VALUE : values[values.length - 1];
}

private boolean add(long l) {
assert l != MISSING;
final int slot = Long.hashCode(l) & mask;
for (int i = slot;; i = (i + 1) & mask) {
if (table[i] == MISSING) {
table[i] = l;
return true;
} else if (table[i] == l) {
// already added
return false;
}
}
}

/**
* check for membership in the set.
*
* <p>You should use {@link #minValue} and {@link #maxValue} to guide/terminate iteration before
* calling this.
*/
public boolean contains(long l) {
if (l == MISSING) {
return hasMissingValue;
}
final int slot = Long.hashCode(l) & mask;
for (int i = slot;; i = (i + 1) & mask) {
if (table[i] == MISSING) {
return false;
} else if (table[i] == l) {
return true;
}
}
}

/** returns a stream of all values contained in this set */
LongStream stream() {
LongStream stream = Arrays.stream(table).filter(v -> v != MISSING);
if (hasMissingValue) {
stream = LongStream.concat(LongStream.of(MISSING), stream);
}
return stream;
}

@Override
public int hashCode() {
return Objects.hash(size, minValue, maxValue, mask, hasMissingValue, Arrays.hashCode(table));
}

@Override
public boolean equals(Object obj) {
if (obj != null && obj instanceof LongHashSet) {
LongHashSet that = (LongHashSet) obj;
return size == that.size
&& minValue == that.minValue
&& maxValue == that.maxValue
&& mask == that.mask
&& hasMissingValue == that.hasMissingValue
&& Arrays.equals(table, that.table);
}
return false;
}

@Override
public String toString() {
return stream().mapToObj(String::valueOf).collect(Collectors.joining(", ", "[", "]"));
}

/** number of elements in the set */
public int size() {
return size;
}

@Override
public long ramBytesUsed() {
return BASE_RAM_BYTES + RamUsageEstimator.sizeOfObject(table);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/

package org.opensearch.index.document;

import org.apache.lucene.document.LongHashSet;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.search.ConstantScoreScorer;
import org.apache.lucene.search.ConstantScoreWeight;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchNoDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.TwoPhaseIterator;
import org.apache.lucene.search.Weight;

import java.io.IOException;
import java.math.BigInteger;
import java.util.Arrays;
import java.util.Objects;

/**
* The {@link org.apache.lucene.document.SortedNumericDocValuesSetQuery} implementation for unsigned long numeric data type.
*
* @opensearch.internal
*/
public abstract class SortedUnsignedLongDocValuesSetQuery extends Query {

private final String field;
private final LongHashSet numbers;

SortedUnsignedLongDocValuesSetQuery(String field, BigInteger[] numbers) {
this.field = Objects.requireNonNull(field);
Arrays.sort(numbers);
this.numbers = new LongHashSet(Arrays.stream(numbers).mapToLong(n -> n.longValue()).toArray());
}

@Override
public String toString(String field) {
return new StringBuilder().append(field).append(": ").append(numbers.toString()).toString();
}

@Override
public void visit(QueryVisitor visitor) {
if (visitor.acceptField(field)) {
visitor.visitLeaf(this);
}
}

@Override
public Query rewrite(IndexSearcher indexSearcher) throws IOException {
if (numbers.size() == 0) {
return new MatchNoDocsQuery();
}
return super.rewrite(indexSearcher);
}

@Override
public boolean equals(Object other) {
if (sameClassAs(other) == false) {
return false;
}
SortedUnsignedLongDocValuesSetQuery that = (SortedUnsignedLongDocValuesSetQuery) other;
return field.equals(that.field) && numbers.equals(that.numbers);
}

@Override
public int hashCode() {
return Objects.hash(classHash(), field, numbers);
}

abstract SortedNumericDocValues getValues(LeafReader reader, String field) throws IOException;

@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
return new ConstantScoreWeight(this, boost) {

@Override
public boolean isCacheable(LeafReaderContext ctx) {
return DocValues.isCacheable(ctx, field);
}

@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
SortedNumericDocValues values = getValues(context.reader(), field);
if (values == null) {
return null;
}
final NumericDocValues singleton = DocValues.unwrapSingleton(values);
final TwoPhaseIterator iterator;
if (singleton != null) {
iterator = new TwoPhaseIterator(singleton) {
@Override
public boolean matches() throws IOException {
long value = singleton.longValue();
return Long.compareUnsigned(value, numbers.minValue) >= 0
&& Long.compareUnsigned(value, numbers.maxValue) <= 0
&& numbers.contains(value);
}

@Override
public float matchCost() {
return 5; // 2 comparisions, possible lookup in the set
}
};
} else {
iterator = new TwoPhaseIterator(values) {
@Override
public boolean matches() throws IOException {
int count = values.docValueCount();
for (int i = 0; i < count; i++) {
final long value = values.nextValue();
if (Long.compareUnsigned(value, numbers.minValue) < 0) {
continue;
} else if (Long.compareUnsigned(value, numbers.maxValue) > 0) {
return false; // values are sorted, terminate
} else if (numbers.contains(value)) {
return true;
}
}
return false;
}

@Override
public float matchCost() {
return 5; // 2 comparisons, possible lookup in the set
}
};
}
return new ConstantScoreScorer(this, score(), scoreMode, iterator);
}
};
}

public static Query newSlowSetQuery(String field, BigInteger... values) {
return new SortedUnsignedLongDocValuesSetQuery(field, values) {
@Override
SortedNumericDocValues getValues(LeafReader reader, String field) throws IOException {
FieldInfo info = reader.getFieldInfos().fieldInfo(field);
if (info == null) {
// Queries have some optimizations when one sub scorer returns null rather
// than a scorer that does not match any documents
return null;
}
return DocValues.getSortedNumeric(reader, field);
}
};
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
import org.opensearch.core.xcontent.XContentParser;
import org.opensearch.core.xcontent.XContentParser.Token;
import org.opensearch.index.document.SortedUnsignedLongDocValuesRangeQuery;
import org.opensearch.index.document.SortedUnsignedLongDocValuesSetQuery;
import org.opensearch.index.fielddata.IndexFieldData;
import org.opensearch.index.fielddata.IndexNumericFieldData.NumericType;
import org.opensearch.index.fielddata.plain.SortedNumericIndexFieldData;
Expand Down Expand Up @@ -1018,11 +1019,11 @@ public Query termsQuery(String field, List<Object> values, boolean hasDocvalues,

if (isSearchable && hasDocvalues) {
Query query = BigIntegerPoint.newSetQuery(field, v);
Query dvQuery = SortedNumericDocValuesField.newSlowSetQuery(field, points);
Query dvQuery = SortedUnsignedLongDocValuesSetQuery.newSlowSetQuery(field, v);
return new IndexOrDocValuesQuery(query, dvQuery);
}
if (hasDocvalues) {
return SortedNumericDocValuesField.newSlowSetQuery(field, points);
return SortedUnsignedLongDocValuesSetQuery.newSlowSetQuery(field, v);
}
return BigIntegerPoint.newSetQuery(field, v);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
import org.opensearch.core.xcontent.XContentBuilder;
import org.opensearch.index.IndexSettings;
import org.opensearch.index.document.SortedUnsignedLongDocValuesRangeQuery;
import org.opensearch.index.document.SortedUnsignedLongDocValuesSetQuery;
import org.opensearch.index.fielddata.IndexNumericFieldData;
import org.opensearch.index.mapper.MappedFieldType.Relation;
import org.opensearch.index.mapper.NumberFieldMapper.NumberFieldType;
Expand Down Expand Up @@ -413,6 +414,22 @@ public void testUnsignedLongRangeQuery() {
assertEquals("Cannot search on field [field] since it is both not indexed, and does not have doc_values enabled.", e.getMessage());
}

public void testUnsignedLongTermsQuery() {
MappedFieldType ft = new NumberFieldMapper.NumberFieldType("field", NumberFieldMapper.NumberType.UNSIGNED_LONG);
Query expected = new IndexOrDocValuesQuery(
BigIntegerPoint.newSetQuery("field", BigInteger.valueOf(1), BigInteger.valueOf(3)),
SortedUnsignedLongDocValuesSetQuery.newSlowSetQuery("field", BigInteger.valueOf(1), BigInteger.valueOf(3))
);
assertEquals(expected, ft.termsQuery(List.of("1", "3"), MOCK_QSC));

MappedFieldType unsearchable = unsearchable();
IllegalArgumentException e = expectThrows(
IllegalArgumentException.class,
() -> unsearchable.termsQuery(List.of("1", "3"), MOCK_QSC)
);
assertEquals("Cannot search on field [field] since it is both not indexed, and does not have doc_values enabled.", e.getMessage());
}

public void testDoubleRangeQuery() {
MappedFieldType ft = new NumberFieldMapper.NumberFieldType("field", NumberFieldMapper.NumberType.DOUBLE);
Query expected = new IndexOrDocValuesQuery(
Expand Down

0 comments on commit 2cdeb08

Please sign in to comment.