Adding new SortedUnsignedLongDocValuesSetQuery to allow for BitIntege…

…r Terms query Signed-off-by: Harsha Vamsi Kalluri <[email protected]>
harshavamsi · Dec 7, 2023 · 2cdeb08 · 2cdeb08
1 parent 670afb4
commit 2cdeb08
Show file tree

Hide file tree

Showing 6 changed files with 324 additions and 6 deletions.
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
diff --git a/rest-api-spec/src/main/resources/rest-api-spec/test/search/350_number_field_term.yml b/rest-api-spec/src/main/resources/rest-api-spec/test/search/350_number_field_term.yml
@@ -58,7 +58,7 @@ setup:
           integer: 1
           long: 1
           short: 1
-          unsigned_long: 1
+          unsigned_long: 10223372036854775807
 
   - do:
       headers:
@@ -74,7 +74,7 @@ setup:
           integer: 1
           long: 1
           short: 1
-          unsigned_long: 1
+          unsigned_long: 10223372036854775807
 
 
   - do:
@@ -91,7 +91,7 @@ setup:
           integer: 1
           long: 1
           short: 1
-          unsigned_long: 1
+          unsigned_long: 10223372036854775807
 
 
   - do:

diff --git a/server/src/main/java/org/apache/lucene/document/LongHashSet.java b/server/src/main/java/org/apache/lucene/document/LongHashSet.java
@@ -0,0 +1,138 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.apache.lucene.document;
+
+import org.apache.lucene.util.Accountable;
+import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.packed.PackedInts;
+
+import java.util.Arrays;
+import java.util.Objects;
+import java.util.stream.Collectors;
+import java.util.stream.LongStream;
+
+/** Set of longs, optimized for docvalues usage */
+public final class LongHashSet implements Accountable {
+    private static final long BASE_RAM_BYTES = RamUsageEstimator.shallowSizeOfInstance(LongHashSet.class);
+
+    private static final long MISSING = Long.MIN_VALUE;
+
+    final long[] table;
+    final int mask;
+    final boolean hasMissingValue;
+    final int size;
+    /** minimum value in the set, or Long.MAX_VALUE for an empty set */
+    public final long minValue;
+    /** maximum value in the set, or Long.MIN_VALUE for an empty set */
+    public final long maxValue;
+
+    /** Construct a set. Values must be in sorted order. */
+    public LongHashSet(long[] values) {
+        int tableSize = Math.toIntExact(values.length * 3L / 2);
+        tableSize = 1 << PackedInts.bitsRequired(tableSize); // make it a power of 2
+        assert tableSize >= values.length * 3L / 2;
+        table = new long[tableSize];
+        Arrays.fill(table, MISSING);
+        mask = tableSize - 1;
+        boolean hasMissingValue = false;
+        int size = 0;
+        long previousValue = Long.MIN_VALUE; // for assert
+        for (long value : values) {
+            if (value == MISSING) {
+                size += hasMissingValue ? 0 : 1;
+                hasMissingValue = true;
+            } else if (add(value)) {
+                ++size;
+            }
+            assert value >= previousValue : "values must be provided in sorted order";
+            previousValue = value;
+        }
+        this.hasMissingValue = hasMissingValue;
+        this.size = size;
+        this.minValue = values.length == 0 ? Long.MAX_VALUE : values[0];
+        this.maxValue = values.length == 0 ? Long.MIN_VALUE : values[values.length - 1];
+    }
+
+    private boolean add(long l) {
+        assert l != MISSING;
+        final int slot = Long.hashCode(l) & mask;
+        for (int i = slot;; i = (i + 1) & mask) {
+            if (table[i] == MISSING) {
+                table[i] = l;
+                return true;
+            } else if (table[i] == l) {
+                // already added
+                return false;
+            }
+        }
+    }
+
+    /**
+     * check for membership in the set.
+     *
+     * <p>You should use {@link #minValue} and {@link #maxValue} to guide/terminate iteration before
+     * calling this.
+     */
+    public boolean contains(long l) {
+        if (l == MISSING) {
+            return hasMissingValue;
+        }
+        final int slot = Long.hashCode(l) & mask;
+        for (int i = slot;; i = (i + 1) & mask) {
+            if (table[i] == MISSING) {
+                return false;
+            } else if (table[i] == l) {
+                return true;
+            }
+        }
+    }
+
+    /** returns a stream of all values contained in this set */
+    LongStream stream() {
+        LongStream stream = Arrays.stream(table).filter(v -> v != MISSING);
+        if (hasMissingValue) {
+            stream = LongStream.concat(LongStream.of(MISSING), stream);
+        }
+        return stream;
+    }
+
+    @Override
+    public int hashCode() {
+        return Objects.hash(size, minValue, maxValue, mask, hasMissingValue, Arrays.hashCode(table));
+    }
+
+    @Override
+    public boolean equals(Object obj) {
+        if (obj != null && obj instanceof LongHashSet) {
+            LongHashSet that = (LongHashSet) obj;
+            return size == that.size
+                && minValue == that.minValue
+                && maxValue == that.maxValue
+                && mask == that.mask
+                && hasMissingValue == that.hasMissingValue
+                && Arrays.equals(table, that.table);
+        }
+        return false;
+    }
+
+    @Override
+    public String toString() {
+        return stream().mapToObj(String::valueOf).collect(Collectors.joining(", ", "[", "]"));
+    }
+
+    /** number of elements in the set */
+    public int size() {
+        return size;
+    }
+
+    @Override
+    public long ramBytesUsed() {
+        return BASE_RAM_BYTES + RamUsageEstimator.sizeOfObject(table);
+    }
+}
diff --git a/server/src/main/java/org/opensearch/index/document/SortedUnsignedLongDocValuesSetQuery.java b/server/src/main/java/org/opensearch/index/document/SortedUnsignedLongDocValuesSetQuery.java
@@ -0,0 +1,161 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.index.document;
+
+import org.apache.lucene.document.LongHashSet;
+import org.apache.lucene.index.DocValues;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.index.SortedNumericDocValues;
+import org.apache.lucene.search.ConstantScoreScorer;
+import org.apache.lucene.search.ConstantScoreWeight;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.MatchNoDocsQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.QueryVisitor;
+import org.apache.lucene.search.ScoreMode;
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.search.TwoPhaseIterator;
+import org.apache.lucene.search.Weight;
+
+import java.io.IOException;
+import java.math.BigInteger;
+import java.util.Arrays;
+import java.util.Objects;
+
+/**
+ * The {@link org.apache.lucene.document.SortedNumericDocValuesSetQuery} implementation for unsigned long numeric data type.
+ *
+ * @opensearch.internal
+ */
+public abstract class SortedUnsignedLongDocValuesSetQuery extends Query {
+
+    private final String field;
+    private final LongHashSet numbers;
+
+    SortedUnsignedLongDocValuesSetQuery(String field, BigInteger[] numbers) {
+        this.field = Objects.requireNonNull(field);
+        Arrays.sort(numbers);
+        this.numbers = new LongHashSet(Arrays.stream(numbers).mapToLong(n -> n.longValue()).toArray());
+    }
+
+    @Override
+    public String toString(String field) {
+        return new StringBuilder().append(field).append(": ").append(numbers.toString()).toString();
+    }
+
+    @Override
+    public void visit(QueryVisitor visitor) {
+        if (visitor.acceptField(field)) {
+            visitor.visitLeaf(this);
+        }
+    }
+
+    @Override
+    public Query rewrite(IndexSearcher indexSearcher) throws IOException {
+        if (numbers.size() == 0) {
+            return new MatchNoDocsQuery();
+        }
+        return super.rewrite(indexSearcher);
+    }
+
+    @Override
+    public boolean equals(Object other) {
+        if (sameClassAs(other) == false) {
+            return false;
+        }
+        SortedUnsignedLongDocValuesSetQuery that = (SortedUnsignedLongDocValuesSetQuery) other;
+        return field.equals(that.field) && numbers.equals(that.numbers);
+    }
+
+    @Override
+    public int hashCode() {
+        return Objects.hash(classHash(), field, numbers);
+    }
+
+    abstract SortedNumericDocValues getValues(LeafReader reader, String field) throws IOException;
+
+    @Override
+    public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
+        return new ConstantScoreWeight(this, boost) {
+
+            @Override
+            public boolean isCacheable(LeafReaderContext ctx) {
+                return DocValues.isCacheable(ctx, field);
+            }
+
+            @Override
+            public Scorer scorer(LeafReaderContext context) throws IOException {
+                SortedNumericDocValues values = getValues(context.reader(), field);
+                if (values == null) {
+                    return null;
+                }
+                final NumericDocValues singleton = DocValues.unwrapSingleton(values);
+                final TwoPhaseIterator iterator;
+                if (singleton != null) {
+                    iterator = new TwoPhaseIterator(singleton) {
+                        @Override
+                        public boolean matches() throws IOException {
+                            long value = singleton.longValue();
+                            return Long.compareUnsigned(value, numbers.minValue) >= 0
+                                && Long.compareUnsigned(value, numbers.maxValue) <= 0
+                                && numbers.contains(value);
+                        }
+
+                        @Override
+                        public float matchCost() {
+                            return 5; // 2 comparisions, possible lookup in the set
+                        }
+                    };
+                } else {
+                    iterator = new TwoPhaseIterator(values) {
+                        @Override
+                        public boolean matches() throws IOException {
+                            int count = values.docValueCount();
+                            for (int i = 0; i < count; i++) {
+                                final long value = values.nextValue();
+                                if (Long.compareUnsigned(value, numbers.minValue) < 0) {
+                                    continue;
+                                } else if (Long.compareUnsigned(value, numbers.maxValue) > 0) {
+                                    return false; // values are sorted, terminate
+                                } else if (numbers.contains(value)) {
+                                    return true;
+                                }
+                            }
+                            return false;
+                        }
+
+                        @Override
+                        public float matchCost() {
+                            return 5; // 2 comparisons, possible lookup in the set
+                        }
+                    };
+                }
+                return new ConstantScoreScorer(this, score(), scoreMode, iterator);
+            }
+        };
+    }
+
+    public static Query newSlowSetQuery(String field, BigInteger... values) {
+        return new SortedUnsignedLongDocValuesSetQuery(field, values) {
+            @Override
+            SortedNumericDocValues getValues(LeafReader reader, String field) throws IOException {
+                FieldInfo info = reader.getFieldInfos().fieldInfo(field);
+                if (info == null) {
+                    // Queries have some optimizations when one sub scorer returns null rather
+                    // than a scorer that does not match any documents
+                    return null;
+                }
+                return DocValues.getSortedNumeric(reader, field);
+            }
+        };
+    }
+}
diff --git a/server/src/main/java/org/opensearch/index/mapper/NumberFieldMapper.java b/server/src/main/java/org/opensearch/index/mapper/NumberFieldMapper.java
@@ -62,6 +62,7 @@
 import org.opensearch.core.xcontent.XContentParser;
 import org.opensearch.core.xcontent.XContentParser.Token;
 import org.opensearch.index.document.SortedUnsignedLongDocValuesRangeQuery;
+import org.opensearch.index.document.SortedUnsignedLongDocValuesSetQuery;
 import org.opensearch.index.fielddata.IndexFieldData;
 import org.opensearch.index.fielddata.IndexNumericFieldData.NumericType;
 import org.opensearch.index.fielddata.plain.SortedNumericIndexFieldData;
@@ -1018,11 +1019,11 @@ public Query termsQuery(String field, List<Object> values, boolean hasDocvalues,
 
                 if (isSearchable && hasDocvalues) {
                     Query query = BigIntegerPoint.newSetQuery(field, v);
-                    Query dvQuery = SortedNumericDocValuesField.newSlowSetQuery(field, points);
+                    Query dvQuery = SortedUnsignedLongDocValuesSetQuery.newSlowSetQuery(field, v);
                     return new IndexOrDocValuesQuery(query, dvQuery);
                 }
                 if (hasDocvalues) {
-                    return SortedNumericDocValuesField.newSlowSetQuery(field, points);
+                    return SortedUnsignedLongDocValuesSetQuery.newSlowSetQuery(field, v);
                 }
                 return BigIntegerPoint.newSetQuery(field, v);
             }

diff --git a/server/src/test/java/org/opensearch/index/mapper/NumberFieldTypeTests.java b/server/src/test/java/org/opensearch/index/mapper/NumberFieldTypeTests.java
@@ -66,6 +66,7 @@
 import org.opensearch.core.xcontent.XContentBuilder;
 import org.opensearch.index.IndexSettings;
 import org.opensearch.index.document.SortedUnsignedLongDocValuesRangeQuery;
+import org.opensearch.index.document.SortedUnsignedLongDocValuesSetQuery;
 import org.opensearch.index.fielddata.IndexNumericFieldData;
 import org.opensearch.index.mapper.MappedFieldType.Relation;
 import org.opensearch.index.mapper.NumberFieldMapper.NumberFieldType;
@@ -413,6 +414,22 @@ public void testUnsignedLongRangeQuery() {
         assertEquals("Cannot search on field [field] since it is both not indexed, and does not have doc_values enabled.", e.getMessage());
     }
 
+    public void testUnsignedLongTermsQuery() {
+        MappedFieldType ft = new NumberFieldMapper.NumberFieldType("field", NumberFieldMapper.NumberType.UNSIGNED_LONG);
+        Query expected = new IndexOrDocValuesQuery(
+            BigIntegerPoint.newSetQuery("field", BigInteger.valueOf(1), BigInteger.valueOf(3)),
+            SortedUnsignedLongDocValuesSetQuery.newSlowSetQuery("field", BigInteger.valueOf(1), BigInteger.valueOf(3))
+        );
+        assertEquals(expected, ft.termsQuery(List.of("1", "3"), MOCK_QSC));
+
+        MappedFieldType unsearchable = unsearchable();
+        IllegalArgumentException e = expectThrows(
+            IllegalArgumentException.class,
+            () -> unsearchable.termsQuery(List.of("1", "3"), MOCK_QSC)
+        );
+        assertEquals("Cannot search on field [field] since it is both not indexed, and does not have doc_values enabled.", e.getMessage());
+    }
+
     public void testDoubleRangeQuery() {
         MappedFieldType ft = new NumberFieldMapper.NumberFieldType("field", NumberFieldMapper.NumberType.DOUBLE);
         Query expected = new IndexOrDocValuesQuery(