Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduce duplication in taxonomy facets; always do counts #12966

Merged
merged 2 commits into from
Apr 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,9 @@ Improvements
* GITHUB#13202: Early terminate graph and exact searches of AbstractKnnVectorQuery to follow timeout set from
IndexSearcher#setTimeout(QueryTimeout). (Kaival Parikh)

* GITHUB#12966: Move most of the responsibility from TaxonomyFacets implementations to TaxonomyFacets itself.
This reduces code duplication and enables future development. (Stefan Vodita)

Optimizations
---------------------

Expand Down Expand Up @@ -285,6 +288,9 @@ Optimizations
* GITHUB#13149: Made PointRangeQuery faster, for some segment sizes, by reducing the amount of virtual calls to
IntersectVisitor::visit(int). (Anton Hägerstrand)

* GITHUB#12966: FloatTaxonomyFacets can now collect values into a sparse structure, like IntTaxonomyFacets already
could. (Stefan Vodita)

Bug Fixes
---------------------

Expand All @@ -305,6 +311,8 @@ Bug Fixes

* GITHUB#13206: Subtract deleted file size from the cache size of NRTCachingDirectory. (Jean-François Boeuf)

* GITHUB#12966: Aggregation facets no longer assume that aggregation values are positive. (Stefan Vodita)

Build
---------------------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ public FacetResult getTopChildren(int topN, String dim, String... path) throws I

topN = Math.min(topN, cardinality);
TopOrdAndIntQueue q = null;
TopOrdAndIntQueue.OrdAndValue reuse = null;
TopOrdAndIntQueue.OrdAndInt reuse = null;
int bottomCount = 0;
int bottomOrd = Integer.MAX_VALUE;
int childCount = 0; // total number of labels with non-zero count
Expand All @@ -191,18 +191,18 @@ public FacetResult getTopChildren(int topN, String dim, String... path) throws I
int ord = cursor.key;
int count = cursor.value;
if (count > bottomCount || (count == bottomCount && ord < bottomOrd)) {
if (reuse == null) {
reuse = new TopOrdAndIntQueue.OrdAndValue();
}
reuse.ord = ord;
reuse.value = count;
if (q == null) {
// Lazy init for sparse case:
q = new TopOrdAndIntQueue(topN);
}
reuse = q.insertWithOverflow(reuse);
if (reuse == null) {
reuse = (TopOrdAndIntQueue.OrdAndInt) q.newOrdAndValue();
}
reuse.ord = ord;
reuse.value = count;
reuse = (TopOrdAndIntQueue.OrdAndInt) q.insertWithOverflow(reuse);
if (q.size() == topN) {
bottomCount = q.top().value;
bottomCount = ((TopOrdAndIntQueue.OrdAndInt) q.top()).value;
bottomOrd = q.top().ord;
}
}
Expand All @@ -213,18 +213,18 @@ public FacetResult getTopChildren(int topN, String dim, String... path) throws I
if (count != 0) {
childCount++;
if (count > bottomCount || (count == bottomCount && i < bottomOrd)) {
if (reuse == null) {
reuse = new TopOrdAndIntQueue.OrdAndValue();
}
reuse.ord = i;
reuse.value = count;
if (q == null) {
// Lazy init for sparse case:
q = new TopOrdAndIntQueue(topN);
}
reuse = q.insertWithOverflow(reuse);
if (reuse == null) {
reuse = (TopOrdAndIntQueue.OrdAndInt) q.newOrdAndValue();
}
reuse.ord = i;
reuse.value = count;
reuse = (TopOrdAndIntQueue.OrdAndInt) q.insertWithOverflow(reuse);
if (q.size() == topN) {
bottomCount = q.top().value;
bottomCount = ((TopOrdAndIntQueue.OrdAndInt) q.top()).value;
bottomOrd = q.top().ord;
}
}
Expand All @@ -235,7 +235,7 @@ public FacetResult getTopChildren(int topN, String dim, String... path) throws I
int resultCount = q == null ? 0 : q.size();
LabelAndValue[] labelValues = new LabelAndValue[resultCount];
for (int i = labelValues.length - 1; i >= 0; i--) {
TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop();
TopOrdAndIntQueue.OrdAndInt ordAndValue = (TopOrdAndIntQueue.OrdAndInt) q.pop();
final BytesRef term = docValues.lookupOrd(ordAndValue.ord);
labelValues[i] = new LabelAndValue(term.utf8ToString(), ordAndValue.value);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,37 +16,42 @@
*/
package org.apache.lucene.facet;

import org.apache.lucene.util.PriorityQueue;
/** Keeps highest results, first by largest float value, then tie-break by smallest ord. */
public class TopOrdAndFloatQueue extends TopOrdAndNumberQueue {

/** Keeps highest results, first by largest float value, then tie break by smallest ord. */
public class TopOrdAndFloatQueue extends PriorityQueue<TopOrdAndFloatQueue.OrdAndValue> {

/** Holds a single entry. */
public static final class OrdAndValue {

/** Ordinal of the entry. */
public int ord;
/** Sole constructor. */
public TopOrdAndFloatQueue(int topN) {
super(topN);
}

/** Value associated with the ordinal. */
/** Holds an ordinal and a float value. */
public static final class OrdAndFloat extends OrdAndValue {
/** The value corresponding to the ordinal is a float. */
public float value;

/** Default constructor. */
public OrdAndValue() {}
}
public OrdAndFloat() {}

@Override
public boolean lessThan(OrdAndValue other) {
OrdAndFloat otherOrdAndFloat = (OrdAndFloat) other;
if (value < otherOrdAndFloat.value) {
return true;
}
if (value > otherOrdAndFloat.value) {
return false;
}
return ord > otherOrdAndFloat.ord;
}

/** Sole constructor. */
public TopOrdAndFloatQueue(int topN) {
super(topN);
@Override
public Number getValue() {
return value;
}
}

@Override
protected boolean lessThan(OrdAndValue a, OrdAndValue b) {
if (a.value < b.value) {
return true;
} else if (a.value > b.value) {
return false;
} else {
return a.ord > b.ord;
}
public OrdAndValue newOrdAndValue() {
return new OrdAndFloat();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,37 +16,42 @@
*/
package org.apache.lucene.facet;

import org.apache.lucene.util.PriorityQueue;
/** Keeps highest results, first by largest int value, then tie-break by smallest ord. */
public class TopOrdAndIntQueue extends TopOrdAndNumberQueue {

/** Keeps highest results, first by largest int value, then tie break by smallest ord. */
public class TopOrdAndIntQueue extends PriorityQueue<TopOrdAndIntQueue.OrdAndValue> {

/** Holds a single entry. */
public static final class OrdAndValue {

/** Ordinal of the entry. */
public int ord;
/** Sole constructor. */
public TopOrdAndIntQueue(int topN) {
super(topN);
}

/** Value associated with the ordinal. */
/** Holds an ordinal and an int value. */
public static final class OrdAndInt extends OrdAndValue {
/** The value corresponding to the ordinal is an int. */
public int value;

/** Default constructor. */
public OrdAndValue() {}
}
public OrdAndInt() {}

@Override
public boolean lessThan(OrdAndValue other) {
OrdAndInt otherOrdAndInt = (OrdAndInt) other;
if (value < otherOrdAndInt.value) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You might use Integer.compare here -- not sure if it's actually faster. You'd still need to get the result and check if it's != 0 for the tiebreak (which could also be Integer.compare).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is how Integer.compare is implemented:

     public static int compare(int x, int y) {
        return (x < y) ? -1 : ((x == y) ? 0 : 1);
     }

And lessThan would become:

    public boolean lessThan(OrdAndValue other) {
      OrdAndInt otherOrdAndInt = (OrdAndInt) other;
      int cmp = Integer.compare(value, otherOrdAndInt.value);
      if (cmp == 0) {
        cmp = Integer.compare(otherOrdAndInt.value, ord);
      }
      return cmp < 0;
    }

I think we end up doing more comparisons overall? I might be missing something though.

return true;
}
if (value > otherOrdAndInt.value) {
return false;
}
return ord > otherOrdAndInt.ord;
}

/** Sole constructor. */
public TopOrdAndIntQueue(int topN) {
super(topN);
@Override
public Number getValue() {
return value;
}
}

@Override
protected boolean lessThan(OrdAndValue a, OrdAndValue b) {
if (a.value < b.value) {
return true;
} else if (a.value > b.value) {
return false;
} else {
return a.ord > b.ord;
}
public OrdAndValue newOrdAndValue() {
return new OrdAndInt();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.facet;

import org.apache.lucene.util.PriorityQueue;

/** Keeps highest results, first by largest value, then tie-break by smallest ord. */
public abstract class TopOrdAndNumberQueue extends PriorityQueue<TopOrdAndNumberQueue.OrdAndValue> {

/** Holds a single entry. */
public abstract static class OrdAndValue {

/** Ordinal of the entry. */
public int ord;

/** Default constructor. */
public OrdAndValue() {}

/** Compare with another {@link OrdAndValue}. */
public abstract boolean lessThan(OrdAndValue other);

/** Get the value stored in this {@link OrdAndValue}. */
public abstract Number getValue();
}

/** Sole constructor. */
public TopOrdAndNumberQueue(int topN) {
super(topN);
}

@Override
public boolean lessThan(TopOrdAndNumberQueue.OrdAndValue a, TopOrdAndNumberQueue.OrdAndValue b) {
return a.lessThan(b);
}

/**
* Create a new {@link org.apache.lucene.facet.TopOrdAndNumberQueue.OrdAndValue} of the
* appropriate type.
*/
public abstract OrdAndValue newOrdAndValue();
}
Original file line number Diff line number Diff line change
Expand Up @@ -327,28 +327,28 @@ private TopChildrenForPath computeTopChildren(
int pathCount = 0;
int childCount = 0;

TopOrdAndIntQueue.OrdAndValue reuse = null;
TopOrdAndIntQueue.OrdAndInt reuse = null;
while (childOrds.hasNext()) {
int ord = childOrds.next();
int count = getCount(ord);
if (count > 0) {
pathCount += count;
childCount++;
if (count > bottomCount || (count == bottomCount && ord < bottomOrd)) {
if (reuse == null) {
reuse = new TopOrdAndIntQueue.OrdAndValue();
}
reuse.ord = ord;
reuse.value = count;
if (q == null) {
// Lazy init, so we don't create this for the
// sparse case unnecessarily
q = new TopOrdAndIntQueue(topN);
}
reuse = q.insertWithOverflow(reuse);
if (reuse == null) {
reuse = (TopOrdAndIntQueue.OrdAndInt) q.newOrdAndValue();
}
reuse.ord = ord;
reuse.value = count;
reuse = (TopOrdAndIntQueue.OrdAndInt) q.insertWithOverflow(reuse);
if (q.size() == topN) {
bottomCount = q.top().value;
bottomOrd = q.top().value;
bottomCount = ((TopOrdAndIntQueue.OrdAndInt) q.top()).value;
bottomOrd = q.top().ord;
}
}
}
Expand Down Expand Up @@ -396,7 +396,7 @@ private FacetResult createFacetResult(

LabelAndValue[] labelValues = new LabelAndValue[q.size()];
for (int i = labelValues.length - 1; i >= 0; i--) {
TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop();
TopOrdAndIntQueue.OrdAndInt ordAndValue = (TopOrdAndIntQueue.OrdAndInt) q.pop();
assert ordAndValue != null;
final BytesRef term = dv.lookupOrd(ordAndValue.ord);
String[] parts = FacetsConfig.stringToPath(term.utf8ToString());
Expand Down
Loading