Skip to content

Commit

Permalink
Prototype bitmap frequency aggs
Browse files Browse the repository at this point in the history
  • Loading branch information
mkavanagh committed Jul 29, 2020
1 parent bd643fe commit d9f9739
Show file tree
Hide file tree
Showing 7 changed files with 478 additions and 16 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -57,14 +57,16 @@
import org.apache.solr.search.facet.AggValueSource;
import org.apache.solr.search.facet.AvgAgg;
import org.apache.solr.search.facet.BitmapCollectorAgg;
import org.apache.solr.search.facet.BitmapFrequencyAgg;
import org.apache.solr.search.facet.CountAgg;
import org.apache.solr.search.facet.FrequencyOfFrequenciesAgg;
import org.apache.solr.search.facet.HLLAgg;
import org.apache.solr.search.facet.MinMaxAgg;
import org.apache.solr.search.facet.PercentileAgg;
import org.apache.solr.search.facet.RelatednessAgg;
import org.apache.solr.search.facet.StddevAgg;
import org.apache.solr.search.facet.SumAgg;
import org.apache.solr.search.facet.SumsqAgg;
import org.apache.solr.search.facet.RelatednessAgg;
import org.apache.solr.search.facet.TopDocsAgg;
import org.apache.solr.search.facet.UniqueAgg;
import org.apache.solr.search.facet.UniqueBlockAgg;
Expand Down Expand Up @@ -1059,6 +1061,10 @@ public ValueSource parse(FunctionQParser fp) throws SyntaxError {

addParser("agg_bitmapcollector", new BitmapCollectorAgg.Parser());

addParser("agg_bitmapfreq", new BitmapFrequencyAgg.Parser());

addParser("agg_bitmapfreqfreq", new FrequencyOfFrequenciesAgg.Parser());

addParser("childfield", new ChildFieldValueSourceParser());
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
package org.apache.solr.search.facet;

import java.io.ByteArrayOutputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.Arrays;
Expand Down Expand Up @@ -73,7 +71,7 @@ public Object getValue(int slotNum) {
byte[] serialised;
if (result[slotNum] != null) {
result[slotNum].runOptimize();
serialised = bitmapToBytes(result[slotNum]);
serialised = BitmapUtil.bitmapToBytes(result[slotNum]);
} else {
serialised = new byte[0];
}
Expand Down Expand Up @@ -116,20 +114,9 @@ public void finish(Context mcontext) {
public Object getMergedResult() {
combined.runOptimize();
SimpleOrderedMap map = new SimpleOrderedMap();
map.add(KEY, bitmapToBytes(combined));
map.add(KEY, BitmapUtil.bitmapToBytes(combined));
return map;
}
}

private static byte[] bitmapToBytes(MutableRoaringBitmap bitmap) {
ByteArrayOutputStream bos = new ByteArrayOutputStream();
DataOutputStream dos = new DataOutputStream(bos);
try {
bitmap.serialize(dos);
dos.close();
return bos.toByteArray();
} catch (IOException ioe) {
throw new RuntimeException("Failed to serialise RoaringBitmap to bytes", ioe);
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
package org.apache.solr.search.facet;

import org.apache.lucene.queries.function.ValueSource;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.search.FunctionQParser;
import org.apache.solr.search.SyntaxError;
import org.apache.solr.search.ValueSourceParser;

/**
* Calculates the frequency of ordinal values using Roaring Bitmaps.
*
* The response is a map with the following fields:
* - bitmaps: an array of bitmaps, where the frequency of a value x is given by the sum of {@code 2^i} for all values
* of {@code i} where {@code bitmaps[i].contains(x)}
* - overflow: a bitmap of ordinal values with {@code frequency >= 2^(bitmaps.length)}
*
* Lacking a coherent definition of magnitude other than the raw count, this aggregate cannot be used for sorting.
*/
public class BitmapFrequencyAgg extends SimpleAggValueSource {
private final int size;

public BitmapFrequencyAgg(ValueSource vs, int size) {
super("bitmapfreq", vs);

this.size = size;
}

@Override
public SlotAcc createSlotAcc(FacetContext fcontext, int numDocs, int numSlots) {
return new BitmapFrequencySlotAcc(getArg(), fcontext, numSlots, size);
}

@Override
public FacetMerger createFacetMerger(Object prototype) {
return new Merger(size);
}

public static class Parser extends ValueSourceParser {
@Override
public ValueSource parse(FunctionQParser fp) throws SyntaxError {
ValueSource valueSource = fp.parseValueSource();

int size = 16;
if (fp.hasMoreArguments()) {
size = fp.parseInt();
}

return new BitmapFrequencyAgg(valueSource, size);
}
}

private static class Merger extends FacetMerger {
private final int size;
private BitmapFrequencyCounter result;

public Merger(int size) {
this.size = size;
this.result = new BitmapFrequencyCounter(size);
}

@Override
public void merge(Object facetResult, Context mcontext) {
if (facetResult instanceof SimpleOrderedMap) {
BitmapFrequencyCounter deserialized = new BitmapFrequencyCounter(size);
deserialized.deserialize((SimpleOrderedMap<Object>) facetResult);

result = result.merge(deserialized);
}
}

@Override
public void finish(Context mcontext) {
// never called
}

@Override
public Object getMergedResult() {
return result.serialize();
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
package org.apache.solr.search.facet;

import org.apache.solr.common.util.SimpleOrderedMap;
import org.roaringbitmap.RoaringBitmap;

/**
* Counts frequencies of ordinal values using Roaring Bitmaps.
*/
public class BitmapFrequencyCounter {
private final RoaringBitmap[] bitmaps;
private RoaringBitmap overflow;

/**
* Constructs a new frequency counter. The maximum countable frequency will be given by {@code (2^size)-1}.
*
* @param size The maximum size of the frequencies list
*/
public BitmapFrequencyCounter(int size) {
this.bitmaps = new RoaringBitmap[size];
}

/**
* An array of bitmaps encoding frequencies of values: the frequency of a value x is given by the sum of {@code 2^i}
* for all values of {@code i} where {@code bitmaps[i].contains(x)}.
*
* @return The encoded frequencies
*/
public RoaringBitmap[] getBitmaps() {
return this.bitmaps;
}

/**
* The overflow set of all values with {@code frequency >= 2^(bitmaps.length)}.
*
* @return The overflow set
*/
public RoaringBitmap getOverflow() {
return this.overflow;
}

/**
* Adds one occurrence of the given value to the counter.
*
* @param value The value to add
*/
public void add(int value) {
// This is just binary addition x+1=y - we carry the value till we find an empty column
for (int i = 0; i < bitmaps.length; i++) {
RoaringBitmap bitmap = bitmaps[i];
if (bitmap == null) {
bitmap = bitmaps[i] = new RoaringBitmap();
}

if (!bitmap.contains(value)) {
bitmap.add(value);
return;
}

bitmap.remove(value);
}

// If we reach this point, the frequency of this value is >= 2^(bitmaps.length)

if (overflow == null) {
overflow = new RoaringBitmap();
}

overflow.add(value);
}

/**
* Serializes the counter.
*
* @return The serialized data
*/
public SimpleOrderedMap<Object> serialize() {
SimpleOrderedMap<Object> serialized = new SimpleOrderedMap<>();

byte[][] serializedBitmaps = new byte[bitmaps.length][];

int i = 0;
while (i < bitmaps.length) {
RoaringBitmap bitmap = bitmaps[i];
if (bitmap == null) {
break;
}

bitmap.runOptimize();
serializedBitmaps[i] = BitmapUtil.bitmapToBytes(bitmap);

i++;
}

if (i > 0) {
serialized.add("bitmaps", serializedBitmaps);
}

if (overflow != null) {
overflow.runOptimize();
serialized.add("overflow", BitmapUtil.bitmapToBytes(overflow));
}

return serialized;
}

/**
* Populates the counter from the given serialized data.
*
* The counter must be fresh (with no values previously added), and have the same size as the counter from which the
* serialized data was generated.
*
* @param serialized The serialized data
*/
public void deserialize(SimpleOrderedMap<Object> serialized) {
byte[][] serializedBitmaps = (byte[][]) serialized.get("bitmaps");
if (serializedBitmaps != null) {
for (int i = 0; i < bitmaps.length; i++) {
bitmaps[i] = BitmapUtil.bytesToBitmap(serializedBitmaps[i]);
}
}

byte[] overflow = (byte[]) serialized.get("overflow");
if (overflow != null) {
this.overflow = BitmapUtil.bytesToBitmap(overflow);
} else {
this.overflow = null;
}
}

/**
* Merges this counter with another (in-place).
*
* The other counter must have the same size as this counter. After this operation, the returned counter will contain
* the values from both counters with their frequencies added together, and references to either of the original
* counters should be discarded (since either may now be invalid, and one will have been modified and returned).
*
* @param other The counter to merge in
* @return The merged counter
*/
public BitmapFrequencyCounter merge(BitmapFrequencyCounter other) {
// The algorithm here is a ripple-carry adder in two dimensions, built from half-adders that are adapted from the
// standard (where s is the sum, and c the carried value):
//
// s = x xor y
// c = x and y
//
// to:
//
// s = x xor y
// c = y andnot s
//
// which allows in-place modification of bitmaps (x modified into s, y modified into c).

RoaringBitmap c;

int i = 0;

RoaringBitmap x = bitmaps[i];
RoaringBitmap y = other.bitmaps[i];
if (x == null) {
return other;
} else if (y == null) {
return this;
}

x.xor(y); // x2 = x1 xor y1
y.andNot(x); // y2 = y1 andnot x2

c = y; // c1 = y2

i++;

while (i < bitmaps.length) {
x = bitmaps[i];
y = other.bitmaps[i];
if (x == null || y == null) {
break;
}

x.xor(y); // x2 = x1 xor y1
y.andNot(x); // y2 = y1 andnot x2
x.xor(c); // x3 = x2 xor c1

c.andNot(x); // c2 = c1 andnot x3
c.or(y); // c3 = c2 or y2

i++;
}

while (i < bitmaps.length) {
x = bitmaps[i];
if (x == null) {
break;
}

x.xor(c); // x2 = x1 xor c1
c.andNot(x); // c2 = c1 andnot x2

i++;
}

while (i < bitmaps.length) {
x = other.bitmaps[i];
if (x == null) {
break;
}

x.xor(c); // x2 = x1 xor c1
c.andNot(x); // c2 = c1 andnot x2

bitmaps[i] = x;

i++;
}

if (i == bitmaps.length) {
if (overflow == null) {
overflow = c;
} else {
overflow.or(c);
}
}

return this;
}
}
Loading

0 comments on commit d9f9739

Please sign in to comment.