Skip to content

Commit

Permalink
Prototype bitmap frequency aggs
Browse files Browse the repository at this point in the history
  • Loading branch information
mkavanagh committed Jul 28, 2020
1 parent bd643fe commit 606a995
Show file tree
Hide file tree
Showing 7 changed files with 427 additions and 16 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -57,14 +57,16 @@
import org.apache.solr.search.facet.AggValueSource;
import org.apache.solr.search.facet.AvgAgg;
import org.apache.solr.search.facet.BitmapCollectorAgg;
import org.apache.solr.search.facet.BitmapFrequencyAgg;
import org.apache.solr.search.facet.CountAgg;
import org.apache.solr.search.facet.FrequencyOfFrequencyAgg;
import org.apache.solr.search.facet.HLLAgg;
import org.apache.solr.search.facet.MinMaxAgg;
import org.apache.solr.search.facet.PercentileAgg;
import org.apache.solr.search.facet.RelatednessAgg;
import org.apache.solr.search.facet.StddevAgg;
import org.apache.solr.search.facet.SumAgg;
import org.apache.solr.search.facet.SumsqAgg;
import org.apache.solr.search.facet.RelatednessAgg;
import org.apache.solr.search.facet.TopDocsAgg;
import org.apache.solr.search.facet.UniqueAgg;
import org.apache.solr.search.facet.UniqueBlockAgg;
Expand Down Expand Up @@ -1059,6 +1061,10 @@ public ValueSource parse(FunctionQParser fp) throws SyntaxError {

addParser("agg_bitmapcollector", new BitmapCollectorAgg.Parser());

addParser("agg_bitmapfreq", new BitmapFrequencyAgg.Parser());

addParser("agg_bitmapfreqfreq", new FrequencyOfFrequencyAgg.Parser());

addParser("childfield", new ChildFieldValueSourceParser());
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
package org.apache.solr.search.facet;

import java.io.ByteArrayOutputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.Arrays;
Expand Down Expand Up @@ -73,7 +71,7 @@ public Object getValue(int slotNum) {
byte[] serialised;
if (result[slotNum] != null) {
result[slotNum].runOptimize();
serialised = bitmapToBytes(result[slotNum]);
serialised = BitmapUtil.bitmapToBytes(result[slotNum]);
} else {
serialised = new byte[0];
}
Expand Down Expand Up @@ -116,20 +114,9 @@ public void finish(Context mcontext) {
public Object getMergedResult() {
combined.runOptimize();
SimpleOrderedMap map = new SimpleOrderedMap();
map.add(KEY, bitmapToBytes(combined));
map.add(KEY, BitmapUtil.bitmapToBytes(combined));
return map;
}
}

private static byte[] bitmapToBytes(MutableRoaringBitmap bitmap) {
ByteArrayOutputStream bos = new ByteArrayOutputStream();
DataOutputStream dos = new DataOutputStream(bos);
try {
bitmap.serialize(dos);
dos.close();
return bos.toByteArray();
} catch (IOException ioe) {
throw new RuntimeException("Failed to serialise RoaringBitmap to bytes", ioe);
}
}
}
159 changes: 159 additions & 0 deletions solr/core/src/java/org/apache/solr/search/facet/BitmapFrequencies.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
package org.apache.solr.search.facet;

import java.util.ArrayList;
import java.util.List;

import org.apache.solr.common.util.SimpleOrderedMap;
import org.roaringbitmap.RoaringBitmap;

public class BitmapFrequencies {
private final List<RoaringBitmap> frequencies;
private final Integer maxFrequency;
private RoaringBitmap overflow;

public BitmapFrequencies() {
this.frequencies = new ArrayList<>();
this.maxFrequency = null;
}

public BitmapFrequencies(int maxFrequency) {
this.frequencies = new ArrayList<>(maxFrequency);
this.maxFrequency = maxFrequency;
}

public BitmapFrequencies(SimpleOrderedMap<Object> serialized) {
this();

Iterable<byte[]> serializedFrequencies = (Iterable<byte[]>) serialized.get("frequencies");
if (serializedFrequencies != null) {
for (byte[] bytes : serializedFrequencies) {
this.frequencies.add(BitmapUtil.bytesToBitmap(bytes));
}
}

byte[] overflow = (byte[]) serialized.get("overflow");
if (overflow != null) {
this.overflow = BitmapUtil.bytesToBitmap(overflow);
}
}

public List<RoaringBitmap> getFrequencies() {
return this.frequencies;
}

public RoaringBitmap getOverflow() {
return this.overflow;
}

public void add(int value) {
for (RoaringBitmap frequency : frequencies) {
if (!frequency.contains(value)) {
frequency.add(value);
return;
}
frequency.remove(value);
}

if (maxFrequency == null || frequencies.size() < maxFrequency) {
frequencies.add(RoaringBitmap.bitmapOf(value));
} else {
if (overflow == null) {
overflow = RoaringBitmap.bitmapOf(value);
} else {
overflow.add(value);
}
}
}

public SimpleOrderedMap<Object> serialize() {
SimpleOrderedMap<Object> map = new SimpleOrderedMap<>();

if (!frequencies.isEmpty()) {
List<byte[]> serialized = new ArrayList<>(frequencies.size());
for (RoaringBitmap bitmap : frequencies) {
bitmap.runOptimize();
serialized.add(BitmapUtil.bitmapToBytes(bitmap));
}
map.add("frequencies", serialized);
}

if (overflow != null) {
map.add("overflow", BitmapUtil.bitmapToBytes(overflow));
}

return map;
}

// Merges (in-place) with frequencies from another sample. The supplied BitmapFrequencies is no longer valid after
// this operation.
public void merge(BitmapFrequencies other) {
int smallest = Math.min(frequencies.size(), other.frequencies.size());

RoaringBitmap carried = new RoaringBitmap();
int f = 0;
while (f < smallest) {
// x(f) is the set of values which occurred with frequency f in this sample
// y(f) is the set of values which occurred with frequency f in the sample to be merged
// carried is the intersection of x(f-1) and y(f-1)
//
// 1) x(f) and y(f) may intersect
// 2) x(f) does not intersect with x(f-1)
// 3) y(f) does not intersect with y(f-1)
// 4) For carried to intersect with x(f), at least one value would have to be in x(f-1), y(f-1) and x(f).
// As per 2), this is impossible.
// 5) For carried to intersect with y(f), at least one value would have to be in x(f-1), y(f-1) and y(f).
// As per 3), this is impossible.
// 6) Therefore, carried does not intersect with either x(f) or y(f).
RoaringBitmap x = frequencies.get(f);
RoaringBitmap y = other.frequencies.get(f);

// We first merge carried, x, and y.
// Since x and y may intersect, the result may contain some values with frequency at most f+1.
RoaringBitmap merged = carried;
merged.or(x);
merged.or(y);

// We now calculate the values in the merged set which have frequency f+1, and remove them (to be carried).
carried = x;
carried.and(y);
merged.andNot(carried);

frequencies.set(f, merged);
f++;
}

while (f < other.frequencies.size()) {
RoaringBitmap merged = other.frequencies.get(f);

if (carried != null) {
merged.or(carried);
carried = null;
}

frequencies.add(merged);
f++;
}

if (maxFrequency == null || frequencies.size() < maxFrequency) {
if (carried != null) {
frequencies.add(carried);
}
} else {
if (other.overflow != null) {
if (overflow == null) {
overflow = other.overflow;
} else {
overflow.or(other.overflow);
}
}

if (carried != null) {
if (overflow == null) {
overflow = carried;
} else {
overflow.or(carried);
}
}
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
package org.apache.solr.search.facet;

import org.apache.lucene.queries.function.ValueSource;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.search.FunctionQParser;
import org.apache.solr.search.SyntaxError;
import org.apache.solr.search.ValueSourceParser;

// Calculates frequencies of ordinal values using bitmaps (up to an optional maximum frequency)
// Response:
// - frequencies: an array (omitted if empty) where frequencies[i] is a Roaring Bitmap of the ordinal values which
// occurred with frequency i
// - overflow: a Roaring Bitmap (omitted if empty) of ordinal values with frequency greater than the supplied maximum
public class BitmapFrequencyAgg extends SimpleAggValueSource {
private final Integer maxFrequency;

public BitmapFrequencyAgg(ValueSource vs, Integer maxFrequency) {
super("bitmapfrequency", vs);

this.maxFrequency = maxFrequency;
}

@Override
public SlotAcc createSlotAcc(FacetContext fcontext, int numDocs, int numSlots) {
return new BitmapFrequencySlotAcc(getArg(), fcontext, numSlots, maxFrequency);
}

@Override
public FacetMerger createFacetMerger(Object prototype) {
if (maxFrequency == null) {
return new BitmapFrequencyFacetMerger();
} else {
return new BitmapFrequencyFacetMerger(maxFrequency);
}
}

public static class Parser extends ValueSourceParser {
@Override
public ValueSource parse(FunctionQParser fp) throws SyntaxError {
ValueSource valueSource = fp.parseValueSource();

Integer maxFrequency = null;
if (fp.hasMoreArguments()) {
maxFrequency = fp.parseInt();
}

return new BitmapFrequencyAgg(valueSource, maxFrequency);
}
}

private static class BitmapFrequencyFacetMerger extends FacetMerger {
private final BitmapFrequencies result;

public BitmapFrequencyFacetMerger() {
this.result = new BitmapFrequencies();
}

public BitmapFrequencyFacetMerger(int maxFrequency) {
this.result = new BitmapFrequencies(maxFrequency);
}

@Override
public void merge(Object facetResult, Context mcontext) {
if (facetResult instanceof SimpleOrderedMap) {
BitmapFrequencies deserialized = new BitmapFrequencies((SimpleOrderedMap<Object>) facetResult);

result.merge(deserialized);
}
}

@Override
public void finish(Context mcontext) {
// never called
}

@Override
public Object getMergedResult() {
return result.serialize();
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
package org.apache.solr.search.facet;

import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.function.IntFunction;

import org.apache.lucene.queries.function.ValueSource;

public class BitmapFrequencySlotAcc extends FuncSlotAcc {
private BitmapFrequencies[] result;
private final Integer maxFrequency;

public BitmapFrequencySlotAcc(ValueSource values, FacetContext fcontext, int numSlots, Integer maxFrequency) {
super(values, fcontext, numSlots);

this.result = new BitmapFrequencies[numSlots];
this.maxFrequency = maxFrequency;
}

@Override
public void collect(int doc, int slot, IntFunction<SlotContext> slotContext) throws IOException {
if (result[slot] == null) {
if (this.maxFrequency != null) {
result[slot] = new BitmapFrequencies(this.maxFrequency);
} else {
result[slot] = new BitmapFrequencies();
}
}
result[slot].add(values.intVal(doc));
}

@Override
public int compare(int slotA, int slotB) {
throw new UnsupportedOperationException();
}

@Override
public Object getValue(int slotNum) {
if (result[slotNum] != null) {
return result[slotNum].serialize();
} else {
return Collections.emptyList();
}
}

@Override
public void reset() {
Arrays.fill(result, null);
}

@Override
public void resize(Resizer resizer) {
result = resizer.resize(result, null);
}
}
32 changes: 32 additions & 0 deletions solr/core/src/java/org/apache/solr/search/facet/BitmapUtil.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
package org.apache.solr.search.facet;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;

import org.roaringbitmap.ImmutableBitmapDataProvider;
import org.roaringbitmap.RoaringBitmap;

public class BitmapUtil {
public static byte[] bitmapToBytes(ImmutableBitmapDataProvider bitmap) {
try (ByteArrayOutputStream bos = new ByteArrayOutputStream(); DataOutputStream dos = new DataOutputStream(bos)) {
bitmap.serialize(dos);
dos.close();
return bos.toByteArray();
} catch (IOException ioe) {
throw new RuntimeException("Failed to serialise RoaringBitmap to bytes", ioe);
}
}

public static RoaringBitmap bytesToBitmap(byte[] bytes) {
try (ByteArrayInputStream bis = new ByteArrayInputStream(bytes); DataInputStream dis = new DataInputStream(bis)) {
RoaringBitmap bitmap = new RoaringBitmap();
bitmap.deserialize(dis);
return bitmap;
} catch (IOException ioe) {
throw new RuntimeException("Failed to deserialise RoaringBitmap from bytes", ioe);
}
}
}
Loading

0 comments on commit 606a995

Please sign in to comment.