Skip to content

Commit

Permalink
Bitmap freq aggs overflow to map, maintaining counts
Browse files Browse the repository at this point in the history
  • Loading branch information
mkavanagh committed Aug 5, 2020
1 parent 60499a5 commit 035db87
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 45 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
* The response is a map with the following fields:
* - bitmaps: an array of bitmaps, where the frequency of a value x is given by the sum of {@code 2^i} for all values
* of {@code i} where {@code bitmaps[i].contains(x)}
* - overflow: a bitmap of ordinal values with {@code frequency >= 2^(bitmaps.length)}
* - overflow: a map of ordinal values to frequencies, for values with {@code frequency >= 2^(bitmaps.length)}
*
* Lacking a coherent definition of magnitude other than the raw count, this aggregate cannot be used for sorting.
*/
Expand Down
Original file line number Diff line number Diff line change
@@ -1,22 +1,29 @@
package org.apache.solr.search.facet;

import java.util.HashMap;
import java.util.Map;

import org.apache.solr.common.util.SimpleOrderedMap;
import org.roaringbitmap.RoaringBatchIterator;
import org.roaringbitmap.RoaringBitmap;

/**
* Counts frequencies of ordinal values using Roaring Bitmaps.
*/
public class BitmapFrequencyCounter {
private final RoaringBitmap[] bitmaps;
private RoaringBitmap overflow;
private final Map<Integer, Integer> overflow;

/**
* Constructs a new frequency counter. The maximum countable frequency will be given by {@code (2^size)-1}.
* Constructs a new frequency counter. Frequencies greater than {@code (2^size)-1} will be represented as a HashMap
* (rather than a compact bitmap encoding), and for efficiency should not represent a large fraction of the distinct
* values to be counted.
*
* @param size The maximum size of the frequencies list
*/
public BitmapFrequencyCounter(int size) {
this.bitmaps = new RoaringBitmap[size];
this.overflow = new HashMap<>();
}

/**
Expand All @@ -30,11 +37,11 @@ public RoaringBitmap[] getBitmaps() {
}

/**
* The overflow set of all values with {@code frequency >= 2^(bitmaps.length)}.
* A map of high-frequency values (with {@code frequency >= 2^(bitmaps.length)}).
*
* @return The overflow set
* @return The map of high-frequency values.
*/
public RoaringBitmap getOverflow() {
public Map<Integer, Integer> getOverflow() {
return this.overflow;
}

Expand All @@ -44,6 +51,11 @@ public RoaringBitmap getOverflow() {
* @param value The value to add
*/
public void add(int value) {
final Integer overflowCount = overflow.computeIfPresent(value, (v, f) -> f + 1);
if (overflowCount != null) {
return;
}

// This is just binary addition x+1=y - we carry the value till we find an empty column
for (int i = 0; i < bitmaps.length; i++) {
RoaringBitmap bitmap = bitmaps[i];
Expand All @@ -61,11 +73,7 @@ public void add(int value) {

// If we reach this point, the frequency of this value is >= 2^(bitmaps.length)

if (overflow == null) {
overflow = new RoaringBitmap();
}

overflow.add(value);
overflow.put(value, 1 << bitmaps.length);
}

/**
Expand Down Expand Up @@ -95,9 +103,8 @@ public SimpleOrderedMap<Object> serialize() {
serialized.add("bitmaps", serializedBitmaps);
}

if (overflow != null) {
overflow.runOptimize();
serialized.add("overflow", BitmapUtil.bitmapToBytes(overflow));
if (!overflow.isEmpty()) {
serialized.add("overflow", overflow);
}

return serialized;
Expand All @@ -119,11 +126,9 @@ public void deserialize(SimpleOrderedMap<Object> serialized) {
}
}

byte[] overflow = (byte[]) serialized.get("overflow");
Map<Integer, Integer> overflow = (Map<Integer, Integer>) serialized.get("overflow");
if (overflow != null) {
this.overflow = BitmapUtil.bytesToBitmap(overflow);
} else {
this.overflow = null;
this.overflow.putAll(overflow);
}
}

Expand Down Expand Up @@ -214,10 +219,26 @@ public BitmapFrequencyCounter merge(BitmapFrequencyCounter other) {
}

if (i == bitmaps.length) {
if (overflow == null) {
overflow = c;
} else {
overflow.or(c);
other.overflow.forEach((value, freq) -> {
overflow.merge(value, freq, Integer::sum);
});

RoaringBatchIterator iter = c.getBatchIterator();
int[] batch = new int[128];
while (iter.hasNext()) {
int batchSize = iter.nextBatch(batch);
for (int j = 0; j < batchSize; j++) {
int value = batch[j];
int freq = 1 << bitmaps.length;

for (int k = 0; k < bitmaps.length; k++) {
if (bitmaps[j].contains(value)) {
freq += 1 << k;
}
}

overflow.merge(value, freq, Integer::sum);
}
}
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
package org.apache.solr.search.facet;

import java.util.LinkedHashMap;
import java.util.Map;

import org.apache.lucene.queries.function.ValueSource;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.search.FunctionQParser;
Expand All @@ -9,10 +12,8 @@
/**
* Calculates the frequency-of-frequencies (number of values occurring x times) of ordinal values.
*
* The response is a map with the following fields:
* - frequencies: an array where {@code frequencies[i]} is the number of values with {@code frequency = i} (omitted
* if empty)
* - overflow: the number of values with {@code frequency > frequencies.length}
* The response is a map where the keys are frequencies (x = number of times a value occurred), and the values are
* the frequency-of-frequencies (number of values which occurred x times).
*
* Lacking a coherent definition of magnitude other than the raw count, this aggregate cannot be used for sorting.
*/
Expand Down Expand Up @@ -75,10 +76,15 @@ public void finish(Context mcontext) {

@Override
public Object getMergedResult() {
SimpleOrderedMap<Object> map = new SimpleOrderedMap<>();
Map<Integer, Integer> map = new LinkedHashMap<>();

int[] lowFrequencies = result.decode();
for (int i = 0; i < lowFrequencies.length; i++) {
map.put(i, lowFrequencies[i]);
}

map.add("frequencies", result.decode());
map.add("overflow", result.getOverflow().getCardinality());
result.getOverflow()
.forEach((value, freq) -> map.merge(freq, 1, Integer::sum));

return map;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ public void givenSize0_whenAddingValue_withFrequency1() {
counter.add(TEST_ORDINAL);

assertEquals(counter.getBitmaps().length, 0);
assertTrue(counter.getOverflow().contains(TEST_ORDINAL));
assertEquals(counter.getOverflow().get(TEST_ORDINAL), Integer.valueOf(1));
}

@Test
Expand All @@ -29,7 +29,7 @@ public void givenSize0_whenAddingValue_withFrequency2() {
counter.add(TEST_ORDINAL);

assertEquals(counter.getBitmaps().length, 0);
assertTrue(counter.getOverflow().contains(TEST_ORDINAL));
assertEquals(counter.getOverflow().get(TEST_ORDINAL), Integer.valueOf(2));
}

@Test
Expand All @@ -40,7 +40,7 @@ public void givenSize1_whenAddingValue_withFrequency1() {

assertEquals(counter.getBitmaps().length, 1);
assertTrue(counter.getBitmaps()[0].contains(TEST_ORDINAL));
assertNull(counter.getOverflow());
assertTrue(counter.getOverflow().isEmpty());

int[] decoded = counter.decode();

Expand All @@ -58,7 +58,7 @@ public void givenSize1_whenAddingValue_withFrequency2() {

assertEquals(counter.getBitmaps().length, 1);
assertFalse(counter.getBitmaps()[0].contains(TEST_ORDINAL));
assertTrue(counter.getOverflow().contains(TEST_ORDINAL));
assertEquals(counter.getOverflow().get(TEST_ORDINAL), Integer.valueOf(2));

int[] decoded = counter.decode();

Expand All @@ -75,7 +75,7 @@ public void givenSize2_whenAddingValue_withFrequency1() {

assertEquals(counter.getBitmaps().length, 2);
assertTrue(counter.getBitmaps()[0].contains(TEST_ORDINAL));
assertNull(counter.getOverflow());
assertTrue(counter.getOverflow().isEmpty());

int[] decoded = counter.decode();

Expand All @@ -94,7 +94,7 @@ public void givenSize2_whenAddingValue_withFrequency2() {
assertEquals(counter.getBitmaps().length, 2);
assertFalse(counter.getBitmaps()[0].contains(TEST_ORDINAL));
assertTrue(counter.getBitmaps()[1].contains(TEST_ORDINAL));
assertNull(counter.getOverflow());
assertTrue(counter.getOverflow().isEmpty());

int[] decoded = counter.decode();

Expand All @@ -116,7 +116,7 @@ public void givenSize2_whenAddingValue_withFrequency3() {
assertEquals(counter.getBitmaps().length, 2);
assertTrue(counter.getBitmaps()[0].contains(TEST_ORDINAL));
assertTrue(counter.getBitmaps()[1].contains(TEST_ORDINAL));
assertNull(counter.getOverflow());
assertTrue(counter.getOverflow().isEmpty());

int[] decoded = counter.decode();

Expand All @@ -139,7 +139,7 @@ public void givenSize2_whenAddingValue_withFrequency4() {
assertEquals(counter.getBitmaps().length, 2);
assertFalse(counter.getBitmaps()[0].contains(TEST_ORDINAL));
assertFalse(counter.getBitmaps()[1].contains(TEST_ORDINAL));
assertTrue(counter.getOverflow().contains(TEST_ORDINAL));
assertEquals(counter.getOverflow().get(TEST_ORDINAL), Integer.valueOf(4));

int[] decoded = counter.decode();

Expand Down Expand Up @@ -188,7 +188,7 @@ public void givenSize2_whenAddingMultipleValues() {
assertTrue(counter.getBitmaps()[0].contains(303));
assertTrue(counter.getBitmaps()[1].contains(303));

assertNull(counter.getOverflow());
assertTrue(counter.getOverflow().isEmpty());

int[] decoded = counter.decode();

Expand All @@ -210,22 +210,22 @@ public void givenSize2_whenMergingValues() {
assertEquals(x.getBitmaps().length, 2);
assertFalse(x.getBitmaps()[0].contains(TEST_ORDINAL));
assertTrue(x.getBitmaps()[1].contains(TEST_ORDINAL));
assertNull(x.getOverflow());
assertTrue(x.getOverflow().isEmpty());

y.add(TEST_ORDINAL);
y.add(TEST_ORDINAL);

assertEquals(y.getBitmaps().length, 2);
assertFalse(y.getBitmaps()[0].contains(TEST_ORDINAL));
assertTrue(y.getBitmaps()[1].contains(TEST_ORDINAL));
assertNull(y.getOverflow());
assertTrue(y.getOverflow().isEmpty());

x = x.merge(y);

assertEquals(x.getBitmaps().length, 2);
assertFalse(x.getBitmaps()[0].contains(TEST_ORDINAL));
assertFalse(x.getBitmaps()[1].contains(TEST_ORDINAL));
assertTrue(x.getOverflow().contains(TEST_ORDINAL));
assertEquals(x.getOverflow().get(TEST_ORDINAL), Integer.valueOf(4));
}

@Test
Expand All @@ -249,7 +249,7 @@ public void givenSize4_whenMergingValues() {
assertTrue(x.getBitmaps()[1].contains(TEST_ORDINAL));
assertFalse(x.getBitmaps()[2].contains(TEST_ORDINAL));
assertTrue(x.getBitmaps()[3].contains(TEST_ORDINAL));
assertNull(x.getOverflow());
assertTrue(x.getOverflow().isEmpty());

y.add(TEST_ORDINAL);
y.add(TEST_ORDINAL);
Expand All @@ -262,7 +262,7 @@ public void givenSize4_whenMergingValues() {
assertFalse(y.getBitmaps()[1].contains(TEST_ORDINAL));
assertTrue(y.getBitmaps()[2].contains(TEST_ORDINAL));
assertNull(y.getBitmaps()[3]);
assertNull(y.getOverflow());
assertTrue(y.getOverflow().isEmpty());

x = x.merge(y);

Expand All @@ -271,7 +271,7 @@ public void givenSize4_whenMergingValues() {
assertTrue(x.getBitmaps()[1].contains(TEST_ORDINAL));
assertTrue(x.getBitmaps()[2].contains(TEST_ORDINAL));
assertTrue(x.getBitmaps()[3].contains(TEST_ORDINAL));
assertNull(y.getOverflow());
assertTrue(x.getOverflow().isEmpty());

int[] decoded = x.decode();

Expand Down

0 comments on commit 035db87

Please sign in to comment.