Skip to content

Commit

Permalink
Add comment and more UT for adaptive BloomFilter
Browse files Browse the repository at this point in the history
Signed-off-by: Chen Dai <[email protected]>
  • Loading branch information
dai-chen committed Mar 12, 2024
1 parent 4104789 commit 3df4893
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 18 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ public class AdaptiveBloomFilter implements BloomFilter {
/**
* Initial expected number of items for the first candidate.
*/
private static final int INITIAL_EXPECTED_NUM_ITEMS = 1024;
public static final int INITIAL_EXPECTED_NUM_ITEMS = 1024;

/**
* Total number of distinct items seen so far.
Expand All @@ -36,7 +36,7 @@ public class AdaptiveBloomFilter implements BloomFilter {
/**
* BloomFilter candidates.
*/
final BloomFilterCandidate[] candidates;
private final BloomFilterCandidate[] candidates;

/**
* Construct adaptive BloomFilter instance with the given algorithm parameters.
Expand All @@ -45,11 +45,12 @@ public class AdaptiveBloomFilter implements BloomFilter {
* @param fpp false positive probability
*/
public AdaptiveBloomFilter(int numCandidates, double fpp) {
this.candidates = initializeCandidates(numCandidates, expectedNumItems -> new ClassicBloomFilter(expectedNumItems, fpp));
this.candidates = initializeCandidates(numCandidates,
expectedNumItems -> new ClassicBloomFilter(expectedNumItems, fpp));
}

/**
* Construct adaptive BloomFilter instance from deserialized content.
* Construct adaptive BloomFilter instance from BloomFilter array deserialized from input stream.
*
* @param cardinality total number of distinct items
* @param candidates BloomFilter candidates
Expand Down Expand Up @@ -117,14 +118,15 @@ public BloomFilter merge(BloomFilter other) {
AdaptiveBloomFilter otherBf = (AdaptiveBloomFilter) other;
cardinality += otherBf.cardinality;

for (int i = 0; i < candidates.length; i++) {
for (int i = bestCandidateIndex(); i < candidates.length; i++) {
candidates[i].bloomFilter.merge(otherBf.candidates[i].bloomFilter);
}
return this;
}

@Override
public boolean mightContain(long item) {
// Use the last candidate which is the most accurate
return candidates[candidates.length - 1].bloomFilter.mightContain(item);
}

Expand Down Expand Up @@ -156,12 +158,28 @@ private int bestCandidateIndex() {
if (index < 0) {
index = -(index + 1);
}

/*
* Now 'index' represents the position where the current cardinality should be inserted,
* indicating the best candidate to choose based on its expected number of distinct values.
* The last one is chosen if cardinality exceeds each candidate's expected number.
*/
return Math.min(index, candidates.length - 1);
}

/**
* BloomFilter candidate that records expected number of items for each candidate.
*/
public static class BloomFilterCandidate implements Comparable<BloomFilterCandidate> {
int expectedNumItems;
BloomFilter bloomFilter;
/**
* Expected number of items associated with this candidate.
*/
private final int expectedNumItems;

/**
* BloomFilter instance.
*/
private final BloomFilter bloomFilter;

BloomFilterCandidate(int expectedNumItems, BloomFilter bloomFilter) {
this.expectedNumItems = expectedNumItems;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,34 +20,48 @@ public class AdaptiveBloomFilterTest {

private final int numCandidates = 5;

private final AdaptiveBloomFilter bloomFilter = new AdaptiveBloomFilter(numCandidates, 0.03);
private final double fpp = 0.03;

private final AdaptiveBloomFilter bloomFilter = new AdaptiveBloomFilter(numCandidates, fpp);

@Test
public void shouldChooseBestCandidateAdaptively() {
// Insert 500 items should choose 1st candidate
for (int i = 0; i < 500; i++) {
bloomFilter.put(i);
}
assertEquals(1024, bloomFilter.bestCandidate().expectedNumItems);
assertEquals(1024, bloomFilter.bestCandidate().getExpectedNumItems());

// Insert 1000 (total 1500) should choose 2nd candidate
// Insert 1000 (total 1500) items should choose 2nd candidate
for (int i = 500; i < 1500; i++) {
bloomFilter.put(i);
}
assertEquals(2048, bloomFilter.bestCandidate().expectedNumItems);
assertEquals(2048, bloomFilter.bestCandidate().getExpectedNumItems());

// Insert 4000 (total 5500) should choose 4th candidate
// Insert 4000 (total 5500) items should choose 4th candidate
for (int i = 1500; i < 5500; i++) {
bloomFilter.put(i);
}
assertEquals(8192, bloomFilter.bestCandidate().expectedNumItems);
assertEquals(8192, bloomFilter.bestCandidate().getExpectedNumItems());
}

@Test
public void shouldChooseLastCandidateForLargeCardinality() {
// Insert items more than last candidate's NDV 16384
for (int i = 0; i < 20000; i++) {
bloomFilter.put(i);
}

// Ensure that the last candidate is chosen due to the large cardinality
assertEquals(16384, bloomFilter.bestCandidate().getExpectedNumItems());
}

@Test
public void shouldBeTheSameAfterWriteToAndReadFrom() throws IOException {
bloomFilter.put(123L);
bloomFilter.put(456L);
bloomFilter.put(789L);
// Insert some items to verify each candidate below
for (int i = 0; i < 10000; i++) {
bloomFilter.put(i);
}

// Serialize and deserialize and assert the equality
ByteArrayOutputStream out = new ByteArrayOutputStream();
Expand All @@ -59,7 +73,7 @@ public void shouldBeTheSameAfterWriteToAndReadFrom() throws IOException {

@Test
public void shouldMergeTwoFiltersCorrectly() {
AdaptiveBloomFilter bloomFilter2 = new AdaptiveBloomFilter(numCandidates, 0.03);
AdaptiveBloomFilter bloomFilter2 = new AdaptiveBloomFilter(numCandidates, fpp);

// Insert items into the first filter
for (int i = 0; i < 1000; i++) {
Expand All @@ -78,6 +92,6 @@ public void shouldMergeTwoFiltersCorrectly() {
for (int i = 0; i < 2000; i++) {
assertTrue(bloomFilter.mightContain(i));
}
assertEquals(2048, bloomFilter.bestCandidate().expectedNumItems);
assertEquals(2048, bloomFilter.bestCandidate().getExpectedNumItems());
}
}

0 comments on commit 3df4893

Please sign in to comment.