Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow multiple precision modes #6

Draft
wants to merge 20 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
0d1d95c
Initial try to support a variety of byte sizes for cuckoofilter.
panmari Nov 15, 2021
ac182fd
Use config for constructing filter.
panmari Nov 17, 2021
69c4f39
Simplified and sped up insert method.
panmari Jan 16, 2022
6a19792
Fixing typo in method name.
panmari Jan 23, 2022
286a8bc
Making insert benchmark less biased towards completely full behavior.
panmari Jan 23, 2022
7fac49b
Moving rand call to more optimized version.
panmari Jan 23, 2022
0c45ada
Restructuring benchmarks to narrow down on important code.
panmari Jan 23, 2022
e43f263
Passing fingerprint size instead of dynamic function.
panmari Jul 11, 2022
e099372
Making benchmarks and tests deterministic.
panmari Aug 10, 2022
b8d4b9c
Passing around fingerprint size in bits.
panmari Aug 10, 2022
91602ce
Fully implementing Encode/Decode for generic version.
panmari Aug 13, 2022
b7f9905
Fixing bug with insert
panmari Aug 13, 2022
a1dc6a8
Extending tests to cover all types of de- and encoding.
panmari Aug 13, 2022
e8be750
Small cleanup of Encode function.
panmari Aug 18, 2022
b66c920
Inlining bucket initialization.
panmari Sep 22, 2022
f768a1f
Putting commented variant of alternative code into getAltIndex.
panmari Sep 22, 2022
10258d9
Merge branch 'master' into generic_fp_type
panmari Sep 22, 2022
6cf2b26
Merge branch 'master' into generic_fp_type
panmari Sep 24, 2022
b9b2432
Bumping testing matrix to only include supported go versions.
panmari Sep 24, 2022
2e23dc5
Merge branch 'master' into generic_fp_type
panmari Oct 5, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/go.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
go: [ '1.15', '>=1.5 <2']
go: [ '1.18', '>=1.18 <2']
steps:
- uses: actions/checkout@v3

Expand Down
23 changes: 11 additions & 12 deletions bucket.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,22 +5,21 @@ import (
"fmt"
)

// fingerprint represents a single entry in a bucket.
type fingerprint uint16
type fingerprintsize interface {
uint8 | uint16 | uint32
}

// bucket keeps track of fingerprints hashing to the same index.
type bucket [bucketSize]fingerprint
type bucket[T fingerprintsize] [bucketSize]T

const (
nullFp = 0
bucketSize = 4
fingerprintSizeBits = 16
maxFingerprint = (1 << fingerprintSizeBits) - 1
nullFp = 0
bucketSize = 4
)

// insert a fingerprint into a bucket. Returns true if there was enough space and insertion succeeded.
// Note it allows inserting the same fingerprint multiple times.
func (b *bucket) insert(fp fingerprint) bool {
func (b *bucket[T]) insert(fp T) bool {
for i, tfp := range b {
if tfp == nullFp {
b[i] = fp
Expand All @@ -32,7 +31,7 @@ func (b *bucket) insert(fp fingerprint) bool {

// delete a fingerprint from a bucket.
// Returns true if the fingerprint was present and successfully removed.
func (b *bucket) delete(fp fingerprint) bool {
func (b *bucket[T]) delete(fp T) bool {
for i, tfp := range b {
if tfp == fp {
b[i] = nullFp
Expand All @@ -42,7 +41,7 @@ func (b *bucket) delete(fp fingerprint) bool {
return false
}

func (b *bucket) contains(needle fingerprint) bool {
func (b *bucket[T]) contains(needle T) bool {
for _, fp := range b {
if fp == needle {
return true
Expand All @@ -52,13 +51,13 @@ func (b *bucket) contains(needle fingerprint) bool {
}

// reset deletes all fingerprints in the bucket.
func (b *bucket) reset() {
func (b *bucket[T]) reset() {
for i := range b {
b[i] = nullFp
}
}

func (b *bucket) String() string {
func (b *bucket[T]) String() string {
var buf bytes.Buffer
buf.WriteString("[")
for _, by := range b {
Expand Down
6 changes: 3 additions & 3 deletions bucket_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@ import (
)

func TestBucket_Reset(t *testing.T) {
var bkt bucket
for i := fingerprint(0); i < bucketSize; i++ {
var bkt bucket[uint16]
for i := uint16(0); i < bucketSize; i++ {
bkt[i] = i
}
bkt.reset()

var want bucket
var want bucket[uint16]
if !reflect.DeepEqual(bkt, want) {
t.Errorf("bucket.reset() got %v, want %v", bkt, want)
}
Expand Down
14 changes: 14 additions & 0 deletions config.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
package cuckoo

type FilterPrecision uint

const (
Medium FilterPrecision = iota
Low
High
)

type Config struct {
NumElements uint
Precision FilterPrecision
}
174 changes: 119 additions & 55 deletions cuckoofilter.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,37 +12,88 @@ import (
const maxCuckooKickouts = 500

// Filter is a probabilistic counter.
type Filter struct {
buckets []bucket
count uint
type Filter interface {
// Lookup returns true if data is in the filter.
Lookup(data []byte) bool
// Insert data into the filter. Returns false if insertion failed. In the resulting state, the filter
// * Might return false negatives
// * Deletes are not guaranteed to work
// To increase success rate of inserts, create a larger filter.
Insert(data []byte) bool
// Delete data from the filter. Returns true if the data was found and deleted.
Delete(data []byte) bool
// Count returns the number of items in the filter.
Count() uint

// LoadFactor returns the fraction slots that are occupied.
LoadFactor() float64
// Reset removes all items from the filter, setting count to 0.
Reset()
// Encode returns a byte slice representing a Cuckoofilter.
Encode() []byte
}

type filter[T fingerprintsize] struct {
buckets []bucket[T]
fingerprintSizeBits int
count uint
// Bit mask set to len(buckets) - 1. As len(buckets) is always a power of 2,
// applying this mask mimics the operation x % len(buckets).
bucketIndexMask uint
bucketIndexMask uint
maxFingerprintMinusOne uint64
}

// NewFilter returns a new cuckoofilter suitable for the given number of elements.
// When inserting more elements, insertion speed will drop significantly and insertions might fail altogether.
// A capacity of 1000000 is a normal default, which allocates
// about ~2MB on 64-bit machines.
func NewFilter(numElements uint) *Filter {
func numBuckets(numElements uint) uint {
numBuckets := getNextPow2(uint64(numElements / bucketSize))
if float64(numElements)/float64(numBuckets*bucketSize) > 0.96 {
numBuckets <<= 1
}
if numBuckets == 0 {
numBuckets = 1
}
buckets := make([]bucket, numBuckets)
return &Filter{
buckets: buckets,
count: 0,
bucketIndexMask: uint(len(buckets) - 1),
return numBuckets
}

func maxFingerprintMinusOne(fingerprintSizeBits int) uint64 {
return uint64((1 << fingerprintSizeBits) - 2)
}

// NewFilter returns a new cuckoofilter suitable for the given number of elements.
// When inserting more elements, insertion speed will drop significantly and insertions might fail altogether.
// A capacity of 1000000 is a normal default, which allocates
// about ~2MB on 64-bit machines.
func NewFilter(cfg Config) Filter {
numBuckets := numBuckets(cfg.NumElements)
switch cfg.Precision {
case Low:
return &filter[uint8]{
buckets: make([]bucket[uint8], numBuckets),
count: 0,
bucketIndexMask: uint(numBuckets - 1),
fingerprintSizeBits: 8,
maxFingerprintMinusOne: maxFingerprintMinusOne(8),
}
case High:
return &filter[uint32]{
buckets: make([]bucket[uint32], numBuckets),
count: 0,
bucketIndexMask: uint(numBuckets - 1),
fingerprintSizeBits: 32,
maxFingerprintMinusOne: maxFingerprintMinusOne(32),
}
default:
return &filter[uint16]{
buckets: make([]bucket[uint16], numBuckets),
count: 0,
bucketIndexMask: uint(numBuckets - 1),
fingerprintSizeBits: 16,
maxFingerprintMinusOne: maxFingerprintMinusOne(16),
}
}
}

// Lookup returns true if data is in the filter.
func (cf *Filter) Lookup(data []byte) bool {
i1, fp := getIndexAndFingerprint(data, cf.bucketIndexMask)
func (cf *filter[T]) Lookup(data []byte) bool {
i1, fp := getIndexAndFingerprint[T](data, cf.bucketIndexMask, cf.maxFingerprintMinusOne, cf.fingerprintSizeBits)
if b := cf.buckets[i1]; b.contains(fp) {
return true
}
Expand All @@ -51,85 +102,76 @@ func (cf *Filter) Lookup(data []byte) bool {
return b.contains(fp)
}

// Reset removes all items from the filter, setting count to 0.
func (cf *Filter) Reset() {
func (cf *filter[T]) Reset() {
for i := range cf.buckets {
cf.buckets[i].reset()
}
cf.count = 0
}

// Insert data into the filter. Returns false if insertion failed. In the resulting state, the filter
// * Might return false negatives
// * Deletes are not guaranteed to work
// To increase success rate of inserts, create a larger filter.
func (cf *Filter) Insert(data []byte) bool {
i1, fp := getIndexAndFingerprint(data, cf.bucketIndexMask)
if cf.insert(fp, i1) {
func (cf *filter[T]) Insert(data []byte) bool {
i1, fp := getIndexAndFingerprint[T](data, cf.bucketIndexMask, cf.maxFingerprintMinusOne, cf.fingerprintSizeBits)
if cf.insertIntoBucket(fp, i1) {
return true
}
i2 := getAltIndex(fp, i1, cf.bucketIndexMask)
if cf.insert(fp, i2) {
if cf.insertIntoBucket(fp, i2) {
return true
}
return cf.reinsert(fp, randi(i1, i2))
return cf.cuckooInsert(fp, i1)
}

func (cf *Filter) insert(fp fingerprint, i uint) bool {
func (cf *filter[T]) insertIntoBucket(fp T, i uint) bool {
if cf.buckets[i].insert(fp) {
cf.count++
return true
}
return false
}

func (cf *Filter) reinsert(fp fingerprint, i uint) bool {
func (cf *filter[T]) cuckooInsert(fp T, i uint) bool {
// Apply cuckoo kickouts until a free space is found.
for k := 0; k < maxCuckooKickouts; k++ {
j := rand.Intn(bucketSize)
// Swap fingerprint with bucket entry.
cf.buckets[i][j], fp = fp, cf.buckets[i][j]

// Move kicked out fingerprint to alternate location.
i = getAltIndex(fp, i, cf.bucketIndexMask)
if cf.insert(fp, i) {
if cf.insertIntoBucket(fp, i) {
return true
}
}
return false
}

// Delete data from the filter. Returns true if the data was found and deleted.
func (cf *Filter) Delete(data []byte) bool {
i1, fp := getIndexAndFingerprint(data, cf.bucketIndexMask)
func (cf *filter[T]) Delete(data []byte) bool {
i1, fp := getIndexAndFingerprint[T](data, cf.bucketIndexMask, cf.maxFingerprintMinusOne, cf.fingerprintSizeBits)
i2 := getAltIndex(fp, i1, cf.bucketIndexMask)
return cf.delete(fp, i1) || cf.delete(fp, i2)
}

func (cf *Filter) delete(fp fingerprint, i uint) bool {
func (cf *filter[T]) delete(fp T, i uint) bool {
if cf.buckets[i].delete(fp) {
cf.count--
return true
}
return false
}

// Count returns the number of items in the filter.
func (cf *Filter) Count() uint {
func (cf *filter[T]) Count() uint {
return cf.count
}

// LoadFactor returns the fraction slots that are occupied.
func (cf *Filter) LoadFactor() float64 {
func (cf *filter[T]) LoadFactor() float64 {
return float64(cf.count) / float64(len(cf.buckets)*bucketSize)
}

const bytesPerBucket = bucketSize * fingerprintSizeBits / 8

// Encode returns a byte slice representing a Cuckoofilter.
func (cf *Filter) Encode() []byte {
func (cf *filter[T]) Encode() []byte {
res := new(bytes.Buffer)
res.Grow(len(cf.buckets) * bytesPerBucket)

bytesPerBucket := bucketSize * cf.fingerprintSizeBits / 8
res.Grow(len(cf.buckets)*bytesPerBucket + 1)
binary.Write(res, binary.LittleEndian, uint8(cf.fingerprintSizeBits))
for _, b := range cf.buckets {
for _, fp := range b {
binary.Write(res, binary.LittleEndian, fp)
Expand All @@ -139,21 +181,41 @@ func (cf *Filter) Encode() []byte {
}

// Decode returns a Cuckoofilter from a byte slice created using Encode.
func Decode(data []byte) (*Filter, error) {
func Decode(data []byte) (Filter, error) {
if len(data) == 0 {
return nil, fmt.Errorf("data can not be empty")
}
fingerprintSizeBits, data := int(data[0]), data[1:]
if len(data)%bucketSize != 0 {
return nil, fmt.Errorf("bytes must to be multiple of %d, got %d", bucketSize, len(data))
return nil, fmt.Errorf("data must to be multiple of %d, got %d", bucketSize, len(data))
}
bytesPerBucket := bucketSize * fingerprintSizeBits / 8
if bytesPerBucket == 0 {
return nil, fmt.Errorf("bytesPerBucket can not be zero")
}
numBuckets := len(data) / bytesPerBucket
if numBuckets < 1 {
return nil, fmt.Errorf("bytes can not be smaller than %d, size in bytes is %d", bytesPerBucket, len(data))
return nil, fmt.Errorf("data can not be smaller than %d, size in bytes is %d", bytesPerBucket, len(data))
}
if getNextPow2(uint64(numBuckets)) != uint(numBuckets) {
return nil, fmt.Errorf("numBuckets must to be a power of 2, got %d", numBuckets)
}
switch fingerprintSizeBits {
case 8:
return decode[uint8](fingerprintSizeBits, numBuckets, data), nil
case 16:
return decode[uint16](fingerprintSizeBits, numBuckets, data), nil
case 32:
return decode[uint32](fingerprintSizeBits, numBuckets, data), nil
default:
return nil, fmt.Errorf("fingerprint size bits must be 8, 16 or 32, got %d", fingerprintSizeBits)
}
}

func decode[T fingerprintsize](fingerprintSizeBits, numBuckets int, data []byte) *filter[T] {
var count uint
buckets := make([]bucket, numBuckets)
buckets := make([]bucket[T], numBuckets)
reader := bytes.NewReader(data)

for i, b := range buckets {
for j := range b {
binary.Read(reader, binary.LittleEndian, &buckets[i][j])
Expand All @@ -162,9 +224,11 @@ func Decode(data []byte) (*Filter, error) {
}
}
}
return &Filter{
buckets: buckets,
count: count,
bucketIndexMask: uint(len(buckets) - 1),
}, nil
return &filter[T]{
buckets: buckets,
count: count,
bucketIndexMask: uint(len(buckets) - 1),
fingerprintSizeBits: fingerprintSizeBits,
maxFingerprintMinusOne: maxFingerprintMinusOne(fingerprintSizeBits),
}
}
Loading