From 0d1d95c53dc6dfff6c3db99f32b630f671c433bd Mon Sep 17 00:00:00 2001 From: panmari Date: Mon, 15 Nov 2021 11:17:55 +0100 Subject: [PATCH 01/17] Initial try to support a variety of byte sizes for cuckoofilter. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using generics, allowing to switch transparently between uint8 and uint16 for the fingerprint size. Performance neutral according to benchmarks ~/goworkspace/bin/benchstat ~/master.benchstats ~/generics.benchstats name old time/op new time/op delta Filter_Reset-4 10.0µs ± 0% 9.9µs ± 0% ~ (p=1.000 n=1+1) Filter_Insert-4 18.2µs ± 0% 18.0µs ± 0% ~ (p=1.000 n=1+1) Filter_Lookup-4 1.52µs ± 0% 1.51µs ± 0% ~ (p=1.000 n=1+1) --- bucket.go | 23 ++++++------ bucket_test.go | 6 +-- cuckoofilter.go | 77 ++++++++++++++++++++++++-------------- cuckoofilter_test.go | 6 ++- example_threadsafe_test.go | 2 +- go.mod | 6 ++- go.sum | 5 ++- util.go | 20 ++++++++-- util_test.go | 4 +- 9 files changed, 94 insertions(+), 55 deletions(-) diff --git a/bucket.go b/bucket.go index c0ab3af..ec3f1f9 100644 --- a/bucket.go +++ b/bucket.go @@ -5,22 +5,21 @@ import ( "fmt" ) -// fingerprint represents a single entry in a bucket. -type fingerprint uint16 +type fingerprintsize interface { + uint8 | uint16 | uint32 +} // bucket keeps track of fingerprints hashing to the same index. -type bucket [bucketSize]fingerprint +type bucket[T fingerprintsize] [bucketSize]T const ( - nullFp = 0 - bucketSize = 4 - fingerprintSizeBits = 16 - maxFingerprint = (1 << fingerprintSizeBits) - 1 + nullFp = 0 + bucketSize = 4 ) // insert a fingerprint into a bucket. Returns true if there was enough space and insertion succeeded. // Note it allows inserting the same fingerprint multiple times. -func (b *bucket) insert(fp fingerprint) bool { +func (b *bucket[T]) insert(fp T) bool { for i, tfp := range b { if tfp == nullFp { b[i] = fp @@ -32,7 +31,7 @@ func (b *bucket) insert(fp fingerprint) bool { // delete a fingerprint from a bucket. // Returns true if the fingerprint was present and successfully removed. -func (b *bucket) delete(fp fingerprint) bool { +func (b *bucket[T]) delete(fp T) bool { for i, tfp := range b { if tfp == fp { b[i] = nullFp @@ -42,7 +41,7 @@ func (b *bucket) delete(fp fingerprint) bool { return false } -func (b *bucket) contains(needle fingerprint) bool { +func (b *bucket[T]) contains(needle T) bool { for _, fp := range b { if fp == needle { return true @@ -52,13 +51,13 @@ func (b *bucket) contains(needle fingerprint) bool { } // reset deletes all fingerprints in the bucket. -func (b *bucket) reset() { +func (b *bucket[T]) reset() { for i := range b { b[i] = nullFp } } -func (b *bucket) String() string { +func (b *bucket[T]) String() string { var buf bytes.Buffer buf.WriteString("[") for _, by := range b { diff --git a/bucket_test.go b/bucket_test.go index ea57baa..b94edbb 100644 --- a/bucket_test.go +++ b/bucket_test.go @@ -6,13 +6,13 @@ import ( ) func TestBucket_Reset(t *testing.T) { - var bkt bucket - for i := fingerprint(0); i < bucketSize; i++ { + var bkt bucket[uint16] + for i := uint16(0); i < bucketSize; i++ { bkt[i] = i } bkt.reset() - var want bucket + var want bucket[uint16] if !reflect.DeepEqual(bkt, want) { t.Errorf("bucket.reset() got %v, want %v", bkt, want) } diff --git a/cuckoofilter.go b/cuckoofilter.go index 4798109..8f95a76 100644 --- a/cuckoofilter.go +++ b/cuckoofilter.go @@ -11,19 +11,16 @@ import ( const maxCuckooKickouts = 500 // Filter is a probabilistic counter. -type Filter struct { - buckets []bucket - count uint +type Filter[T fingerprintsize] struct { + buckets []bucket[T] + getFingerprint func(hash uint64) T + count uint // Bit mask set to len(buckets) - 1. As len(buckets) is always a power of 2, // applying this mask mimics the operation x % len(buckets). bucketIndexMask uint } -// NewFilter returns a new cuckoofilter suitable for the given number of elements. -// When inserting more elements, insertion speed will drop significantly and insertions might fail altogether. -// A capacity of 1000000 is a normal default, which allocates -// about ~2MB on 64-bit machines. -func NewFilter(numElements uint) *Filter { +func numBuckets(numElements uint) uint { numBuckets := getNextPow2(uint64(numElements / bucketSize)) if float64(numElements)/float64(numBuckets*bucketSize) > 0.96 { numBuckets <<= 1 @@ -31,17 +28,39 @@ func NewFilter(numElements uint) *Filter { if numBuckets == 0 { numBuckets = 1 } - buckets := make([]bucket, numBuckets) - return &Filter{ + return numBuckets +} + +// NewFilter returns a new cuckoofilter suitable for the given number of elements. +// When inserting more elements, insertion speed will drop significantly and insertions might fail altogether. +// A capacity of 1000000 is a normal default, which allocates +// about ~2MB on 64-bit machines. +func NewFilter(numElements uint) *Filter[uint16] { + buckets := make([]bucket[uint16], numBuckets(numElements)) + return &Filter[uint16]{ + buckets: buckets, + count: 0, + bucketIndexMask: uint(len(buckets) - 1), + getFingerprint: getFinterprintUint16, + } +} + +// NewFilterLowPrecision is the same as NewFilter, but returns a filter that uses +// half the memory but has lower precision. +func NewFilterLowPrecision(numElements uint) *Filter[uint8] { + buckets := make([]bucket[uint8], numBuckets(numElements)) + return &Filter[uint8]{ buckets: buckets, count: 0, bucketIndexMask: uint(len(buckets) - 1), + getFingerprint: getFinterprintUint8, } } + // Lookup returns true if data is in the filter. -func (cf *Filter) Lookup(data []byte) bool { - i1, fp := getIndexAndFingerprint(data, cf.bucketIndexMask) +func (cf *Filter[T]) Lookup(data []byte) bool { + i1, fp := getIndexAndFingerprint(data, cf.bucketIndexMask, cf.getFingerprint) if b := cf.buckets[i1]; b.contains(fp) { return true } @@ -51,7 +70,7 @@ func (cf *Filter) Lookup(data []byte) bool { } // Reset removes all items from the filter, setting count to 0. -func (cf *Filter) Reset() { +func (cf *Filter[T]) Reset() { for i := range cf.buckets { cf.buckets[i].reset() } @@ -62,8 +81,8 @@ func (cf *Filter) Reset() { // * Might return false negatives // * Deletes are not guaranteed to work // To increase success rate of inserts, create a larger filter. -func (cf *Filter) Insert(data []byte) bool { - i1, fp := getIndexAndFingerprint(data, cf.bucketIndexMask) +func (cf *Filter[T]) Insert(data []byte) bool { + i1, fp := getIndexAndFingerprint(data, cf.bucketIndexMask, cf.getFingerprint) if cf.insert(fp, i1) { return true } @@ -74,7 +93,7 @@ func (cf *Filter) Insert(data []byte) bool { return cf.reinsert(fp, randi(i1, i2)) } -func (cf *Filter) insert(fp fingerprint, i uint) bool { +func (cf *Filter[T]) insert(fp T, i uint) bool { if cf.buckets[i].insert(fp) { cf.count++ return true @@ -82,7 +101,7 @@ func (cf *Filter) insert(fp fingerprint, i uint) bool { return false } -func (cf *Filter) reinsert(fp fingerprint, i uint) bool { +func (cf *Filter[T]) reinsert(fp T, i uint) bool { for k := 0; k < maxCuckooKickouts; k++ { j := rand.Intn(bucketSize) // Swap fingerprint with bucket entry. @@ -98,13 +117,13 @@ func (cf *Filter) reinsert(fp fingerprint, i uint) bool { } // Delete data from the filter. Returns true if the data was found and deleted. -func (cf *Filter) Delete(data []byte) bool { - i1, fp := getIndexAndFingerprint(data, cf.bucketIndexMask) +func (cf *Filter[T]) Delete(data []byte) bool { + i1, fp := getIndexAndFingerprint(data, cf.bucketIndexMask, cf.getFingerprint) i2 := getAltIndex(fp, i1, cf.bucketIndexMask) return cf.delete(fp, i1) || cf.delete(fp, i2) } -func (cf *Filter) delete(fp fingerprint, i uint) bool { +func (cf *Filter[T]) delete(fp T, i uint) bool { if cf.buckets[i].delete(fp) { cf.count-- return true @@ -113,19 +132,20 @@ func (cf *Filter) delete(fp fingerprint, i uint) bool { } // Count returns the number of items in the filter. -func (cf *Filter) Count() uint { +func (cf *Filter[T]) Count() uint { return cf.count } // LoadFactor returns the fraction slots that are occupied. -func (cf *Filter) LoadFactor() float64 { +func (cf *Filter[T]) LoadFactor() float64 { return float64(cf.count) / float64(len(cf.buckets)*bucketSize) } -const bytesPerBucket = bucketSize * fingerprintSizeBits / 8 +// TODO(panmari): Size of fingerprint needs to be derived from type. Currently hardcoded to 16 for uint16. +const bytesPerBucket = bucketSize * 16 / 8 // Encode returns a byte slice representing a Cuckoofilter. -func (cf *Filter) Encode() []byte { +func (cf *Filter[T]) Encode() []byte { bytes := make([]byte, 0, len(cf.buckets)*bytesPerBucket) for _, b := range cf.buckets { for _, f := range b { @@ -138,7 +158,7 @@ func (cf *Filter) Encode() []byte { } // Decode returns a Cuckoofilter from a byte slice created using Encode. -func Decode(bytes []byte) (*Filter, error) { +func Decode(bytes []byte) (*Filter[uint16], error) { if len(bytes)%bucketSize != 0 { return nil, fmt.Errorf("bytes must to be multiple of %d, got %d", bucketSize, len(bytes)) } @@ -150,21 +170,22 @@ func Decode(bytes []byte) (*Filter, error) { return nil, fmt.Errorf("numBuckets must to be a power of 2, got %d", numBuckets) } var count uint - buckets := make([]bucket, numBuckets) + buckets := make([]bucket[uint16], numBuckets) for i, b := range buckets { for j := range b { var next []byte next, bytes = bytes[:2], bytes[2:] - if fp := fingerprint(binary.LittleEndian.Uint16(next)); fp != 0 { + if fp := binary.LittleEndian.Uint16(next); fp != 0 { buckets[i][j] = fp count++ } } } - return &Filter{ + return &Filter[uint16]{ buckets: buckets, count: count, bucketIndexMask: uint(len(buckets) - 1), + getFingerprint: getFinterprintUint16, }, nil } diff --git a/cuckoofilter_test.go b/cuckoofilter_test.go index 3bf35e4..fbfd330 100644 --- a/cuckoofilter_test.go +++ b/cuckoofilter_test.go @@ -7,10 +7,10 @@ import ( "io" "math" "os" - "reflect" "testing" "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" ) // optFloatNear considers float64 as equal if the relative delta is small. @@ -231,7 +231,9 @@ func TestEncodeDecode(t *testing.T) { if err != nil { t.Errorf("Expected no error, got %v", err) } - if !reflect.DeepEqual(cf, got) { + if !cmp.Equal(cf, got, + cmp.AllowUnexported(Filter[uint16]{}), + cmpopts.IgnoreFields(Filter[uint16]{}, "getFingerprint")) { t.Errorf("Decode = %v, want %v, encoded = %v", got, cf, encoded) } } diff --git a/example_threadsafe_test.go b/example_threadsafe_test.go index 7b89f8d..9c38225 100644 --- a/example_threadsafe_test.go +++ b/example_threadsafe_test.go @@ -9,7 +9,7 @@ import ( // Small wrapper around cuckoo filter making it thread safe. type threadSafeFilter struct { - cf *cuckoo.Filter + cf *cuckoo.Filter[uint16] mu sync.RWMutex } diff --git a/go.mod b/go.mod index 7311842..7864410 100644 --- a/go.mod +++ b/go.mod @@ -1,8 +1,10 @@ module github.com/panmari/cuckoofilter -go 1.15 +go 1.18 require ( - github.com/dgryski/go-metro v0.0.0-20200812162917-85c65e2d0165 + github.com/dgryski/go-metro v0.0.0-20211015221634-2661b20a2446 github.com/google/go-cmp v0.5.2 ) + +require golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 // indirect diff --git a/go.sum b/go.sum index 578d428..1fe5307 100644 --- a/go.sum +++ b/go.sum @@ -1,5 +1,6 @@ -github.com/dgryski/go-metro v0.0.0-20200812162917-85c65e2d0165 h1:BS21ZUJ/B5X2UVUbczfmdWH7GapPWAhxcMsDnjJTU1E= -github.com/dgryski/go-metro v0.0.0-20200812162917-85c65e2d0165/go.mod h1:c9O8+fpSOX1DM8cPNSkX/qsBWdkD4yd2dpciOWQjpBw= +github.com/dgryski/go-metro v0.0.0-20211015221634-2661b20a2446 h1:QnWGyQI3H080vbC9E4jlr6scOYEnALtvV/69oATYzOo= +github.com/dgryski/go-metro v0.0.0-20211015221634-2661b20a2446/go.mod h1:c9O8+fpSOX1DM8cPNSkX/qsBWdkD4yd2dpciOWQjpBw= github.com/google/go-cmp v0.5.2 h1:X2ev0eStA3AbceY54o37/0PQ/UWqKEiiO2dKL5OPaFM= github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= diff --git a/util.go b/util.go index 00f6309..8246042 100644 --- a/util.go +++ b/util.go @@ -15,23 +15,35 @@ func randi(i1, i2 uint) uint { return i2 } -func getAltIndex(fp fingerprint, i uint, bucketIndexMask uint) uint { +func getAltIndex[T fingerprintsize](fp T, i uint, bucketIndexMask uint) uint { b := make([]byte, 2) binary.LittleEndian.PutUint16(b, uint16(fp)) hash := uint(metro.Hash64(b, 1337)) return (i ^ hash) & bucketIndexMask } -func getFingerprint(hash uint64) fingerprint { +func getFinterprintUint16(hash uint64) uint16 { + const fingerprintSizeBits = 16 + const maxFingerprint = (1 << fingerprintSizeBits) - 1 // Use most significant bits for fingerprint. shifted := hash >> (64 - fingerprintSizeBits) // Valid fingerprints are in range [1, maxFingerprint], leaving 0 as the special empty state. fp := shifted%(maxFingerprint-1) + 1 - return fingerprint(fp) + return uint16(fp) +} + +func getFinterprintUint8(hash uint64) uint8 { + const fingerprintSizeBits = 8 + const maxFingerprint = (1 << fingerprintSizeBits) - 1 + // Use most significant bits for fingerprint. + shifted := hash >> (64 - fingerprintSizeBits) + // Valid fingerprints are in range [1, maxFingerprint], leaving 0 as the special empty state. + fp := shifted%(maxFingerprint-1) + 1 + return uint8(fp) } // getIndexAndFingerprint returns the primary bucket index and fingerprint to be used -func getIndexAndFingerprint(data []byte, bucketIndexMask uint) (uint, fingerprint) { +func getIndexAndFingerprint[T fingerprintsize](data []byte, bucketIndexMask uint, getFingerprint func(uint64) T) (uint, T) { hash := metro.Hash64(data, 1337) f := getFingerprint(hash) // Use least significant bits for deriving index. diff --git a/util_test.go b/util_test.go index fbe0c5e..1ac4de3 100644 --- a/util_test.go +++ b/util_test.go @@ -7,7 +7,9 @@ import ( func TestIndexAndFP(t *testing.T) { data := []byte("seif") numBuckets := uint(1024) - i1, fp := getIndexAndFingerprint(data, numBuckets) + i1, fp := getIndexAndFingerprint(data, numBuckets, func(in uint64) uint16 { + return 2 + }) i2 := getAltIndex(fp, i1, numBuckets) i11 := getAltIndex(fp, i2, numBuckets) i22 := getAltIndex(fp, i1, numBuckets) From ac182fd3f9f3cf74d52fbd568d6dd6541629ab94 Mon Sep 17 00:00:00 2001 From: panmari Date: Wed, 17 Nov 2021 13:25:16 +0100 Subject: [PATCH 02/17] Use config for constructing filter. Instead of using multiple methods for instantiating a filter for a chosen precision. --- config.go | 14 +++++ cuckoofilter.go | 117 +++++++++++++++++++++---------------- cuckoofilter_fuzz_test.go | 2 +- cuckoofilter_test.go | 29 +++++---- example_test.go | 6 +- example_threadsafe_test.go | 4 +- util.go | 25 +++++--- 7 files changed, 119 insertions(+), 78 deletions(-) create mode 100644 config.go diff --git a/config.go b/config.go new file mode 100644 index 0000000..10822c0 --- /dev/null +++ b/config.go @@ -0,0 +1,14 @@ +package cuckoo + +type FilterPrecision uint + +const ( + Medium FilterPrecision = iota + Low + High +) + +type Config struct { + NumElements uint + Precision FilterPrecision +} diff --git a/cuckoofilter.go b/cuckoofilter.go index 8f95a76..36f8017 100644 --- a/cuckoofilter.go +++ b/cuckoofilter.go @@ -1,6 +1,7 @@ package cuckoo import ( + "bytes" "encoding/binary" "fmt" "math/rand" @@ -11,7 +12,28 @@ import ( const maxCuckooKickouts = 500 // Filter is a probabilistic counter. -type Filter[T fingerprintsize] struct { +type Filter interface { + // Lookup returns true if data is in the filter. + Lookup(data []byte) bool + // Insert data into the filter. Returns false if insertion failed. In the resulting state, the filter + // * Might return false negatives + // * Deletes are not guaranteed to work + // To increase success rate of inserts, create a larger filter. + Insert(data []byte) bool + // Delete data from the filter. Returns true if the data was found and deleted. + Delete(data []byte) bool + // Count returns the number of items in the filter. + Count() uint + + // LoadFactor returns the fraction slots that are occupied. + LoadFactor() float64 + // Reset removes all items from the filter, setting count to 0. + Reset() + // Encode returns a byte slice representing a Cuckoofilter. + Encode() []byte +} + +type filter[T fingerprintsize] struct { buckets []bucket[T] getFingerprint func(hash uint64) T count uint @@ -35,31 +57,37 @@ func numBuckets(numElements uint) uint { // When inserting more elements, insertion speed will drop significantly and insertions might fail altogether. // A capacity of 1000000 is a normal default, which allocates // about ~2MB on 64-bit machines. -func NewFilter(numElements uint) *Filter[uint16] { - buckets := make([]bucket[uint16], numBuckets(numElements)) - return &Filter[uint16]{ - buckets: buckets, - count: 0, - bucketIndexMask: uint(len(buckets) - 1), - getFingerprint: getFinterprintUint16, - } -} - -// NewFilterLowPrecision is the same as NewFilter, but returns a filter that uses -// half the memory but has lower precision. -func NewFilterLowPrecision(numElements uint) *Filter[uint8] { - buckets := make([]bucket[uint8], numBuckets(numElements)) - return &Filter[uint8]{ - buckets: buckets, - count: 0, - bucketIndexMask: uint(len(buckets) - 1), - getFingerprint: getFinterprintUint8, +func NewFilter(cfg Config) Filter { + numBuckets := numBuckets(cfg.NumElements) + switch cfg.Precision { + case Low: + buckets := make([]bucket[uint8], numBuckets) + return &filter[uint8]{ + buckets: buckets, + count: 0, + bucketIndexMask: uint(len(buckets) - 1), + getFingerprint: getFinterprintUint8, + } + case High: + buckets := make([]bucket[uint32], numBuckets) + return &filter[uint32]{ + buckets: buckets, + count: 0, + bucketIndexMask: uint(len(buckets) - 1), + getFingerprint: getFinterprintUint32, + } + default: + buckets := make([]bucket[uint16], numBuckets) + return &filter[uint16]{ + buckets: buckets, + count: 0, + bucketIndexMask: uint(len(buckets) - 1), + getFingerprint: getFinterprintUint16, + } } } - -// Lookup returns true if data is in the filter. -func (cf *Filter[T]) Lookup(data []byte) bool { +func (cf *filter[T]) Lookup(data []byte) bool { i1, fp := getIndexAndFingerprint(data, cf.bucketIndexMask, cf.getFingerprint) if b := cf.buckets[i1]; b.contains(fp) { return true @@ -69,19 +97,14 @@ func (cf *Filter[T]) Lookup(data []byte) bool { return b.contains(fp) } -// Reset removes all items from the filter, setting count to 0. -func (cf *Filter[T]) Reset() { +func (cf *filter[T]) Reset() { for i := range cf.buckets { cf.buckets[i].reset() } cf.count = 0 } -// Insert data into the filter. Returns false if insertion failed. In the resulting state, the filter -// * Might return false negatives -// * Deletes are not guaranteed to work -// To increase success rate of inserts, create a larger filter. -func (cf *Filter[T]) Insert(data []byte) bool { +func (cf *filter[T]) Insert(data []byte) bool { i1, fp := getIndexAndFingerprint(data, cf.bucketIndexMask, cf.getFingerprint) if cf.insert(fp, i1) { return true @@ -93,7 +116,7 @@ func (cf *Filter[T]) Insert(data []byte) bool { return cf.reinsert(fp, randi(i1, i2)) } -func (cf *Filter[T]) insert(fp T, i uint) bool { +func (cf *filter[T]) insert(fp T, i uint) bool { if cf.buckets[i].insert(fp) { cf.count++ return true @@ -101,7 +124,7 @@ func (cf *Filter[T]) insert(fp T, i uint) bool { return false } -func (cf *Filter[T]) reinsert(fp T, i uint) bool { +func (cf *filter[T]) reinsert(fp T, i uint) bool { for k := 0; k < maxCuckooKickouts; k++ { j := rand.Intn(bucketSize) // Swap fingerprint with bucket entry. @@ -116,14 +139,13 @@ func (cf *Filter[T]) reinsert(fp T, i uint) bool { return false } -// Delete data from the filter. Returns true if the data was found and deleted. -func (cf *Filter[T]) Delete(data []byte) bool { +func (cf *filter[T]) Delete(data []byte) bool { i1, fp := getIndexAndFingerprint(data, cf.bucketIndexMask, cf.getFingerprint) i2 := getAltIndex(fp, i1, cf.bucketIndexMask) return cf.delete(fp, i1) || cf.delete(fp, i2) } -func (cf *Filter[T]) delete(fp T, i uint) bool { +func (cf *filter[T]) delete(fp T, i uint) bool { if cf.buckets[i].delete(fp) { cf.count-- return true @@ -131,34 +153,31 @@ func (cf *Filter[T]) delete(fp T, i uint) bool { return false } -// Count returns the number of items in the filter. -func (cf *Filter[T]) Count() uint { +func (cf *filter[T]) Count() uint { return cf.count } -// LoadFactor returns the fraction slots that are occupied. -func (cf *Filter[T]) LoadFactor() float64 { +func (cf *filter[T]) LoadFactor() float64 { return float64(cf.count) / float64(len(cf.buckets)*bucketSize) } // TODO(panmari): Size of fingerprint needs to be derived from type. Currently hardcoded to 16 for uint16. const bytesPerBucket = bucketSize * 16 / 8 -// Encode returns a byte slice representing a Cuckoofilter. -func (cf *Filter[T]) Encode() []byte { - bytes := make([]byte, 0, len(cf.buckets)*bytesPerBucket) +func (cf *filter[T]) Encode() []byte { + res := bytes.NewBuffer(nil) + res.Grow(len(cf.buckets) * bytesPerBucket) for _, b := range cf.buckets { - for _, f := range b { - next := make([]byte, 2) - binary.LittleEndian.PutUint16(next, uint16(f)) - bytes = append(bytes, next...) + for _, fp := range b { + binary.Write(res, binary.LittleEndian, fp) } } - return bytes + return res.Bytes() } // Decode returns a Cuckoofilter from a byte slice created using Encode. -func Decode(bytes []byte) (*Filter[uint16], error) { +// TODO(panmari): This only works for uint16 at this point. +func Decode(bytes []byte) (Filter, error) { if len(bytes)%bucketSize != 0 { return nil, fmt.Errorf("bytes must to be multiple of %d, got %d", bucketSize, len(bytes)) } @@ -182,7 +201,7 @@ func Decode(bytes []byte) (*Filter[uint16], error) { } } } - return &Filter[uint16]{ + return &filter[uint16]{ buckets: buckets, count: count, bucketIndexMask: uint(len(buckets) - 1), diff --git a/cuckoofilter_fuzz_test.go b/cuckoofilter_fuzz_test.go index 6bea2f2..18edb9c 100644 --- a/cuckoofilter_fuzz_test.go +++ b/cuckoofilter_fuzz_test.go @@ -8,7 +8,7 @@ import ( ) func FuzzDecode(f *testing.F) { - cf := NewFilter(10) + cf := NewFilter(Config{NumElements: 10}) cf.Insert([]byte{1}) cf.Insert([]byte{2}) cf.Insert([]byte{3}) diff --git a/cuckoofilter_test.go b/cuckoofilter_test.go index fbfd330..ea7a60c 100644 --- a/cuckoofilter_test.go +++ b/cuckoofilter_test.go @@ -21,7 +21,7 @@ var optFloatNear = cmp.Comparer(func(x, y float64) bool { }) func TestInsertion(t *testing.T) { - cf := NewFilter(1000000) + cf := NewFilter(Config{NumElements: 1000000}) fd, err := os.Open("/usr/share/dict/words") if err != nil { t.Skipf("failed reading words: %v", err) @@ -58,7 +58,7 @@ func TestInsertion(t *testing.T) { } func TestLookup(t *testing.T) { - cf := NewFilter(4) + cf := NewFilter(Config{NumElements: 4}) cf.Insert([]byte("one")) cf.Insert([]byte("two")) cf.Insert([]byte("three")) @@ -86,7 +86,7 @@ func TestLookup(t *testing.T) { func TestFilter_LookupLarge(t *testing.T) { const size = 10000 insertFail := 0 - cf := NewFilter(size) + cf := NewFilter(Config{NumElements: size}) for i := 0; i < size; i++ { if !cf.Insert([]byte{byte(i)}) { insertFail++ @@ -105,8 +105,7 @@ func TestFilter_LookupLarge(t *testing.T) { } func TestFilter_Insert(t *testing.T) { - const cap = 10000 - filter := NewFilter(cap) + filter := NewFilter(Config{NumElements: 10000}) var hash [32]byte @@ -121,8 +120,7 @@ func TestFilter_Insert(t *testing.T) { } func BenchmarkFilter_Reset(b *testing.B) { - const cap = 10000 - filter := NewFilter(cap) + filter := NewFilter(Config{NumElements: 10000}) b.ResetTimer() @@ -132,8 +130,7 @@ func BenchmarkFilter_Reset(b *testing.B) { } func BenchmarkFilter_Insert(b *testing.B) { - const cap = 10000 - filter := NewFilter(cap) + filter := NewFilter(Config{NumElements: 10000}) b.ResetTimer() @@ -145,8 +142,7 @@ func BenchmarkFilter_Insert(b *testing.B) { } func BenchmarkFilter_Lookup(b *testing.B) { - const cap = 10000 - filter := NewFilter(cap) + filter := NewFilter(Config{NumElements: 10000}) var hash [32]byte for i := 0; i < 10000; i++ { @@ -162,7 +158,7 @@ func BenchmarkFilter_Lookup(b *testing.B) { } func TestDelete(t *testing.T) { - cf := NewFilter(8) + cf := NewFilter(Config{NumElements: 8}) cf.Insert([]byte("one")) cf.Insert([]byte("two")) cf.Insert([]byte("three")) @@ -187,7 +183,7 @@ func TestDelete(t *testing.T) { } func TestDeleteMultipleSame(t *testing.T) { - cf := NewFilter(4) + cf := NewFilter(Config{NumElements: 10}) for i := 0; i < 5; i++ { cf.Insert([]byte("some_item")) } @@ -206,6 +202,7 @@ func TestDeleteMultipleSame(t *testing.T) { {"some_item", true, 0}, {"some_item", false, 0}, } + t.Logf("Filter state full: %v", cf) for _, tc := range testCases { t.Run(fmt.Sprintf("cf.Delete(%q)", tc.word), func(t *testing.T) { if got, gotCount := cf.Delete([]byte(tc.word)), cf.Count(); got != tc.want || gotCount != tc.wantCount { @@ -216,7 +213,7 @@ func TestDeleteMultipleSame(t *testing.T) { } func TestEncodeDecode(t *testing.T) { - cf := NewFilter(10) + cf := NewFilter(Config{NumElements: 10}) cf.Insert([]byte{1}) cf.Insert([]byte{2}) cf.Insert([]byte{3}) @@ -232,8 +229,8 @@ func TestEncodeDecode(t *testing.T) { t.Errorf("Expected no error, got %v", err) } if !cmp.Equal(cf, got, - cmp.AllowUnexported(Filter[uint16]{}), - cmpopts.IgnoreFields(Filter[uint16]{}, "getFingerprint")) { + cmp.AllowUnexported(filter[uint16]{}), + cmpopts.IgnoreFields(filter[uint16]{}, "getFingerprint")) { t.Errorf("Decode = %v, want %v, encoded = %v", got, cf, encoded) } } diff --git a/example_test.go b/example_test.go index 3ac4138..22906a4 100644 --- a/example_test.go +++ b/example_test.go @@ -7,7 +7,7 @@ import ( ) func Example() { - cf := cuckoo.NewFilter(1000) + cf := cuckoo.NewFilter(cuckoo.Config{NumElements: 1000}) cf.Insert([]byte("pizza")) cf.Insert([]byte("tacos")) @@ -25,7 +25,7 @@ func Example() { } func ExampleFilter_Lookup() { - cf := cuckoo.NewFilter(1000) + cf := cuckoo.NewFilter(cuckoo.Config{NumElements: 1000}) cf.Insert([]byte("pizza")) cf.Insert([]byte("tacos")) @@ -38,7 +38,7 @@ func ExampleFilter_Lookup() { } func ExampleFilter_Delete() { - cf := cuckoo.NewFilter(1000) + cf := cuckoo.NewFilter(cuckoo.Config{NumElements: 1000}) cf.Insert([]byte("pizza")) cf.Insert([]byte("tacos")) diff --git a/example_threadsafe_test.go b/example_threadsafe_test.go index 9c38225..6d08171 100644 --- a/example_threadsafe_test.go +++ b/example_threadsafe_test.go @@ -9,7 +9,7 @@ import ( // Small wrapper around cuckoo filter making it thread safe. type threadSafeFilter struct { - cf *cuckoo.Filter[uint16] + cf cuckoo.Filter mu sync.RWMutex } @@ -29,7 +29,7 @@ func (f *threadSafeFilter) lookup(item []byte) bool { func Example_threadSafe() { cf := &threadSafeFilter{ - cf: cuckoo.NewFilter(1000), + cf: cuckoo.NewFilter(cuckoo.Config{NumElements: 1000}), } var wg sync.WaitGroup diff --git a/util.go b/util.go index 8246042..2fa7067 100644 --- a/util.go +++ b/util.go @@ -1,7 +1,6 @@ package cuckoo import ( - "encoding/binary" "math/rand" metro "github.com/dgryski/go-metro" @@ -16,12 +15,24 @@ func randi(i1, i2 uint) uint { } func getAltIndex[T fingerprintsize](fp T, i uint, bucketIndexMask uint) uint { - b := make([]byte, 2) - binary.LittleEndian.PutUint16(b, uint16(fp)) - hash := uint(metro.Hash64(b, 1337)) + // NOTE(panmari): hash was originally computed as uint(metro.Hash64(fp, 1337)). + // Multiplying with a constant has a similar effect and is cheaper. + // 0x5bd1e995 is the hash constant from MurmurHash2 + const murmurConstant = 0x5bd1e995 + hash := uint(fp) * murmurConstant return (i ^ hash) & bucketIndexMask } +func getFinterprintUint8(hash uint64) uint8 { + const fingerprintSizeBits = 8 + const maxFingerprint = (1 << fingerprintSizeBits) - 1 + // Use most significant bits for fingerprint. + shifted := hash >> (64 - fingerprintSizeBits) + // Valid fingerprints are in range [1, maxFingerprint], leaving 0 as the special empty state. + fp := shifted%(maxFingerprint-1) + 1 + return uint8(fp) +} + func getFinterprintUint16(hash uint64) uint16 { const fingerprintSizeBits = 16 const maxFingerprint = (1 << fingerprintSizeBits) - 1 @@ -32,14 +43,14 @@ func getFinterprintUint16(hash uint64) uint16 { return uint16(fp) } -func getFinterprintUint8(hash uint64) uint8 { - const fingerprintSizeBits = 8 +func getFinterprintUint32(hash uint64) uint32 { + const fingerprintSizeBits = 32 const maxFingerprint = (1 << fingerprintSizeBits) - 1 // Use most significant bits for fingerprint. shifted := hash >> (64 - fingerprintSizeBits) // Valid fingerprints are in range [1, maxFingerprint], leaving 0 as the special empty state. fp := shifted%(maxFingerprint-1) + 1 - return uint8(fp) + return uint32(fp) } // getIndexAndFingerprint returns the primary bucket index and fingerprint to be used From 69c4f398ef3223992f7be90cf66b31b58f646ef7 Mon Sep 17 00:00:00 2001 From: panmari Date: Sun, 16 Jan 2022 21:20:57 +0100 Subject: [PATCH 03/17] Simplified and sped up insert method. There is no asymptotic advantage to try both idx & alt idx for an item to insert. Rather just let cuckoo kickout do it's thing. --- cuckoofilter.go | 31 +++++++++++++------------------ util.go | 10 ---------- 2 files changed, 13 insertions(+), 28 deletions(-) diff --git a/cuckoofilter.go b/cuckoofilter.go index 36f8017..ddeeb1f 100644 --- a/cuckoofilter.go +++ b/cuckoofilter.go @@ -105,26 +105,12 @@ func (cf *filter[T]) Reset() { } func (cf *filter[T]) Insert(data []byte) bool { - i1, fp := getIndexAndFingerprint(data, cf.bucketIndexMask, cf.getFingerprint) - if cf.insert(fp, i1) { - return true - } - i2 := getAltIndex(fp, i1, cf.bucketIndexMask) - if cf.insert(fp, i2) { - return true - } - return cf.reinsert(fp, randi(i1, i2)) -} - -func (cf *filter[T]) insert(fp T, i uint) bool { - if cf.buckets[i].insert(fp) { - cf.count++ + i, fp := getIndexAndFingerprint(data, cf.bucketIndexMask, cf.getFingerprint) + if cf.insertIntoBucket(fp, i) { return true } - return false -} -func (cf *filter[T]) reinsert(fp T, i uint) bool { + // Apply cuckoo kickouts until a free space is found. for k := 0; k < maxCuckooKickouts; k++ { j := rand.Intn(bucketSize) // Swap fingerprint with bucket entry. @@ -132,13 +118,22 @@ func (cf *filter[T]) reinsert(fp T, i uint) bool { // Move kicked out fingerprint to alternate location. i = getAltIndex(fp, i, cf.bucketIndexMask) - if cf.insert(fp, i) { + if cf.insertIntoBucket(fp, i) { return true } } return false } +func (cf *filter[T]) insertIntoBucket(fp T, i uint) bool { + if cf.buckets[i].insert(fp) { + cf.count++ + return true + } + return false +} + + func (cf *filter[T]) Delete(data []byte) bool { i1, fp := getIndexAndFingerprint(data, cf.bucketIndexMask, cf.getFingerprint) i2 := getAltIndex(fp, i1, cf.bucketIndexMask) diff --git a/util.go b/util.go index 2fa7067..118f48d 100644 --- a/util.go +++ b/util.go @@ -1,19 +1,9 @@ package cuckoo import ( - "math/rand" - metro "github.com/dgryski/go-metro" ) -// randi returns either i1 or i2 randomly. -func randi(i1, i2 uint) uint { - if rand.Int31()%2 == 0 { - return i1 - } - return i2 -} - func getAltIndex[T fingerprintsize](fp T, i uint, bucketIndexMask uint) uint { // NOTE(panmari): hash was originally computed as uint(metro.Hash64(fp, 1337)). // Multiplying with a constant has a similar effect and is cheaper. From 6a19792caa0022d973dc8e7f461a9db8c5cbab64 Mon Sep 17 00:00:00 2001 From: panmari Date: Sun, 23 Jan 2022 18:39:42 +0100 Subject: [PATCH 04/17] Fixing typo in method name. --- cuckoofilter.go | 8 ++++---- util.go | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/cuckoofilter.go b/cuckoofilter.go index ddeeb1f..0cc0bd2 100644 --- a/cuckoofilter.go +++ b/cuckoofilter.go @@ -66,7 +66,7 @@ func NewFilter(cfg Config) Filter { buckets: buckets, count: 0, bucketIndexMask: uint(len(buckets) - 1), - getFingerprint: getFinterprintUint8, + getFingerprint: getFingerprintUint8, } case High: buckets := make([]bucket[uint32], numBuckets) @@ -74,7 +74,7 @@ func NewFilter(cfg Config) Filter { buckets: buckets, count: 0, bucketIndexMask: uint(len(buckets) - 1), - getFingerprint: getFinterprintUint32, + getFingerprint: getFingerprintUint32, } default: buckets := make([]bucket[uint16], numBuckets) @@ -82,7 +82,7 @@ func NewFilter(cfg Config) Filter { buckets: buckets, count: 0, bucketIndexMask: uint(len(buckets) - 1), - getFingerprint: getFinterprintUint16, + getFingerprint: getFingerprintUint16, } } } @@ -200,6 +200,6 @@ func Decode(bytes []byte) (Filter, error) { buckets: buckets, count: count, bucketIndexMask: uint(len(buckets) - 1), - getFingerprint: getFinterprintUint16, + getFingerprint: getFingerprintUint16, }, nil } diff --git a/util.go b/util.go index 118f48d..0bb3c7a 100644 --- a/util.go +++ b/util.go @@ -13,7 +13,7 @@ func getAltIndex[T fingerprintsize](fp T, i uint, bucketIndexMask uint) uint { return (i ^ hash) & bucketIndexMask } -func getFinterprintUint8(hash uint64) uint8 { +func getFingerprintUint8(hash uint64) uint8 { const fingerprintSizeBits = 8 const maxFingerprint = (1 << fingerprintSizeBits) - 1 // Use most significant bits for fingerprint. @@ -23,7 +23,7 @@ func getFinterprintUint8(hash uint64) uint8 { return uint8(fp) } -func getFinterprintUint16(hash uint64) uint16 { +func getFingerprintUint16(hash uint64) uint16 { const fingerprintSizeBits = 16 const maxFingerprint = (1 << fingerprintSizeBits) - 1 // Use most significant bits for fingerprint. @@ -33,7 +33,7 @@ func getFinterprintUint16(hash uint64) uint16 { return uint16(fp) } -func getFinterprintUint32(hash uint64) uint32 { +func getFingerprintUint32(hash uint64) uint32 { const fingerprintSizeBits = 32 const maxFingerprint = (1 << fingerprintSizeBits) - 1 // Use most significant bits for fingerprint. From 286a8bce50216ad1cb290612c6a21e88147bf288 Mon Sep 17 00:00:00 2001 From: panmari Date: Sun, 23 Jan 2022 20:01:55 +0100 Subject: [PATCH 05/17] Making insert benchmark less biased towards completely full behavior. --- cuckoofilter_test.go | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/cuckoofilter_test.go b/cuckoofilter_test.go index ea7a60c..ab8d28c 100644 --- a/cuckoofilter_test.go +++ b/cuckoofilter_test.go @@ -130,14 +130,18 @@ func BenchmarkFilter_Reset(b *testing.B) { } func BenchmarkFilter_Insert(b *testing.B) { - filter := NewFilter(Config{NumElements: 10000}) + const size = 10000 + filter := NewFilter(Config{NumElements: size}) b.ResetTimer() var hash [32]byte - for i := 0; i < b.N; i++ { - io.ReadFull(rand.Reader, hash[:]) - filter.Insert(hash[:]) + for i := 0; i < b.N; { + for j := 0; j < size / 10; j++ { + io.ReadFull(rand.Reader, hash[:]) + filter.Insert(hash[:]) + i++ + } } } From 7fac49b3a767464ecb5c5e3b34a138f1524f07a1 Mon Sep 17 00:00:00 2001 From: panmari Date: Sun, 23 Jan 2022 20:03:05 +0100 Subject: [PATCH 06/17] Moving rand call to more optimized version. Avoids some unnecessary ifs in the implementation. --- bucket.go | 1 + cuckoofilter.go | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/bucket.go b/bucket.go index ec3f1f9..d4db962 100644 --- a/bucket.go +++ b/bucket.go @@ -15,6 +15,7 @@ type bucket[T fingerprintsize] [bucketSize]T const ( nullFp = 0 bucketSize = 4 + bucketSizeMask = bucketSize - 1 ) // insert a fingerprint into a bucket. Returns true if there was enough space and insertion succeeded. diff --git a/cuckoofilter.go b/cuckoofilter.go index 0cc0bd2..57907fa 100644 --- a/cuckoofilter.go +++ b/cuckoofilter.go @@ -112,7 +112,7 @@ func (cf *filter[T]) Insert(data []byte) bool { // Apply cuckoo kickouts until a free space is found. for k := 0; k < maxCuckooKickouts; k++ { - j := rand.Intn(bucketSize) + j := rand.Int63() & bucketSizeMask // Swap fingerprint with bucket entry. cf.buckets[i][j], fp = fp, cf.buckets[i][j] From 0c45ada95ddbeccf2084e12ed7676e9cf12dc722 Mon Sep 17 00:00:00 2001 From: panmari Date: Sun, 23 Jan 2022 20:53:54 +0100 Subject: [PATCH 07/17] Restructuring benchmarks to narrow down on important code. --- cuckoofilter_test.go | 41 +++++++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/cuckoofilter_test.go b/cuckoofilter_test.go index ab8d28c..f2df306 100644 --- a/cuckoofilter_test.go +++ b/cuckoofilter_test.go @@ -121,7 +121,6 @@ func TestFilter_Insert(t *testing.T) { func BenchmarkFilter_Reset(b *testing.B) { filter := NewFilter(Config{NumElements: 10000}) - b.ResetTimer() for i := 0; i < b.N; i++ { @@ -129,17 +128,30 @@ func BenchmarkFilter_Reset(b *testing.B) { } } +// benchmarKeys returns a slice of keys for benchmarking with length `size`. +func benchmarKeys(b *testing.B, size int ) [][]byte { + b.Helper() + keys := make([][]byte, size) + for i := range keys { + keys[i] = make([]byte, 32) + if _, err := io.ReadFull(rand.Reader, keys[i]); err != nil { + b.Error(err) + } + } + return keys +} + func BenchmarkFilter_Insert(b *testing.B) { const size = 10000 - filter := NewFilter(Config{NumElements: size}) - + keys := benchmarKeys(b, int(float64(size)* 0.8)) b.ResetTimer() - var hash [32]byte for i := 0; i < b.N; { - for j := 0; j < size / 10; j++ { - io.ReadFull(rand.Reader, hash[:]) - filter.Insert(hash[:]) + b.StopTimer() + filter := NewFilter(Config{NumElements: size}) + b.StartTimer() + for _, k := range keys { + filter.Insert(k) i++ } } @@ -147,17 +159,14 @@ func BenchmarkFilter_Insert(b *testing.B) { func BenchmarkFilter_Lookup(b *testing.B) { filter := NewFilter(Config{NumElements: 10000}) - - var hash [32]byte - for i := 0; i < 10000; i++ { - io.ReadFull(rand.Reader, hash[:]) - filter.Insert(hash[:]) - } + keys := benchmarKeys(b, 10000) b.ResetTimer() - for i := 0; i < b.N; i++ { - io.ReadFull(rand.Reader, hash[:]) - filter.Lookup(hash[:]) + for i := 0; i < b.N; { + for _, k := range keys { + filter.Lookup(k) + i++ + } } } From e43f2630ce631ba0302bd3f89b9a78a493d78757 Mon Sep 17 00:00:00 2001 From: panmari Date: Mon, 11 Jul 2022 21:09:17 +0200 Subject: [PATCH 08/17] Passing fingerprint size instead of dynamic function. --- cuckoofilter.go | 45 ++++++++++++++++++++++---------------------- cuckoofilter_test.go | 10 ++++------ util.go | 31 +++++------------------------- util_test.go | 4 +--- 4 files changed, 32 insertions(+), 58 deletions(-) diff --git a/cuckoofilter.go b/cuckoofilter.go index 57907fa..bb1a3ac 100644 --- a/cuckoofilter.go +++ b/cuckoofilter.go @@ -34,9 +34,9 @@ type Filter interface { } type filter[T fingerprintsize] struct { - buckets []bucket[T] - getFingerprint func(hash uint64) T - count uint + buckets []bucket[T] + fingerprintSizeBits uint64 + count uint // Bit mask set to len(buckets) - 1. As len(buckets) is always a power of 2, // applying this mask mimics the operation x % len(buckets). bucketIndexMask uint @@ -63,32 +63,32 @@ func NewFilter(cfg Config) Filter { case Low: buckets := make([]bucket[uint8], numBuckets) return &filter[uint8]{ - buckets: buckets, - count: 0, - bucketIndexMask: uint(len(buckets) - 1), - getFingerprint: getFingerprintUint8, + buckets: buckets, + count: 0, + bucketIndexMask: uint(len(buckets) - 1), + fingerprintSizeBits: 8, } case High: buckets := make([]bucket[uint32], numBuckets) return &filter[uint32]{ - buckets: buckets, - count: 0, - bucketIndexMask: uint(len(buckets) - 1), - getFingerprint: getFingerprintUint32, + buckets: buckets, + count: 0, + bucketIndexMask: uint(len(buckets) - 1), + fingerprintSizeBits: 32, } default: buckets := make([]bucket[uint16], numBuckets) return &filter[uint16]{ - buckets: buckets, - count: 0, - bucketIndexMask: uint(len(buckets) - 1), - getFingerprint: getFingerprintUint16, + buckets: buckets, + count: 0, + bucketIndexMask: uint(len(buckets) - 1), + fingerprintSizeBits: 16, } } } func (cf *filter[T]) Lookup(data []byte) bool { - i1, fp := getIndexAndFingerprint(data, cf.bucketIndexMask, cf.getFingerprint) + i1, fp := getIndexAndFingerprint[T](data, cf.bucketIndexMask, cf.fingerprintSizeBits) if b := cf.buckets[i1]; b.contains(fp) { return true } @@ -105,7 +105,7 @@ func (cf *filter[T]) Reset() { } func (cf *filter[T]) Insert(data []byte) bool { - i, fp := getIndexAndFingerprint(data, cf.bucketIndexMask, cf.getFingerprint) + i, fp := getIndexAndFingerprint[T](data, cf.bucketIndexMask, cf.fingerprintSizeBits) if cf.insertIntoBucket(fp, i) { return true } @@ -133,9 +133,8 @@ func (cf *filter[T]) insertIntoBucket(fp T, i uint) bool { return false } - func (cf *filter[T]) Delete(data []byte) bool { - i1, fp := getIndexAndFingerprint(data, cf.bucketIndexMask, cf.getFingerprint) + i1, fp := getIndexAndFingerprint[T](data, cf.bucketIndexMask, cf.fingerprintSizeBits) i2 := getAltIndex(fp, i1, cf.bucketIndexMask) return cf.delete(fp, i1) || cf.delete(fp, i2) } @@ -197,9 +196,9 @@ func Decode(bytes []byte) (Filter, error) { } } return &filter[uint16]{ - buckets: buckets, - count: count, - bucketIndexMask: uint(len(buckets) - 1), - getFingerprint: getFingerprintUint16, + buckets: buckets, + count: count, + bucketIndexMask: uint(len(buckets) - 1), + fingerprintSizeBits: 16, }, nil } diff --git a/cuckoofilter_test.go b/cuckoofilter_test.go index f2df306..eabebcd 100644 --- a/cuckoofilter_test.go +++ b/cuckoofilter_test.go @@ -10,7 +10,6 @@ import ( "testing" "github.com/google/go-cmp/cmp" - "github.com/google/go-cmp/cmp/cmpopts" ) // optFloatNear considers float64 as equal if the relative delta is small. @@ -129,7 +128,7 @@ func BenchmarkFilter_Reset(b *testing.B) { } // benchmarKeys returns a slice of keys for benchmarking with length `size`. -func benchmarKeys(b *testing.B, size int ) [][]byte { +func benchmarKeys(b *testing.B, size int) [][]byte { b.Helper() keys := make([][]byte, size) for i := range keys { @@ -143,7 +142,7 @@ func benchmarKeys(b *testing.B, size int ) [][]byte { func BenchmarkFilter_Insert(b *testing.B) { const size = 10000 - keys := benchmarKeys(b, int(float64(size)* 0.8)) + keys := benchmarKeys(b, int(float64(size)*0.8)) b.ResetTimer() for i := 0; i < b.N; { @@ -164,7 +163,7 @@ func BenchmarkFilter_Lookup(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; { for _, k := range keys { - filter.Lookup(k) + filter.Lookup(k) i++ } } @@ -242,8 +241,7 @@ func TestEncodeDecode(t *testing.T) { t.Errorf("Expected no error, got %v", err) } if !cmp.Equal(cf, got, - cmp.AllowUnexported(filter[uint16]{}), - cmpopts.IgnoreFields(filter[uint16]{}, "getFingerprint")) { + cmp.AllowUnexported(filter[uint16]{})) { t.Errorf("Decode = %v, want %v, encoded = %v", got, cf, encoded) } } diff --git a/util.go b/util.go index 0bb3c7a..7a99db6 100644 --- a/util.go +++ b/util.go @@ -13,40 +13,19 @@ func getAltIndex[T fingerprintsize](fp T, i uint, bucketIndexMask uint) uint { return (i ^ hash) & bucketIndexMask } -func getFingerprintUint8(hash uint64) uint8 { - const fingerprintSizeBits = 8 - const maxFingerprint = (1 << fingerprintSizeBits) - 1 +func getFingerprint[T fingerprintsize](hash uint64, fingerprintSizeBits uint64) T { + maxFingerprint := uint64((1 << fingerprintSizeBits) - 1) // Use most significant bits for fingerprint. shifted := hash >> (64 - fingerprintSizeBits) // Valid fingerprints are in range [1, maxFingerprint], leaving 0 as the special empty state. fp := shifted%(maxFingerprint-1) + 1 - return uint8(fp) -} - -func getFingerprintUint16(hash uint64) uint16 { - const fingerprintSizeBits = 16 - const maxFingerprint = (1 << fingerprintSizeBits) - 1 - // Use most significant bits for fingerprint. - shifted := hash >> (64 - fingerprintSizeBits) - // Valid fingerprints are in range [1, maxFingerprint], leaving 0 as the special empty state. - fp := shifted%(maxFingerprint-1) + 1 - return uint16(fp) -} - -func getFingerprintUint32(hash uint64) uint32 { - const fingerprintSizeBits = 32 - const maxFingerprint = (1 << fingerprintSizeBits) - 1 - // Use most significant bits for fingerprint. - shifted := hash >> (64 - fingerprintSizeBits) - // Valid fingerprints are in range [1, maxFingerprint], leaving 0 as the special empty state. - fp := shifted%(maxFingerprint-1) + 1 - return uint32(fp) + return T(fp) } // getIndexAndFingerprint returns the primary bucket index and fingerprint to be used -func getIndexAndFingerprint[T fingerprintsize](data []byte, bucketIndexMask uint, getFingerprint func(uint64) T) (uint, T) { +func getIndexAndFingerprint[T fingerprintsize](data []byte, bucketIndexMask uint, fingerprintSize uint64) (uint, T) { hash := metro.Hash64(data, 1337) - f := getFingerprint(hash) + f := getFingerprint[T](hash, fingerprintSize) // Use least significant bits for deriving index. i1 := uint(hash) & bucketIndexMask return i1, f diff --git a/util_test.go b/util_test.go index 1ac4de3..4789bf6 100644 --- a/util_test.go +++ b/util_test.go @@ -7,9 +7,7 @@ import ( func TestIndexAndFP(t *testing.T) { data := []byte("seif") numBuckets := uint(1024) - i1, fp := getIndexAndFingerprint(data, numBuckets, func(in uint64) uint16 { - return 2 - }) + i1, fp := getIndexAndFingerprint[uint16](data, numBuckets, 16) i2 := getAltIndex(fp, i1, numBuckets) i11 := getAltIndex(fp, i2, numBuckets) i22 := getAltIndex(fp, i1, numBuckets) From e099372ef0fd5162a1f3a87d55f871e88f576c3c Mon Sep 17 00:00:00 2001 From: panmari Date: Wed, 10 Aug 2022 17:16:43 +0200 Subject: [PATCH 09/17] Making benchmarks and tests deterministic. By seeding random source. --- cuckoofilter_test.go | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/cuckoofilter_test.go b/cuckoofilter_test.go index eabebcd..9f14352 100644 --- a/cuckoofilter_test.go +++ b/cuckoofilter_test.go @@ -2,10 +2,9 @@ package cuckoo import ( "bufio" - "crypto/rand" "fmt" - "io" "math" + "math/rand" "os" "testing" @@ -105,12 +104,13 @@ func TestFilter_LookupLarge(t *testing.T) { func TestFilter_Insert(t *testing.T) { filter := NewFilter(Config{NumElements: 10000}) + rng := rand.New(rand.NewSource(int64(42))) - var hash [32]byte + hash := make([]byte, 32) for i := 0; i < 100; i++ { - io.ReadFull(rand.Reader, hash[:]) - filter.Insert(hash[:]) + rng.Read(hash) + filter.Insert(hash) } if got, want := filter.Count(), uint(100); got != want { @@ -127,13 +127,14 @@ func BenchmarkFilter_Reset(b *testing.B) { } } -// benchmarKeys returns a slice of keys for benchmarking with length `size`. -func benchmarKeys(b *testing.B, size int) [][]byte { +// benchmarkKeys returns a slice of keys for benchmarking with length `size`. +func benchmarkKeys(b *testing.B, size int) [][]byte { b.Helper() keys := make([][]byte, size) + rng := rand.New(rand.NewSource(int64(size))) for i := range keys { keys[i] = make([]byte, 32) - if _, err := io.ReadFull(rand.Reader, keys[i]); err != nil { + if _, err := rng.Read(keys[i]); err != nil { b.Error(err) } } @@ -142,7 +143,7 @@ func benchmarKeys(b *testing.B, size int) [][]byte { func BenchmarkFilter_Insert(b *testing.B) { const size = 10000 - keys := benchmarKeys(b, int(float64(size)*0.8)) + keys := benchmarkKeys(b, int(float64(size)*0.8)) b.ResetTimer() for i := 0; i < b.N; { @@ -158,7 +159,7 @@ func BenchmarkFilter_Insert(b *testing.B) { func BenchmarkFilter_Lookup(b *testing.B) { filter := NewFilter(Config{NumElements: 10000}) - keys := benchmarKeys(b, 10000) + keys := benchmarkKeys(b, 10000) b.ResetTimer() for i := 0; i < b.N; { From b8d4b9c324c81a09ada2151ee644fbb35322058f Mon Sep 17 00:00:00 2001 From: panmari Date: Wed, 10 Aug 2022 18:41:27 +0200 Subject: [PATCH 10/17] Passing around fingerprint size in bits. --- bucket.go | 1 - cuckoofilter.go | 20 ++++++++++++-------- util.go | 20 ++++++++++---------- util_test.go | 9 +++++---- 4 files changed, 27 insertions(+), 23 deletions(-) diff --git a/bucket.go b/bucket.go index d4db962..ec3f1f9 100644 --- a/bucket.go +++ b/bucket.go @@ -15,7 +15,6 @@ type bucket[T fingerprintsize] [bucketSize]T const ( nullFp = 0 bucketSize = 4 - bucketSizeMask = bucketSize - 1 ) // insert a fingerprint into a bucket. Returns true if there was enough space and insertion succeeded. diff --git a/cuckoofilter.go b/cuckoofilter.go index bb1a3ac..09ad962 100644 --- a/cuckoofilter.go +++ b/cuckoofilter.go @@ -35,11 +35,12 @@ type Filter interface { type filter[T fingerprintsize] struct { buckets []bucket[T] - fingerprintSizeBits uint64 + fingerprintSizeBits int count uint // Bit mask set to len(buckets) - 1. As len(buckets) is always a power of 2, // applying this mask mimics the operation x % len(buckets). bucketIndexMask uint + maxFingerprint uint64 } func numBuckets(numElements uint) uint { @@ -67,6 +68,7 @@ func NewFilter(cfg Config) Filter { count: 0, bucketIndexMask: uint(len(buckets) - 1), fingerprintSizeBits: 8, + maxFingerprint: uint64((1 << 8) - 1), } case High: buckets := make([]bucket[uint32], numBuckets) @@ -75,6 +77,7 @@ func NewFilter(cfg Config) Filter { count: 0, bucketIndexMask: uint(len(buckets) - 1), fingerprintSizeBits: 32, + maxFingerprint: uint64((1 << 32) - 1), } default: buckets := make([]bucket[uint16], numBuckets) @@ -83,16 +86,17 @@ func NewFilter(cfg Config) Filter { count: 0, bucketIndexMask: uint(len(buckets) - 1), fingerprintSizeBits: 16, + maxFingerprint: uint64((1 << 16) - 1), } } } func (cf *filter[T]) Lookup(data []byte) bool { - i1, fp := getIndexAndFingerprint[T](data, cf.bucketIndexMask, cf.fingerprintSizeBits) + i1, fp := getIndexAndFingerprint[T](data, cf.bucketIndexMask, cf.maxFingerprint, cf.fingerprintSizeBits) if b := cf.buckets[i1]; b.contains(fp) { return true } - i2 := getAltIndex(fp, i1, cf.bucketIndexMask) + i2 := getAltIndex(fp, i1, cf.bucketIndexMask, cf.fingerprintSizeBits) b := cf.buckets[i2] return b.contains(fp) } @@ -105,19 +109,19 @@ func (cf *filter[T]) Reset() { } func (cf *filter[T]) Insert(data []byte) bool { - i, fp := getIndexAndFingerprint[T](data, cf.bucketIndexMask, cf.fingerprintSizeBits) + i, fp := getIndexAndFingerprint[T](data, cf.bucketIndexMask, cf.maxFingerprint, cf.fingerprintSizeBits) if cf.insertIntoBucket(fp, i) { return true } // Apply cuckoo kickouts until a free space is found. for k := 0; k < maxCuckooKickouts; k++ { - j := rand.Int63() & bucketSizeMask + j := rand.Intn(bucketSize) // Swap fingerprint with bucket entry. cf.buckets[i][j], fp = fp, cf.buckets[i][j] // Move kicked out fingerprint to alternate location. - i = getAltIndex(fp, i, cf.bucketIndexMask) + i = getAltIndex(fp, i, cf.bucketIndexMask, cf.fingerprintSizeBits) if cf.insertIntoBucket(fp, i) { return true } @@ -134,8 +138,8 @@ func (cf *filter[T]) insertIntoBucket(fp T, i uint) bool { } func (cf *filter[T]) Delete(data []byte) bool { - i1, fp := getIndexAndFingerprint[T](data, cf.bucketIndexMask, cf.fingerprintSizeBits) - i2 := getAltIndex(fp, i1, cf.bucketIndexMask) + i1, fp := getIndexAndFingerprint[T](data, cf.bucketIndexMask, cf.maxFingerprint, cf.fingerprintSizeBits) + i2 := getAltIndex(fp, i1, cf.bucketIndexMask, cf.fingerprintSizeBits) return cf.delete(fp, i1) || cf.delete(fp, i2) } diff --git a/util.go b/util.go index 7a99db6..150fd91 100644 --- a/util.go +++ b/util.go @@ -1,20 +1,20 @@ package cuckoo import ( + "encoding/binary" + metro "github.com/dgryski/go-metro" ) -func getAltIndex[T fingerprintsize](fp T, i uint, bucketIndexMask uint) uint { - // NOTE(panmari): hash was originally computed as uint(metro.Hash64(fp, 1337)). - // Multiplying with a constant has a similar effect and is cheaper. - // 0x5bd1e995 is the hash constant from MurmurHash2 - const murmurConstant = 0x5bd1e995 - hash := uint(fp) * murmurConstant +func getAltIndex[T fingerprintsize](fp T, i uint, bucketIndexMask uint, fingerprintSizeBits int) uint { + b := make([]byte, fingerprintSizeBits/8) + binary.LittleEndian.PutUint16(b, uint16(fp)) + hash := uint(metro.Hash64(b, 1337)) return (i ^ hash) & bucketIndexMask } -func getFingerprint[T fingerprintsize](hash uint64, fingerprintSizeBits uint64) T { - maxFingerprint := uint64((1 << fingerprintSizeBits) - 1) +func getFingerprint[T fingerprintsize](hash, maxFingerprint uint64, fingerprintSizeBits int) T { + // maxFingerprint := uint64((1 << fingerprintSizeBits) - 1) // Use most significant bits for fingerprint. shifted := hash >> (64 - fingerprintSizeBits) // Valid fingerprints are in range [1, maxFingerprint], leaving 0 as the special empty state. @@ -23,9 +23,9 @@ func getFingerprint[T fingerprintsize](hash uint64, fingerprintSizeBits uint64) } // getIndexAndFingerprint returns the primary bucket index and fingerprint to be used -func getIndexAndFingerprint[T fingerprintsize](data []byte, bucketIndexMask uint, fingerprintSize uint64) (uint, T) { +func getIndexAndFingerprint[T fingerprintsize](data []byte, bucketIndexMask uint, maxFingerprint uint64, fingerprintSize int) (uint, T) { hash := metro.Hash64(data, 1337) - f := getFingerprint[T](hash, fingerprintSize) + f := getFingerprint[T](hash, maxFingerprint, fingerprintSize) // Use least significant bits for deriving index. i1 := uint(hash) & bucketIndexMask return i1, f diff --git a/util_test.go b/util_test.go index 4789bf6..3ea6536 100644 --- a/util_test.go +++ b/util_test.go @@ -5,12 +5,13 @@ import ( ) func TestIndexAndFP(t *testing.T) { + const fingerprintSizeBits = 16 data := []byte("seif") numBuckets := uint(1024) - i1, fp := getIndexAndFingerprint[uint16](data, numBuckets, 16) - i2 := getAltIndex(fp, i1, numBuckets) - i11 := getAltIndex(fp, i2, numBuckets) - i22 := getAltIndex(fp, i1, numBuckets) + i1, fp := getIndexAndFingerprint[uint16](data, numBuckets, uint64((1< Date: Sat, 13 Aug 2022 22:24:36 +0200 Subject: [PATCH 11/17] Fully implementing Encode/Decode for generic version. --- cuckoofilter.go | 103 +++++++++++++++++++++++--------------- cuckoofilter_fuzz_test.go | 11 ++-- util.go | 21 ++++---- util_test.go | 6 +-- 4 files changed, 84 insertions(+), 57 deletions(-) diff --git a/cuckoofilter.go b/cuckoofilter.go index 09ad962..fabaa93 100644 --- a/cuckoofilter.go +++ b/cuckoofilter.go @@ -39,8 +39,8 @@ type filter[T fingerprintsize] struct { count uint // Bit mask set to len(buckets) - 1. As len(buckets) is always a power of 2, // applying this mask mimics the operation x % len(buckets). - bucketIndexMask uint - maxFingerprint uint64 + bucketIndexMask uint + maxFingerprintMinusOne uint64 } func numBuckets(numElements uint) uint { @@ -54,6 +54,10 @@ func numBuckets(numElements uint) uint { return numBuckets } +func maxFingerprintMinusOne(fingerprintSizeBits int) uint64 { + return uint64((1 << fingerprintSizeBits) - 2) +} + // NewFilter returns a new cuckoofilter suitable for the given number of elements. // When inserting more elements, insertion speed will drop significantly and insertions might fail altogether. // A capacity of 1000000 is a normal default, which allocates @@ -64,39 +68,39 @@ func NewFilter(cfg Config) Filter { case Low: buckets := make([]bucket[uint8], numBuckets) return &filter[uint8]{ - buckets: buckets, - count: 0, - bucketIndexMask: uint(len(buckets) - 1), - fingerprintSizeBits: 8, - maxFingerprint: uint64((1 << 8) - 1), + buckets: buckets, + count: 0, + bucketIndexMask: uint(len(buckets) - 1), + fingerprintSizeBits: 8, + maxFingerprintMinusOne: maxFingerprintMinusOne(8), } case High: buckets := make([]bucket[uint32], numBuckets) return &filter[uint32]{ - buckets: buckets, - count: 0, - bucketIndexMask: uint(len(buckets) - 1), - fingerprintSizeBits: 32, - maxFingerprint: uint64((1 << 32) - 1), + buckets: buckets, + count: 0, + bucketIndexMask: uint(len(buckets) - 1), + fingerprintSizeBits: 32, + maxFingerprintMinusOne: maxFingerprintMinusOne(32), } default: buckets := make([]bucket[uint16], numBuckets) return &filter[uint16]{ - buckets: buckets, - count: 0, - bucketIndexMask: uint(len(buckets) - 1), - fingerprintSizeBits: 16, - maxFingerprint: uint64((1 << 16) - 1), + buckets: buckets, + count: 0, + bucketIndexMask: uint(len(buckets) - 1), + fingerprintSizeBits: 16, + maxFingerprintMinusOne: maxFingerprintMinusOne(16), } } } func (cf *filter[T]) Lookup(data []byte) bool { - i1, fp := getIndexAndFingerprint[T](data, cf.bucketIndexMask, cf.maxFingerprint, cf.fingerprintSizeBits) + i1, fp := getIndexAndFingerprint[T](data, cf.bucketIndexMask, cf.maxFingerprintMinusOne, cf.fingerprintSizeBits) if b := cf.buckets[i1]; b.contains(fp) { return true } - i2 := getAltIndex(fp, i1, cf.bucketIndexMask, cf.fingerprintSizeBits) + i2 := getAltIndex(fp, i1, cf.bucketIndexMask) b := cf.buckets[i2] return b.contains(fp) } @@ -109,7 +113,7 @@ func (cf *filter[T]) Reset() { } func (cf *filter[T]) Insert(data []byte) bool { - i, fp := getIndexAndFingerprint[T](data, cf.bucketIndexMask, cf.maxFingerprint, cf.fingerprintSizeBits) + i, fp := getIndexAndFingerprint[T](data, cf.bucketIndexMask, cf.maxFingerprintMinusOne, cf.fingerprintSizeBits) if cf.insertIntoBucket(fp, i) { return true } @@ -121,7 +125,7 @@ func (cf *filter[T]) Insert(data []byte) bool { cf.buckets[i][j], fp = fp, cf.buckets[i][j] // Move kicked out fingerprint to alternate location. - i = getAltIndex(fp, i, cf.bucketIndexMask, cf.fingerprintSizeBits) + i = getAltIndex(fp, i, cf.bucketIndexMask) if cf.insertIntoBucket(fp, i) { return true } @@ -138,8 +142,8 @@ func (cf *filter[T]) insertIntoBucket(fp T, i uint) bool { } func (cf *filter[T]) Delete(data []byte) bool { - i1, fp := getIndexAndFingerprint[T](data, cf.bucketIndexMask, cf.maxFingerprint, cf.fingerprintSizeBits) - i2 := getAltIndex(fp, i1, cf.bucketIndexMask, cf.fingerprintSizeBits) + i1, fp := getIndexAndFingerprint[T](data, cf.bucketIndexMask, cf.maxFingerprintMinusOne, cf.fingerprintSizeBits) + i2 := getAltIndex(fp, i1, cf.bucketIndexMask) return cf.delete(fp, i1) || cf.delete(fp, i2) } @@ -159,12 +163,12 @@ func (cf *filter[T]) LoadFactor() float64 { return float64(cf.count) / float64(len(cf.buckets)*bucketSize) } -// TODO(panmari): Size of fingerprint needs to be derived from type. Currently hardcoded to 16 for uint16. -const bytesPerBucket = bucketSize * 16 / 8 - +// Encode returns a Cuckoofilter encoded as a byte slice. func (cf *filter[T]) Encode() []byte { res := bytes.NewBuffer(nil) - res.Grow(len(cf.buckets) * bytesPerBucket) + bytesPerBucket := bucketSize * cf.fingerprintSizeBits / 8 + res.Grow(len(cf.buckets)*bytesPerBucket + 4) + binary.Write(res, binary.LittleEndian, uint8(cf.fingerprintSizeBits)) for _, b := range cf.buckets { for _, fp := range b { binary.Write(res, binary.LittleEndian, fp) @@ -174,11 +178,18 @@ func (cf *filter[T]) Encode() []byte { } // Decode returns a Cuckoofilter from a byte slice created using Encode. -// TODO(panmari): This only works for uint16 at this point. func Decode(bytes []byte) (Filter, error) { + if len(bytes) == 0 { + return nil, fmt.Errorf("bytes can not be empty") + } + fingerprintSizeBits, bytes := int(bytes[0]), bytes[1:] if len(bytes)%bucketSize != 0 { return nil, fmt.Errorf("bytes must to be multiple of %d, got %d", bucketSize, len(bytes)) } + bytesPerBucket := bucketSize * fingerprintSizeBits / 8 + if bytesPerBucket == 0 { + return nil, fmt.Errorf("bytesPerBucket can not be zero") + } numBuckets := len(bytes) / bytesPerBucket if numBuckets < 1 { return nil, fmt.Errorf("bytes can not be smaller than %d, size in bytes is %d", bytesPerBucket, len(bytes)) @@ -186,23 +197,35 @@ func Decode(bytes []byte) (Filter, error) { if getNextPow2(uint64(numBuckets)) != uint(numBuckets) { return nil, fmt.Errorf("numBuckets must to be a power of 2, got %d", numBuckets) } + switch fingerprintSizeBits { + case 8: + return decode[uint8](fingerprintSizeBits, numBuckets, bytes), nil + case 16: + return decode[uint16](fingerprintSizeBits, numBuckets, bytes), nil + case 32: + return decode[uint32](fingerprintSizeBits, numBuckets, bytes), nil + default: + return nil, fmt.Errorf("fingerprint size bits must be 8, 16 or 32, got %d", fingerprintSizeBits) + } +} + +func decode[T fingerprintsize](fingerprintSizeBits, numBuckets int, data []byte) *filter[T] { var count uint - buckets := make([]bucket[uint16], numBuckets) + buckets := make([]bucket[T], numBuckets) + reader := bytes.NewReader(data) for i, b := range buckets { for j := range b { - var next []byte - next, bytes = bytes[:2], bytes[2:] - - if fp := binary.LittleEndian.Uint16(next); fp != 0 { - buckets[i][j] = fp + binary.Read(reader, binary.LittleEndian, &buckets[i][j]) + if buckets[i][j] != 0 { count++ } } } - return &filter[uint16]{ - buckets: buckets, - count: count, - bucketIndexMask: uint(len(buckets) - 1), - fingerprintSizeBits: 16, - }, nil + return &filter[T]{ + buckets: buckets, + count: count, + bucketIndexMask: uint(len(buckets) - 1), + fingerprintSizeBits: fingerprintSizeBits, + maxFingerprintMinusOne: maxFingerprintMinusOne(fingerprintSizeBits), + } } diff --git a/cuckoofilter_fuzz_test.go b/cuckoofilter_fuzz_test.go index 18edb9c..19b24ef 100644 --- a/cuckoofilter_fuzz_test.go +++ b/cuckoofilter_fuzz_test.go @@ -7,8 +7,7 @@ import ( "testing" ) -func FuzzDecode(f *testing.F) { - cf := NewFilter(Config{NumElements: 10}) +func filledFilter(cf Filter) Filter { cf.Insert([]byte{1}) cf.Insert([]byte{2}) cf.Insert([]byte{3}) @@ -18,7 +17,13 @@ func FuzzDecode(f *testing.F) { cf.Insert([]byte{7}) cf.Insert([]byte{8}) cf.Insert([]byte{9}) - f.Add(cf.Encode()) + return cf +} + +func FuzzDecode(f *testing.F) { + f.Add(filledFilter(NewFilter(Config{NumElements: 10})).Encode()) + f.Add(filledFilter(NewFilter(Config{NumElements: 10, Precision: Low})).Encode()) + f.Add(filledFilter(NewFilter(Config{NumElements: 10, Precision: High})).Encode()) f.Fuzz(func(t *testing.T, encoded []byte) { cache, err := Decode(encoded) if err != nil { diff --git a/util.go b/util.go index 150fd91..9fb28a4 100644 --- a/util.go +++ b/util.go @@ -1,31 +1,30 @@ package cuckoo import ( - "encoding/binary" - metro "github.com/dgryski/go-metro" ) -func getAltIndex[T fingerprintsize](fp T, i uint, bucketIndexMask uint, fingerprintSizeBits int) uint { - b := make([]byte, fingerprintSizeBits/8) - binary.LittleEndian.PutUint16(b, uint16(fp)) - hash := uint(metro.Hash64(b, 1337)) +func getAltIndex[T fingerprintsize](fp T, i uint, bucketIndexMask uint) uint { + // NOTE(panmari): hash was originally computed as uint(metro.Hash64(fp, 1337)). + // Multiplying with a constant has a similar effect and is cheaper. + // 0x5bd1e995 is the hash constant from MurmurHash2 + const murmurConstant = 0x5bd1e995 + hash := uint(fp) * murmurConstant return (i ^ hash) & bucketIndexMask } -func getFingerprint[T fingerprintsize](hash, maxFingerprint uint64, fingerprintSizeBits int) T { - // maxFingerprint := uint64((1 << fingerprintSizeBits) - 1) +func getFingerprint[T fingerprintsize](hash, maxFingerprintMinusOne uint64, fingerprintSizeBits int) T { // Use most significant bits for fingerprint. shifted := hash >> (64 - fingerprintSizeBits) // Valid fingerprints are in range [1, maxFingerprint], leaving 0 as the special empty state. - fp := shifted%(maxFingerprint-1) + 1 + fp := shifted%(maxFingerprintMinusOne) + 1 return T(fp) } // getIndexAndFingerprint returns the primary bucket index and fingerprint to be used -func getIndexAndFingerprint[T fingerprintsize](data []byte, bucketIndexMask uint, maxFingerprint uint64, fingerprintSize int) (uint, T) { +func getIndexAndFingerprint[T fingerprintsize](data []byte, bucketIndexMask uint, maxFingerprintMinusOne uint64, fingerprintSize int) (uint, T) { hash := metro.Hash64(data, 1337) - f := getFingerprint[T](hash, maxFingerprint, fingerprintSize) + f := getFingerprint[T](hash, maxFingerprintMinusOne, fingerprintSize) // Use least significant bits for deriving index. i1 := uint(hash) & bucketIndexMask return i1, f diff --git a/util_test.go b/util_test.go index 3ea6536..b90b699 100644 --- a/util_test.go +++ b/util_test.go @@ -9,9 +9,9 @@ func TestIndexAndFP(t *testing.T) { data := []byte("seif") numBuckets := uint(1024) i1, fp := getIndexAndFingerprint[uint16](data, numBuckets, uint64((1< Date: Sat, 13 Aug 2022 22:38:18 +0200 Subject: [PATCH 12/17] Fixing bug with insert Previously, items only moved to the alternative index on cuckoo inserts. --- cuckoofilter.go | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/cuckoofilter.go b/cuckoofilter.go index fabaa93..b3ea2e8 100644 --- a/cuckoofilter.go +++ b/cuckoofilter.go @@ -113,11 +113,26 @@ func (cf *filter[T]) Reset() { } func (cf *filter[T]) Insert(data []byte) bool { - i, fp := getIndexAndFingerprint[T](data, cf.bucketIndexMask, cf.maxFingerprintMinusOne, cf.fingerprintSizeBits) - if cf.insertIntoBucket(fp, i) { + i1, fp := getIndexAndFingerprint[T](data, cf.bucketIndexMask, cf.maxFingerprintMinusOne, cf.fingerprintSizeBits) + if cf.insertIntoBucket(fp, i1) { + return true + } + i2 := getAltIndex(fp, i1, cf.bucketIndexMask) + if cf.insertIntoBucket(fp, i2) { return true } + return cf.cuckooInsert(fp, i1) +} +func (cf *filter[T]) insertIntoBucket(fp T, i uint) bool { + if cf.buckets[i].insert(fp) { + cf.count++ + return true + } + return false +} + +func (cf *filter[T]) cuckooInsert(fp T, i uint) bool { // Apply cuckoo kickouts until a free space is found. for k := 0; k < maxCuckooKickouts; k++ { j := rand.Intn(bucketSize) @@ -133,14 +148,6 @@ func (cf *filter[T]) Insert(data []byte) bool { return false } -func (cf *filter[T]) insertIntoBucket(fp T, i uint) bool { - if cf.buckets[i].insert(fp) { - cf.count++ - return true - } - return false -} - func (cf *filter[T]) Delete(data []byte) bool { i1, fp := getIndexAndFingerprint[T](data, cf.bucketIndexMask, cf.maxFingerprintMinusOne, cf.fingerprintSizeBits) i2 := getAltIndex(fp, i1, cf.bucketIndexMask) From a1dc6a816239319b449f82df92512baf0cae6864 Mon Sep 17 00:00:00 2001 From: panmari Date: Sat, 13 Aug 2022 23:10:53 +0200 Subject: [PATCH 13/17] Extending tests to cover all types of de- and encoding. --- cuckoofilter_test.go | 43 ++++++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/cuckoofilter_test.go b/cuckoofilter_test.go index 9f14352..f5916b5 100644 --- a/cuckoofilter_test.go +++ b/cuckoofilter_test.go @@ -226,23 +226,32 @@ func TestDeleteMultipleSame(t *testing.T) { } func TestEncodeDecode(t *testing.T) { - cf := NewFilter(Config{NumElements: 10}) - cf.Insert([]byte{1}) - cf.Insert([]byte{2}) - cf.Insert([]byte{3}) - cf.Insert([]byte{4}) - cf.Insert([]byte{5}) - cf.Insert([]byte{6}) - cf.Insert([]byte{7}) - cf.Insert([]byte{8}) - cf.Insert([]byte{9}) - encoded := cf.Encode() - got, err := Decode(encoded) - if err != nil { - t.Errorf("Expected no error, got %v", err) + testCases := []struct { + filter Filter + }{ + {NewFilter(Config{NumElements: 10})}, + {NewFilter(Config{NumElements: 10, Precision: Low})}, + {NewFilter(Config{NumElements: 10, Precision: High})}, } - if !cmp.Equal(cf, got, - cmp.AllowUnexported(filter[uint16]{})) { - t.Errorf("Decode = %v, want %v, encoded = %v", got, cf, encoded) + for _, tc := range testCases { + cf := tc.filter + cf.Insert([]byte{1}) + cf.Insert([]byte{2}) + cf.Insert([]byte{3}) + cf.Insert([]byte{4}) + cf.Insert([]byte{5}) + cf.Insert([]byte{6}) + cf.Insert([]byte{7}) + cf.Insert([]byte{8}) + cf.Insert([]byte{9}) + encoded := cf.Encode() + got, err := Decode(encoded) + if err != nil { + t.Errorf("Expected no error, got %v", err) + } + if !cmp.Equal(cf, got, + cmp.AllowUnexported(filter[uint8]{}, filter[uint16]{}, filter[uint32]{})) { + t.Errorf("Decode = %v, want %v, encoded = %v", got, cf, encoded) + } } } From e8be750ca767bbdf33635eb34d2965545d7922ab Mon Sep 17 00:00:00 2001 From: panmari Date: Thu, 18 Aug 2022 19:35:51 +0200 Subject: [PATCH 14/17] Small cleanup of Encode function. --- cuckoofilter.go | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cuckoofilter.go b/cuckoofilter.go index b3ea2e8..a6748b7 100644 --- a/cuckoofilter.go +++ b/cuckoofilter.go @@ -170,11 +170,10 @@ func (cf *filter[T]) LoadFactor() float64 { return float64(cf.count) / float64(len(cf.buckets)*bucketSize) } -// Encode returns a Cuckoofilter encoded as a byte slice. func (cf *filter[T]) Encode() []byte { - res := bytes.NewBuffer(nil) + res := new(bytes.Buffer) bytesPerBucket := bucketSize * cf.fingerprintSizeBits / 8 - res.Grow(len(cf.buckets)*bytesPerBucket + 4) + res.Grow(len(cf.buckets)*bytesPerBucket + 1) binary.Write(res, binary.LittleEndian, uint8(cf.fingerprintSizeBits)) for _, b := range cf.buckets { for _, fp := range b { From b66c920fbabb66c416db4387db955984ee87403d Mon Sep 17 00:00:00 2001 From: panmari Date: Thu, 22 Sep 2022 20:04:05 +0200 Subject: [PATCH 15/17] Inlining bucket initialization. --- cuckoofilter.go | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/cuckoofilter.go b/cuckoofilter.go index a6748b7..74552d3 100644 --- a/cuckoofilter.go +++ b/cuckoofilter.go @@ -66,29 +66,26 @@ func NewFilter(cfg Config) Filter { numBuckets := numBuckets(cfg.NumElements) switch cfg.Precision { case Low: - buckets := make([]bucket[uint8], numBuckets) return &filter[uint8]{ - buckets: buckets, + buckets: make([]bucket[uint8], numBuckets), count: 0, - bucketIndexMask: uint(len(buckets) - 1), + bucketIndexMask: uint(numBuckets - 1), fingerprintSizeBits: 8, maxFingerprintMinusOne: maxFingerprintMinusOne(8), } case High: - buckets := make([]bucket[uint32], numBuckets) return &filter[uint32]{ - buckets: buckets, + buckets: make([]bucket[uint32], numBuckets), count: 0, - bucketIndexMask: uint(len(buckets) - 1), + bucketIndexMask: uint(numBuckets - 1), fingerprintSizeBits: 32, maxFingerprintMinusOne: maxFingerprintMinusOne(32), } default: - buckets := make([]bucket[uint16], numBuckets) return &filter[uint16]{ - buckets: buckets, + buckets: make([]bucket[uint16], numBuckets), count: 0, - bucketIndexMask: uint(len(buckets) - 1), + bucketIndexMask: uint(numBuckets - 1), fingerprintSizeBits: 16, maxFingerprintMinusOne: maxFingerprintMinusOne(16), } From f768a1f8df2be3d932092f41d7922340458de8e9 Mon Sep 17 00:00:00 2001 From: panmari Date: Thu, 22 Sep 2022 20:24:15 +0200 Subject: [PATCH 16/17] Putting commented variant of alternative code into getAltIndex. Plus a tiny rename. --- util.go | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/util.go b/util.go index 9fb28a4..6fe59bc 100644 --- a/util.go +++ b/util.go @@ -10,6 +10,9 @@ func getAltIndex[T fingerprintsize](fp T, i uint, bucketIndexMask uint) uint { // 0x5bd1e995 is the hash constant from MurmurHash2 const murmurConstant = 0x5bd1e995 hash := uint(fp) * murmurConstant + // bytes := make([]byte, binary.MaxVarintLen32) + // binary.PutUvarint(bytes, uint64(fp)) + // hash := uint(metro.Hash64(bytes, 1337)) return (i ^ hash) & bucketIndexMask } @@ -24,10 +27,10 @@ func getFingerprint[T fingerprintsize](hash, maxFingerprintMinusOne uint64, fing // getIndexAndFingerprint returns the primary bucket index and fingerprint to be used func getIndexAndFingerprint[T fingerprintsize](data []byte, bucketIndexMask uint, maxFingerprintMinusOne uint64, fingerprintSize int) (uint, T) { hash := metro.Hash64(data, 1337) - f := getFingerprint[T](hash, maxFingerprintMinusOne, fingerprintSize) + fp := getFingerprint[T](hash, maxFingerprintMinusOne, fingerprintSize) // Use least significant bits for deriving index. i1 := uint(hash) & bucketIndexMask - return i1, f + return i1, fp } func getNextPow2(n uint64) uint { From b9b2432b54946d7a7b8006927eb459e072b0d2d0 Mon Sep 17 00:00:00 2001 From: panmari Date: Sat, 24 Sep 2022 15:01:52 +0200 Subject: [PATCH 17/17] Bumping testing matrix to only include supported go versions. --- .github/workflows/go.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index b6f5409..f99e60b 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - go: [ '1.15', '>=1.5 <2'] + go: [ '1.18', '>=1.18 <2'] steps: - uses: actions/checkout@v3