Skip to content

Commit

Permalink
Initial try to support a variety of byte sizes for cuckoofilter.
Browse files Browse the repository at this point in the history
Using generics, allowing to switch transparently between uint8 and
uint16 for the fingerprint size.

Performance neutral according to benchmarks

~/goworkspace/bin/benchstat ~/master.benchstats ~/generics.benchstats
name             old time/op  new time/op  delta
Filter_Reset-4   10.0µs ± 0%   9.9µs ± 0%   ~     (p=1.000 n=1+1)
Filter_Insert-4  18.2µs ± 0%  18.0µs ± 0%   ~     (p=1.000 n=1+1)
Filter_Lookup-4  1.52µs ± 0%  1.51µs ± 0%   ~     (p=1.000 n=1+1)
  • Loading branch information
panmari committed Jan 16, 2022
1 parent b9d73bc commit 0d1d95c
Show file tree
Hide file tree
Showing 9 changed files with 94 additions and 55 deletions.
23 changes: 11 additions & 12 deletions bucket.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,22 +5,21 @@ import (
"fmt"
)

// fingerprint represents a single entry in a bucket.
type fingerprint uint16
type fingerprintsize interface {
uint8 | uint16 | uint32
}

// bucket keeps track of fingerprints hashing to the same index.
type bucket [bucketSize]fingerprint
type bucket[T fingerprintsize] [bucketSize]T

const (
nullFp = 0
bucketSize = 4
fingerprintSizeBits = 16
maxFingerprint = (1 << fingerprintSizeBits) - 1
nullFp = 0
bucketSize = 4
)

// insert a fingerprint into a bucket. Returns true if there was enough space and insertion succeeded.
// Note it allows inserting the same fingerprint multiple times.
func (b *bucket) insert(fp fingerprint) bool {
func (b *bucket[T]) insert(fp T) bool {
for i, tfp := range b {
if tfp == nullFp {
b[i] = fp
Expand All @@ -32,7 +31,7 @@ func (b *bucket) insert(fp fingerprint) bool {

// delete a fingerprint from a bucket.
// Returns true if the fingerprint was present and successfully removed.
func (b *bucket) delete(fp fingerprint) bool {
func (b *bucket[T]) delete(fp T) bool {
for i, tfp := range b {
if tfp == fp {
b[i] = nullFp
Expand All @@ -42,7 +41,7 @@ func (b *bucket) delete(fp fingerprint) bool {
return false
}

func (b *bucket) contains(needle fingerprint) bool {
func (b *bucket[T]) contains(needle T) bool {
for _, fp := range b {
if fp == needle {
return true
Expand All @@ -52,13 +51,13 @@ func (b *bucket) contains(needle fingerprint) bool {
}

// reset deletes all fingerprints in the bucket.
func (b *bucket) reset() {
func (b *bucket[T]) reset() {
for i := range b {
b[i] = nullFp
}
}

func (b *bucket) String() string {
func (b *bucket[T]) String() string {
var buf bytes.Buffer
buf.WriteString("[")
for _, by := range b {
Expand Down
6 changes: 3 additions & 3 deletions bucket_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@ import (
)

func TestBucket_Reset(t *testing.T) {
var bkt bucket
for i := fingerprint(0); i < bucketSize; i++ {
var bkt bucket[uint16]
for i := uint16(0); i < bucketSize; i++ {
bkt[i] = i
}
bkt.reset()

var want bucket
var want bucket[uint16]
if !reflect.DeepEqual(bkt, want) {
t.Errorf("bucket.reset() got %v, want %v", bkt, want)
}
Expand Down
77 changes: 49 additions & 28 deletions cuckoofilter.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,37 +11,56 @@ import (
const maxCuckooKickouts = 500

// Filter is a probabilistic counter.
type Filter struct {
buckets []bucket
count uint
type Filter[T fingerprintsize] struct {
buckets []bucket[T]
getFingerprint func(hash uint64) T
count uint
// Bit mask set to len(buckets) - 1. As len(buckets) is always a power of 2,
// applying this mask mimics the operation x % len(buckets).
bucketIndexMask uint
}

// NewFilter returns a new cuckoofilter suitable for the given number of elements.
// When inserting more elements, insertion speed will drop significantly and insertions might fail altogether.
// A capacity of 1000000 is a normal default, which allocates
// about ~2MB on 64-bit machines.
func NewFilter(numElements uint) *Filter {
func numBuckets(numElements uint) uint {
numBuckets := getNextPow2(uint64(numElements / bucketSize))
if float64(numElements)/float64(numBuckets*bucketSize) > 0.96 {
numBuckets <<= 1
}
if numBuckets == 0 {
numBuckets = 1
}
buckets := make([]bucket, numBuckets)
return &Filter{
return numBuckets
}

// NewFilter returns a new cuckoofilter suitable for the given number of elements.
// When inserting more elements, insertion speed will drop significantly and insertions might fail altogether.
// A capacity of 1000000 is a normal default, which allocates
// about ~2MB on 64-bit machines.
func NewFilter(numElements uint) *Filter[uint16] {
buckets := make([]bucket[uint16], numBuckets(numElements))
return &Filter[uint16]{
buckets: buckets,
count: 0,
bucketIndexMask: uint(len(buckets) - 1),
getFingerprint: getFinterprintUint16,
}
}

// NewFilterLowPrecision is the same as NewFilter, but returns a filter that uses
// half the memory but has lower precision.
func NewFilterLowPrecision(numElements uint) *Filter[uint8] {
buckets := make([]bucket[uint8], numBuckets(numElements))
return &Filter[uint8]{
buckets: buckets,
count: 0,
bucketIndexMask: uint(len(buckets) - 1),
getFingerprint: getFinterprintUint8,
}
}


// Lookup returns true if data is in the filter.
func (cf *Filter) Lookup(data []byte) bool {
i1, fp := getIndexAndFingerprint(data, cf.bucketIndexMask)
func (cf *Filter[T]) Lookup(data []byte) bool {
i1, fp := getIndexAndFingerprint(data, cf.bucketIndexMask, cf.getFingerprint)
if b := cf.buckets[i1]; b.contains(fp) {
return true
}
Expand All @@ -51,7 +70,7 @@ func (cf *Filter) Lookup(data []byte) bool {
}

// Reset removes all items from the filter, setting count to 0.
func (cf *Filter) Reset() {
func (cf *Filter[T]) Reset() {
for i := range cf.buckets {
cf.buckets[i].reset()
}
Expand All @@ -62,8 +81,8 @@ func (cf *Filter) Reset() {
// * Might return false negatives
// * Deletes are not guaranteed to work
// To increase success rate of inserts, create a larger filter.
func (cf *Filter) Insert(data []byte) bool {
i1, fp := getIndexAndFingerprint(data, cf.bucketIndexMask)
func (cf *Filter[T]) Insert(data []byte) bool {
i1, fp := getIndexAndFingerprint(data, cf.bucketIndexMask, cf.getFingerprint)
if cf.insert(fp, i1) {
return true
}
Expand All @@ -74,15 +93,15 @@ func (cf *Filter) Insert(data []byte) bool {
return cf.reinsert(fp, randi(i1, i2))
}

func (cf *Filter) insert(fp fingerprint, i uint) bool {
func (cf *Filter[T]) insert(fp T, i uint) bool {
if cf.buckets[i].insert(fp) {
cf.count++
return true
}
return false
}

func (cf *Filter) reinsert(fp fingerprint, i uint) bool {
func (cf *Filter[T]) reinsert(fp T, i uint) bool {
for k := 0; k < maxCuckooKickouts; k++ {
j := rand.Intn(bucketSize)
// Swap fingerprint with bucket entry.
Expand All @@ -98,13 +117,13 @@ func (cf *Filter) reinsert(fp fingerprint, i uint) bool {
}

// Delete data from the filter. Returns true if the data was found and deleted.
func (cf *Filter) Delete(data []byte) bool {
i1, fp := getIndexAndFingerprint(data, cf.bucketIndexMask)
func (cf *Filter[T]) Delete(data []byte) bool {
i1, fp := getIndexAndFingerprint(data, cf.bucketIndexMask, cf.getFingerprint)
i2 := getAltIndex(fp, i1, cf.bucketIndexMask)
return cf.delete(fp, i1) || cf.delete(fp, i2)
}

func (cf *Filter) delete(fp fingerprint, i uint) bool {
func (cf *Filter[T]) delete(fp T, i uint) bool {
if cf.buckets[i].delete(fp) {
cf.count--
return true
Expand All @@ -113,19 +132,20 @@ func (cf *Filter) delete(fp fingerprint, i uint) bool {
}

// Count returns the number of items in the filter.
func (cf *Filter) Count() uint {
func (cf *Filter[T]) Count() uint {
return cf.count
}

// LoadFactor returns the fraction slots that are occupied.
func (cf *Filter) LoadFactor() float64 {
func (cf *Filter[T]) LoadFactor() float64 {
return float64(cf.count) / float64(len(cf.buckets)*bucketSize)
}

const bytesPerBucket = bucketSize * fingerprintSizeBits / 8
// TODO(panmari): Size of fingerprint needs to be derived from type. Currently hardcoded to 16 for uint16.
const bytesPerBucket = bucketSize * 16 / 8

// Encode returns a byte slice representing a Cuckoofilter.
func (cf *Filter) Encode() []byte {
func (cf *Filter[T]) Encode() []byte {
bytes := make([]byte, 0, len(cf.buckets)*bytesPerBucket)
for _, b := range cf.buckets {
for _, f := range b {
Expand All @@ -138,7 +158,7 @@ func (cf *Filter) Encode() []byte {
}

// Decode returns a Cuckoofilter from a byte slice created using Encode.
func Decode(bytes []byte) (*Filter, error) {
func Decode(bytes []byte) (*Filter[uint16], error) {
if len(bytes)%bucketSize != 0 {
return nil, fmt.Errorf("bytes must to be multiple of %d, got %d", bucketSize, len(bytes))
}
Expand All @@ -150,21 +170,22 @@ func Decode(bytes []byte) (*Filter, error) {
return nil, fmt.Errorf("numBuckets must to be a power of 2, got %d", numBuckets)
}
var count uint
buckets := make([]bucket, numBuckets)
buckets := make([]bucket[uint16], numBuckets)
for i, b := range buckets {
for j := range b {
var next []byte
next, bytes = bytes[:2], bytes[2:]

if fp := fingerprint(binary.LittleEndian.Uint16(next)); fp != 0 {
if fp := binary.LittleEndian.Uint16(next); fp != 0 {
buckets[i][j] = fp
count++
}
}
}
return &Filter{
return &Filter[uint16]{
buckets: buckets,
count: count,
bucketIndexMask: uint(len(buckets) - 1),
getFingerprint: getFinterprintUint16,
}, nil
}
6 changes: 4 additions & 2 deletions cuckoofilter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@ import (
"io"
"math"
"os"
"reflect"
"testing"

"github.com/google/go-cmp/cmp"
"github.com/google/go-cmp/cmp/cmpopts"
)

// optFloatNear considers float64 as equal if the relative delta is small.
Expand Down Expand Up @@ -231,7 +231,9 @@ func TestEncodeDecode(t *testing.T) {
if err != nil {
t.Errorf("Expected no error, got %v", err)
}
if !reflect.DeepEqual(cf, got) {
if !cmp.Equal(cf, got,
cmp.AllowUnexported(Filter[uint16]{}),
cmpopts.IgnoreFields(Filter[uint16]{}, "getFingerprint")) {
t.Errorf("Decode = %v, want %v, encoded = %v", got, cf, encoded)
}
}
2 changes: 1 addition & 1 deletion example_threadsafe_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import (

// Small wrapper around cuckoo filter making it thread safe.
type threadSafeFilter struct {
cf *cuckoo.Filter
cf *cuckoo.Filter[uint16]
mu sync.RWMutex
}

Expand Down
6 changes: 4 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
module github.com/panmari/cuckoofilter

go 1.15
go 1.18

require (
github.com/dgryski/go-metro v0.0.0-20200812162917-85c65e2d0165
github.com/dgryski/go-metro v0.0.0-20211015221634-2661b20a2446
github.com/google/go-cmp v0.5.2
)

require golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 // indirect
5 changes: 3 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
github.com/dgryski/go-metro v0.0.0-20200812162917-85c65e2d0165 h1:BS21ZUJ/B5X2UVUbczfmdWH7GapPWAhxcMsDnjJTU1E=
github.com/dgryski/go-metro v0.0.0-20200812162917-85c65e2d0165/go.mod h1:c9O8+fpSOX1DM8cPNSkX/qsBWdkD4yd2dpciOWQjpBw=
github.com/dgryski/go-metro v0.0.0-20211015221634-2661b20a2446 h1:QnWGyQI3H080vbC9E4jlr6scOYEnALtvV/69oATYzOo=
github.com/dgryski/go-metro v0.0.0-20211015221634-2661b20a2446/go.mod h1:c9O8+fpSOX1DM8cPNSkX/qsBWdkD4yd2dpciOWQjpBw=
github.com/google/go-cmp v0.5.2 h1:X2ev0eStA3AbceY54o37/0PQ/UWqKEiiO2dKL5OPaFM=
github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
20 changes: 16 additions & 4 deletions util.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,23 +15,35 @@ func randi(i1, i2 uint) uint {
return i2
}

func getAltIndex(fp fingerprint, i uint, bucketIndexMask uint) uint {
func getAltIndex[T fingerprintsize](fp T, i uint, bucketIndexMask uint) uint {
b := make([]byte, 2)
binary.LittleEndian.PutUint16(b, uint16(fp))
hash := uint(metro.Hash64(b, 1337))
return (i ^ hash) & bucketIndexMask
}

func getFingerprint(hash uint64) fingerprint {
func getFinterprintUint16(hash uint64) uint16 {
const fingerprintSizeBits = 16
const maxFingerprint = (1 << fingerprintSizeBits) - 1
// Use most significant bits for fingerprint.
shifted := hash >> (64 - fingerprintSizeBits)
// Valid fingerprints are in range [1, maxFingerprint], leaving 0 as the special empty state.
fp := shifted%(maxFingerprint-1) + 1
return fingerprint(fp)
return uint16(fp)
}

func getFinterprintUint8(hash uint64) uint8 {
const fingerprintSizeBits = 8
const maxFingerprint = (1 << fingerprintSizeBits) - 1

This comment has been minimized.

Copy link
@panmari

panmari Jul 11, 2022

Author Owner

Instead, do only one function and pass fingerprintSizeBits as a parameter. This is cheaper than a dynamic function call.

// Use most significant bits for fingerprint.
shifted := hash >> (64 - fingerprintSizeBits)
// Valid fingerprints are in range [1, maxFingerprint], leaving 0 as the special empty state.
fp := shifted%(maxFingerprint-1) + 1
return uint8(fp)
}

// getIndexAndFingerprint returns the primary bucket index and fingerprint to be used
func getIndexAndFingerprint(data []byte, bucketIndexMask uint) (uint, fingerprint) {
func getIndexAndFingerprint[T fingerprintsize](data []byte, bucketIndexMask uint, getFingerprint func(uint64) T) (uint, T) {
hash := metro.Hash64(data, 1337)
f := getFingerprint(hash)
// Use least significant bits for deriving index.
Expand Down
4 changes: 3 additions & 1 deletion util_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@ import (
func TestIndexAndFP(t *testing.T) {
data := []byte("seif")
numBuckets := uint(1024)
i1, fp := getIndexAndFingerprint(data, numBuckets)
i1, fp := getIndexAndFingerprint(data, numBuckets, func(in uint64) uint16 {
return 2
})
i2 := getAltIndex(fp, i1, numBuckets)
i11 := getAltIndex(fp, i2, numBuckets)
i22 := getAltIndex(fp, i1, numBuckets)
Expand Down

0 comments on commit 0d1d95c

Please sign in to comment.