diff --git a/server/mvcc/backend/batch_tx_test.go b/server/mvcc/backend/batch_tx_test.go index f0e224fb49e..2957b4d5094 100644 --- a/server/mvcc/backend/batch_tx_test.go +++ b/server/mvcc/backend/batch_tx_test.go @@ -15,11 +15,14 @@ package backend_test import ( + "fmt" + "math/rand" "reflect" "testing" "time" "github.com/google/go-cmp/cmp" + bolt "go.etcd.io/bbolt" "go.etcd.io/etcd/server/v3/mvcc/backend" betesting "go.etcd.io/etcd/server/v3/mvcc/backend/testing" @@ -239,24 +242,113 @@ func TestRangeAfterDeleteMatch(t *testing.T) { tx.Unlock() tx.Commit() - checkRangeResponseMatch(t, b.BatchTx(), b.ReadTx(), []byte("foo"), nil, 0) + checkRangeResponseMatch(t, b.BatchTx(), b.ReadTx(), buckets.Test, []byte("foo"), nil, 0) checkForEach(t, b.BatchTx(), b.ReadTx(), [][]byte{[]byte("foo")}, [][]byte{[]byte("bar")}) tx.Lock() tx.UnsafeDelete(buckets.Test, []byte("foo")) tx.Unlock() - checkRangeResponseMatch(t, b.BatchTx(), b.ReadTx(), []byte("foo"), nil, 0) + checkRangeResponseMatch(t, b.BatchTx(), b.ReadTx(), buckets.Test, []byte("foo"), nil, 0) checkForEach(t, b.BatchTx(), b.ReadTx(), nil, nil) } -func checkRangeResponseMatch(t *testing.T, tx backend.BatchTx, rtx backend.ReadTx, key, endKey []byte, limit int64) { +func TestRangeAfterUnorderedKeyWriteMatch(t *testing.T) { + b, _ := betesting.NewTmpBackend(t, time.Hour, 10000) + defer betesting.Close(t, b) + + tx := b.BatchTx() + tx.Lock() + tx.UnsafeCreateBucket(buckets.Test) + tx.UnsafePut(buckets.Test, []byte("foo5"), []byte("bar5")) + tx.UnsafePut(buckets.Test, []byte("foo2"), []byte("bar2")) + tx.UnsafePut(buckets.Test, []byte("foo1"), []byte("bar1")) + tx.UnsafePut(buckets.Test, []byte("foo3"), []byte("bar3")) + tx.UnsafePut(buckets.Test, []byte("foo"), []byte("bar")) + tx.UnsafePut(buckets.Test, []byte("foo4"), []byte("bar4")) + tx.Unlock() + + checkRangeResponseMatch(t, b.BatchTx(), b.ReadTx(), buckets.Test, []byte("foo"), nil, 1) +} + +func TestRangeAfterAlternatingBucketWriteMatch(t *testing.T) { + b, _ := betesting.NewTmpBackend(t, time.Hour, 10000) + defer betesting.Close(t, b) + + tx := b.BatchTx() + + tx.Lock() + tx.UnsafeCreateBucket(buckets.Key) + tx.UnsafeCreateBucket(buckets.Test) + tx.UnsafeSeqPut(buckets.Key, []byte("key1"), []byte("val1")) + tx.Unlock() + + tx.Lock() + tx.UnsafeSeqPut(buckets.Key, []byte("key2"), []byte("val2")) + tx.Unlock() + tx.Commit() + // only in the 2nd commit the buckets.Key key is removed from the readBuffer.buckets. + // This makes sure to test the case when an empty writeBuffer.bucket + // is used to replace the read buffer bucket. + tx.Commit() + tx.Lock() + tx.UnsafePut(buckets.Test, []byte("foo"), []byte("bar")) + tx.Unlock() + checkRangeResponseMatch(t, b.BatchTx(), b.ReadTx(), buckets.Key, []byte("key"), []byte("key5"), 100) + checkRangeResponseMatch(t, b.BatchTx(), b.ReadTx(), buckets.Test, []byte("foo"), []byte("foo3"), 1) +} + +func TestRangeAfterOverwriteMatch(t *testing.T) { + b, _ := betesting.NewTmpBackend(t, time.Hour, 10000) + defer betesting.Close(t, b) + tx := b.BatchTx() + tx.Lock() + tx.UnsafeCreateBucket(buckets.Test) + tx.UnsafePut(buckets.Test, []byte("foo"), []byte("bar2")) + tx.UnsafePut(buckets.Test, []byte("foo"), []byte("bar0")) + tx.UnsafePut(buckets.Test, []byte("foo1"), []byte("bar10")) + tx.UnsafePut(buckets.Test, []byte("foo"), []byte("bar1")) + tx.UnsafePut(buckets.Test, []byte("foo1"), []byte("bar11")) + tx.Unlock() + checkRangeResponseMatch(t, b.BatchTx(), b.ReadTx(), buckets.Test, []byte("foo"), []byte("foo3"), 1) + checkForEach(t, b.BatchTx(), b.ReadTx(), [][]byte{[]byte("foo"), []byte("foo1")}, [][]byte{[]byte("bar1"), []byte("bar11")}) +} + +func TestRangeAfterOverwriteAndDeleteMatch(t *testing.T) { + b, _ := betesting.NewTmpBackend(t, time.Hour, 10000) + defer betesting.Close(t, b) + + tx := b.BatchTx() + + tx.Lock() + tx.UnsafeCreateBucket(buckets.Test) + tx.UnsafePut(buckets.Test, []byte("foo"), []byte("bar2")) + tx.UnsafePut(buckets.Test, []byte("foo"), []byte("bar0")) + tx.UnsafePut(buckets.Test, []byte("foo1"), []byte("bar10")) + tx.UnsafePut(buckets.Test, []byte("foo"), []byte("bar1")) + tx.UnsafePut(buckets.Test, []byte("foo1"), []byte("bar11")) + tx.Unlock() + + checkRangeResponseMatch(t, b.BatchTx(), b.ReadTx(), buckets.Test, []byte("foo"), nil, 0) + checkForEach(t, b.BatchTx(), b.ReadTx(), [][]byte{[]byte("foo"), []byte("foo1")}, [][]byte{[]byte("bar1"), []byte("bar11")}) + + tx.Lock() + tx.UnsafePut(buckets.Test, []byte("foo"), []byte("bar3")) + tx.UnsafeDelete(buckets.Test, []byte("foo1")) + tx.Unlock() + + checkRangeResponseMatch(t, b.BatchTx(), b.ReadTx(), buckets.Test, []byte("foo"), nil, 0) + checkRangeResponseMatch(t, b.BatchTx(), b.ReadTx(), buckets.Test, []byte("foo1"), nil, 0) + checkForEach(t, b.BatchTx(), b.ReadTx(), [][]byte{[]byte("foo")}, [][]byte{[]byte("bar3")}) +} + +func checkRangeResponseMatch(t *testing.T, tx backend.BatchTx, rtx backend.ReadTx, bucket backend.Bucket, key, endKey []byte, limit int64) { tx.Lock() - ks1, vs1 := tx.UnsafeRange(buckets.Test, key, endKey, limit) + ks1, vs1 := tx.UnsafeRange(bucket, key, endKey, limit) tx.Unlock() rtx.RLock() - ks2, vs2 := rtx.UnsafeRange(buckets.Test, key, endKey, limit) + ks2, vs2 := rtx.UnsafeRange(bucket, key, endKey, limit) rtx.RUnlock() if diff := cmp.Diff(ks1, ks2); diff != "" { @@ -292,3 +384,86 @@ func checkUnsafeForEach(t *testing.T, tx backend.ReadTx, expectedKeys, expectedV t.Errorf("values on transaction doesn't match expected, diff: %s", diff) } } + +// runWriteback is used test the txWriteBuffer.writeback function, which is called inside tx.Unlock(). +// The parameters are chosen based on defaultBatchLimit = 10000 +func runWriteback(t testing.TB, kss, vss [][]string, isSeq bool) { + b, _ := betesting.NewTmpBackend(t, time.Hour, 10000) + defer betesting.Close(t, b) + + tx := b.BatchTx() + + tx.Lock() + tx.UnsafeCreateBucket(buckets.Test) + tx.UnsafeCreateBucket(buckets.Key) + tx.Unlock() + for i, ks := range kss { + vs := vss[i] + tx.Lock() + for j := 0; j < len(ks); j++ { + if isSeq { + tx.UnsafeSeqPut(buckets.Key, []byte(ks[j]), []byte(vs[j])) + } else { + tx.UnsafePut(buckets.Test, []byte(ks[j]), []byte(vs[j])) + } + } + tx.Unlock() + } +} + +func BenchmarkWritebackSeqBatches1BatchSize10000(b *testing.B) { benchmarkWriteback(b, 1, 10000, true) } + +func BenchmarkWritebackSeqBatches10BatchSize1000(b *testing.B) { benchmarkWriteback(b, 10, 1000, true) } + +func BenchmarkWritebackSeqBatches100BatchSize100(b *testing.B) { benchmarkWriteback(b, 100, 100, true) } + +func BenchmarkWritebackSeqBatches1000BatchSize10(b *testing.B) { benchmarkWriteback(b, 1000, 10, true) } + +func BenchmarkWritebackNonSeqBatches1000BatchSize1(b *testing.B) { + // for non sequential writes, the batch size is usually small, 1 or the order of cluster size. + benchmarkWriteback(b, 1000, 1, false) +} + +func BenchmarkWritebackNonSeqBatches10000BatchSize1(b *testing.B) { + benchmarkWriteback(b, 10000, 1, false) +} + +func BenchmarkWritebackNonSeqBatches100BatchSize10(b *testing.B) { + benchmarkWriteback(b, 100, 10, false) +} + +func BenchmarkWritebackNonSeqBatches1000BatchSize10(b *testing.B) { + benchmarkWriteback(b, 1000, 10, false) +} + +func benchmarkWriteback(b *testing.B, batches, batchSize int, isSeq bool) { + // kss and vss are key and value arrays to write with size batches*batchSize + var kss, vss [][]string + for i := 0; i < batches; i++ { + var ks, vs []string + for j := i * batchSize; j < (i+1)*batchSize; j++ { + k := fmt.Sprintf("key%d", j) + v := fmt.Sprintf("val%d", j) + ks = append(ks, k) + vs = append(vs, v) + } + if !isSeq { + // make sure each batch is shuffled differently but the same for different test runs. + shuffleList(ks, i*batchSize) + } + kss = append(kss, ks) + vss = append(vss, vs) + } + b.ResetTimer() + for n := 1; n < b.N; n++ { + runWriteback(b, kss, vss, isSeq) + } +} + +func shuffleList(l []string, seed int) { + r := rand.New(rand.NewSource(int64(seed))) + for i := 0; i < len(l); i++ { + j := r.Intn(i + 1) + l[i], l[j] = l[j], l[i] + } +} diff --git a/server/mvcc/backend/tx_buffer.go b/server/mvcc/backend/tx_buffer.go index 66740024836..b21f3b37307 100644 --- a/server/mvcc/backend/tx_buffer.go +++ b/server/mvcc/backend/tx_buffer.go @@ -50,7 +50,8 @@ func (txw *txWriteBuffer) put(bucket Bucket, k, v []byte) { } func (txw *txWriteBuffer) putSeq(bucket Bucket, k, v []byte) { - // TODO: Add (in tests?) verification whether k>b[len(b)] + // putSeq is only be called for the data in the Key bucket. The keys + // in the Key bucket should be monotonically increasing revisions. txw.putInternal(bucket, k, v) } @@ -80,6 +81,9 @@ func (txw *txWriteBuffer) writeback(txr *txReadBuffer) { rb, ok := txr.buckets[k] if !ok { delete(txw.buckets, k) + if seq, ok := txw.bucket2seq[k]; ok && !seq { + wb.dedupe() + } txr.buckets[k] = wb continue } @@ -148,7 +152,7 @@ func newBucketBuffer() *bucketBuffer { func (bb *bucketBuffer) Range(key, endKey []byte, limit int64) (keys [][]byte, vals [][]byte) { f := func(i int) bool { return bytes.Compare(bb.buf[i].key, key) >= 0 } idx := sort.Search(bb.used, f) - if idx < 0 { + if idx < 0 || idx >= bb.used { return nil, nil } if len(endKey) == 0 { @@ -201,10 +205,15 @@ func (bb *bucketBuffer) merge(bbsrc *bucketBuffer) { if bytes.Compare(bb.buf[(bb.used-bbsrc.used)-1].key, bbsrc.buf[0].key) < 0 { return } + bb.dedupe() +} +// dedupe removes duplicates, using only newest update +func (bb *bucketBuffer) dedupe() { + if bb.used <= 1 { + return + } sort.Stable(bb) - - // remove duplicates, using only newest update widx := 0 for ridx := 1; ridx < bb.used; ridx++ { if !bytes.Equal(bb.buf[ridx].key, bb.buf[widx].key) { diff --git a/server/mvcc/backend/tx_buffer_test.go b/server/mvcc/backend/tx_buffer_test.go new file mode 100644 index 00000000000..2326614aff2 --- /dev/null +++ b/server/mvcc/backend/tx_buffer_test.go @@ -0,0 +1,71 @@ +// Copyright 2023 The etcd Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package backend + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestDedupe(t *testing.T) { + tests := []struct { + name string + keys, vals, expectedKeys, expectedVals []string + }{ + { + name: "empty", + keys: []string{}, + vals: []string{}, + expectedKeys: []string{}, + expectedVals: []string{}, + }, + { + name: "single kv", + keys: []string{"key1"}, + vals: []string{"val1"}, + expectedKeys: []string{"key1"}, + expectedVals: []string{"val1"}, + }, + { + name: "duplicate key", + keys: []string{"key1", "key1"}, + vals: []string{"val1", "val2"}, + expectedKeys: []string{"key1"}, + expectedVals: []string{"val2"}, + }, + { + name: "unordered keys", + keys: []string{"key3", "key1", "key4", "key2", "key1", "key4"}, + vals: []string{"val1", "val5", "val3", "val4", "val2", "val6"}, + expectedKeys: []string{"key1", "key2", "key3", "key4"}, + expectedVals: []string{"val2", "val4", "val1", "val6"}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + bb := &bucketBuffer{buf: make([]kv, 10), used: 0} + for i := 0; i < len(tt.keys); i++ { + bb.add([]byte(tt.keys[i]), []byte(tt.vals[i])) + } + bb.dedupe() + assert.Len(t, tt.expectedKeys, bb.used) + for i := 0; i < bb.used; i++ { + assert.Equal(t, bb.buf[i].key, []byte(tt.expectedKeys[i])) + assert.Equal(t, bb.buf[i].val, []byte(tt.expectedVals[i])) + } + }) + } +}