From 1d175079a8d094eaee2e1e09e181ce69d82def7b Mon Sep 17 00:00:00 2001 From: Thomas Gosteli Date: Wed, 6 Nov 2024 11:47:18 +0100 Subject: [PATCH] fix(defrag): handle defragdb failure Signed-off-by: Thomas Gosteli --- server/mvcc/backend/backend.go | 12 +++++- tests/e2e/defrag_no_space_test.go | 62 +++++++++++++++++++++---------- 2 files changed, 52 insertions(+), 22 deletions(-) diff --git a/server/mvcc/backend/backend.go b/server/mvcc/backend/backend.go index 077146b82ee..d04126d94ae 100644 --- a/server/mvcc/backend/backend.go +++ b/server/mvcc/backend/backend.go @@ -490,8 +490,8 @@ func (b *backend) defrag() error { options = *boltOpenOptions } options.OpenFile = func(_ string, _ int, _ os.FileMode) (file *os.File, err error) { - // gofail: var defragNoSpace string - // return nil, fmt.Errorf(defragNoSpace) + // gofail: var defragOpenFileError string + // return nil, fmt.Errorf(defragOpenFileError) return temp, nil } // Don't load tmp db into memory regardless of opening options @@ -526,6 +526,11 @@ func (b *backend) defrag() error { if rmErr := os.RemoveAll(tmpdb.Path()); rmErr != nil { b.lg.Error("failed to remove db.tmp after defragmentation completed", zap.Error(rmErr)) } + + // restore the bbolt transactions if defragmentation fails + b.batchTx.tx = b.unsafeBegin(true) + b.readTx.tx = b.unsafeBegin(false) + return err } @@ -578,6 +583,9 @@ func (b *backend) defrag() error { } func defragdb(odb, tmpdb *bolt.DB, limit int) error { + // gofail: var defragdbFail string + // return fmt.Errorf(defragdbFail) + // open a tx on tmpdb for writes tmptx, err := tmpdb.Begin(true) if err != nil { diff --git a/tests/e2e/defrag_no_space_test.go b/tests/e2e/defrag_no_space_test.go index 810136f156e..f6ceabe667b 100644 --- a/tests/e2e/defrag_no_space_test.go +++ b/tests/e2e/defrag_no_space_test.go @@ -16,6 +16,7 @@ package e2e import ( "context" + "fmt" "testing" "time" @@ -26,24 +27,45 @@ import ( ) func TestDefragNoSpace(t *testing.T) { - e2e.BeforeTest(t) - - clus, err := e2e.NewEtcdProcessCluster(context.TODO(), t, - e2e.WithClusterSize(1), - e2e.WithGoFailEnabled(true), - ) - require.NoError(t, err) - t.Cleanup(func() { clus.Stop() }) - - member := clus.Procs[0] - - require.NoError(t, member.Failpoints().SetupHTTP(context.Background(), "defragNoSpace", `return("no space")`)) - require.ErrorContains(t, member.Etcdctl().Defragment(context.Background(), config.DefragOption{Timeout: time.Minute}), "no space") - - // Make sure etcd continues to run even after the failed defrag attempt - require.NoError(t, member.Etcdctl().Put(context.Background(), "foo", "bar", config.PutOptions{})) - value, err := member.Etcdctl().Get(context.Background(), "foo", config.GetOptions{}) - require.NoError(t, err) - require.Len(t, value.Kvs, 1) - require.Equal(t, "bar", string(value.Kvs[0].Value)) + tests := []struct { + name string + failpoint string + err string + }{ + { + name: "no space (#18810) - can't open/create new bbolt db", + failpoint: "defragOpenFileError", + err: "no space", + }, + { + name: "defragdb failure", + failpoint: "defragdbFail", + err: "some random error", + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + e2e.BeforeTest(t) + + clus, err := e2e.NewEtcdProcessCluster(context.TODO(), t, + e2e.WithClusterSize(1), + e2e.WithGoFailEnabled(true), + ) + require.NoError(t, err) + t.Cleanup(func() { clus.Stop() }) + + member := clus.Procs[0] + + require.NoError(t, member.Failpoints().SetupHTTP(context.Background(), tc.failpoint, fmt.Sprintf(`return("%s")`, tc.err))) + require.ErrorContains(t, member.Etcdctl().Defragment(context.Background(), config.DefragOption{Timeout: time.Minute}), tc.err) + + // Make sure etcd continues to run even after the failed defrag attempt + require.NoError(t, member.Etcdctl().Put(context.Background(), "foo", "bar", config.PutOptions{})) + value, err := member.Etcdctl().Get(context.Background(), "foo", config.GetOptions{}) + require.NoError(t, err) + require.Len(t, value.Kvs, 1) + require.Equal(t, "bar", string(value.Kvs[0].Value)) + }) + } }