From e8ae83fac471daea85e3cfa8d2011a6562932120 Mon Sep 17 00:00:00 2001 From: Marek Siarkowicz Date: Tue, 17 Oct 2023 12:40:17 +0200 Subject: [PATCH 1/2] Add e2e tests for livez readyz Signed-off-by: Chao Chen --- tests/e2e/http_health_check_test.go | 179 ++++++++++++++++++++++++++++ 1 file changed, 179 insertions(+) diff --git a/tests/e2e/http_health_check_test.go b/tests/e2e/http_health_check_test.go index 8007b48f634..0e74a562b4e 100644 --- a/tests/e2e/http_health_check_test.go +++ b/tests/e2e/http_health_check_test.go @@ -21,10 +21,14 @@ import ( "io" "net/http" "os" + "path" "strings" "testing" "time" + "go.etcd.io/etcd/api/v3/etcdserverpb" + "go.etcd.io/etcd/server/v3/storage/mvcc/testutil" + "github.com/stretchr/testify/require" "go.etcd.io/etcd/tests/v3/framework/config" @@ -158,6 +162,147 @@ func TestHTTPHealthHandler(t *testing.T) { } } +func TestHTTPLivezReadyzHandler(t *testing.T) { + e2e.BeforeTest(t) + client := &http.Client{} + tcs := []struct { + name string + injectFailure func(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCluster) + clusterOptions []e2e.EPClusterOption + healthChecks []healthCheckConfig + }{ + { + name: "no failures", // happy case + clusterOptions: []e2e.EPClusterOption{e2e.WithClusterSize(1)}, + healthChecks: []healthCheckConfig{ + { + url: "/livez", + expectedStatusCode: http.StatusOK, + }, + { + url: "/readyz", + expectedStatusCode: http.StatusOK, + }, + }, + }, + { + name: "activated no space alarm", + injectFailure: triggerNoSpaceAlarm, + clusterOptions: []e2e.EPClusterOption{e2e.WithClusterSize(1), e2e.WithQuotaBackendBytes(int64(13 * os.Getpagesize()))}, + healthChecks: []healthCheckConfig{ + { + url: "/livez", + expectedStatusCode: http.StatusOK, + }, + { + url: "/readyz", + expectedStatusCode: http.StatusOK, + }, + }, + }, + { + name: "overloaded server slow apply", + injectFailure: triggerSlowApply, + clusterOptions: []e2e.EPClusterOption{e2e.WithClusterSize(3), e2e.WithGoFailEnabled(true)}, + healthChecks: []healthCheckConfig{ + { + url: "/livez", + expectedStatusCode: http.StatusOK, + }, + { + url: "/readyz", + expectedStatusCode: http.StatusOK, + }, + }, + }, + { + name: "network partitioned", + injectFailure: blackhole, + clusterOptions: []e2e.EPClusterOption{e2e.WithClusterSize(3), e2e.WithIsPeerTLS(true), e2e.WithPeerProxy(true)}, + healthChecks: []healthCheckConfig{ + { + url: "/livez", + expectedStatusCode: http.StatusOK, + }, + { + url: "/readyz", + expectedStatusCode: http.StatusOK, + }, + }, + }, + { + name: "raft loop deadlock", + injectFailure: triggerRaftLoopDeadLock, + clusterOptions: []e2e.EPClusterOption{e2e.WithClusterSize(1), e2e.WithGoFailEnabled(true)}, + healthChecks: []healthCheckConfig{ + { + // current kubeadm etcd liveness check failed to detect raft loop deadlock in steady state + // ref. https://github.com/kubernetes/kubernetes/blob/master/cmd/kubeadm/app/phases/etcd/local.go#L225-L226 + // current liveness probe depends on the etcd /health check has a flaw that new /livez check should resolve. + url: "/livez", + expectedStatusCode: http.StatusOK, + }, + { + url: "/readyz", + expectedStatusCode: http.StatusOK, + }, + }, + }, + // verify that auth enabled serializable read must go through mvcc + { + name: "slow buffer write back with auth enabled", + injectFailure: triggerSlowBufferWriteBackWithAuth, + clusterOptions: []e2e.EPClusterOption{e2e.WithClusterSize(1), e2e.WithGoFailEnabled(true)}, + healthChecks: []healthCheckConfig{ + { + url: "/livez", + expectedTimeoutError: true, + }, + { + url: "/readyz", + expectedStatusCode: http.StatusOK, + }, + }, + }, + { + name: "corrupt", + injectFailure: triggerCorrupt, + clusterOptions: []e2e.EPClusterOption{e2e.WithClusterSize(3), e2e.WithCorruptCheckTime(time.Second)}, + healthChecks: []healthCheckConfig{ + { + url: "/livez", + expectedStatusCode: http.StatusOK, + }, + { + url: "/readyz", + expectedStatusCode: http.StatusServiceUnavailable, + }, + }, + }, + } + + for _, tc := range tcs { + t.Run(tc.name, func(t *testing.T) { + ctx, cancel := context.WithTimeout(context.Background(), 20*time.Second) + defer cancel() + clus, err := e2e.NewEtcdProcessCluster(ctx, t, tc.clusterOptions...) + require.NoError(t, err) + defer clus.Close() + testutils.ExecuteUntil(ctx, t, func() { + if tc.injectFailure != nil { + tc.injectFailure(ctx, t, clus) + } + + for _, hc := range tc.healthChecks { + requestURL := clus.Procs[0].EndpointsHTTP()[0] + hc.url + t.Logf("health check URL is %s", requestURL) + doHealthCheckAndVerify(t, client, requestURL, hc.expectedStatusCode, hc.expectedTimeoutError) + } + }) + }) + } +} + func doHealthCheckAndVerify(t *testing.T, client *http.Client, url string, expectStatusCode int, expectTimeoutError bool) { ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) req, err := http.NewRequestWithContext(ctx, "GET", url, nil) @@ -224,3 +369,37 @@ func triggerSlowBufferWriteBackWithAuth(ctx context.Context, t *testing.T, clus require.NoError(t, clus.Procs[0].Failpoints().SetupHTTP(ctx, "beforeWritebackBuf", `sleep("3s")`)) clus.Procs[0].Etcdctl(e2e.WithAuth("root", "root")).Put(context.Background(), "foo", "bar", config.PutOptions{Timeout: 200 * time.Millisecond}) } + +func triggerCorrupt(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCluster) { + etcdctl := clus.Procs[0].Etcdctl() + for i := 0; i < 10; i++ { + err := etcdctl.Put(ctx, "foo", "bar", config.PutOptions{}) + require.NoError(t, err) + } + err := clus.Procs[0].Kill() + require.NoError(t, err) + err = clus.Procs[0].Wait(ctx) + require.NoError(t, err) + err = testutil.CorruptBBolt(path.Join(clus.Procs[0].Config().DataDirPath, "member", "snap", "db")) + require.NoError(t, err) + err = clus.Procs[0].Start(ctx) + for { + time.Sleep(time.Second) + select { + case <-ctx.Done(): + require.NoError(t, err) + default: + } + response, err := etcdctl.AlarmList(ctx) + if err != nil { + continue + } + if len(response.Alarms) == 0 { + continue + } + require.Len(t, response.Alarms, 1) + if response.Alarms[0].Alarm == etcdserverpb.AlarmType_CORRUPT { + break + } + } +} From 42d9e43e5ff896bdacee5f7203ee4286bfa28feb Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Thu, 26 Oct 2023 11:12:07 -0700 Subject: [PATCH 2/2] tests/e2e: enhance livez readyz e2e tests Signed-off-by: Chao Chen --- tests/e2e/http_health_check_test.go | 171 ++++++++++++++-------------- 1 file changed, 88 insertions(+), 83 deletions(-) diff --git a/tests/e2e/http_health_check_test.go b/tests/e2e/http_health_check_test.go index 0e74a562b4e..9cc090f2395 100644 --- a/tests/e2e/http_health_check_test.go +++ b/tests/e2e/http_health_check_test.go @@ -18,6 +18,7 @@ package e2e import ( "context" + "fmt" "io" "net/http" "os" @@ -26,28 +27,35 @@ import ( "testing" "time" - "go.etcd.io/etcd/api/v3/etcdserverpb" - "go.etcd.io/etcd/server/v3/storage/mvcc/testutil" - "github.com/stretchr/testify/require" + "go.etcd.io/etcd/api/v3/etcdserverpb" + "go.etcd.io/etcd/server/v3/storage/mvcc/testutil" "go.etcd.io/etcd/tests/v3/framework/config" "go.etcd.io/etcd/tests/v3/framework/e2e" "go.etcd.io/etcd/tests/v3/framework/testutils" ) +const ( + healthCheckTimeout = 2 * time.Second + putCommandTimeout = 200 * time.Millisecond +) + type healthCheckConfig struct { - url string - expectedStatusCode int - expectedTimeoutError bool + url string + expectedStatusCode int + expectedTimeoutError bool + expectedRespSubStrings []string } +type injectFailure func(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCluster, duration time.Duration) + func TestHTTPHealthHandler(t *testing.T) { e2e.BeforeTest(t) client := &http.Client{} tcs := []struct { name string - injectFailure func(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCluster) + injectFailure injectFailure clusterOptions []e2e.EPClusterOption healthChecks []healthCheckConfig }{ @@ -149,104 +157,91 @@ func TestHTTPHealthHandler(t *testing.T) { defer clus.Close() testutils.ExecuteUntil(ctx, t, func() { if tc.injectFailure != nil { - tc.injectFailure(ctx, t, clus) + // guaranteed that failure point is active until all the health checks timeout. + duration := time.Duration(len(tc.healthChecks)+1) * healthCheckTimeout + tc.injectFailure(ctx, t, clus, duration) } for _, hc := range tc.healthChecks { requestURL := clus.Procs[0].EndpointsHTTP()[0] + hc.url t.Logf("health check URL is %s", requestURL) - doHealthCheckAndVerify(t, client, requestURL, hc.expectedStatusCode, hc.expectedTimeoutError) + doHealthCheckAndVerify(t, client, requestURL, hc.expectedTimeoutError, hc.expectedStatusCode, hc.expectedRespSubStrings) } }) }) } } +var ( + defaultHealthCheckConfigs = []healthCheckConfig{ + { + url: "/livez", + expectedStatusCode: http.StatusOK, + expectedRespSubStrings: []string{`ok`}, + }, + { + url: "/readyz", + expectedStatusCode: http.StatusOK, + expectedRespSubStrings: []string{`ok`}, + }, + { + url: "/livez?verbose=true", + expectedStatusCode: http.StatusOK, + expectedRespSubStrings: []string{`[+]serializable_read ok`}, + }, + { + url: "/readyz?verbose=true", + expectedStatusCode: http.StatusOK, + expectedRespSubStrings: []string{ + `[+]serializable_read ok`, + `[+]data_corruption ok`, + }, + }, + } +) + func TestHTTPLivezReadyzHandler(t *testing.T) { e2e.BeforeTest(t) client := &http.Client{} tcs := []struct { name string - injectFailure func(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCluster) + injectFailure injectFailure clusterOptions []e2e.EPClusterOption healthChecks []healthCheckConfig }{ { name: "no failures", // happy case clusterOptions: []e2e.EPClusterOption{e2e.WithClusterSize(1)}, - healthChecks: []healthCheckConfig{ - { - url: "/livez", - expectedStatusCode: http.StatusOK, - }, - { - url: "/readyz", - expectedStatusCode: http.StatusOK, - }, - }, + healthChecks: defaultHealthCheckConfigs, }, { name: "activated no space alarm", injectFailure: triggerNoSpaceAlarm, clusterOptions: []e2e.EPClusterOption{e2e.WithClusterSize(1), e2e.WithQuotaBackendBytes(int64(13 * os.Getpagesize()))}, - healthChecks: []healthCheckConfig{ - { - url: "/livez", - expectedStatusCode: http.StatusOK, - }, - { - url: "/readyz", - expectedStatusCode: http.StatusOK, - }, - }, + healthChecks: defaultHealthCheckConfigs, }, + // Readiness is not an indicator of performance. Slow response is not covered by readiness. + // refer to https://tinyurl.com/livez-readyz-design-doc or https://github.com/etcd-io/etcd/issues/16007#issuecomment-1726541091 in case tinyurl is down. { name: "overloaded server slow apply", injectFailure: triggerSlowApply, clusterOptions: []e2e.EPClusterOption{e2e.WithClusterSize(3), e2e.WithGoFailEnabled(true)}, - healthChecks: []healthCheckConfig{ - { - url: "/livez", - expectedStatusCode: http.StatusOK, - }, - { - url: "/readyz", - expectedStatusCode: http.StatusOK, - }, - }, + healthChecks: defaultHealthCheckConfigs, }, { name: "network partitioned", injectFailure: blackhole, clusterOptions: []e2e.EPClusterOption{e2e.WithClusterSize(3), e2e.WithIsPeerTLS(true), e2e.WithPeerProxy(true)}, - healthChecks: []healthCheckConfig{ - { - url: "/livez", - expectedStatusCode: http.StatusOK, - }, - { - url: "/readyz", - expectedStatusCode: http.StatusOK, - }, - }, + // TODO expected behavior of readyz check should be 503 or timeout after ReadIndex check is implemented. + healthChecks: defaultHealthCheckConfigs, }, { name: "raft loop deadlock", injectFailure: triggerRaftLoopDeadLock, clusterOptions: []e2e.EPClusterOption{e2e.WithClusterSize(1), e2e.WithGoFailEnabled(true)}, - healthChecks: []healthCheckConfig{ - { - // current kubeadm etcd liveness check failed to detect raft loop deadlock in steady state - // ref. https://github.com/kubernetes/kubernetes/blob/master/cmd/kubeadm/app/phases/etcd/local.go#L225-L226 - // current liveness probe depends on the etcd /health check has a flaw that new /livez check should resolve. - url: "/livez", - expectedStatusCode: http.StatusOK, - }, - { - url: "/readyz", - expectedStatusCode: http.StatusOK, - }, - }, + // TODO expected behavior of livez check should be 503 or timeout after RaftLoopDeadLock check is implemented. + // TODO expected behavior of readyz check should be 503 or timeout after ReadIndex check is implemented. + healthChecks: defaultHealthCheckConfigs, }, // verify that auth enabled serializable read must go through mvcc { @@ -259,8 +254,8 @@ func TestHTTPLivezReadyzHandler(t *testing.T) { expectedTimeoutError: true, }, { - url: "/readyz", - expectedStatusCode: http.StatusOK, + url: "/readyz", + expectedTimeoutError: true, }, }, }, @@ -270,12 +265,17 @@ func TestHTTPLivezReadyzHandler(t *testing.T) { clusterOptions: []e2e.EPClusterOption{e2e.WithClusterSize(3), e2e.WithCorruptCheckTime(time.Second)}, healthChecks: []healthCheckConfig{ { - url: "/livez", - expectedStatusCode: http.StatusOK, + url: "/livez?verbose=true", + expectedStatusCode: http.StatusOK, + expectedRespSubStrings: []string{`[+]serializable_read ok`}, }, { url: "/readyz", expectedStatusCode: http.StatusServiceUnavailable, + expectedRespSubStrings: []string{ + `[+]serializable_read ok`, + `[-]data_corruption failed: alarm activated: CORRUPT`, + }, }, }, }, @@ -290,21 +290,23 @@ func TestHTTPLivezReadyzHandler(t *testing.T) { defer clus.Close() testutils.ExecuteUntil(ctx, t, func() { if tc.injectFailure != nil { - tc.injectFailure(ctx, t, clus) + // guaranteed that failure point is active until all the health checks timeout. + duration := time.Duration(len(tc.healthChecks)+1) * healthCheckTimeout + tc.injectFailure(ctx, t, clus, duration) } for _, hc := range tc.healthChecks { requestURL := clus.Procs[0].EndpointsHTTP()[0] + hc.url t.Logf("health check URL is %s", requestURL) - doHealthCheckAndVerify(t, client, requestURL, hc.expectedStatusCode, hc.expectedTimeoutError) + doHealthCheckAndVerify(t, client, requestURL, hc.expectedTimeoutError, hc.expectedStatusCode, hc.expectedRespSubStrings) } }) }) } } -func doHealthCheckAndVerify(t *testing.T, client *http.Client, url string, expectStatusCode int, expectTimeoutError bool) { - ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) +func doHealthCheckAndVerify(t *testing.T, client *http.Client, url string, expectTimeoutError bool, expectStatusCode int, expectRespSubStrings []string) { + ctx, cancel := context.WithTimeout(context.Background(), healthCheckTimeout) req, err := http.NewRequestWithContext(ctx, "GET", url, nil) require.NoErrorf(t, err, "failed to creat request %+v", err) resp, herr := client.Do(req) @@ -321,11 +323,14 @@ func doHealthCheckAndVerify(t *testing.T, client *http.Client, url string, expec resp.Body.Close() require.NoErrorf(t, err, "failed to read response %+v", err) - t.Logf("health check response body is: %s", body) + t.Logf("health check response body is:\n%s", body) require.Equal(t, expectStatusCode, resp.StatusCode) + for _, expectRespSubString := range expectRespSubStrings { + require.Contains(t, string(body), expectRespSubString) + } } -func triggerNoSpaceAlarm(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCluster) { +func triggerNoSpaceAlarm(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCluster, _ time.Duration) { buf := strings.Repeat("b", os.Getpagesize()) etcdctl := clus.Etcdctl() for { @@ -338,14 +343,14 @@ func triggerNoSpaceAlarm(ctx context.Context, t *testing.T, clus *e2e.EtcdProces } } -func triggerSlowApply(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCluster) { +func triggerSlowApply(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCluster, duration time.Duration) { // the following proposal will be blocked at applying stage // because when apply index < committed index, linearizable read would time out. - require.NoError(t, clus.Procs[0].Failpoints().SetupHTTP(ctx, "beforeApplyOneEntryNormal", `sleep("3s")`)) + require.NoError(t, clus.Procs[0].Failpoints().SetupHTTP(ctx, "beforeApplyOneEntryNormal", fmt.Sprintf(`sleep("%s")`, duration))) require.NoError(t, clus.Procs[1].Etcdctl().Put(ctx, "foo", "bar", config.PutOptions{})) } -func blackhole(_ context.Context, t *testing.T, clus *e2e.EtcdProcessCluster) { +func blackhole(_ context.Context, t *testing.T, clus *e2e.EtcdProcessCluster, _ time.Duration) { member := clus.Procs[0] proxy := member.PeerProxy() t.Logf("Blackholing traffic from and to member %q", member.Config().Name) @@ -353,12 +358,12 @@ func blackhole(_ context.Context, t *testing.T, clus *e2e.EtcdProcessCluster) { proxy.BlackholeRx() } -func triggerRaftLoopDeadLock(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCluster) { - require.NoError(t, clus.Procs[0].Failpoints().SetupHTTP(ctx, "raftBeforeSave", `sleep("3s")`)) - clus.Procs[0].Etcdctl().Put(context.Background(), "foo", "bar", config.PutOptions{}) +func triggerRaftLoopDeadLock(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCluster, duration time.Duration) { + require.NoError(t, clus.Procs[0].Failpoints().SetupHTTP(ctx, "raftBeforeSave", fmt.Sprintf(`sleep("%s")`, duration))) + clus.Procs[0].Etcdctl().Put(context.Background(), "foo", "bar", config.PutOptions{Timeout: putCommandTimeout}) } -func triggerSlowBufferWriteBackWithAuth(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCluster) { +func triggerSlowBufferWriteBackWithAuth(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCluster, duration time.Duration) { etcdctl := clus.Etcdctl() _, err := etcdctl.UserAdd(ctx, "root", "root", config.UserAddOptions{}) require.NoError(t, err) @@ -366,11 +371,11 @@ func triggerSlowBufferWriteBackWithAuth(ctx context.Context, t *testing.T, clus require.NoError(t, err) require.NoError(t, etcdctl.AuthEnable(ctx)) - require.NoError(t, clus.Procs[0].Failpoints().SetupHTTP(ctx, "beforeWritebackBuf", `sleep("3s")`)) - clus.Procs[0].Etcdctl(e2e.WithAuth("root", "root")).Put(context.Background(), "foo", "bar", config.PutOptions{Timeout: 200 * time.Millisecond}) + require.NoError(t, clus.Procs[0].Failpoints().SetupHTTP(ctx, "beforeWritebackBuf", fmt.Sprintf(`sleep("%s")`, duration))) + clus.Procs[0].Etcdctl(e2e.WithAuth("root", "root")).Put(context.Background(), "foo", "bar", config.PutOptions{Timeout: putCommandTimeout}) } -func triggerCorrupt(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCluster) { +func triggerCorrupt(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCluster, _ time.Duration) { etcdctl := clus.Procs[0].Etcdctl() for i := 0; i < 10; i++ { err := etcdctl.Put(ctx, "foo", "bar", config.PutOptions{})