tests/e2e: enhance livez readyz e2e tests

Signed-off-by: Chao Chen <chaochn@amazon.com>
This commit is contained in:
Chao Chen 2023-10-26 11:12:07 -07:00
parent e8ae83fac4
commit 42d9e43e5f

View File

@ -18,6 +18,7 @@ package e2e
import ( import (
"context" "context"
"fmt"
"io" "io"
"net/http" "net/http"
"os" "os"
@ -26,28 +27,35 @@ import (
"testing" "testing"
"time" "time"
"go.etcd.io/etcd/api/v3/etcdserverpb"
"go.etcd.io/etcd/server/v3/storage/mvcc/testutil"
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
"go.etcd.io/etcd/api/v3/etcdserverpb"
"go.etcd.io/etcd/server/v3/storage/mvcc/testutil"
"go.etcd.io/etcd/tests/v3/framework/config" "go.etcd.io/etcd/tests/v3/framework/config"
"go.etcd.io/etcd/tests/v3/framework/e2e" "go.etcd.io/etcd/tests/v3/framework/e2e"
"go.etcd.io/etcd/tests/v3/framework/testutils" "go.etcd.io/etcd/tests/v3/framework/testutils"
) )
const (
healthCheckTimeout = 2 * time.Second
putCommandTimeout = 200 * time.Millisecond
)
type healthCheckConfig struct { type healthCheckConfig struct {
url string url string
expectedStatusCode int expectedStatusCode int
expectedTimeoutError bool expectedTimeoutError bool
expectedRespSubStrings []string
} }
type injectFailure func(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCluster, duration time.Duration)
func TestHTTPHealthHandler(t *testing.T) { func TestHTTPHealthHandler(t *testing.T) {
e2e.BeforeTest(t) e2e.BeforeTest(t)
client := &http.Client{} client := &http.Client{}
tcs := []struct { tcs := []struct {
name string name string
injectFailure func(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCluster) injectFailure injectFailure
clusterOptions []e2e.EPClusterOption clusterOptions []e2e.EPClusterOption
healthChecks []healthCheckConfig healthChecks []healthCheckConfig
}{ }{
@ -149,104 +157,91 @@ func TestHTTPHealthHandler(t *testing.T) {
defer clus.Close() defer clus.Close()
testutils.ExecuteUntil(ctx, t, func() { testutils.ExecuteUntil(ctx, t, func() {
if tc.injectFailure != nil { if tc.injectFailure != nil {
tc.injectFailure(ctx, t, clus) // guaranteed that failure point is active until all the health checks timeout.
duration := time.Duration(len(tc.healthChecks)+1) * healthCheckTimeout
tc.injectFailure(ctx, t, clus, duration)
} }
for _, hc := range tc.healthChecks { for _, hc := range tc.healthChecks {
requestURL := clus.Procs[0].EndpointsHTTP()[0] + hc.url requestURL := clus.Procs[0].EndpointsHTTP()[0] + hc.url
t.Logf("health check URL is %s", requestURL) t.Logf("health check URL is %s", requestURL)
doHealthCheckAndVerify(t, client, requestURL, hc.expectedStatusCode, hc.expectedTimeoutError) doHealthCheckAndVerify(t, client, requestURL, hc.expectedTimeoutError, hc.expectedStatusCode, hc.expectedRespSubStrings)
} }
}) })
}) })
} }
} }
var (
defaultHealthCheckConfigs = []healthCheckConfig{
{
url: "/livez",
expectedStatusCode: http.StatusOK,
expectedRespSubStrings: []string{`ok`},
},
{
url: "/readyz",
expectedStatusCode: http.StatusOK,
expectedRespSubStrings: []string{`ok`},
},
{
url: "/livez?verbose=true",
expectedStatusCode: http.StatusOK,
expectedRespSubStrings: []string{`[+]serializable_read ok`},
},
{
url: "/readyz?verbose=true",
expectedStatusCode: http.StatusOK,
expectedRespSubStrings: []string{
`[+]serializable_read ok`,
`[+]data_corruption ok`,
},
},
}
)
func TestHTTPLivezReadyzHandler(t *testing.T) { func TestHTTPLivezReadyzHandler(t *testing.T) {
e2e.BeforeTest(t) e2e.BeforeTest(t)
client := &http.Client{} client := &http.Client{}
tcs := []struct { tcs := []struct {
name string name string
injectFailure func(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCluster) injectFailure injectFailure
clusterOptions []e2e.EPClusterOption clusterOptions []e2e.EPClusterOption
healthChecks []healthCheckConfig healthChecks []healthCheckConfig
}{ }{
{ {
name: "no failures", // happy case name: "no failures", // happy case
clusterOptions: []e2e.EPClusterOption{e2e.WithClusterSize(1)}, clusterOptions: []e2e.EPClusterOption{e2e.WithClusterSize(1)},
healthChecks: []healthCheckConfig{ healthChecks: defaultHealthCheckConfigs,
{
url: "/livez",
expectedStatusCode: http.StatusOK,
},
{
url: "/readyz",
expectedStatusCode: http.StatusOK,
},
},
}, },
{ {
name: "activated no space alarm", name: "activated no space alarm",
injectFailure: triggerNoSpaceAlarm, injectFailure: triggerNoSpaceAlarm,
clusterOptions: []e2e.EPClusterOption{e2e.WithClusterSize(1), e2e.WithQuotaBackendBytes(int64(13 * os.Getpagesize()))}, clusterOptions: []e2e.EPClusterOption{e2e.WithClusterSize(1), e2e.WithQuotaBackendBytes(int64(13 * os.Getpagesize()))},
healthChecks: []healthCheckConfig{ healthChecks: defaultHealthCheckConfigs,
{
url: "/livez",
expectedStatusCode: http.StatusOK,
},
{
url: "/readyz",
expectedStatusCode: http.StatusOK,
},
},
}, },
// Readiness is not an indicator of performance. Slow response is not covered by readiness.
// refer to https://tinyurl.com/livez-readyz-design-doc or https://github.com/etcd-io/etcd/issues/16007#issuecomment-1726541091 in case tinyurl is down.
{ {
name: "overloaded server slow apply", name: "overloaded server slow apply",
injectFailure: triggerSlowApply, injectFailure: triggerSlowApply,
clusterOptions: []e2e.EPClusterOption{e2e.WithClusterSize(3), e2e.WithGoFailEnabled(true)}, clusterOptions: []e2e.EPClusterOption{e2e.WithClusterSize(3), e2e.WithGoFailEnabled(true)},
healthChecks: []healthCheckConfig{ healthChecks: defaultHealthCheckConfigs,
{
url: "/livez",
expectedStatusCode: http.StatusOK,
},
{
url: "/readyz",
expectedStatusCode: http.StatusOK,
},
},
}, },
{ {
name: "network partitioned", name: "network partitioned",
injectFailure: blackhole, injectFailure: blackhole,
clusterOptions: []e2e.EPClusterOption{e2e.WithClusterSize(3), e2e.WithIsPeerTLS(true), e2e.WithPeerProxy(true)}, clusterOptions: []e2e.EPClusterOption{e2e.WithClusterSize(3), e2e.WithIsPeerTLS(true), e2e.WithPeerProxy(true)},
healthChecks: []healthCheckConfig{ // TODO expected behavior of readyz check should be 503 or timeout after ReadIndex check is implemented.
{ healthChecks: defaultHealthCheckConfigs,
url: "/livez",
expectedStatusCode: http.StatusOK,
},
{
url: "/readyz",
expectedStatusCode: http.StatusOK,
},
},
}, },
{ {
name: "raft loop deadlock", name: "raft loop deadlock",
injectFailure: triggerRaftLoopDeadLock, injectFailure: triggerRaftLoopDeadLock,
clusterOptions: []e2e.EPClusterOption{e2e.WithClusterSize(1), e2e.WithGoFailEnabled(true)}, clusterOptions: []e2e.EPClusterOption{e2e.WithClusterSize(1), e2e.WithGoFailEnabled(true)},
healthChecks: []healthCheckConfig{ // TODO expected behavior of livez check should be 503 or timeout after RaftLoopDeadLock check is implemented.
{ // TODO expected behavior of readyz check should be 503 or timeout after ReadIndex check is implemented.
// current kubeadm etcd liveness check failed to detect raft loop deadlock in steady state healthChecks: defaultHealthCheckConfigs,
// ref. https://github.com/kubernetes/kubernetes/blob/master/cmd/kubeadm/app/phases/etcd/local.go#L225-L226
// current liveness probe depends on the etcd /health check has a flaw that new /livez check should resolve.
url: "/livez",
expectedStatusCode: http.StatusOK,
},
{
url: "/readyz",
expectedStatusCode: http.StatusOK,
},
},
}, },
// verify that auth enabled serializable read must go through mvcc // verify that auth enabled serializable read must go through mvcc
{ {
@ -260,7 +255,7 @@ func TestHTTPLivezReadyzHandler(t *testing.T) {
}, },
{ {
url: "/readyz", url: "/readyz",
expectedStatusCode: http.StatusOK, expectedTimeoutError: true,
}, },
}, },
}, },
@ -270,12 +265,17 @@ func TestHTTPLivezReadyzHandler(t *testing.T) {
clusterOptions: []e2e.EPClusterOption{e2e.WithClusterSize(3), e2e.WithCorruptCheckTime(time.Second)}, clusterOptions: []e2e.EPClusterOption{e2e.WithClusterSize(3), e2e.WithCorruptCheckTime(time.Second)},
healthChecks: []healthCheckConfig{ healthChecks: []healthCheckConfig{
{ {
url: "/livez", url: "/livez?verbose=true",
expectedStatusCode: http.StatusOK, expectedStatusCode: http.StatusOK,
expectedRespSubStrings: []string{`[+]serializable_read ok`},
}, },
{ {
url: "/readyz", url: "/readyz",
expectedStatusCode: http.StatusServiceUnavailable, expectedStatusCode: http.StatusServiceUnavailable,
expectedRespSubStrings: []string{
`[+]serializable_read ok`,
`[-]data_corruption failed: alarm activated: CORRUPT`,
},
}, },
}, },
}, },
@ -290,21 +290,23 @@ func TestHTTPLivezReadyzHandler(t *testing.T) {
defer clus.Close() defer clus.Close()
testutils.ExecuteUntil(ctx, t, func() { testutils.ExecuteUntil(ctx, t, func() {
if tc.injectFailure != nil { if tc.injectFailure != nil {
tc.injectFailure(ctx, t, clus) // guaranteed that failure point is active until all the health checks timeout.
duration := time.Duration(len(tc.healthChecks)+1) * healthCheckTimeout
tc.injectFailure(ctx, t, clus, duration)
} }
for _, hc := range tc.healthChecks { for _, hc := range tc.healthChecks {
requestURL := clus.Procs[0].EndpointsHTTP()[0] + hc.url requestURL := clus.Procs[0].EndpointsHTTP()[0] + hc.url
t.Logf("health check URL is %s", requestURL) t.Logf("health check URL is %s", requestURL)
doHealthCheckAndVerify(t, client, requestURL, hc.expectedStatusCode, hc.expectedTimeoutError) doHealthCheckAndVerify(t, client, requestURL, hc.expectedTimeoutError, hc.expectedStatusCode, hc.expectedRespSubStrings)
} }
}) })
}) })
} }
} }
func doHealthCheckAndVerify(t *testing.T, client *http.Client, url string, expectStatusCode int, expectTimeoutError bool) { func doHealthCheckAndVerify(t *testing.T, client *http.Client, url string, expectTimeoutError bool, expectStatusCode int, expectRespSubStrings []string) {
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) ctx, cancel := context.WithTimeout(context.Background(), healthCheckTimeout)
req, err := http.NewRequestWithContext(ctx, "GET", url, nil) req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
require.NoErrorf(t, err, "failed to creat request %+v", err) require.NoErrorf(t, err, "failed to creat request %+v", err)
resp, herr := client.Do(req) resp, herr := client.Do(req)
@ -321,11 +323,14 @@ func doHealthCheckAndVerify(t *testing.T, client *http.Client, url string, expec
resp.Body.Close() resp.Body.Close()
require.NoErrorf(t, err, "failed to read response %+v", err) require.NoErrorf(t, err, "failed to read response %+v", err)
t.Logf("health check response body is: %s", body) t.Logf("health check response body is:\n%s", body)
require.Equal(t, expectStatusCode, resp.StatusCode) require.Equal(t, expectStatusCode, resp.StatusCode)
for _, expectRespSubString := range expectRespSubStrings {
require.Contains(t, string(body), expectRespSubString)
}
} }
func triggerNoSpaceAlarm(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCluster) { func triggerNoSpaceAlarm(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCluster, _ time.Duration) {
buf := strings.Repeat("b", os.Getpagesize()) buf := strings.Repeat("b", os.Getpagesize())
etcdctl := clus.Etcdctl() etcdctl := clus.Etcdctl()
for { for {
@ -338,14 +343,14 @@ func triggerNoSpaceAlarm(ctx context.Context, t *testing.T, clus *e2e.EtcdProces
} }
} }
func triggerSlowApply(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCluster) { func triggerSlowApply(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCluster, duration time.Duration) {
// the following proposal will be blocked at applying stage // the following proposal will be blocked at applying stage
// because when apply index < committed index, linearizable read would time out. // because when apply index < committed index, linearizable read would time out.
require.NoError(t, clus.Procs[0].Failpoints().SetupHTTP(ctx, "beforeApplyOneEntryNormal", `sleep("3s")`)) require.NoError(t, clus.Procs[0].Failpoints().SetupHTTP(ctx, "beforeApplyOneEntryNormal", fmt.Sprintf(`sleep("%s")`, duration)))
require.NoError(t, clus.Procs[1].Etcdctl().Put(ctx, "foo", "bar", config.PutOptions{})) require.NoError(t, clus.Procs[1].Etcdctl().Put(ctx, "foo", "bar", config.PutOptions{}))
} }
func blackhole(_ context.Context, t *testing.T, clus *e2e.EtcdProcessCluster) { func blackhole(_ context.Context, t *testing.T, clus *e2e.EtcdProcessCluster, _ time.Duration) {
member := clus.Procs[0] member := clus.Procs[0]
proxy := member.PeerProxy() proxy := member.PeerProxy()
t.Logf("Blackholing traffic from and to member %q", member.Config().Name) t.Logf("Blackholing traffic from and to member %q", member.Config().Name)
@ -353,12 +358,12 @@ func blackhole(_ context.Context, t *testing.T, clus *e2e.EtcdProcessCluster) {
proxy.BlackholeRx() proxy.BlackholeRx()
} }
func triggerRaftLoopDeadLock(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCluster) { func triggerRaftLoopDeadLock(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCluster, duration time.Duration) {
require.NoError(t, clus.Procs[0].Failpoints().SetupHTTP(ctx, "raftBeforeSave", `sleep("3s")`)) require.NoError(t, clus.Procs[0].Failpoints().SetupHTTP(ctx, "raftBeforeSave", fmt.Sprintf(`sleep("%s")`, duration)))
clus.Procs[0].Etcdctl().Put(context.Background(), "foo", "bar", config.PutOptions{}) clus.Procs[0].Etcdctl().Put(context.Background(), "foo", "bar", config.PutOptions{Timeout: putCommandTimeout})
} }
func triggerSlowBufferWriteBackWithAuth(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCluster) { func triggerSlowBufferWriteBackWithAuth(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCluster, duration time.Duration) {
etcdctl := clus.Etcdctl() etcdctl := clus.Etcdctl()
_, err := etcdctl.UserAdd(ctx, "root", "root", config.UserAddOptions{}) _, err := etcdctl.UserAdd(ctx, "root", "root", config.UserAddOptions{})
require.NoError(t, err) require.NoError(t, err)
@ -366,11 +371,11 @@ func triggerSlowBufferWriteBackWithAuth(ctx context.Context, t *testing.T, clus
require.NoError(t, err) require.NoError(t, err)
require.NoError(t, etcdctl.AuthEnable(ctx)) require.NoError(t, etcdctl.AuthEnable(ctx))
require.NoError(t, clus.Procs[0].Failpoints().SetupHTTP(ctx, "beforeWritebackBuf", `sleep("3s")`)) require.NoError(t, clus.Procs[0].Failpoints().SetupHTTP(ctx, "beforeWritebackBuf", fmt.Sprintf(`sleep("%s")`, duration)))
clus.Procs[0].Etcdctl(e2e.WithAuth("root", "root")).Put(context.Background(), "foo", "bar", config.PutOptions{Timeout: 200 * time.Millisecond}) clus.Procs[0].Etcdctl(e2e.WithAuth("root", "root")).Put(context.Background(), "foo", "bar", config.PutOptions{Timeout: putCommandTimeout})
} }
func triggerCorrupt(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCluster) { func triggerCorrupt(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCluster, _ time.Duration) {
etcdctl := clus.Procs[0].Etcdctl() etcdctl := clus.Procs[0].Etcdctl()
for i := 0; i < 10; i++ { for i := 0; i < 10; i++ {
err := etcdctl.Put(ctx, "foo", "bar", config.PutOptions{}) err := etcdctl.Put(ctx, "foo", "bar", config.PutOptions{})