mirror of
https://github.com/etcd-io/etcd.git
synced 2024-09-27 06:25:44 +00:00
tests/e2e: enhance livez readyz e2e tests
Signed-off-by: Chao Chen <chaochn@amazon.com>
This commit is contained in:
parent
e8ae83fac4
commit
42d9e43e5f
@ -18,6 +18,7 @@ package e2e
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"net/http"
|
"net/http"
|
||||||
"os"
|
"os"
|
||||||
@ -26,28 +27,35 @@ import (
|
|||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"go.etcd.io/etcd/api/v3/etcdserverpb"
|
|
||||||
"go.etcd.io/etcd/server/v3/storage/mvcc/testutil"
|
|
||||||
|
|
||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/require"
|
||||||
|
|
||||||
|
"go.etcd.io/etcd/api/v3/etcdserverpb"
|
||||||
|
"go.etcd.io/etcd/server/v3/storage/mvcc/testutil"
|
||||||
"go.etcd.io/etcd/tests/v3/framework/config"
|
"go.etcd.io/etcd/tests/v3/framework/config"
|
||||||
"go.etcd.io/etcd/tests/v3/framework/e2e"
|
"go.etcd.io/etcd/tests/v3/framework/e2e"
|
||||||
"go.etcd.io/etcd/tests/v3/framework/testutils"
|
"go.etcd.io/etcd/tests/v3/framework/testutils"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
healthCheckTimeout = 2 * time.Second
|
||||||
|
putCommandTimeout = 200 * time.Millisecond
|
||||||
|
)
|
||||||
|
|
||||||
type healthCheckConfig struct {
|
type healthCheckConfig struct {
|
||||||
url string
|
url string
|
||||||
expectedStatusCode int
|
expectedStatusCode int
|
||||||
expectedTimeoutError bool
|
expectedTimeoutError bool
|
||||||
|
expectedRespSubStrings []string
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type injectFailure func(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCluster, duration time.Duration)
|
||||||
|
|
||||||
func TestHTTPHealthHandler(t *testing.T) {
|
func TestHTTPHealthHandler(t *testing.T) {
|
||||||
e2e.BeforeTest(t)
|
e2e.BeforeTest(t)
|
||||||
client := &http.Client{}
|
client := &http.Client{}
|
||||||
tcs := []struct {
|
tcs := []struct {
|
||||||
name string
|
name string
|
||||||
injectFailure func(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCluster)
|
injectFailure injectFailure
|
||||||
clusterOptions []e2e.EPClusterOption
|
clusterOptions []e2e.EPClusterOption
|
||||||
healthChecks []healthCheckConfig
|
healthChecks []healthCheckConfig
|
||||||
}{
|
}{
|
||||||
@ -149,104 +157,91 @@ func TestHTTPHealthHandler(t *testing.T) {
|
|||||||
defer clus.Close()
|
defer clus.Close()
|
||||||
testutils.ExecuteUntil(ctx, t, func() {
|
testutils.ExecuteUntil(ctx, t, func() {
|
||||||
if tc.injectFailure != nil {
|
if tc.injectFailure != nil {
|
||||||
tc.injectFailure(ctx, t, clus)
|
// guaranteed that failure point is active until all the health checks timeout.
|
||||||
|
duration := time.Duration(len(tc.healthChecks)+1) * healthCheckTimeout
|
||||||
|
tc.injectFailure(ctx, t, clus, duration)
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, hc := range tc.healthChecks {
|
for _, hc := range tc.healthChecks {
|
||||||
requestURL := clus.Procs[0].EndpointsHTTP()[0] + hc.url
|
requestURL := clus.Procs[0].EndpointsHTTP()[0] + hc.url
|
||||||
t.Logf("health check URL is %s", requestURL)
|
t.Logf("health check URL is %s", requestURL)
|
||||||
doHealthCheckAndVerify(t, client, requestURL, hc.expectedStatusCode, hc.expectedTimeoutError)
|
doHealthCheckAndVerify(t, client, requestURL, hc.expectedTimeoutError, hc.expectedStatusCode, hc.expectedRespSubStrings)
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var (
|
||||||
|
defaultHealthCheckConfigs = []healthCheckConfig{
|
||||||
|
{
|
||||||
|
url: "/livez",
|
||||||
|
expectedStatusCode: http.StatusOK,
|
||||||
|
expectedRespSubStrings: []string{`ok`},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
url: "/readyz",
|
||||||
|
expectedStatusCode: http.StatusOK,
|
||||||
|
expectedRespSubStrings: []string{`ok`},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
url: "/livez?verbose=true",
|
||||||
|
expectedStatusCode: http.StatusOK,
|
||||||
|
expectedRespSubStrings: []string{`[+]serializable_read ok`},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
url: "/readyz?verbose=true",
|
||||||
|
expectedStatusCode: http.StatusOK,
|
||||||
|
expectedRespSubStrings: []string{
|
||||||
|
`[+]serializable_read ok`,
|
||||||
|
`[+]data_corruption ok`,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
func TestHTTPLivezReadyzHandler(t *testing.T) {
|
func TestHTTPLivezReadyzHandler(t *testing.T) {
|
||||||
e2e.BeforeTest(t)
|
e2e.BeforeTest(t)
|
||||||
client := &http.Client{}
|
client := &http.Client{}
|
||||||
tcs := []struct {
|
tcs := []struct {
|
||||||
name string
|
name string
|
||||||
injectFailure func(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCluster)
|
injectFailure injectFailure
|
||||||
clusterOptions []e2e.EPClusterOption
|
clusterOptions []e2e.EPClusterOption
|
||||||
healthChecks []healthCheckConfig
|
healthChecks []healthCheckConfig
|
||||||
}{
|
}{
|
||||||
{
|
{
|
||||||
name: "no failures", // happy case
|
name: "no failures", // happy case
|
||||||
clusterOptions: []e2e.EPClusterOption{e2e.WithClusterSize(1)},
|
clusterOptions: []e2e.EPClusterOption{e2e.WithClusterSize(1)},
|
||||||
healthChecks: []healthCheckConfig{
|
healthChecks: defaultHealthCheckConfigs,
|
||||||
{
|
|
||||||
url: "/livez",
|
|
||||||
expectedStatusCode: http.StatusOK,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
url: "/readyz",
|
|
||||||
expectedStatusCode: http.StatusOK,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "activated no space alarm",
|
name: "activated no space alarm",
|
||||||
injectFailure: triggerNoSpaceAlarm,
|
injectFailure: triggerNoSpaceAlarm,
|
||||||
clusterOptions: []e2e.EPClusterOption{e2e.WithClusterSize(1), e2e.WithQuotaBackendBytes(int64(13 * os.Getpagesize()))},
|
clusterOptions: []e2e.EPClusterOption{e2e.WithClusterSize(1), e2e.WithQuotaBackendBytes(int64(13 * os.Getpagesize()))},
|
||||||
healthChecks: []healthCheckConfig{
|
healthChecks: defaultHealthCheckConfigs,
|
||||||
{
|
|
||||||
url: "/livez",
|
|
||||||
expectedStatusCode: http.StatusOK,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
url: "/readyz",
|
|
||||||
expectedStatusCode: http.StatusOK,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
},
|
||||||
|
// Readiness is not an indicator of performance. Slow response is not covered by readiness.
|
||||||
|
// refer to https://tinyurl.com/livez-readyz-design-doc or https://github.com/etcd-io/etcd/issues/16007#issuecomment-1726541091 in case tinyurl is down.
|
||||||
{
|
{
|
||||||
name: "overloaded server slow apply",
|
name: "overloaded server slow apply",
|
||||||
injectFailure: triggerSlowApply,
|
injectFailure: triggerSlowApply,
|
||||||
clusterOptions: []e2e.EPClusterOption{e2e.WithClusterSize(3), e2e.WithGoFailEnabled(true)},
|
clusterOptions: []e2e.EPClusterOption{e2e.WithClusterSize(3), e2e.WithGoFailEnabled(true)},
|
||||||
healthChecks: []healthCheckConfig{
|
healthChecks: defaultHealthCheckConfigs,
|
||||||
{
|
|
||||||
url: "/livez",
|
|
||||||
expectedStatusCode: http.StatusOK,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
url: "/readyz",
|
|
||||||
expectedStatusCode: http.StatusOK,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "network partitioned",
|
name: "network partitioned",
|
||||||
injectFailure: blackhole,
|
injectFailure: blackhole,
|
||||||
clusterOptions: []e2e.EPClusterOption{e2e.WithClusterSize(3), e2e.WithIsPeerTLS(true), e2e.WithPeerProxy(true)},
|
clusterOptions: []e2e.EPClusterOption{e2e.WithClusterSize(3), e2e.WithIsPeerTLS(true), e2e.WithPeerProxy(true)},
|
||||||
healthChecks: []healthCheckConfig{
|
// TODO expected behavior of readyz check should be 503 or timeout after ReadIndex check is implemented.
|
||||||
{
|
healthChecks: defaultHealthCheckConfigs,
|
||||||
url: "/livez",
|
|
||||||
expectedStatusCode: http.StatusOK,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
url: "/readyz",
|
|
||||||
expectedStatusCode: http.StatusOK,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "raft loop deadlock",
|
name: "raft loop deadlock",
|
||||||
injectFailure: triggerRaftLoopDeadLock,
|
injectFailure: triggerRaftLoopDeadLock,
|
||||||
clusterOptions: []e2e.EPClusterOption{e2e.WithClusterSize(1), e2e.WithGoFailEnabled(true)},
|
clusterOptions: []e2e.EPClusterOption{e2e.WithClusterSize(1), e2e.WithGoFailEnabled(true)},
|
||||||
healthChecks: []healthCheckConfig{
|
// TODO expected behavior of livez check should be 503 or timeout after RaftLoopDeadLock check is implemented.
|
||||||
{
|
// TODO expected behavior of readyz check should be 503 or timeout after ReadIndex check is implemented.
|
||||||
// current kubeadm etcd liveness check failed to detect raft loop deadlock in steady state
|
healthChecks: defaultHealthCheckConfigs,
|
||||||
// ref. https://github.com/kubernetes/kubernetes/blob/master/cmd/kubeadm/app/phases/etcd/local.go#L225-L226
|
|
||||||
// current liveness probe depends on the etcd /health check has a flaw that new /livez check should resolve.
|
|
||||||
url: "/livez",
|
|
||||||
expectedStatusCode: http.StatusOK,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
url: "/readyz",
|
|
||||||
expectedStatusCode: http.StatusOK,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
},
|
||||||
// verify that auth enabled serializable read must go through mvcc
|
// verify that auth enabled serializable read must go through mvcc
|
||||||
{
|
{
|
||||||
@ -260,7 +255,7 @@ func TestHTTPLivezReadyzHandler(t *testing.T) {
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
url: "/readyz",
|
url: "/readyz",
|
||||||
expectedStatusCode: http.StatusOK,
|
expectedTimeoutError: true,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
@ -270,12 +265,17 @@ func TestHTTPLivezReadyzHandler(t *testing.T) {
|
|||||||
clusterOptions: []e2e.EPClusterOption{e2e.WithClusterSize(3), e2e.WithCorruptCheckTime(time.Second)},
|
clusterOptions: []e2e.EPClusterOption{e2e.WithClusterSize(3), e2e.WithCorruptCheckTime(time.Second)},
|
||||||
healthChecks: []healthCheckConfig{
|
healthChecks: []healthCheckConfig{
|
||||||
{
|
{
|
||||||
url: "/livez",
|
url: "/livez?verbose=true",
|
||||||
expectedStatusCode: http.StatusOK,
|
expectedStatusCode: http.StatusOK,
|
||||||
|
expectedRespSubStrings: []string{`[+]serializable_read ok`},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
url: "/readyz",
|
url: "/readyz",
|
||||||
expectedStatusCode: http.StatusServiceUnavailable,
|
expectedStatusCode: http.StatusServiceUnavailable,
|
||||||
|
expectedRespSubStrings: []string{
|
||||||
|
`[+]serializable_read ok`,
|
||||||
|
`[-]data_corruption failed: alarm activated: CORRUPT`,
|
||||||
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
@ -290,21 +290,23 @@ func TestHTTPLivezReadyzHandler(t *testing.T) {
|
|||||||
defer clus.Close()
|
defer clus.Close()
|
||||||
testutils.ExecuteUntil(ctx, t, func() {
|
testutils.ExecuteUntil(ctx, t, func() {
|
||||||
if tc.injectFailure != nil {
|
if tc.injectFailure != nil {
|
||||||
tc.injectFailure(ctx, t, clus)
|
// guaranteed that failure point is active until all the health checks timeout.
|
||||||
|
duration := time.Duration(len(tc.healthChecks)+1) * healthCheckTimeout
|
||||||
|
tc.injectFailure(ctx, t, clus, duration)
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, hc := range tc.healthChecks {
|
for _, hc := range tc.healthChecks {
|
||||||
requestURL := clus.Procs[0].EndpointsHTTP()[0] + hc.url
|
requestURL := clus.Procs[0].EndpointsHTTP()[0] + hc.url
|
||||||
t.Logf("health check URL is %s", requestURL)
|
t.Logf("health check URL is %s", requestURL)
|
||||||
doHealthCheckAndVerify(t, client, requestURL, hc.expectedStatusCode, hc.expectedTimeoutError)
|
doHealthCheckAndVerify(t, client, requestURL, hc.expectedTimeoutError, hc.expectedStatusCode, hc.expectedRespSubStrings)
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func doHealthCheckAndVerify(t *testing.T, client *http.Client, url string, expectStatusCode int, expectTimeoutError bool) {
|
func doHealthCheckAndVerify(t *testing.T, client *http.Client, url string, expectTimeoutError bool, expectStatusCode int, expectRespSubStrings []string) {
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
ctx, cancel := context.WithTimeout(context.Background(), healthCheckTimeout)
|
||||||
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
|
||||||
require.NoErrorf(t, err, "failed to creat request %+v", err)
|
require.NoErrorf(t, err, "failed to creat request %+v", err)
|
||||||
resp, herr := client.Do(req)
|
resp, herr := client.Do(req)
|
||||||
@ -321,11 +323,14 @@ func doHealthCheckAndVerify(t *testing.T, client *http.Client, url string, expec
|
|||||||
resp.Body.Close()
|
resp.Body.Close()
|
||||||
require.NoErrorf(t, err, "failed to read response %+v", err)
|
require.NoErrorf(t, err, "failed to read response %+v", err)
|
||||||
|
|
||||||
t.Logf("health check response body is: %s", body)
|
t.Logf("health check response body is:\n%s", body)
|
||||||
require.Equal(t, expectStatusCode, resp.StatusCode)
|
require.Equal(t, expectStatusCode, resp.StatusCode)
|
||||||
|
for _, expectRespSubString := range expectRespSubStrings {
|
||||||
|
require.Contains(t, string(body), expectRespSubString)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func triggerNoSpaceAlarm(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCluster) {
|
func triggerNoSpaceAlarm(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCluster, _ time.Duration) {
|
||||||
buf := strings.Repeat("b", os.Getpagesize())
|
buf := strings.Repeat("b", os.Getpagesize())
|
||||||
etcdctl := clus.Etcdctl()
|
etcdctl := clus.Etcdctl()
|
||||||
for {
|
for {
|
||||||
@ -338,14 +343,14 @@ func triggerNoSpaceAlarm(ctx context.Context, t *testing.T, clus *e2e.EtcdProces
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func triggerSlowApply(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCluster) {
|
func triggerSlowApply(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCluster, duration time.Duration) {
|
||||||
// the following proposal will be blocked at applying stage
|
// the following proposal will be blocked at applying stage
|
||||||
// because when apply index < committed index, linearizable read would time out.
|
// because when apply index < committed index, linearizable read would time out.
|
||||||
require.NoError(t, clus.Procs[0].Failpoints().SetupHTTP(ctx, "beforeApplyOneEntryNormal", `sleep("3s")`))
|
require.NoError(t, clus.Procs[0].Failpoints().SetupHTTP(ctx, "beforeApplyOneEntryNormal", fmt.Sprintf(`sleep("%s")`, duration)))
|
||||||
require.NoError(t, clus.Procs[1].Etcdctl().Put(ctx, "foo", "bar", config.PutOptions{}))
|
require.NoError(t, clus.Procs[1].Etcdctl().Put(ctx, "foo", "bar", config.PutOptions{}))
|
||||||
}
|
}
|
||||||
|
|
||||||
func blackhole(_ context.Context, t *testing.T, clus *e2e.EtcdProcessCluster) {
|
func blackhole(_ context.Context, t *testing.T, clus *e2e.EtcdProcessCluster, _ time.Duration) {
|
||||||
member := clus.Procs[0]
|
member := clus.Procs[0]
|
||||||
proxy := member.PeerProxy()
|
proxy := member.PeerProxy()
|
||||||
t.Logf("Blackholing traffic from and to member %q", member.Config().Name)
|
t.Logf("Blackholing traffic from and to member %q", member.Config().Name)
|
||||||
@ -353,12 +358,12 @@ func blackhole(_ context.Context, t *testing.T, clus *e2e.EtcdProcessCluster) {
|
|||||||
proxy.BlackholeRx()
|
proxy.BlackholeRx()
|
||||||
}
|
}
|
||||||
|
|
||||||
func triggerRaftLoopDeadLock(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCluster) {
|
func triggerRaftLoopDeadLock(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCluster, duration time.Duration) {
|
||||||
require.NoError(t, clus.Procs[0].Failpoints().SetupHTTP(ctx, "raftBeforeSave", `sleep("3s")`))
|
require.NoError(t, clus.Procs[0].Failpoints().SetupHTTP(ctx, "raftBeforeSave", fmt.Sprintf(`sleep("%s")`, duration)))
|
||||||
clus.Procs[0].Etcdctl().Put(context.Background(), "foo", "bar", config.PutOptions{})
|
clus.Procs[0].Etcdctl().Put(context.Background(), "foo", "bar", config.PutOptions{Timeout: putCommandTimeout})
|
||||||
}
|
}
|
||||||
|
|
||||||
func triggerSlowBufferWriteBackWithAuth(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCluster) {
|
func triggerSlowBufferWriteBackWithAuth(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCluster, duration time.Duration) {
|
||||||
etcdctl := clus.Etcdctl()
|
etcdctl := clus.Etcdctl()
|
||||||
_, err := etcdctl.UserAdd(ctx, "root", "root", config.UserAddOptions{})
|
_, err := etcdctl.UserAdd(ctx, "root", "root", config.UserAddOptions{})
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
@ -366,11 +371,11 @@ func triggerSlowBufferWriteBackWithAuth(ctx context.Context, t *testing.T, clus
|
|||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
require.NoError(t, etcdctl.AuthEnable(ctx))
|
require.NoError(t, etcdctl.AuthEnable(ctx))
|
||||||
|
|
||||||
require.NoError(t, clus.Procs[0].Failpoints().SetupHTTP(ctx, "beforeWritebackBuf", `sleep("3s")`))
|
require.NoError(t, clus.Procs[0].Failpoints().SetupHTTP(ctx, "beforeWritebackBuf", fmt.Sprintf(`sleep("%s")`, duration)))
|
||||||
clus.Procs[0].Etcdctl(e2e.WithAuth("root", "root")).Put(context.Background(), "foo", "bar", config.PutOptions{Timeout: 200 * time.Millisecond})
|
clus.Procs[0].Etcdctl(e2e.WithAuth("root", "root")).Put(context.Background(), "foo", "bar", config.PutOptions{Timeout: putCommandTimeout})
|
||||||
}
|
}
|
||||||
|
|
||||||
func triggerCorrupt(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCluster) {
|
func triggerCorrupt(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCluster, _ time.Duration) {
|
||||||
etcdctl := clus.Procs[0].Etcdctl()
|
etcdctl := clus.Procs[0].Etcdctl()
|
||||||
for i := 0; i < 10; i++ {
|
for i := 0; i < 10; i++ {
|
||||||
err := etcdctl.Put(ctx, "foo", "bar", config.PutOptions{})
|
err := etcdctl.Put(ctx, "foo", "bar", config.PutOptions{})
|
||||||
|
Loading…
x
Reference in New Issue
Block a user