From f31d0eafb9f58c0e07df490068407d5c54d6d23f Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Mon, 8 May 2023 12:11:14 -0700 Subject: [PATCH] tests/e2e: add graceful shutdown test Signed-off-by: Chao Chen --- server/etcdserver/server.go | 6 +- tests/e2e/graceful_shutdown_test.go | 116 ++++++++++++++++++++++++ tests/integration/v3_leadership_test.go | 8 +- 3 files changed, 123 insertions(+), 7 deletions(-) create mode 100644 tests/e2e/graceful_shutdown_test.go diff --git a/server/etcdserver/server.go b/server/etcdserver/server.go index 8275bfae8..13fc5f5b3 100644 --- a/server/etcdserver/server.go +++ b/server/etcdserver/server.go @@ -1202,8 +1202,8 @@ func (s *EtcdServer) MoveLeader(ctx context.Context, lead, transferee uint64) er return nil } -// TransferLeadership transfers the leader to the chosen transferee. -func (s *EtcdServer) TransferLeadership() error { +// TryTransferLeadershipOnShutdown transfers the leader to the chosen transferee. It is only used in server graceful shutdown. +func (s *EtcdServer) TryTransferLeadershipOnShutdown() error { lg := s.Logger() if !s.isLeader() { lg.Info( @@ -1253,7 +1253,7 @@ func (s *EtcdServer) HardStop() { // Do and Process cannot be called after Stop has been invoked. func (s *EtcdServer) Stop() { lg := s.Logger() - if err := s.TransferLeadership(); err != nil { + if err := s.TryTransferLeadershipOnShutdown(); err != nil { lg.Warn("leadership transfer failed", zap.String("local-member-id", s.MemberId().String()), zap.Error(err)) } s.HardStop() diff --git a/tests/e2e/graceful_shutdown_test.go b/tests/e2e/graceful_shutdown_test.go new file mode 100644 index 000000000..dd10febfe --- /dev/null +++ b/tests/e2e/graceful_shutdown_test.go @@ -0,0 +1,116 @@ +// Copyright 2023 The etcd Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package e2e + +import ( + "context" + "fmt" + "testing" + "time" + + "github.com/stretchr/testify/require" + + "go.etcd.io/raft/v3" + + "go.etcd.io/etcd/tests/v3/framework/config" + "go.etcd.io/etcd/tests/v3/framework/e2e" + "go.etcd.io/etcd/tests/v3/framework/interfaces" +) + +func TestGracefulShutdown(t *testing.T) { + tcs := []struct { + name string + clusterSize int + }{ + { + name: "clusterSize3", + clusterSize: 3, + }, + { + name: "clusterSize5", + clusterSize: 5, + }, + } + + for _, tc := range tcs { + t.Run(tc.name, func(t *testing.T) { + testRunner := e2e.NewE2eRunner() + testRunner.BeforeTest(t) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + clus := testRunner.NewCluster(ctx, t, config.WithClusterSize(tc.clusterSize)) + // clean up orphaned resources like closing member client. + defer clus.Close() + // shutdown each etcd member process sequentially + // and start from old leader, (new leader), (follower) + tryShutdownLeader(ctx, t, clus.Members()) + }) + } +} + +// tryShutdownLeader tries stop etcd member if it is leader. +// it also asserts stop leader should not take longer than 1.5 seconds and leaderID has been changed within 500ms. +func tryShutdownLeader(ctx context.Context, t *testing.T, members []interfaces.Member) { + quorum := len(members)/2 + 1 + for len(members) > quorum { + leader, leaderID, term, followers := getLeader(ctx, t, members) + stopped := make(chan error, 1) + go func() { + // each etcd server will wait up to 1 seconds to close all idle connections in peer handler. + start := time.Now() + leader.Stop() + took := time.Since(start) + if took > 1500*time.Millisecond { + stopped <- fmt.Errorf("leader stop took %v longer than 1.5 seconds", took) + return + } + stopped <- nil + }() + + // etcd election timeout could range from 1s to 2s without explicit leadership transfer. + // assert leader ID has been changed within 500ms + time.Sleep(500 * time.Millisecond) + resps, err := followers[0].Client().Status(ctx) + require.NoError(t, err) + require.NotEqual(t, leaderID, raft.None) + require.Equal(t, resps[0].RaftTerm, term+1) + require.NotEqualf(t, resps[0].Leader, leaderID, "expect old leaderID %x changed to new leader ID %x", leaderID, resps[0].Leader) + + err = <-stopped + require.NoError(t, err) + + members = followers + } +} + +func getLeader(ctx context.Context, t *testing.T, members []interfaces.Member) (leader interfaces.Member, leaderID, term uint64, followers []interfaces.Member) { + leaderIdx := -1 + for i, m := range members { + mc := m.Client() + sresps, err := mc.Status(ctx) + require.NoError(t, err) + if sresps[0].Leader == sresps[0].Header.MemberId { + leaderIdx = i + leaderID = sresps[0].Leader + term = sresps[0].RaftTerm + break + } + } + if leaderIdx == -1 { + return nil, 0, 0, members + } + leader = members[leaderIdx] + return leader, leaderID, term, append(members[:leaderIdx], members[leaderIdx+1:]...) +} diff --git a/tests/integration/v3_leadership_test.go b/tests/integration/v3_leadership_test.go index 7956205c5..1e45f7172 100644 --- a/tests/integration/v3_leadership_test.go +++ b/tests/integration/v3_leadership_test.go @@ -58,7 +58,7 @@ func testMoveLeader(t *testing.T, auto bool) { target := uint64(clus.Members[(oldLeadIdx+1)%3].Server.MemberId()) if auto { - err := clus.Members[oldLeadIdx].Server.TransferLeadership() + err := clus.Members[oldLeadIdx].Server.TryTransferLeadershipOnShutdown() if err != nil { t.Fatal(err) } @@ -149,7 +149,7 @@ func TestMoveLeaderToLearnerError(t *testing.T) { } } -// TestTransferLeadershipWithLearner ensures TransferLeadership does not timeout due to learner is +// TestTransferLeadershipWithLearner ensures TryTransferLeadershipOnShutdown does not timeout due to learner is // automatically picked by leader as transferee. func TestTransferLeadershipWithLearner(t *testing.T) { integration.BeforeTest(t) @@ -170,9 +170,9 @@ func TestTransferLeadershipWithLearner(t *testing.T) { leaderIdx := clus.WaitLeader(t) errCh := make(chan error, 1) go func() { - // note that this cluster has 1 leader and 1 learner. TransferLeadership should return nil. + // note that this cluster has 1 leader and 1 learner. TryTransferLeadershipOnShutdown should return nil. // Leadership transfer is skipped in cluster with 1 voting member. - errCh <- clus.Members[leaderIdx].Server.TransferLeadership() + errCh <- clus.Members[leaderIdx].Server.TryTransferLeadershipOnShutdown() }() select { case err := <-errCh: