From 9057253d8c05548aa688a7530daff6555b47e2c5 Mon Sep 17 00:00:00 2001 From: Gyuho Lee Date: Wed, 11 Apr 2018 01:21:09 -0700 Subject: [PATCH] functional: add "SIGQUIT_AND_REMOVE_ONE_FOLLOWER" Signed-off-by: Gyuho Lee --- functional.yaml | 1 + functional/tester/cluster.go | 18 +- functional/tester/cluster_read_config.go | 7 + functional/tester/cluster_test.go | 1 + .../tester/failure_case_sigquit_remove.go | 182 ++++++++++++++++++ 5 files changed, 201 insertions(+), 8 deletions(-) create mode 100644 functional/tester/failure_case_sigquit_remove.go diff --git a/functional.yaml b/functional.yaml index ea69f2443..ef9fcdc24 100644 --- a/functional.yaml +++ b/functional.yaml @@ -128,6 +128,7 @@ tester-config: - SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT - SIGTERM_QUORUM - SIGTERM_ALL + - SIGQUIT_AND_REMOVE_ONE_FOLLOWER - BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER - BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT - BLACKHOLE_PEER_PORT_TX_RX_LEADER diff --git a/functional/tester/cluster.go b/functional/tester/cluster.go index 7e1a5be39..73fd4e6c5 100644 --- a/functional/tester/cluster.go +++ b/functional/tester/cluster.go @@ -161,6 +161,10 @@ func (clus *Cluster) updateFailures() { clus.failures = append(clus.failures, new_FailureCase_SIGTERM_ALL(clus)) + case "SIGQUIT_AND_REMOVE_ONE_FOLLOWER": + clus.failures = append(clus.failures, + new_FailureCase_SIGQUIT_AND_REMOVE_ONE_FOLLOWER(clus)) + case "BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER": clus.failures = append(clus.failures, new_FailureCase_BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER(clus)) @@ -377,14 +381,12 @@ func (clus *Cluster) broadcast(op rpcpb.Operation) error { } func (clus *Cluster) sendOp(idx int, op rpcpb.Operation) error { - if op == rpcpb.Operation_INITIAL_START_ETCD { - clus.agentRequests[idx] = &rpcpb.Request{ - Operation: op, - Member: clus.Members[idx], - Tester: clus.Tester, - } - } else { - clus.agentRequests[idx].Operation = op + // maintain the initial member object + // throughout the test time + clus.agentRequests[idx] = &rpcpb.Request{ + Operation: op, + Member: clus.Members[idx], + Tester: clus.Tester, } err := clus.agentStreams[idx].Send(clus.agentRequests[idx]) diff --git a/functional/tester/cluster_read_config.go b/functional/tester/cluster_read_config.go index 35d618d88..d5c2ff2ab 100644 --- a/functional/tester/cluster_read_config.go +++ b/functional/tester/cluster_read_config.go @@ -68,6 +68,13 @@ func read(lg *zap.Logger, fpath string) (*Cluster, error) { clus.Members[i].Etcd.WALDir = filepath.Join(mem.Etcd.DataDir, "member", "wal") } + switch mem.Etcd.InitialClusterState { + case "new": + case "existing": + default: + return nil, fmt.Errorf("'--initial-cluster-state' got %q", mem.Etcd.InitialClusterState) + } + if mem.Etcd.HeartbeatIntervalMs == 0 { return nil, fmt.Errorf("'--heartbeat-interval' cannot be 0 (got %+v)", mem.Etcd) } diff --git a/functional/tester/cluster_test.go b/functional/tester/cluster_test.go index a2be1f64f..75e6a2df0 100644 --- a/functional/tester/cluster_test.go +++ b/functional/tester/cluster_test.go @@ -162,6 +162,7 @@ func Test_read(t *testing.T) { "SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT", "SIGTERM_QUORUM", "SIGTERM_ALL", + "SIGQUIT_AND_REMOVE_ONE_FOLLOWER", "BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER", "BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT", "BLACKHOLE_PEER_PORT_TX_RX_LEADER", diff --git a/functional/tester/failure_case_sigquit_remove.go b/functional/tester/failure_case_sigquit_remove.go new file mode 100644 index 000000000..4835ec745 --- /dev/null +++ b/functional/tester/failure_case_sigquit_remove.go @@ -0,0 +1,182 @@ +// Copyright 2018 The etcd Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tester + +import ( + "context" + "fmt" + "sort" + "strings" + "time" + + "github.com/coreos/etcd/clientv3" + "github.com/coreos/etcd/functional/rpcpb" + "go.uber.org/zap" +) + +func inject_SIGQUIT_ETCD_AND_REMOVE_DATA(clus *Cluster, idx1 int) error { + cli1, err := clus.Members[idx1].CreateEtcdClient() + if err != nil { + return err + } + defer cli1.Close() + + var mresp *clientv3.MemberListResponse + mresp, err = cli1.MemberList(context.Background()) + mss := []string{} + if err == nil && mresp != nil { + mss = describeMembers(mresp) + } + clus.lg.Info( + "member list before disastrous machine failure", + zap.String("request-to", clus.Members[idx1].EtcdClientEndpoint), + zap.Strings("members", mss), + zap.Error(err), + ) + if err != nil { + return err + } + + sresp, serr := cli1.Status(context.Background(), clus.Members[idx1].EtcdClientEndpoint) + if serr != nil { + return serr + } + id1 := sresp.Header.MemberId + is1 := fmt.Sprintf("%016x", id1) + + err = clus.sendOp(idx1, rpcpb.Operation_SIGQUIT_ETCD_AND_REMOVE_DATA) + clus.lg.Info( + "disastrous machine failure", + zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint), + zap.String("target-member-id", is1), + zap.Error(err), + ) + if err != nil { + return err + } + + time.Sleep(3 * time.Second) + + idx2 := (idx1 + 1) % len(clus.Members) + var cli2 *clientv3.Client + cli2, err = clus.Members[idx2].CreateEtcdClient() + if err != nil { + return err + } + defer cli2.Close() + + _, err = cli2.MemberRemove(context.Background(), id1) + clus.lg.Info( + "member remove after disaster", + zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint), + zap.String("target-member-id", is1), + zap.String("request-to", clus.Members[idx2].EtcdClientEndpoint), + zap.Error(err), + ) + if err != nil { + return err + } + + time.Sleep(5 * time.Second) + + mresp, err = cli2.MemberList(context.Background()) + mss = []string{} + if err == nil && mresp != nil { + mss = describeMembers(mresp) + } + clus.lg.Info( + "member list after member remove", + zap.String("request-to", clus.Members[idx2].EtcdClientEndpoint), + zap.Strings("members", mss), + zap.Error(err), + ) + return err +} + +func recover_SIGQUIT_ETCD_AND_REMOVE_DATA(clus *Cluster, idx1 int) error { + idx2 := (idx1 + 1) % len(clus.Members) + cli2, err := clus.Members[idx2].CreateEtcdClient() + if err != nil { + return err + } + defer cli2.Close() + + _, err = cli2.MemberAdd(context.Background(), clus.Members[idx1].Etcd.AdvertisePeerURLs) + clus.lg.Info( + "member add before fresh restart", + zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint), + zap.String("request-to", clus.Members[idx2].EtcdClientEndpoint), + zap.Error(err), + ) + if err != nil { + return err + } + + time.Sleep(3 * time.Second) + + clus.Members[idx1].Etcd.InitialClusterState = "existing" + err = clus.sendOp(idx1, rpcpb.Operation_RESTART_ETCD) + clus.lg.Info( + "fresh restart after member add", + zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint), + zap.Error(err), + ) + if err != nil { + return err + } + + time.Sleep(3 * time.Second) + + var mresp *clientv3.MemberListResponse + mresp, err = cli2.MemberList(context.Background()) + mss := []string{} + if err == nil && mresp != nil { + mss = describeMembers(mresp) + } + clus.lg.Info( + "member list after member add", + zap.String("request-to", clus.Members[idx2].EtcdClientEndpoint), + zap.Strings("members", mss), + zap.Error(err), + ) + return err +} + +func new_FailureCase_SIGQUIT_AND_REMOVE_ONE_FOLLOWER(clus *Cluster) Failure { + ff := failureByFunc{ + failureCase: rpcpb.FailureCase_SIGQUIT_AND_REMOVE_ONE_FOLLOWER, + injectMember: inject_SIGQUIT_ETCD_AND_REMOVE_DATA, + recoverMember: recover_SIGQUIT_ETCD_AND_REMOVE_DATA, + } + f := &failureFollower{ff, -1, -1} + return &failureDelay{ + Failure: f, + delayDuration: clus.GetFailureDelayDuration(), + } +} + +func describeMembers(mresp *clientv3.MemberListResponse) (ss []string) { + ss = make([]string, len(mresp.Members)) + for i, m := range mresp.Members { + ss[i] = fmt.Sprintf("Name %s / ID %016x / ClientURLs %s / PeerURLs %s", + m.Name, + m.ID, + strings.Join(m.ClientURLs, ","), + strings.Join(m.PeerURLs, ","), + ) + } + sort.Strings(ss) + return ss +}