From f9127dba9c35cc3be3bb1c945683933b9bfbc8fe Mon Sep 17 00:00:00 2001 From: Gyuho Lee Date: Wed, 11 Apr 2018 13:37:01 -0700 Subject: [PATCH] functional/tester: implement "new_FailureCase_SIGQUIT_AND_REMOVE_QUORUM_AND_RESTORE_SNAPSHOT_FROM_SCRATCH" Signed-off-by: Gyuho Lee --- .../failure_case_sigquit_remove_quorum.go | 232 ++++++++++++++++++ 1 file changed, 232 insertions(+) create mode 100644 functional/tester/failure_case_sigquit_remove_quorum.go diff --git a/functional/tester/failure_case_sigquit_remove_quorum.go b/functional/tester/failure_case_sigquit_remove_quorum.go new file mode 100644 index 000000000..c0131eb4e --- /dev/null +++ b/functional/tester/failure_case_sigquit_remove_quorum.go @@ -0,0 +1,232 @@ +// Copyright 2018 The etcd Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tester + +import ( + "context" + "fmt" + "time" + + "github.com/coreos/etcd/clientv3" + "github.com/coreos/etcd/functional/rpcpb" + + "go.uber.org/zap" +) + +// 1. Download snapshot from node C, before destroying node A and B. +// 2. Destroy node A and B, and make the whole cluster inoperable. +// 3. Now node C cannot operate either. +// 4. SIGTERM node C and remove its data directories. +// 5. Restore a new seed member from node C's latest snapshot file. +// 6. Add another member to establish 2-node cluster. +// 7. Add another member to establish 3-node cluster. + +type fetchSnapshotAndFailureQuorum struct { + failureByFunc + injected map[int]struct{} +} + +func (f *fetchSnapshotAndFailureQuorum) Inject(clus *Cluster) error { + // download snapshot first before destroying quorum + f.injected = pickQuorum(len(clus.Members)) + for idx := range f.injected { + if err := f.injectMember(clus, idx); err != nil { + return err + } + } + return nil +} + +func (f *fetchSnapshotAndFailureQuorum) Recover(clus *Cluster) error { + for idx := range f.injected { + if err := f.recoverMember(clus, idx); err != nil { + return err + } + } + return nil +} + +func (f *fetchSnapshotAndFailureQuorum) Desc() string { + if f.desc != "" { + return f.desc + } + return f.failureCase.String() +} + +func (f *fetchSnapshotAndFailureQuorum) FailureCase() rpcpb.FailureCase { + return f.failureCase +} + +func inject_SIGQUIT_AND_REMOVE_QUORUM_AND_RESTORE_SNAPSHOT_FROM_SCRATCH(clus *Cluster, idx1 int) error { + cli1, err := clus.Members[idx1].CreateEtcdClient() + if err != nil { + return err + } + defer cli1.Close() + + var mresp *clientv3.MemberListResponse + mresp, err = cli1.MemberList(context.Background()) + mss := []string{} + if err == nil && mresp != nil { + mss = describeMembers(mresp) + } + clus.lg.Info( + "member list before disastrous machine failure", + zap.String("request-to", clus.Members[idx1].EtcdClientEndpoint), + zap.Strings("members", mss), + zap.Error(err), + ) + if err != nil { + return err + } + + sresp, serr := cli1.Status(context.Background(), clus.Members[idx1].EtcdClientEndpoint) + if serr != nil { + return serr + } + id1 := sresp.Header.MemberId + is1 := fmt.Sprintf("%016x", id1) + + clus.lg.Info( + "disastrous machine failure START", + zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint), + zap.String("target-member-id", is1), + zap.Error(err), + ) + err = clus.sendOp(idx1, rpcpb.Operation_FETCH_SNAPSHOT) + clus.lg.Info( + "disastrous machine failure END", + zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint), + zap.String("target-member-id", is1), + zap.Error(err), + ) + if err != nil { + return err + } + + time.Sleep(2 * time.Second) + + idx2 := (idx1 + 1) % len(clus.Members) + var cli2 *clientv3.Client + cli2, err = clus.Members[idx2].CreateEtcdClient() + if err != nil { + return err + } + defer cli2.Close() + + // FIXME(bug): this may block forever during + // "SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT" + // is the new leader too busy with snapshotting? + // is raft proposal dropped? + // enable client keepalive for failover? + clus.lg.Info( + "member remove after disaster START", + zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint), + zap.String("target-member-id", is1), + zap.String("request-to", clus.Members[idx2].EtcdClientEndpoint), + ) + ctx, cancel := context.WithTimeout(context.Background(), time.Minute) + _, err = cli2.MemberRemove(ctx, id1) + cancel() + clus.lg.Info( + "member remove after disaster END", + zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint), + zap.String("target-member-id", is1), + zap.String("request-to", clus.Members[idx2].EtcdClientEndpoint), + zap.Error(err), + ) + if err != nil { + return err + } + + time.Sleep(2 * time.Second) + + mresp, err = cli2.MemberList(context.Background()) + mss = []string{} + if err == nil && mresp != nil { + mss = describeMembers(mresp) + } + clus.lg.Info( + "member list after member remove", + zap.String("request-to", clus.Members[idx2].EtcdClientEndpoint), + zap.Strings("members", mss), + zap.Error(err), + ) + return err +} + +func recover_SIGQUIT_AND_REMOVE_QUORUM_AND_RESTORE_SNAPSHOT_FROM_SCRATCH(clus *Cluster, idx1 int) error { + idx2 := (idx1 + 1) % len(clus.Members) + cli2, err := clus.Members[idx2].CreateEtcdClient() + if err != nil { + return err + } + defer cli2.Close() + + _, err = cli2.MemberAdd(context.Background(), clus.Members[idx1].Etcd.AdvertisePeerURLs) + clus.lg.Info( + "member add before fresh restart", + zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint), + zap.String("request-to", clus.Members[idx2].EtcdClientEndpoint), + zap.Error(err), + ) + if err != nil { + return err + } + + time.Sleep(2 * time.Second) + + clus.Members[idx1].Etcd.InitialClusterState = "existing" + err = clus.sendOp(idx1, rpcpb.Operation_RESTART_ETCD) + clus.lg.Info( + "fresh restart after member add", + zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint), + zap.Error(err), + ) + if err != nil { + return err + } + + time.Sleep(2 * time.Second) + + var mresp *clientv3.MemberListResponse + mresp, err = cli2.MemberList(context.Background()) + mss := []string{} + if err == nil && mresp != nil { + mss = describeMembers(mresp) + } + clus.lg.Info( + "member list after member add", + zap.String("request-to", clus.Members[idx2].EtcdClientEndpoint), + zap.Strings("members", mss), + zap.Error(err), + ) + return err +} + +func new_FailureCase_SIGQUIT_AND_REMOVE_QUORUM_AND_RESTORE_SNAPSHOT_FROM_SCRATCH(clus *Cluster) Failure { + f := &fetchSnapshotAndFailureQuorum{ + failureByFunc: failureByFunc{ + failureCase: rpcpb.FailureCase_SIGQUIT_AND_REMOVE_QUORUM_AND_RESTORE_SNAPSHOT_FROM_SCRATCH, + injectMember: inject_SIGQUIT_AND_REMOVE_QUORUM_AND_RESTORE_SNAPSHOT_FROM_SCRATCH, + recoverMember: recover_SIGQUIT_AND_REMOVE_QUORUM_AND_RESTORE_SNAPSHOT_FROM_SCRATCH, + }, + injected: make(map[int]struct{}), + } + return &failureDelay{ + Failure: f, + delayDuration: clus.GetFailureDelayDuration(), + } +}