From c1851dfca133a7c447cdcbb01a7a7c0e794299cb Mon Sep 17 00:00:00 2001 From: Gyu-Ho Lee Date: Thu, 11 Feb 2016 15:16:14 -0800 Subject: [PATCH] etcd-tester: add leader failure cases --- .../functional-tester/etcd-tester/cluster.go | 35 +++++++-- .../functional-tester/etcd-tester/failure.go | 72 +++++++++++++++++++ tools/functional-tester/etcd-tester/main.go | 2 + 3 files changed, 105 insertions(+), 4 deletions(-) diff --git a/tools/functional-tester/etcd-tester/cluster.go b/tools/functional-tester/etcd-tester/cluster.go index 6355a5790..e63cd980d 100644 --- a/tools/functional-tester/etcd-tester/cluster.go +++ b/tools/functional-tester/etcd-tester/cluster.go @@ -25,7 +25,8 @@ import ( "github.com/coreos/etcd/Godeps/_workspace/src/golang.org/x/net/context" "github.com/coreos/etcd/Godeps/_workspace/src/google.golang.org/grpc" - clientV2 "github.com/coreos/etcd/client" + clientv2 "github.com/coreos/etcd/client" + "github.com/coreos/etcd/clientv3" pb "github.com/coreos/etcd/etcdserver/etcdserverpb" "github.com/coreos/etcd/tools/functional-tester/etcd-agent/client" ) @@ -183,6 +184,32 @@ func (c *cluster) WaitHealth() error { return err } +// GetLeader returns the index of leader and error if any. +func (c *cluster) GetLeader() (int, error) { + if c.v2Only { + return 0, nil + } + cli, err := clientv3.New(clientv3.Config{ + Endpoints: c.GRPCURLs, + DialTimeout: 5 * time.Second, + }) + if err != nil { + return 0, err + } + defer cli.Close() + clus := clientv3.NewCluster(cli) + mem, err := clus.MemberLeader(context.Background()) + if err != nil { + return 0, err + } + for i, name := range c.Names { + if name == mem.Name { + return i, nil + } + } + return 0, fmt.Errorf("no leader found") +} + func (c *cluster) Report() (success, failure int) { for _, stress := range c.Stressers { s, f := stress.Report() @@ -253,15 +280,15 @@ func setHealthKey(us []string) error { // setHealthKeyV2 sets health key on all given urls. func setHealthKeyV2(us []string) error { for _, u := range us { - cfg := clientV2.Config{ + cfg := clientv2.Config{ Endpoints: []string{u}, } - c, err := clientV2.New(cfg) + c, err := clientv2.New(cfg) if err != nil { return err } ctx, cancel := context.WithTimeout(context.Background(), time.Second) - kapi := clientV2.NewKeysAPI(c) + kapi := clientv2.NewKeysAPI(c) _, err = kapi.Set(ctx, "health", "good", nil) cancel() if err != nil { diff --git a/tools/functional-tester/etcd-tester/failure.go b/tools/functional-tester/etcd-tester/failure.go index 3f4fb5159..b9775cb4c 100644 --- a/tools/functional-tester/etcd-tester/failure.go +++ b/tools/functional-tester/etcd-tester/failure.go @@ -128,6 +128,33 @@ func (f *failureKillOne) Recover(c *cluster, round int) error { return c.WaitHealth() } +type failureKillLeader struct { + description + idx int +} + +func newFailureKillLeader() *failureKillLeader { + return &failureKillLeader{ + description: "kill leader member", + } +} + +func (f *failureKillLeader) Inject(c *cluster, round int) error { + idx, err := c.GetLeader() + if err != nil { + return err + } + f.idx = idx + return c.Agents[idx].Stop() +} + +func (f *failureKillLeader) Recover(c *cluster, round int) error { + if _, err := c.Agents[f.idx].Restart(); err != nil { + return err + } + return c.WaitHealth() +} + // failureKillOneForLongTime kills one member for long time, and restart // after a snapshot is required. type failureKillOneForLongTime struct { @@ -173,6 +200,51 @@ func (f *failureKillOneForLongTime) Recover(c *cluster, round int) error { return c.WaitHealth() } +// failureKillLeaderForLongTime kills the leader for long time, and restart +// after a snapshot is required. +type failureKillLeaderForLongTime struct { + description + idx int +} + +func newFailureKillLeaderForLongTime() *failureKillLeaderForLongTime { + return &failureKillLeaderForLongTime{ + description: "kill the leader for long time and expect it to recover from incoming snapshot", + } +} + +func (f *failureKillLeaderForLongTime) Inject(c *cluster, round int) error { + idx, err := c.GetLeader() + if err != nil { + return err + } + f.idx = idx + if err := c.Agents[idx].Stop(); err != nil { + return err + } + if c.Size >= 3 { + start, _ := c.Report() + var end int + retry := snapshotCount / 1000 * 3 + for j := 0; j < retry; j++ { + end, _ = c.Report() + if end-start > snapshotCount { + return nil + } + time.Sleep(time.Second) + } + return fmt.Errorf("cluster too slow: only commit %d requests in %ds", end-start, retry) + } + return nil +} + +func (f *failureKillLeaderForLongTime) Recover(c *cluster, round int) error { + if _, err := c.Agents[f.idx].Restart(); err != nil { + return err + } + return c.WaitHealth() +} + type failureIsolate struct { description } diff --git a/tools/functional-tester/etcd-tester/main.go b/tools/functional-tester/etcd-tester/main.go index acd918fa2..43693c29e 100644 --- a/tools/functional-tester/etcd-tester/main.go +++ b/tools/functional-tester/etcd-tester/main.go @@ -42,7 +42,9 @@ func main() { newFailureKillAll(), newFailureKillMajority(), newFailureKillOne(), + newFailureKillLeader(), newFailureKillOneForLongTime(), + newFailureKillLeaderForLongTime(), newFailureIsolate(), newFailureIsolateAll(), },