etcd-tester: add leader failure cases

This commit is contained in:
Gyu-Ho Lee 2016-02-11 15:16:14 -08:00
parent e2146e2080
commit c1851dfca1
3 changed files with 105 additions and 4 deletions

View File

@ -25,7 +25,8 @@ import (
"github.com/coreos/etcd/Godeps/_workspace/src/golang.org/x/net/context"
"github.com/coreos/etcd/Godeps/_workspace/src/google.golang.org/grpc"
clientV2 "github.com/coreos/etcd/client"
clientv2 "github.com/coreos/etcd/client"
"github.com/coreos/etcd/clientv3"
pb "github.com/coreos/etcd/etcdserver/etcdserverpb"
"github.com/coreos/etcd/tools/functional-tester/etcd-agent/client"
)
@ -183,6 +184,32 @@ func (c *cluster) WaitHealth() error {
return err
}
// GetLeader returns the index of leader and error if any.
func (c *cluster) GetLeader() (int, error) {
if c.v2Only {
return 0, nil
}
cli, err := clientv3.New(clientv3.Config{
Endpoints: c.GRPCURLs,
DialTimeout: 5 * time.Second,
})
if err != nil {
return 0, err
}
defer cli.Close()
clus := clientv3.NewCluster(cli)
mem, err := clus.MemberLeader(context.Background())
if err != nil {
return 0, err
}
for i, name := range c.Names {
if name == mem.Name {
return i, nil
}
}
return 0, fmt.Errorf("no leader found")
}
func (c *cluster) Report() (success, failure int) {
for _, stress := range c.Stressers {
s, f := stress.Report()
@ -253,15 +280,15 @@ func setHealthKey(us []string) error {
// setHealthKeyV2 sets health key on all given urls.
func setHealthKeyV2(us []string) error {
for _, u := range us {
cfg := clientV2.Config{
cfg := clientv2.Config{
Endpoints: []string{u},
}
c, err := clientV2.New(cfg)
c, err := clientv2.New(cfg)
if err != nil {
return err
}
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
kapi := clientV2.NewKeysAPI(c)
kapi := clientv2.NewKeysAPI(c)
_, err = kapi.Set(ctx, "health", "good", nil)
cancel()
if err != nil {

View File

@ -128,6 +128,33 @@ func (f *failureKillOne) Recover(c *cluster, round int) error {
return c.WaitHealth()
}
type failureKillLeader struct {
description
idx int
}
func newFailureKillLeader() *failureKillLeader {
return &failureKillLeader{
description: "kill leader member",
}
}
func (f *failureKillLeader) Inject(c *cluster, round int) error {
idx, err := c.GetLeader()
if err != nil {
return err
}
f.idx = idx
return c.Agents[idx].Stop()
}
func (f *failureKillLeader) Recover(c *cluster, round int) error {
if _, err := c.Agents[f.idx].Restart(); err != nil {
return err
}
return c.WaitHealth()
}
// failureKillOneForLongTime kills one member for long time, and restart
// after a snapshot is required.
type failureKillOneForLongTime struct {
@ -173,6 +200,51 @@ func (f *failureKillOneForLongTime) Recover(c *cluster, round int) error {
return c.WaitHealth()
}
// failureKillLeaderForLongTime kills the leader for long time, and restart
// after a snapshot is required.
type failureKillLeaderForLongTime struct {
description
idx int
}
func newFailureKillLeaderForLongTime() *failureKillLeaderForLongTime {
return &failureKillLeaderForLongTime{
description: "kill the leader for long time and expect it to recover from incoming snapshot",
}
}
func (f *failureKillLeaderForLongTime) Inject(c *cluster, round int) error {
idx, err := c.GetLeader()
if err != nil {
return err
}
f.idx = idx
if err := c.Agents[idx].Stop(); err != nil {
return err
}
if c.Size >= 3 {
start, _ := c.Report()
var end int
retry := snapshotCount / 1000 * 3
for j := 0; j < retry; j++ {
end, _ = c.Report()
if end-start > snapshotCount {
return nil
}
time.Sleep(time.Second)
}
return fmt.Errorf("cluster too slow: only commit %d requests in %ds", end-start, retry)
}
return nil
}
func (f *failureKillLeaderForLongTime) Recover(c *cluster, round int) error {
if _, err := c.Agents[f.idx].Restart(); err != nil {
return err
}
return c.WaitHealth()
}
type failureIsolate struct {
description
}

View File

@ -42,7 +42,9 @@ func main() {
newFailureKillAll(),
newFailureKillMajority(),
newFailureKillOne(),
newFailureKillLeader(),
newFailureKillOneForLongTime(),
newFailureKillLeaderForLongTime(),
newFailureIsolate(),
newFailureIsolateAll(),
},