mirror of
https://github.com/etcd-io/etcd.git
synced 2024-09-27 06:25:44 +00:00
Merge pull request #4486 from gyuho/f1
etcd-tester: failures aware of leader/non-leader
This commit is contained in:
commit
89ca5ccccd
@ -25,7 +25,8 @@ import (
|
||||
"github.com/coreos/etcd/Godeps/_workspace/src/golang.org/x/net/context"
|
||||
"github.com/coreos/etcd/Godeps/_workspace/src/google.golang.org/grpc"
|
||||
|
||||
clientV2 "github.com/coreos/etcd/client"
|
||||
clientv2 "github.com/coreos/etcd/client"
|
||||
"github.com/coreos/etcd/clientv3"
|
||||
pb "github.com/coreos/etcd/etcdserver/etcdserverpb"
|
||||
"github.com/coreos/etcd/tools/functional-tester/etcd-agent/client"
|
||||
)
|
||||
@ -183,6 +184,32 @@ func (c *cluster) WaitHealth() error {
|
||||
return err
|
||||
}
|
||||
|
||||
// GetLeader returns the index of leader and error if any.
|
||||
func (c *cluster) GetLeader() (int, error) {
|
||||
if c.v2Only {
|
||||
return 0, nil
|
||||
}
|
||||
cli, err := clientv3.New(clientv3.Config{
|
||||
Endpoints: c.GRPCURLs,
|
||||
DialTimeout: 5 * time.Second,
|
||||
})
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
defer cli.Close()
|
||||
clus := clientv3.NewCluster(cli)
|
||||
mem, err := clus.MemberLeader(context.Background())
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
for i, name := range c.Names {
|
||||
if name == mem.Name {
|
||||
return i, nil
|
||||
}
|
||||
}
|
||||
return 0, fmt.Errorf("no leader found")
|
||||
}
|
||||
|
||||
func (c *cluster) Report() (success, failure int) {
|
||||
for _, stress := range c.Stressers {
|
||||
s, f := stress.Report()
|
||||
@ -253,15 +280,15 @@ func setHealthKey(us []string) error {
|
||||
// setHealthKeyV2 sets health key on all given urls.
|
||||
func setHealthKeyV2(us []string) error {
|
||||
for _, u := range us {
|
||||
cfg := clientV2.Config{
|
||||
cfg := clientv2.Config{
|
||||
Endpoints: []string{u},
|
||||
}
|
||||
c, err := clientV2.New(cfg)
|
||||
c, err := clientv2.New(cfg)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
|
||||
kapi := clientV2.NewKeysAPI(c)
|
||||
kapi := clientv2.NewKeysAPI(c)
|
||||
_, err = kapi.Set(ctx, "health", "good", nil)
|
||||
cancel()
|
||||
if err != nil {
|
||||
|
@ -128,6 +128,33 @@ func (f *failureKillOne) Recover(c *cluster, round int) error {
|
||||
return c.WaitHealth()
|
||||
}
|
||||
|
||||
type failureKillLeader struct {
|
||||
description
|
||||
idx int
|
||||
}
|
||||
|
||||
func newFailureKillLeader() *failureKillLeader {
|
||||
return &failureKillLeader{
|
||||
description: "kill leader member",
|
||||
}
|
||||
}
|
||||
|
||||
func (f *failureKillLeader) Inject(c *cluster, round int) error {
|
||||
idx, err := c.GetLeader()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
f.idx = idx
|
||||
return c.Agents[idx].Stop()
|
||||
}
|
||||
|
||||
func (f *failureKillLeader) Recover(c *cluster, round int) error {
|
||||
if _, err := c.Agents[f.idx].Restart(); err != nil {
|
||||
return err
|
||||
}
|
||||
return c.WaitHealth()
|
||||
}
|
||||
|
||||
// failureKillOneForLongTime kills one member for long time, and restart
|
||||
// after a snapshot is required.
|
||||
type failureKillOneForLongTime struct {
|
||||
@ -173,6 +200,51 @@ func (f *failureKillOneForLongTime) Recover(c *cluster, round int) error {
|
||||
return c.WaitHealth()
|
||||
}
|
||||
|
||||
// failureKillLeaderForLongTime kills the leader for long time, and restart
|
||||
// after a snapshot is required.
|
||||
type failureKillLeaderForLongTime struct {
|
||||
description
|
||||
idx int
|
||||
}
|
||||
|
||||
func newFailureKillLeaderForLongTime() *failureKillLeaderForLongTime {
|
||||
return &failureKillLeaderForLongTime{
|
||||
description: "kill the leader for long time and expect it to recover from incoming snapshot",
|
||||
}
|
||||
}
|
||||
|
||||
func (f *failureKillLeaderForLongTime) Inject(c *cluster, round int) error {
|
||||
idx, err := c.GetLeader()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
f.idx = idx
|
||||
if err := c.Agents[idx].Stop(); err != nil {
|
||||
return err
|
||||
}
|
||||
if c.Size >= 3 {
|
||||
start, _ := c.Report()
|
||||
var end int
|
||||
retry := snapshotCount / 1000 * 3
|
||||
for j := 0; j < retry; j++ {
|
||||
end, _ = c.Report()
|
||||
if end-start > snapshotCount {
|
||||
return nil
|
||||
}
|
||||
time.Sleep(time.Second)
|
||||
}
|
||||
return fmt.Errorf("cluster too slow: only commit %d requests in %ds", end-start, retry)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (f *failureKillLeaderForLongTime) Recover(c *cluster, round int) error {
|
||||
if _, err := c.Agents[f.idx].Restart(); err != nil {
|
||||
return err
|
||||
}
|
||||
return c.WaitHealth()
|
||||
}
|
||||
|
||||
type failureIsolate struct {
|
||||
description
|
||||
}
|
||||
|
@ -42,7 +42,9 @@ func main() {
|
||||
newFailureKillAll(),
|
||||
newFailureKillMajority(),
|
||||
newFailureKillOne(),
|
||||
newFailureKillLeader(),
|
||||
newFailureKillOneForLongTime(),
|
||||
newFailureKillLeaderForLongTime(),
|
||||
newFailureIsolate(),
|
||||
newFailureIsolateAll(),
|
||||
},
|
||||
|
Loading…
x
Reference in New Issue
Block a user