From b3fea7ed5385d71ce2eb1946eacec342757012de Mon Sep 17 00:00:00 2001 From: Gyuho Lee Date: Wed, 4 Apr 2018 13:03:57 -0700 Subject: [PATCH] functional-tester/tester: refactor "Failure" to support liveness mode Signed-off-by: Gyuho Lee --- tools/functional-tester/tester/failure.go | 95 +++++++++++++------ .../tester/failure_case_external.go | 19 +++- .../tester/failure_case_failpoints.go | 27 ++++-- .../tester/failure_case_kill.go | 18 ++-- .../tester/failure_case_network_blackhole.go | 6 +- .../tester/failure_case_network_slow.go | 10 +- .../tester/failure_case_no_op.go | 19 +++- 7 files changed, 137 insertions(+), 57 deletions(-) diff --git a/tools/functional-tester/tester/failure.go b/tools/functional-tester/tester/failure.go index b0fbb0b86..b40f61245 100644 --- a/tools/functional-tester/tester/failure.go +++ b/tools/functional-tester/tester/failure.go @@ -18,6 +18,8 @@ import ( "fmt" "math/rand" "time" + + "github.com/coreos/etcd/tools/functional-tester/rpcpb" ) // Failure defines failure injection interface. @@ -33,28 +35,32 @@ type Failure interface { Recover(clus *Cluster) error // Desc returns a description of the failure Desc() string + // FailureCase returns "rpcpb.FailureCase" enum type. + FailureCase() rpcpb.FailureCase } -type description string - -func (d description) Desc() string { return string(d) } - type injectMemberFunc func(*Cluster, int) error type recoverMemberFunc func(*Cluster, int) error type failureByFunc struct { - description + desc + failureCase rpcpb.FailureCase injectMember injectMemberFunc recoverMember recoverMemberFunc } -type failureFollower struct { - failureByFunc - last int - lead int +func (f *failureByFunc) Desc() string { + if string(f.desc) != "" { + return string(f.desc) + } + return f.failureCase.String() } -type failureLeader struct { +func (f *failureByFunc) FailureCase() rpcpb.FailureCase { + return f.failureCase +} + +type failureFollower struct { failureByFunc last int lead int @@ -82,22 +88,6 @@ func (f *failureFollower) updateIndex(clus *Cluster) error { return nil } -func (f *failureLeader) updateIndex(clus *Cluster) error { - idx, err := clus.GetLeader() - if err != nil { - return err - } - f.lead = idx - f.last = idx - return nil -} - -type failureQuorum failureByFunc -type failureAll failureByFunc - -// failureUntilSnapshot injects a failure and waits for a snapshot event -type failureUntilSnapshot struct{ Failure } - func (f *failureFollower) Inject(clus *Cluster) error { if err := f.updateIndex(clus); err != nil { return err @@ -109,6 +99,24 @@ func (f *failureFollower) Recover(clus *Cluster) error { return f.recoverMember(clus, f.last) } +func (f *failureFollower) FailureCase() rpcpb.FailureCase { return f.failureCase } + +type failureLeader struct { + failureByFunc + last int + lead int +} + +func (f *failureLeader) updateIndex(clus *Cluster) error { + idx, err := clus.GetLeader() + if err != nil { + return err + } + f.lead = idx + f.last = idx + return nil +} + func (f *failureLeader) Inject(clus *Cluster) error { if err := f.updateIndex(clus); err != nil { return err @@ -120,6 +128,12 @@ func (f *failureLeader) Recover(clus *Cluster) error { return f.recoverMember(clus, f.last) } +func (f *failureLeader) FailureCase() rpcpb.FailureCase { + return f.failureCase +} + +type failureQuorum failureByFunc + func (f *failureQuorum) Inject(clus *Cluster) error { for i := range killMap(len(clus.Members), clus.rd) { if err := f.injectMember(clus, i); err != nil { @@ -138,6 +152,10 @@ func (f *failureQuorum) Recover(clus *Cluster) error { return nil } +func (f *failureQuorum) FailureCase() rpcpb.FailureCase { return f.failureCase } + +type failureAll failureByFunc + func (f *failureAll) Inject(clus *Cluster) error { for i := range clus.Members { if err := f.injectMember(clus, i); err != nil { @@ -156,6 +174,18 @@ func (f *failureAll) Recover(clus *Cluster) error { return nil } +func (f *failureAll) FailureCase() rpcpb.FailureCase { + return f.failureCase +} + +// failureUntilSnapshot injects a failure and waits for a snapshot event +type failureUntilSnapshot struct { + desc desc + failureCase rpcpb.FailureCase + + Failure +} + const snapshotCount = 10000 func (f *failureUntilSnapshot) Inject(clus *Cluster) error { @@ -190,7 +220,14 @@ func (f *failureUntilSnapshot) Inject(clus *Cluster) error { } func (f *failureUntilSnapshot) Desc() string { - return f.Failure.Desc() + " for a long time and expect it to recover from an incoming snapshot" + if f.desc.Desc() != "" { + return f.desc.Desc() + } + return f.failureCase.String() + " (to trigger snapshot)" +} + +func (f *failureUntilSnapshot) FailureCase() rpcpb.FailureCase { + return f.failureCase } func killMap(size int, seed int) map[int]bool { @@ -204,3 +241,7 @@ func killMap(size int, seed int) map[int]bool { } } } + +type desc string + +func (d desc) Desc() string { return string(d) } diff --git a/tools/functional-tester/tester/failure_case_external.go b/tools/functional-tester/tester/failure_case_external.go index e5b3b6a18..0d73422b1 100644 --- a/tools/functional-tester/tester/failure_case_external.go +++ b/tools/functional-tester/tester/failure_case_external.go @@ -17,13 +17,17 @@ package tester import ( "fmt" "os/exec" + + "github.com/coreos/etcd/tools/functional-tester/rpcpb" ) type failureExternal struct { Failure - description string - scriptPath string + desc string + failureCase rpcpb.FailureCase + + scriptPath string } func (f *failureExternal) Inject(clus *Cluster) error { @@ -34,11 +38,18 @@ func (f *failureExternal) Recover(clus *Cluster) error { return exec.Command(f.scriptPath, "disable", fmt.Sprintf("%d", clus.rd)).Run() } -func (f *failureExternal) Desc() string { return f.description } +func (f *failureExternal) Desc() string { + return f.desc +} + +func (f *failureExternal) FailureCase() rpcpb.FailureCase { + return f.failureCase +} func newFailureExternal(scriptPath string) Failure { return &failureExternal{ - description: fmt.Sprintf("external fault injector (script: %q)", scriptPath), + desc: fmt.Sprintf("external fault injector (script: %q)", scriptPath), + failureCase: rpcpb.FailureCase_EXTERNAL, scriptPath: scriptPath, } } diff --git a/tools/functional-tester/tester/failure_case_failpoints.go b/tools/functional-tester/tester/failure_case_failpoints.go index ede60916a..f5df8138b 100644 --- a/tools/functional-tester/tester/failure_case_failpoints.go +++ b/tools/functional-tester/tester/failure_case_failpoints.go @@ -21,6 +21,8 @@ import ( "strings" "sync" "time" + + "github.com/coreos/etcd/tools/functional-tester/rpcpb" ) type failpointStats struct { @@ -42,14 +44,23 @@ func failpointFailures(clus *Cluster) (ret []Failure, err error) { if len(fp) == 0 { continue } + fpFails := failuresFromFailpoint(fp, clus.Tester.FailpointCommands) + // wrap in delays so failpoint has time to trigger for i, fpf := range fpFails { if strings.Contains(fp, "Snap") { // hack to trigger snapshot failpoints - fpFails[i] = &failureUntilSnapshot{fpf} + fpFails[i] = &failureUntilSnapshot{ + desc: desc(fpf.Desc()), + failureCase: rpcpb.FailureCase_FAILPOINTS, + Failure: fpf, + } } else { - fpFails[i] = &failureDelay{fpf, 3 * time.Second} + fpFails[i] = &failureDelay{ + Failure: fpf, + delayDuration: 3 * time.Second, + } } } ret = append(ret, fpFails...) @@ -85,7 +96,8 @@ func failuresFromFailpoint(fp string, failpointCommands []string) (fs []Failure) fs = append(fs, []Failure{ &failureFollower{ failureByFunc: failureByFunc{ - description: description(fmt.Sprintf("failpoint %s (one: %s)", fp, fcmd)), + desc: desc(fmt.Sprintf("failpoint %q (one: %q)", fp, fcmd)), + failureCase: rpcpb.FailureCase_FAILPOINTS, injectMember: inject, recoverMember: recov, }, @@ -94,7 +106,8 @@ func failuresFromFailpoint(fp string, failpointCommands []string) (fs []Failure) }, &failureLeader{ failureByFunc: failureByFunc{ - description: description(fmt.Sprintf("failpoint %s (leader: %s)", fp, fcmd)), + desc: desc(fmt.Sprintf("failpoint %q (leader: %q)", fp, fcmd)), + failureCase: rpcpb.FailureCase_FAILPOINTS, injectMember: inject, recoverMember: recov, }, @@ -102,12 +115,14 @@ func failuresFromFailpoint(fp string, failpointCommands []string) (fs []Failure) lead: -1, }, &failureQuorum{ - description: description(fmt.Sprintf("failpoint %s (quorum: %s)", fp, fcmd)), + desc: desc(fmt.Sprintf("failpoint %q (quorum: %q)", fp, fcmd)), + failureCase: rpcpb.FailureCase_FAILPOINTS, injectMember: inject, recoverMember: recov, }, &failureAll{ - description: description(fmt.Sprintf("failpoint %s (all: %s)", fp, fcmd)), + desc: desc(fmt.Sprintf("failpoint %q (all: %q)", fp, fcmd)), + failureCase: rpcpb.FailureCase_FAILPOINTS, injectMember: inject, recoverMember: recov, }, diff --git a/tools/functional-tester/tester/failure_case_kill.go b/tools/functional-tester/tester/failure_case_kill.go index d59fddf8f..9e88efe03 100644 --- a/tools/functional-tester/tester/failure_case_kill.go +++ b/tools/functional-tester/tester/failure_case_kill.go @@ -26,7 +26,7 @@ func recoverKill(clus *Cluster, idx int) error { func newFailureKillOneFollower() Failure { ff := failureByFunc{ - description: "kill one follower", + failureCase: rpcpb.FailureCase_KILL_ONE_FOLLOWER, injectMember: injectKill, recoverMember: recoverKill, } @@ -35,7 +35,7 @@ func newFailureKillOneFollower() Failure { func newFailureKillLeader() Failure { ff := failureByFunc{ - description: "kill leader", + failureCase: rpcpb.FailureCase_KILL_LEADER, injectMember: injectKill, recoverMember: recoverKill, } @@ -44,7 +44,7 @@ func newFailureKillLeader() Failure { func newFailureKillQuorum() Failure { return &failureQuorum{ - description: "kill quorum", + failureCase: rpcpb.FailureCase_KILL_QUORUM, injectMember: injectKill, recoverMember: recoverKill, } @@ -52,16 +52,22 @@ func newFailureKillQuorum() Failure { func newFailureKillAll() Failure { return &failureAll{ - description: "kill all", + failureCase: rpcpb.FailureCase_KILL_ALL, injectMember: injectKill, recoverMember: recoverKill, } } func newFailureKillOneFollowerForLongTime() Failure { - return &failureUntilSnapshot{newFailureKillOneFollower()} + return &failureUntilSnapshot{ + failureCase: rpcpb.FailureCase_KILL_ONE_FOLLOWER_FOR_LONG, + Failure: newFailureKillOneFollower(), + } } func newFailureKillLeaderForLongTime() Failure { - return &failureUntilSnapshot{newFailureKillLeader()} + return &failureUntilSnapshot{ + failureCase: rpcpb.FailureCase_KILL_LEADER_FOR_LONG, + Failure: newFailureKillLeader(), + } } diff --git a/tools/functional-tester/tester/failure_case_network_blackhole.go b/tools/functional-tester/tester/failure_case_network_blackhole.go index 6951c892f..f57c20477 100644 --- a/tools/functional-tester/tester/failure_case_network_blackhole.go +++ b/tools/functional-tester/tester/failure_case_network_blackhole.go @@ -26,7 +26,7 @@ func recoverBlackholePeerPortTxRx(clus *Cluster, idx int) error { func newFailureBlackholePeerPortTxRxOneFollower() Failure { ff := failureByFunc{ - description: "blackhole peer port on one follower", + failureCase: rpcpb.FailureCase_BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER, injectMember: injectBlackholePeerPortTxRx, recoverMember: recoverBlackholePeerPortTxRx, } @@ -39,7 +39,7 @@ func newFailureBlackholePeerPortTxRxOneFollower() Failure { func newFailureBlackholePeerPortTxRxLeader() Failure { ff := failureByFunc{ - description: "blackhole peer port on leader", + failureCase: rpcpb.FailureCase_BLACKHOLE_PEER_PORT_TX_RX_LEADER, injectMember: injectBlackholePeerPortTxRx, recoverMember: recoverBlackholePeerPortTxRx, } @@ -52,7 +52,7 @@ func newFailureBlackholePeerPortTxRxLeader() Failure { func newFailureBlackholePeerPortTxRxAll() Failure { f := &failureAll{ - description: "blackhole peer port on all", + failureCase: rpcpb.FailureCase_BLACKHOLE_PEER_PORT_TX_RX_ALL, injectMember: injectBlackholePeerPortTxRx, recoverMember: recoverBlackholePeerPortTxRx, } diff --git a/tools/functional-tester/tester/failure_case_network_slow.go b/tools/functional-tester/tester/failure_case_network_slow.go index 840d2101d..9fadab67c 100644 --- a/tools/functional-tester/tester/failure_case_network_slow.go +++ b/tools/functional-tester/tester/failure_case_network_slow.go @@ -15,7 +15,6 @@ package tester import ( - "fmt" "time" "github.com/coreos/etcd/tools/functional-tester/rpcpb" @@ -43,9 +42,8 @@ func recoverDelayPeerPortTxRx(clus *Cluster, idx int) error { } func newFailureDelayPeerPortTxRxOneFollower(clus *Cluster) Failure { - desc := fmt.Sprintf("delay follower peer port by %d ms", clus.Tester.DelayLatencyMs) ff := failureByFunc{ - description: description(desc), + failureCase: rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER, injectMember: injectDelayPeerPortTxRx, recoverMember: recoverDelayPeerPortTxRx, } @@ -57,9 +55,8 @@ func newFailureDelayPeerPortTxRxOneFollower(clus *Cluster) Failure { } func newFailureDelayPeerPortTxRxLeader(clus *Cluster) Failure { - desc := fmt.Sprintf("delay leader peer port by %d ms", clus.Tester.DelayLatencyMs) ff := failureByFunc{ - description: description(desc), + failureCase: rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_LEADER, injectMember: injectDelayPeerPortTxRx, recoverMember: recoverDelayPeerPortTxRx, } @@ -71,9 +68,8 @@ func newFailureDelayPeerPortTxRxLeader(clus *Cluster) Failure { } func newFailureDelayPeerPortTxRxAll(clus *Cluster) Failure { - desc := fmt.Sprintf("delay all peer port by %d ms", clus.Tester.DelayLatencyMs) f := &failureAll{ - description: description(desc), + failureCase: rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_ALL, injectMember: injectDelayPeerPortTxRx, recoverMember: recoverDelayPeerPortTxRx, } diff --git a/tools/functional-tester/tester/failure_case_no_op.go b/tools/functional-tester/tester/failure_case_no_op.go index fdcddd306..21512214b 100644 --- a/tools/functional-tester/tester/failure_case_no_op.go +++ b/tools/functional-tester/tester/failure_case_no_op.go @@ -14,13 +14,24 @@ package tester +import ( + "time" + + "github.com/coreos/etcd/tools/functional-tester/rpcpb" +) + type failureNoOp failureByFunc -func (f *failureNoOp) Inject(clus *Cluster) error { return nil } -func (f *failureNoOp) Recover(clus *Cluster) error { return nil } +func (f *failureNoOp) Inject(clus *Cluster) error { return nil } +func (f *failureNoOp) Recover(clus *Cluster) error { return nil } +func (f *failureNoOp) FailureCase() rpcpb.FailureCase { return f.failureCase } func newFailureNoOp() Failure { - return &failureNoOp{ - description: "no failure", + f := &failureNoOp{ + failureCase: rpcpb.FailureCase_NO_FAIL_WITH_STRESS, + } + return &failureDelay{ + Failure: f, + delayDuration: 5 * time.Second, } }