functional/tester: delay after injecting "kill" to trigger election

Signed-off-by: Gyuho Lee <gyuhox@gmail.com>
This commit is contained in:
Gyuho Lee
2018-04-09 13:40:00 -07:00
parent 68adc6e300
commit d8a2d3a209
6 changed files with 90 additions and 57 deletions

View File

@@ -143,69 +143,97 @@ func (clus *Cluster) updateFailures() {
for _, cs := range clus.Tester.FailureCases {
switch cs {
case "KILL_ONE_FOLLOWER":
clus.failures = append(clus.failures, newFailureKillOneFollower())
clus.failures = append(clus.failures,
newFailureKillOneFollower(clus))
case "KILL_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT":
clus.failures = append(clus.failures, newFailureKillOneFollowerUntilTriggerSnapshot())
clus.failures = append(clus.failures,
newFailureKillOneFollowerUntilTriggerSnapshot(clus))
case "KILL_LEADER":
clus.failures = append(clus.failures, newFailureKillLeader())
clus.failures = append(clus.failures,
newFailureKillLeader(clus))
case "KILL_LEADER_UNTIL_TRIGGER_SNAPSHOT":
clus.failures = append(clus.failures, newFailureKillLeaderUntilTriggerSnapshot())
clus.failures = append(clus.failures,
newFailureKillLeaderUntilTriggerSnapshot(clus))
case "KILL_QUORUM":
clus.failures = append(clus.failures, newFailureKillQuorum())
clus.failures = append(clus.failures,
newFailureKillQuorum(clus))
case "KILL_ALL":
clus.failures = append(clus.failures, newFailureKillAll())
clus.failures = append(clus.failures,
newFailureKillAll(clus))
case "BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER":
clus.failures = append(clus.failures, newFailureBlackholePeerPortTxRxOneFollower(clus))
clus.failures = append(clus.failures,
newFailureBlackholePeerPortTxRxOneFollower(clus))
case "BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT":
clus.failures = append(clus.failures, newFailureBlackholePeerPortTxRxOneFollowerUntilTriggerSnapshot())
clus.failures = append(clus.failures,
newFailureBlackholePeerPortTxRxOneFollowerUntilTriggerSnapshot())
case "BLACKHOLE_PEER_PORT_TX_RX_LEADER":
clus.failures = append(clus.failures, newFailureBlackholePeerPortTxRxLeader(clus))
clus.failures = append(clus.failures,
newFailureBlackholePeerPortTxRxLeader(clus))
case "BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT":
clus.failures = append(clus.failures, newFailureBlackholePeerPortTxRxLeaderUntilTriggerSnapshot())
clus.failures = append(clus.failures,
newFailureBlackholePeerPortTxRxLeaderUntilTriggerSnapshot())
case "BLACKHOLE_PEER_PORT_TX_RX_QUORUM":
clus.failures = append(clus.failures, newFailureBlackholePeerPortTxRxQuorum(clus))
clus.failures = append(clus.failures,
newFailureBlackholePeerPortTxRxQuorum(clus))
case "BLACKHOLE_PEER_PORT_TX_RX_ALL":
clus.failures = append(clus.failures, newFailureBlackholePeerPortTxRxAll(clus))
clus.failures = append(clus.failures,
newFailureBlackholePeerPortTxRxAll(clus))
case "DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER":
clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxOneFollower(clus, false))
clus.failures = append(clus.failures,
newFailureDelayPeerPortTxRxOneFollower(clus, false))
case "RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER":
clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxOneFollower(clus, true))
clus.failures = append(clus.failures,
newFailureDelayPeerPortTxRxOneFollower(clus, true))
case "DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT":
clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxOneFollowerUntilTriggerSnapshot(clus, false))
clus.failures = append(clus.failures,
newFailureDelayPeerPortTxRxOneFollowerUntilTriggerSnapshot(clus, false))
case "RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT":
clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxOneFollowerUntilTriggerSnapshot(clus, true))
clus.failures = append(clus.failures,
newFailureDelayPeerPortTxRxOneFollowerUntilTriggerSnapshot(clus, true))
case "DELAY_PEER_PORT_TX_RX_LEADER":
clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxLeader(clus, false))
clus.failures = append(clus.failures,
newFailureDelayPeerPortTxRxLeader(clus, false))
case "RANDOM_DELAY_PEER_PORT_TX_RX_LEADER":
clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxLeader(clus, true))
clus.failures = append(clus.failures,
newFailureDelayPeerPortTxRxLeader(clus, true))
case "DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT":
clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxLeaderUntilTriggerSnapshot(clus, false))
clus.failures = append(clus.failures,
newFailureDelayPeerPortTxRxLeaderUntilTriggerSnapshot(clus, false))
case "RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT":
clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxLeaderUntilTriggerSnapshot(clus, true))
clus.failures = append(clus.failures,
newFailureDelayPeerPortTxRxLeaderUntilTriggerSnapshot(clus, true))
case "DELAY_PEER_PORT_TX_RX_QUORUM":
clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxQuorum(clus, false))
clus.failures = append(clus.failures,
newFailureDelayPeerPortTxRxQuorum(clus, false))
case "RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM":
clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxQuorum(clus, true))
clus.failures = append(clus.failures,
newFailureDelayPeerPortTxRxQuorum(clus, true))
case "DELAY_PEER_PORT_TX_RX_ALL":
clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxAll(clus, false))
clus.failures = append(clus.failures,
newFailureDelayPeerPortTxRxAll(clus, false))
case "RANDOM_DELAY_PEER_PORT_TX_RX_ALL":
clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxAll(clus, true))
clus.failures = append(clus.failures,
newFailureDelayPeerPortTxRxAll(clus, true))
case "NO_FAIL_WITH_STRESS":
clus.failures = append(clus.failures, newFailureNoFailWithStress(clus))
clus.failures = append(clus.failures,
newFailureNoFailWithStress(clus))
case "NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS":
clus.failures = append(clus.failures, newFailureNoFailWithNoStressForLiveness(clus))
clus.failures = append(clus.failures,
newFailureNoFailWithNoStressForLiveness(clus))
case "EXTERNAL":
clus.failures = append(clus.failures, newFailureExternal(clus.Tester.ExternalExecPath))
clus.failures = append(clus.failures,
newFailureExternal(clus.Tester.ExternalExecPath))
case "FAILPOINTS":
fpFailures, fperr := failpointFailures(clus)
if len(fpFailures) == 0 {
clus.lg.Info("no failpoints found!", zap.Error(fperr))
}
clus.failures = append(clus.failures, fpFailures...)
clus.failures = append(clus.failures,
fpFailures...)
}
}
}

View File

@@ -40,6 +40,10 @@ func read(lg *zap.Logger, fpath string) (*Cluster, error) {
return nil, err
}
if len(clus.Members) < 3 {
return nil, fmt.Errorf("len(clus.Members) expects at least 3, got %d", len(clus.Members))
}
for i, mem := range clus.Members {
if mem.BaseDir == "" {
return nil, fmt.Errorf("BaseDir cannot be empty (got %q)", mem.BaseDir)

View File

@@ -242,9 +242,6 @@ func (f *failureUntilSnapshot) Inject(clus *Cluster) error {
if err := f.Failure.Inject(clus); err != nil {
return err
}
if len(clus.Members) < 3 {
return nil
}
snapshotCount := clus.Members[0].Etcd.SnapshotCount

View File

@@ -31,9 +31,9 @@ func (f *failureDelay) Inject(clus *Cluster) error {
}
if f.delayDuration > 0 {
clus.lg.Info(
"sleeping in failureDelay",
"wait after inject",
zap.Duration("delay", f.delayDuration),
zap.String("case", f.Failure.Desc()),
zap.String("desc", f.Failure.Desc()),
)
time.Sleep(f.delayDuration)
}

View File

@@ -24,50 +24,66 @@ func recoverKill(clus *Cluster, idx int) error {
return clus.sendOperation(idx, rpcpb.Operation_RestartEtcd)
}
func newFailureKillOneFollower() Failure {
func newFailureKillOneFollower(clus *Cluster) Failure {
ff := failureByFunc{
failureCase: rpcpb.FailureCase_KILL_ONE_FOLLOWER,
injectMember: injectKill,
recoverMember: recoverKill,
}
return &failureFollower{ff, -1, -1}
f := &failureFollower{ff, -1, -1}
return &failureDelay{
Failure: f,
delayDuration: clus.GetFailureDelayDuration(),
}
}
func newFailureKillLeader() Failure {
func newFailureKillLeader(clus *Cluster) Failure {
ff := failureByFunc{
failureCase: rpcpb.FailureCase_KILL_LEADER,
injectMember: injectKill,
recoverMember: recoverKill,
}
return &failureLeader{ff, -1, -1}
f := &failureLeader{ff, -1, -1}
return &failureDelay{
Failure: f,
delayDuration: clus.GetFailureDelayDuration(),
}
}
func newFailureKillQuorum() Failure {
return &failureQuorum{
func newFailureKillQuorum(clus *Cluster) Failure {
f := &failureQuorum{
failureCase: rpcpb.FailureCase_KILL_QUORUM,
injectMember: injectKill,
recoverMember: recoverKill,
}
return &failureDelay{
Failure: f,
delayDuration: clus.GetFailureDelayDuration(),
}
}
func newFailureKillAll() Failure {
return &failureAll{
func newFailureKillAll(clus *Cluster) Failure {
f := &failureAll{
failureCase: rpcpb.FailureCase_KILL_ALL,
injectMember: injectKill,
recoverMember: recoverKill,
}
return &failureDelay{
Failure: f,
delayDuration: clus.GetFailureDelayDuration(),
}
}
func newFailureKillOneFollowerUntilTriggerSnapshot() Failure {
func newFailureKillOneFollowerUntilTriggerSnapshot(clus *Cluster) Failure {
return &failureUntilSnapshot{
failureCase: rpcpb.FailureCase_KILL_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT,
Failure: newFailureKillOneFollower(),
Failure: newFailureKillOneFollower(clus),
}
}
func newFailureKillLeaderUntilTriggerSnapshot() Failure {
func newFailureKillLeaderUntilTriggerSnapshot(clus *Cluster) Failure {
return &failureUntilSnapshot{
failureCase: rpcpb.FailureCase_KILL_LEADER_UNTIL_TRIGGER_SNAPSHOT,
Failure: newFailureKillLeader(),
Failure: newFailureKillLeader(clus),
}
}

View File

@@ -52,13 +52,11 @@ func newFailureDelayPeerPortTxRxOneFollower(clus *Cluster, random bool) Failure
injectMember: injectDelayPeerPortTxRx,
recoverMember: recoverDelayPeerPortTxRx,
}
clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs
if random {
clus.UpdateDelayLatencyMs()
ff.failureCase = rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER
}
f := &failureFollower{ff, -1, -1}
return &failureDelay{
Failure: f,
@@ -72,13 +70,11 @@ func newFailureDelayPeerPortTxRxOneFollowerUntilTriggerSnapshot(clus *Cluster, r
injectMember: injectDelayPeerPortTxRx,
recoverMember: recoverDelayPeerPortTxRx,
}
clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs
if random {
clus.UpdateDelayLatencyMs()
ff.failureCase = rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT
}
f := &failureFollower{ff, -1, -1}
return &failureUntilSnapshot{
failureCase: ff.failureCase,
@@ -92,13 +88,11 @@ func newFailureDelayPeerPortTxRxLeader(clus *Cluster, random bool) Failure {
injectMember: injectDelayPeerPortTxRx,
recoverMember: recoverDelayPeerPortTxRx,
}
clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs
if random {
clus.UpdateDelayLatencyMs()
ff.failureCase = rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_LEADER
}
f := &failureLeader{ff, -1, -1}
return &failureDelay{
Failure: f,
@@ -112,13 +106,11 @@ func newFailureDelayPeerPortTxRxLeaderUntilTriggerSnapshot(clus *Cluster, random
injectMember: injectDelayPeerPortTxRx,
recoverMember: recoverDelayPeerPortTxRx,
}
clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs
if random {
clus.UpdateDelayLatencyMs()
ff.failureCase = rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT
}
f := &failureLeader{ff, -1, -1}
return &failureUntilSnapshot{
failureCase: ff.failureCase,
@@ -132,13 +124,11 @@ func newFailureDelayPeerPortTxRxQuorum(clus *Cluster, random bool) Failure {
injectMember: injectDelayPeerPortTxRx,
recoverMember: recoverDelayPeerPortTxRx,
}
clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs
if random {
clus.UpdateDelayLatencyMs()
f.failureCase = rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM
}
return &failureDelay{
Failure: f,
delayDuration: clus.GetFailureDelayDuration(),
@@ -151,13 +141,11 @@ func newFailureDelayPeerPortTxRxAll(clus *Cluster, random bool) Failure {
injectMember: injectDelayPeerPortTxRx,
recoverMember: recoverDelayPeerPortTxRx,
}
clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs
if random {
clus.UpdateDelayLatencyMs()
f.failureCase = rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_ALL
}
return &failureDelay{
Failure: f,
delayDuration: clus.GetFailureDelayDuration(),