diff --git a/tests/robustness/failpoints.go b/tests/robustness/failpoints.go index 33c920a02..0239267ac 100644 --- a/tests/robustness/failpoints.go +++ b/tests/robustness/failpoints.go @@ -67,7 +67,7 @@ var ( RaftAfterWALReleasePanic Failpoint = goPanicFailpoint{"raftAfterWALRelease", triggerBlackhole{waitTillSnapshot: true}, Follower} RaftBeforeSaveSnapPanic Failpoint = goPanicFailpoint{"raftBeforeSaveSnap", triggerBlackhole{waitTillSnapshot: true}, Follower} RaftAfterSaveSnapPanic Failpoint = goPanicFailpoint{"raftAfterSaveSnap", triggerBlackhole{waitTillSnapshot: true}, Follower} - RandomFailpoint Failpoint = randomFailpoint{[]Failpoint{ + allFailpoints = []Failpoint{ KillFailpoint, BeforeCommitPanic, AfterCommitPanic, RaftBeforeSavePanic, RaftAfterSavePanic, DefragBeforeCopyPanic, DefragBeforeRenamePanic, BackendBeforePreCommitHookPanic, BackendAfterPreCommitHookPanic, BackendBeforeStartDBTxnPanic, BackendAfterStartDBTxnPanic, BackendBeforeWritebackBufPanic, @@ -76,22 +76,40 @@ var ( CompactAfterCommitBatchPanic, RaftBeforeLeaderSendPanic, BlackholePeerNetwork, DelayPeerNetwork, RaftBeforeFollowerSendPanic, RaftBeforeApplySnapPanic, RaftAfterApplySnapPanic, RaftAfterWALReleasePanic, RaftBeforeSaveSnapPanic, RaftAfterSaveSnapPanic, BlackholeUntilSnapshot, - }} + } ) +func pickRandomFailpoint(t *testing.T, clus *e2e.EtcdProcessCluster) Failpoint { + availableFailpoints := make([]Failpoint, 0, len(allFailpoints)) + for _, failpoint := range allFailpoints { + err := validateFailpoint(clus, failpoint) + if err != nil { + continue + } + availableFailpoints = append(availableFailpoints, failpoint) + } + if len(availableFailpoints) == 0 { + t.Errorf("No available failpoints") + return nil + } + return availableFailpoints[rand.Int()%len(availableFailpoints)] +} + +func validateFailpoint(clus *e2e.EtcdProcessCluster, failpoint Failpoint) error { + for _, proc := range clus.Procs { + if !failpoint.Available(*clus.Cfg, proc) { + return fmt.Errorf("failpoint %q not available on %s", failpoint.Name(), proc.Config().Name) + } + } + return nil +} + func injectFailpoints(ctx context.Context, t *testing.T, lg *zap.Logger, clus *e2e.EtcdProcessCluster, failpoint Failpoint) { ctx, cancel := context.WithTimeout(ctx, triggerTimeout) defer cancel() - var err error successes := 0 failures := 0 - for _, proc := range clus.Procs { - if !failpoint.Available(*clus.Cfg, proc) { - t.Errorf("Failpoint %q not available on %s", failpoint.Name(), proc.Config().Name) - return - } - } for successes < failpointInjectionsCount && failures < failpointInjectionsRetries { time.Sleep(waitBetweenFailpointTriggers) @@ -230,10 +248,16 @@ func (f goPanicFailpoint) Inject(ctx context.Context, t *testing.T, lg *zap.Logg member := f.pickMember(t, clus) for member.IsRunning() { + select { + case <-ctx.Done(): + return ctx.Err() + default: + } lg.Info("Setting up gofailpoint", zap.String("failpoint", f.Name())) err := member.Failpoints().Setup(ctx, f.failpoint, "panic") if err != nil { lg.Info("goFailpoint setup failed", zap.String("failpoint", f.Name()), zap.Error(err)) + continue } if !member.IsRunning() { // TODO: Check member logs that etcd not running is caused panic caused by proper gofailpoint. @@ -255,11 +279,7 @@ func (f goPanicFailpoint) Inject(ctx context.Context, t *testing.T, lg *zap.Logg lg.Info("Member exited as expected", zap.String("member", member.Config().Name)) } - err := member.Start(ctx) - if err != nil { - return err - } - return nil + return member.Start(ctx) } func (f goPanicFailpoint) pickMember(t *testing.T, clus *e2e.EtcdProcessCluster) e2e.EtcdProcess { @@ -347,40 +367,6 @@ func (t triggerCompact) Available(e2e.EtcdProcessClusterConfig, e2e.EtcdProcess) return true } -type randomFailpoint struct { - failpoints []Failpoint -} - -func (f randomFailpoint) Inject(ctx context.Context, t *testing.T, lg *zap.Logger, clus *e2e.EtcdProcessCluster) error { - availableFailpoints := make([]Failpoint, 0, len(f.failpoints)) - for _, failpoint := range f.failpoints { - count := 0 - for _, proc := range clus.Procs { - if failpoint.Available(*clus.Cfg, proc) { - count++ - } - } - if count == len(clus.Procs) { - availableFailpoints = append(availableFailpoints, failpoint) - } - } - if len(availableFailpoints) == 0 { - t.Errorf("No available failpoints") - return nil - } - failpoint := availableFailpoints[rand.Int()%len(availableFailpoints)] - lg.Info("Triggering failpoint\n", zap.String("failpoint", failpoint.Name())) - return failpoint.Inject(ctx, t, lg, clus) -} - -func (f randomFailpoint) Name() string { - return "Random" -} - -func (f randomFailpoint) Available(e2e.EtcdProcessClusterConfig, e2e.EtcdProcess) bool { - return true -} - type blackholePeerNetworkFailpoint struct { triggerBlackhole } diff --git a/tests/robustness/linearizability_test.go b/tests/robustness/linearizability_test.go index 51b512ef6..3e1101266 100644 --- a/tests/robustness/linearizability_test.go +++ b/tests/robustness/linearizability_test.go @@ -39,9 +39,8 @@ func TestRobustness(t *testing.T) { scenarios := []testScenario{} for _, traffic := range []traffic.Config{traffic.LowTraffic, traffic.HighTraffic, traffic.KubernetesTraffic} { scenarios = append(scenarios, testScenario{ - name: traffic.Name + "ClusterOfSize1", - failpoint: RandomFailpoint, - traffic: traffic, + name: traffic.Name + "ClusterOfSize1", + traffic: traffic, cluster: *e2e.NewConfig( e2e.WithClusterSize(1), e2e.WithSnapshotCount(100), @@ -62,10 +61,9 @@ func TestRobustness(t *testing.T) { clusterOfSize3Options = append(clusterOfSize3Options, e2e.WithSnapshotCatchUpEntries(100)) } scenarios = append(scenarios, testScenario{ - name: traffic.Name + "ClusterOfSize3", - failpoint: RandomFailpoint, - traffic: traffic, - cluster: *e2e.NewConfig(clusterOfSize3Options...), + name: traffic.Name + "ClusterOfSize3", + traffic: traffic, + cluster: *e2e.NewConfig(clusterOfSize3Options...), }) } scenarios = append(scenarios, testScenario{ @@ -93,8 +91,7 @@ func TestRobustness(t *testing.T) { ), }) scenarios = append(scenarios, testScenario{ - name: "Issue15220", - failpoint: RandomFailpoint, + name: "Issue15220", watch: watchConfig{ requestProgress: true, }, @@ -145,6 +142,15 @@ func testRobustness(ctx context.Context, t *testing.T, lg *zap.Logger, s testSce } defer r.clus.Close() + if s.failpoint == nil { + s.failpoint = pickRandomFailpoint(t, r.clus) + } else { + err = validateFailpoint(r.clus, s.failpoint) + if err != nil { + t.Fatal(err) + } + } + // t.Failed() returns false during panicking. We need to forcibly // save data on panicking. // Refer to: https://github.com/golang/go/issues/49929