tests/robustness: Retry injecting failpoint

Signed-off-by: Marek Siarkowicz <siarkowicz@google.com>
2024-09-27 06:25:44 +00:00 · 2023-06-15 21:25:11 +02:00 · 2023-06-15 21:25:11 +02:00 · 43b2477c28
commit 43b2477c28
parent 90cbadc660
2 changed files with 49 additions and 57 deletions
--- a/tests/robustness/failpoints.go
+++ b/tests/robustness/failpoints.go
@ -67,7 +67,7 @@ var (
 	RaftAfterWALReleasePanic                 Failpoint = goPanicFailpoint{"raftAfterWALRelease", triggerBlackhole{waitTillSnapshot: true}, Follower}
 	RaftBeforeSaveSnapPanic                  Failpoint = goPanicFailpoint{"raftBeforeSaveSnap", triggerBlackhole{waitTillSnapshot: true}, Follower}
 	RaftAfterSaveSnapPanic                   Failpoint = goPanicFailpoint{"raftAfterSaveSnap", triggerBlackhole{waitTillSnapshot: true}, Follower}
-	RandomFailpoint                          Failpoint = randomFailpoint{[]Failpoint{
+	allFailpoints                                      = []Failpoint{
 		KillFailpoint, BeforeCommitPanic, AfterCommitPanic, RaftBeforeSavePanic, RaftAfterSavePanic,
 		DefragBeforeCopyPanic, DefragBeforeRenamePanic, BackendBeforePreCommitHookPanic, BackendAfterPreCommitHookPanic,
 		BackendBeforeStartDBTxnPanic, BackendAfterStartDBTxnPanic, BackendBeforeWritebackBufPanic,
@ -76,22 +76,40 @@ var (
 		CompactAfterCommitBatchPanic, RaftBeforeLeaderSendPanic, BlackholePeerNetwork, DelayPeerNetwork,
 		RaftBeforeFollowerSendPanic, RaftBeforeApplySnapPanic, RaftAfterApplySnapPanic, RaftAfterWALReleasePanic,
 		RaftBeforeSaveSnapPanic, RaftAfterSaveSnapPanic, BlackholeUntilSnapshot,
-	}}
+	}
 )

+func pickRandomFailpoint(t *testing.T, clus *e2e.EtcdProcessCluster) Failpoint {
+	availableFailpoints := make([]Failpoint, 0, len(allFailpoints))
+	for _, failpoint := range allFailpoints {
+		err := validateFailpoint(clus, failpoint)
+		if err != nil {
+			continue
+		}
+		availableFailpoints = append(availableFailpoints, failpoint)
+	}
+	if len(availableFailpoints) == 0 {
+		t.Errorf("No available failpoints")
+		return nil
+	}
+	return availableFailpoints[rand.Int()%len(availableFailpoints)]
+}
+
+func validateFailpoint(clus *e2e.EtcdProcessCluster, failpoint Failpoint) error {
+	for _, proc := range clus.Procs {
+		if !failpoint.Available(*clus.Cfg, proc) {
+			return fmt.Errorf("failpoint %q not available on %s", failpoint.Name(), proc.Config().Name)
+		}
+	}
+	return nil
+}
+
 func injectFailpoints(ctx context.Context, t *testing.T, lg *zap.Logger, clus *e2e.EtcdProcessCluster, failpoint Failpoint) {
 	ctx, cancel := context.WithTimeout(ctx, triggerTimeout)
 	defer cancel()
-
 	var err error
 	successes := 0
 	failures := 0
-	for _, proc := range clus.Procs {
-		if !failpoint.Available(*clus.Cfg, proc) {
-			t.Errorf("Failpoint %q not available on %s", failpoint.Name(), proc.Config().Name)
-			return
-		}
-	}
 	for successes < failpointInjectionsCount && failures < failpointInjectionsRetries {
 		time.Sleep(waitBetweenFailpointTriggers)

@ -230,10 +248,16 @@ func (f goPanicFailpoint) Inject(ctx context.Context, t *testing.T, lg *zap.Logg
 	member := f.pickMember(t, clus)

 	for member.IsRunning() {
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		default:
+		}
 		lg.Info("Setting up gofailpoint", zap.String("failpoint", f.Name()))
 		err := member.Failpoints().Setup(ctx, f.failpoint, "panic")
 		if err != nil {
 			lg.Info("goFailpoint setup failed", zap.String("failpoint", f.Name()), zap.Error(err))
+			continue
 		}
 		if !member.IsRunning() {
 			// TODO: Check member logs that etcd not running is caused panic caused by proper gofailpoint.
@ -255,11 +279,7 @@ func (f goPanicFailpoint) Inject(ctx context.Context, t *testing.T, lg *zap.Logg
 		lg.Info("Member exited as expected", zap.String("member", member.Config().Name))
 	}

-	err := member.Start(ctx)
-	if err != nil {
-		return err
-	}
-	return nil
+	return member.Start(ctx)
 }

 func (f goPanicFailpoint) pickMember(t *testing.T, clus *e2e.EtcdProcessCluster) e2e.EtcdProcess {
@ -347,40 +367,6 @@ func (t triggerCompact) Available(e2e.EtcdProcessClusterConfig, e2e.EtcdProcess)
 	return true
 }

-type randomFailpoint struct {
-	failpoints []Failpoint
-}
-
-func (f randomFailpoint) Inject(ctx context.Context, t *testing.T, lg *zap.Logger, clus *e2e.EtcdProcessCluster) error {
-	availableFailpoints := make([]Failpoint, 0, len(f.failpoints))
-	for _, failpoint := range f.failpoints {
-		count := 0
-		for _, proc := range clus.Procs {
-			if failpoint.Available(*clus.Cfg, proc) {
-				count++
-			}
-		}
-		if count == len(clus.Procs) {
-			availableFailpoints = append(availableFailpoints, failpoint)
-		}
-	}
-	if len(availableFailpoints) == 0 {
-		t.Errorf("No available failpoints")
-		return nil
-	}
-	failpoint := availableFailpoints[rand.Int()%len(availableFailpoints)]
-	lg.Info("Triggering failpoint\n", zap.String("failpoint", failpoint.Name()))
-	return failpoint.Inject(ctx, t, lg, clus)
-}
-
-func (f randomFailpoint) Name() string {
-	return "Random"
-}
-
-func (f randomFailpoint) Available(e2e.EtcdProcessClusterConfig, e2e.EtcdProcess) bool {
-	return true
-}
-
 type blackholePeerNetworkFailpoint struct {
 	triggerBlackhole
 }
--- a/tests/robustness/linearizability_test.go
+++ b/tests/robustness/linearizability_test.go
@ -39,9 +39,8 @@ func TestRobustness(t *testing.T) {
 	scenarios := []testScenario{}
 	for _, traffic := range []traffic.Config{traffic.LowTraffic, traffic.HighTraffic, traffic.KubernetesTraffic} {
 		scenarios = append(scenarios, testScenario{
-			name:      traffic.Name + "ClusterOfSize1",
-			failpoint: RandomFailpoint,
-			traffic:   traffic,
+			name:    traffic.Name + "ClusterOfSize1",
+			traffic: traffic,
 			cluster: *e2e.NewConfig(
 				e2e.WithClusterSize(1),
 				e2e.WithSnapshotCount(100),
@ -62,10 +61,9 @@ func TestRobustness(t *testing.T) {
 			clusterOfSize3Options = append(clusterOfSize3Options, e2e.WithSnapshotCatchUpEntries(100))
 		}
 		scenarios = append(scenarios, testScenario{
-			name:      traffic.Name + "ClusterOfSize3",
-			failpoint: RandomFailpoint,
-			traffic:   traffic,
-			cluster:   *e2e.NewConfig(clusterOfSize3Options...),
+			name:    traffic.Name + "ClusterOfSize3",
+			traffic: traffic,
+			cluster: *e2e.NewConfig(clusterOfSize3Options...),
 		})
 	}
 	scenarios = append(scenarios, testScenario{
@ -93,8 +91,7 @@ func TestRobustness(t *testing.T) {
 		),
 	})
 	scenarios = append(scenarios, testScenario{
-		name:      "Issue15220",
-		failpoint: RandomFailpoint,
+		name: "Issue15220",
 		watch: watchConfig{
 			requestProgress: true,
 		},
@ -145,6 +142,15 @@ func testRobustness(ctx context.Context, t *testing.T, lg *zap.Logger, s testSce
 	}
 	defer r.clus.Close()

+	if s.failpoint == nil {
+		s.failpoint = pickRandomFailpoint(t, r.clus)
+	} else {
+		err = validateFailpoint(r.clus, s.failpoint)
+		if err != nil {
+			t.Fatal(err)
+		}
+	}
+
 	// t.Failed() returns false during panicking. We need to forcibly
 	// save data on panicking.
 	// Refer to: https://github.com/golang/go/issues/49929