tests/robustness: Retry injecting failpoint

Signed-off-by: Marek Siarkowicz <siarkowicz@google.com>
This commit is contained in:
Marek Siarkowicz 2023-06-15 21:25:11 +02:00
parent 90cbadc660
commit 43b2477c28
2 changed files with 49 additions and 57 deletions

View File

@ -67,7 +67,7 @@ var (
RaftAfterWALReleasePanic Failpoint = goPanicFailpoint{"raftAfterWALRelease", triggerBlackhole{waitTillSnapshot: true}, Follower} RaftAfterWALReleasePanic Failpoint = goPanicFailpoint{"raftAfterWALRelease", triggerBlackhole{waitTillSnapshot: true}, Follower}
RaftBeforeSaveSnapPanic Failpoint = goPanicFailpoint{"raftBeforeSaveSnap", triggerBlackhole{waitTillSnapshot: true}, Follower} RaftBeforeSaveSnapPanic Failpoint = goPanicFailpoint{"raftBeforeSaveSnap", triggerBlackhole{waitTillSnapshot: true}, Follower}
RaftAfterSaveSnapPanic Failpoint = goPanicFailpoint{"raftAfterSaveSnap", triggerBlackhole{waitTillSnapshot: true}, Follower} RaftAfterSaveSnapPanic Failpoint = goPanicFailpoint{"raftAfterSaveSnap", triggerBlackhole{waitTillSnapshot: true}, Follower}
RandomFailpoint Failpoint = randomFailpoint{[]Failpoint{ allFailpoints = []Failpoint{
KillFailpoint, BeforeCommitPanic, AfterCommitPanic, RaftBeforeSavePanic, RaftAfterSavePanic, KillFailpoint, BeforeCommitPanic, AfterCommitPanic, RaftBeforeSavePanic, RaftAfterSavePanic,
DefragBeforeCopyPanic, DefragBeforeRenamePanic, BackendBeforePreCommitHookPanic, BackendAfterPreCommitHookPanic, DefragBeforeCopyPanic, DefragBeforeRenamePanic, BackendBeforePreCommitHookPanic, BackendAfterPreCommitHookPanic,
BackendBeforeStartDBTxnPanic, BackendAfterStartDBTxnPanic, BackendBeforeWritebackBufPanic, BackendBeforeStartDBTxnPanic, BackendAfterStartDBTxnPanic, BackendBeforeWritebackBufPanic,
@ -76,22 +76,40 @@ var (
CompactAfterCommitBatchPanic, RaftBeforeLeaderSendPanic, BlackholePeerNetwork, DelayPeerNetwork, CompactAfterCommitBatchPanic, RaftBeforeLeaderSendPanic, BlackholePeerNetwork, DelayPeerNetwork,
RaftBeforeFollowerSendPanic, RaftBeforeApplySnapPanic, RaftAfterApplySnapPanic, RaftAfterWALReleasePanic, RaftBeforeFollowerSendPanic, RaftBeforeApplySnapPanic, RaftAfterApplySnapPanic, RaftAfterWALReleasePanic,
RaftBeforeSaveSnapPanic, RaftAfterSaveSnapPanic, BlackholeUntilSnapshot, RaftBeforeSaveSnapPanic, RaftAfterSaveSnapPanic, BlackholeUntilSnapshot,
}} }
) )
func pickRandomFailpoint(t *testing.T, clus *e2e.EtcdProcessCluster) Failpoint {
availableFailpoints := make([]Failpoint, 0, len(allFailpoints))
for _, failpoint := range allFailpoints {
err := validateFailpoint(clus, failpoint)
if err != nil {
continue
}
availableFailpoints = append(availableFailpoints, failpoint)
}
if len(availableFailpoints) == 0 {
t.Errorf("No available failpoints")
return nil
}
return availableFailpoints[rand.Int()%len(availableFailpoints)]
}
func validateFailpoint(clus *e2e.EtcdProcessCluster, failpoint Failpoint) error {
for _, proc := range clus.Procs {
if !failpoint.Available(*clus.Cfg, proc) {
return fmt.Errorf("failpoint %q not available on %s", failpoint.Name(), proc.Config().Name)
}
}
return nil
}
func injectFailpoints(ctx context.Context, t *testing.T, lg *zap.Logger, clus *e2e.EtcdProcessCluster, failpoint Failpoint) { func injectFailpoints(ctx context.Context, t *testing.T, lg *zap.Logger, clus *e2e.EtcdProcessCluster, failpoint Failpoint) {
ctx, cancel := context.WithTimeout(ctx, triggerTimeout) ctx, cancel := context.WithTimeout(ctx, triggerTimeout)
defer cancel() defer cancel()
var err error var err error
successes := 0 successes := 0
failures := 0 failures := 0
for _, proc := range clus.Procs {
if !failpoint.Available(*clus.Cfg, proc) {
t.Errorf("Failpoint %q not available on %s", failpoint.Name(), proc.Config().Name)
return
}
}
for successes < failpointInjectionsCount && failures < failpointInjectionsRetries { for successes < failpointInjectionsCount && failures < failpointInjectionsRetries {
time.Sleep(waitBetweenFailpointTriggers) time.Sleep(waitBetweenFailpointTriggers)
@ -230,10 +248,16 @@ func (f goPanicFailpoint) Inject(ctx context.Context, t *testing.T, lg *zap.Logg
member := f.pickMember(t, clus) member := f.pickMember(t, clus)
for member.IsRunning() { for member.IsRunning() {
select {
case <-ctx.Done():
return ctx.Err()
default:
}
lg.Info("Setting up gofailpoint", zap.String("failpoint", f.Name())) lg.Info("Setting up gofailpoint", zap.String("failpoint", f.Name()))
err := member.Failpoints().Setup(ctx, f.failpoint, "panic") err := member.Failpoints().Setup(ctx, f.failpoint, "panic")
if err != nil { if err != nil {
lg.Info("goFailpoint setup failed", zap.String("failpoint", f.Name()), zap.Error(err)) lg.Info("goFailpoint setup failed", zap.String("failpoint", f.Name()), zap.Error(err))
continue
} }
if !member.IsRunning() { if !member.IsRunning() {
// TODO: Check member logs that etcd not running is caused panic caused by proper gofailpoint. // TODO: Check member logs that etcd not running is caused panic caused by proper gofailpoint.
@ -255,11 +279,7 @@ func (f goPanicFailpoint) Inject(ctx context.Context, t *testing.T, lg *zap.Logg
lg.Info("Member exited as expected", zap.String("member", member.Config().Name)) lg.Info("Member exited as expected", zap.String("member", member.Config().Name))
} }
err := member.Start(ctx) return member.Start(ctx)
if err != nil {
return err
}
return nil
} }
func (f goPanicFailpoint) pickMember(t *testing.T, clus *e2e.EtcdProcessCluster) e2e.EtcdProcess { func (f goPanicFailpoint) pickMember(t *testing.T, clus *e2e.EtcdProcessCluster) e2e.EtcdProcess {
@ -347,40 +367,6 @@ func (t triggerCompact) Available(e2e.EtcdProcessClusterConfig, e2e.EtcdProcess)
return true return true
} }
type randomFailpoint struct {
failpoints []Failpoint
}
func (f randomFailpoint) Inject(ctx context.Context, t *testing.T, lg *zap.Logger, clus *e2e.EtcdProcessCluster) error {
availableFailpoints := make([]Failpoint, 0, len(f.failpoints))
for _, failpoint := range f.failpoints {
count := 0
for _, proc := range clus.Procs {
if failpoint.Available(*clus.Cfg, proc) {
count++
}
}
if count == len(clus.Procs) {
availableFailpoints = append(availableFailpoints, failpoint)
}
}
if len(availableFailpoints) == 0 {
t.Errorf("No available failpoints")
return nil
}
failpoint := availableFailpoints[rand.Int()%len(availableFailpoints)]
lg.Info("Triggering failpoint\n", zap.String("failpoint", failpoint.Name()))
return failpoint.Inject(ctx, t, lg, clus)
}
func (f randomFailpoint) Name() string {
return "Random"
}
func (f randomFailpoint) Available(e2e.EtcdProcessClusterConfig, e2e.EtcdProcess) bool {
return true
}
type blackholePeerNetworkFailpoint struct { type blackholePeerNetworkFailpoint struct {
triggerBlackhole triggerBlackhole
} }

View File

@ -39,9 +39,8 @@ func TestRobustness(t *testing.T) {
scenarios := []testScenario{} scenarios := []testScenario{}
for _, traffic := range []traffic.Config{traffic.LowTraffic, traffic.HighTraffic, traffic.KubernetesTraffic} { for _, traffic := range []traffic.Config{traffic.LowTraffic, traffic.HighTraffic, traffic.KubernetesTraffic} {
scenarios = append(scenarios, testScenario{ scenarios = append(scenarios, testScenario{
name: traffic.Name + "ClusterOfSize1", name: traffic.Name + "ClusterOfSize1",
failpoint: RandomFailpoint, traffic: traffic,
traffic: traffic,
cluster: *e2e.NewConfig( cluster: *e2e.NewConfig(
e2e.WithClusterSize(1), e2e.WithClusterSize(1),
e2e.WithSnapshotCount(100), e2e.WithSnapshotCount(100),
@ -62,10 +61,9 @@ func TestRobustness(t *testing.T) {
clusterOfSize3Options = append(clusterOfSize3Options, e2e.WithSnapshotCatchUpEntries(100)) clusterOfSize3Options = append(clusterOfSize3Options, e2e.WithSnapshotCatchUpEntries(100))
} }
scenarios = append(scenarios, testScenario{ scenarios = append(scenarios, testScenario{
name: traffic.Name + "ClusterOfSize3", name: traffic.Name + "ClusterOfSize3",
failpoint: RandomFailpoint, traffic: traffic,
traffic: traffic, cluster: *e2e.NewConfig(clusterOfSize3Options...),
cluster: *e2e.NewConfig(clusterOfSize3Options...),
}) })
} }
scenarios = append(scenarios, testScenario{ scenarios = append(scenarios, testScenario{
@ -93,8 +91,7 @@ func TestRobustness(t *testing.T) {
), ),
}) })
scenarios = append(scenarios, testScenario{ scenarios = append(scenarios, testScenario{
name: "Issue15220", name: "Issue15220",
failpoint: RandomFailpoint,
watch: watchConfig{ watch: watchConfig{
requestProgress: true, requestProgress: true,
}, },
@ -145,6 +142,15 @@ func testRobustness(ctx context.Context, t *testing.T, lg *zap.Logger, s testSce
} }
defer r.clus.Close() defer r.clus.Close()
if s.failpoint == nil {
s.failpoint = pickRandomFailpoint(t, r.clus)
} else {
err = validateFailpoint(r.clus, s.failpoint)
if err != nil {
t.Fatal(err)
}
}
// t.Failed() returns false during panicking. We need to forcibly // t.Failed() returns false during panicking. We need to forcibly
// save data on panicking. // save data on panicking.
// Refer to: https://github.com/golang/go/issues/49929 // Refer to: https://github.com/golang/go/issues/49929