From d8bb19327b6fc5a4eba57005de0a7bbeaab4ec02 Mon Sep 17 00:00:00 2001 From: Marek Siarkowicz Date: Mon, 13 May 2024 10:43:02 +0200 Subject: [PATCH] Prevent picking a failpoint that waiting till snapshot that doesn't support lower snapshot catchup entries but allow reproducing issue #15271 Signed-off-by: Marek Siarkowicz --- tests/robustness/failpoint/network.go | 15 ++++++++++++--- tests/robustness/main_test.go | 4 ---- tests/robustness/scenarios.go | 8 +++++--- 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/tests/robustness/failpoint/network.go b/tests/robustness/failpoint/network.go index 51e72cfae..d202454bc 100644 --- a/tests/robustness/failpoint/network.go +++ b/tests/robustness/failpoint/network.go @@ -57,6 +57,11 @@ func (tb triggerBlackhole) Trigger(ctx context.Context, t *testing.T, member e2e } func (tb triggerBlackhole) Available(config e2e.EtcdProcessClusterConfig, process e2e.EtcdProcess) bool { + // Avoid triggering failpoint if waiting for failpoint would take too long to fit into timeout. + // Number of required entries for snapshot depends on etcd configuration. + if tb.waitTillSnapshot && entriesToGuaranteeSnapshot(config) > 200 { + return false + } return config.ClusterSize > 1 && process.PeerProxy() != nil } @@ -127,9 +132,7 @@ func waitTillSnapshot(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCl } t.Logf("clusterRevision: %d, blackholedMemberRevision: %d", clusterRevision, blackholedMemberRevision) // Blackholed member has to be sufficiently behind to trigger snapshot transfer. - // Need to make sure leader compacted latest revBlackholedMem inside EtcdServer.snapshot. - // That's why we wait for clus.Cfg.SnapshotCount (to trigger snapshot) + clus.Cfg.SnapshotCatchUpEntries (EtcdServer.snapshot compaction offset) - if clusterRevision-blackholedMemberRevision > int64(clus.Cfg.ServerConfig.SnapshotCount+clus.Cfg.ServerConfig.SnapshotCatchUpEntries) { + if clusterRevision-blackholedMemberRevision > int64(entriesToGuaranteeSnapshot(*clus.Cfg)) { break } time.Sleep(100 * time.Millisecond) @@ -137,6 +140,12 @@ func waitTillSnapshot(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCl return nil } +func entriesToGuaranteeSnapshot(config e2e.EtcdProcessClusterConfig) uint64 { + // Need to make sure leader compacted latest revBlackholedMem inside EtcdServer.snapshot. + // That's why we wait for clus.Cfg.SnapshotCount (to trigger snapshot) + clus.Cfg.SnapshotCatchUpEntries (EtcdServer.snapshot compaction offset) + return config.ServerConfig.SnapshotCount + config.ServerConfig.SnapshotCatchUpEntries +} + // latestRevisionForEndpoint gets latest revision of the first endpoint in Client.Endpoints list func latestRevisionForEndpoint(ctx context.Context, c *clientv3.Client) (int64, error) { resp, err := c.Status(ctx, c.Endpoints()[0]) diff --git a/tests/robustness/main_test.go b/tests/robustness/main_test.go index 3e59e5b3c..1d078574f 100644 --- a/tests/robustness/main_test.go +++ b/tests/robustness/main_test.go @@ -78,10 +78,6 @@ func testRobustness(ctx context.Context, t *testing.T, lg *zap.Logger, s testSce t.Fatal(err) } } - err = failpoint.Validate(r.Cluster, s.failpoint) - if err != nil { - t.Fatal(err) - } // t.Failed() returns false during panicking. We need to forcibly // save data on panicking. diff --git a/tests/robustness/scenarios.go b/tests/robustness/scenarios.go index e5923f873..7cd121b6e 100644 --- a/tests/robustness/scenarios.go +++ b/tests/robustness/scenarios.go @@ -81,6 +81,11 @@ func exploratoryScenarios(t *testing.T) []testScenario { e2e.WithCompactionBatchLimit(100), e2e.WithWatchProcessNotifyInterval(100 * time.Millisecond), } + // snapshot-catchup-entries flag was backported in https://github.com/etcd-io/etcd/pull/17808 + v3_5_13 := semver.Version{Major: 3, Minor: 5, Patch: 13} + if v.Compare(v3_5_13) >= 0 { + baseOptions = append(baseOptions, e2e.WithSnapshotCatchUpEntries(100)) + } scenarios := []testScenario{} for _, tp := range trafficProfiles { name := filepath.Join(tp.Traffic.Name(), tp.Profile.Name, "ClusterOfSize1") @@ -104,9 +109,6 @@ func exploratoryScenarios(t *testing.T) []testScenario { clusterOfSize3Options := baseOptions clusterOfSize3Options = append(clusterOfSize3Options, e2e.WithIsPeerTLS(true)) clusterOfSize3Options = append(clusterOfSize3Options, e2e.WithPeerProxy(true)) - if !v.LessThan(version.V3_6) { - clusterOfSize3Options = append(clusterOfSize3Options, e2e.WithSnapshotCatchUpEntries(100)) - } scenarios = append(scenarios, testScenario{ name: name, traffic: tp.Traffic,