mirror of
https://github.com/etcd-io/etcd.git
synced 2024-09-27 06:25:44 +00:00
266 lines
7.9 KiB
Go
266 lines
7.9 KiB
Go
// Copyright 2022 The etcd Authors
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package robustness
|
|
|
|
import (
|
|
"context"
|
|
"path/filepath"
|
|
"testing"
|
|
"time"
|
|
|
|
"go.uber.org/zap"
|
|
"go.uber.org/zap/zaptest"
|
|
"golang.org/x/sync/errgroup"
|
|
|
|
"go.etcd.io/etcd/api/v3/version"
|
|
"go.etcd.io/etcd/tests/v3/framework/e2e"
|
|
"go.etcd.io/etcd/tests/v3/robustness/identity"
|
|
"go.etcd.io/etcd/tests/v3/robustness/model"
|
|
"go.etcd.io/etcd/tests/v3/robustness/report"
|
|
"go.etcd.io/etcd/tests/v3/robustness/traffic"
|
|
"go.etcd.io/etcd/tests/v3/robustness/validate"
|
|
)
|
|
|
|
type TrafficProfile struct {
|
|
Traffic traffic.Traffic
|
|
Profile traffic.Profile
|
|
}
|
|
|
|
var trafficProfiles = []TrafficProfile{
|
|
{
|
|
Traffic: traffic.EtcdPut,
|
|
Profile: traffic.HighTrafficProfile,
|
|
},
|
|
{
|
|
Traffic: traffic.EtcdPutDeleteLease,
|
|
Profile: traffic.LowTraffic,
|
|
},
|
|
{
|
|
Traffic: traffic.Kubernetes,
|
|
Profile: traffic.HighTrafficProfile,
|
|
},
|
|
{
|
|
Traffic: traffic.Kubernetes,
|
|
Profile: traffic.LowTraffic,
|
|
},
|
|
}
|
|
|
|
func TestRobustness(t *testing.T) {
|
|
testRunner.BeforeTest(t)
|
|
v, err := e2e.GetVersionFromBinary(e2e.BinPath.Etcd)
|
|
if err != nil {
|
|
t.Fatalf("Failed checking etcd version binary, binary: %q, err: %v", e2e.BinPath.Etcd, err)
|
|
}
|
|
enableLazyFS := e2e.BinPath.LazyFSAvailable()
|
|
baseOptions := []e2e.EPClusterOption{
|
|
e2e.WithSnapshotCount(100),
|
|
e2e.WithGoFailEnabled(true),
|
|
e2e.WithCompactionBatchLimit(100),
|
|
e2e.WithWatchProcessNotifyInterval(100 * time.Millisecond),
|
|
}
|
|
scenarios := []testScenario{}
|
|
for _, tp := range trafficProfiles {
|
|
name := filepath.Join(tp.Traffic.Name(), tp.Profile.Name, "ClusterOfSize1")
|
|
clusterOfSize1Options := baseOptions
|
|
clusterOfSize1Options = append(clusterOfSize1Options, e2e.WithClusterSize(1))
|
|
// Add LazyFS only for traffic with lower QPS as it uses a lot of CPU lowering minimal QPS.
|
|
if enableLazyFS && tp.Profile.MinimalQPS <= 100 {
|
|
clusterOfSize1Options = append(clusterOfSize1Options, e2e.WithLazyFSEnabled(true))
|
|
name = filepath.Join(name, "LazyFS")
|
|
}
|
|
scenarios = append(scenarios, testScenario{
|
|
name: name,
|
|
traffic: tp.Traffic,
|
|
profile: tp.Profile,
|
|
cluster: *e2e.NewConfig(clusterOfSize1Options...),
|
|
})
|
|
}
|
|
|
|
for _, tp := range trafficProfiles {
|
|
name := filepath.Join(tp.Traffic.Name(), tp.Profile.Name, "ClusterOfSize3")
|
|
clusterOfSize3Options := baseOptions
|
|
clusterOfSize3Options = append(clusterOfSize3Options, e2e.WithIsPeerTLS(true))
|
|
clusterOfSize3Options = append(clusterOfSize3Options, e2e.WithPeerProxy(true))
|
|
if !v.LessThan(version.V3_6) {
|
|
clusterOfSize3Options = append(clusterOfSize3Options, e2e.WithSnapshotCatchUpEntries(100))
|
|
}
|
|
scenarios = append(scenarios, testScenario{
|
|
name: name,
|
|
traffic: tp.Traffic,
|
|
profile: tp.Profile,
|
|
cluster: *e2e.NewConfig(clusterOfSize3Options...),
|
|
})
|
|
}
|
|
scenarios = append(scenarios, testScenario{
|
|
name: "Issue14370",
|
|
failpoint: RaftBeforeSavePanic,
|
|
profile: traffic.LowTraffic,
|
|
traffic: traffic.EtcdPutDeleteLease,
|
|
cluster: *e2e.NewConfig(
|
|
e2e.WithClusterSize(1),
|
|
e2e.WithGoFailEnabled(true),
|
|
),
|
|
})
|
|
scenarios = append(scenarios, testScenario{
|
|
name: "Issue14685",
|
|
failpoint: DefragBeforeCopyPanic,
|
|
profile: traffic.LowTraffic,
|
|
traffic: traffic.EtcdPutDeleteLease,
|
|
cluster: *e2e.NewConfig(
|
|
e2e.WithClusterSize(1),
|
|
e2e.WithGoFailEnabled(true),
|
|
),
|
|
})
|
|
scenarios = append(scenarios, testScenario{
|
|
name: "Issue13766",
|
|
failpoint: KillFailpoint,
|
|
profile: traffic.HighTrafficProfile,
|
|
traffic: traffic.EtcdPut,
|
|
cluster: *e2e.NewConfig(
|
|
e2e.WithSnapshotCount(100),
|
|
),
|
|
})
|
|
scenarios = append(scenarios, testScenario{
|
|
name: "Issue15220",
|
|
watch: watchConfig{
|
|
requestProgress: true,
|
|
},
|
|
profile: traffic.LowTraffic,
|
|
traffic: traffic.EtcdPutDeleteLease,
|
|
cluster: *e2e.NewConfig(
|
|
e2e.WithClusterSize(1),
|
|
),
|
|
})
|
|
// TODO: Deflake waiting for waiting until snapshot for etcd versions that don't support setting snapshot catchup entries.
|
|
if v.Compare(version.V3_6) >= 0 {
|
|
scenarios = append(scenarios, testScenario{
|
|
name: "Issue15271",
|
|
failpoint: BlackholeUntilSnapshot,
|
|
profile: traffic.HighTrafficProfile,
|
|
traffic: traffic.EtcdPut,
|
|
cluster: *e2e.NewConfig(
|
|
e2e.WithSnapshotCatchUpEntries(100),
|
|
e2e.WithSnapshotCount(100),
|
|
e2e.WithPeerProxy(true),
|
|
e2e.WithIsPeerTLS(true),
|
|
),
|
|
})
|
|
}
|
|
for _, scenario := range scenarios {
|
|
t.Run(scenario.name, func(t *testing.T) {
|
|
lg := zaptest.NewLogger(t)
|
|
scenario.cluster.Logger = lg
|
|
ctx := context.Background()
|
|
testRobustness(ctx, t, lg, scenario)
|
|
})
|
|
}
|
|
}
|
|
|
|
type testScenario struct {
|
|
name string
|
|
failpoint Failpoint
|
|
cluster e2e.EtcdProcessClusterConfig
|
|
traffic traffic.Traffic
|
|
profile traffic.Profile
|
|
watch watchConfig
|
|
}
|
|
|
|
func testRobustness(ctx context.Context, t *testing.T, lg *zap.Logger, s testScenario) {
|
|
report := report.TestReport{Logger: lg}
|
|
var err error
|
|
report.Cluster, err = e2e.NewEtcdProcessCluster(ctx, t, e2e.WithConfig(&s.cluster))
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
defer report.Cluster.Close()
|
|
|
|
if s.failpoint == nil {
|
|
s.failpoint = pickRandomFailpoint(t, report.Cluster)
|
|
} else {
|
|
err = validateFailpoint(report.Cluster, s.failpoint)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
}
|
|
|
|
// t.Failed() returns false during panicking. We need to forcibly
|
|
// save data on panicking.
|
|
// Refer to: https://github.com/golang/go/issues/49929
|
|
panicked := true
|
|
defer func() {
|
|
report.Report(t, panicked)
|
|
}()
|
|
report.Client = s.run(ctx, t, lg, report.Cluster)
|
|
forcestopCluster(report.Cluster)
|
|
|
|
watchProgressNotifyEnabled := report.Cluster.Cfg.ServerConfig.ExperimentalWatchProgressNotifyInterval != 0
|
|
validateGotAtLeastOneProgressNotify(t, report.Client, s.watch.requestProgress || watchProgressNotifyEnabled)
|
|
validateConfig := validate.Config{ExpectRevisionUnique: s.traffic.ExpectUniqueRevision()}
|
|
report.Visualize = validate.ValidateAndReturnVisualize(t, lg, validateConfig, report.Client)
|
|
|
|
panicked = false
|
|
}
|
|
|
|
func (s testScenario) run(ctx context.Context, t *testing.T, lg *zap.Logger, clus *e2e.EtcdProcessCluster) (reports []report.ClientReport) {
|
|
g := errgroup.Group{}
|
|
var operationReport, watchReport []report.ClientReport
|
|
finishTraffic := make(chan struct{})
|
|
|
|
// using baseTime time-measuring operation to get monotonic clock reading
|
|
// see https://github.com/golang/go/blob/master/src/time/time.go#L17
|
|
baseTime := time.Now()
|
|
ids := identity.NewIdProvider()
|
|
g.Go(func() error {
|
|
defer close(finishTraffic)
|
|
injectFailpoints(ctx, t, lg, clus, s.failpoint)
|
|
time.Sleep(time.Second)
|
|
return nil
|
|
})
|
|
maxRevisionChan := make(chan int64, 1)
|
|
g.Go(func() error {
|
|
defer close(maxRevisionChan)
|
|
operationReport = traffic.SimulateTraffic(ctx, t, lg, clus, s.profile, s.traffic, finishTraffic, baseTime, ids)
|
|
maxRevisionChan <- operationsMaxRevision(operationReport)
|
|
return nil
|
|
})
|
|
g.Go(func() error {
|
|
watchReport = collectClusterWatchEvents(ctx, t, clus, maxRevisionChan, s.watch, baseTime, ids)
|
|
return nil
|
|
})
|
|
g.Wait()
|
|
return append(operationReport, watchReport...)
|
|
}
|
|
|
|
func operationsMaxRevision(reports []report.ClientReport) int64 {
|
|
var maxRevision int64
|
|
for _, r := range reports {
|
|
for _, op := range r.KeyValue {
|
|
resp := op.Output.(model.MaybeEtcdResponse)
|
|
if resp.Revision > maxRevision {
|
|
maxRevision = resp.Revision
|
|
}
|
|
}
|
|
}
|
|
return maxRevision
|
|
}
|
|
|
|
// forcestopCluster stops the etcd member with signal kill.
|
|
func forcestopCluster(clus *e2e.EtcdProcessCluster) error {
|
|
for _, member := range clus.Procs {
|
|
member.Kill()
|
|
}
|
|
return clus.ConcurrentStop()
|
|
}
|