mirror of
https://github.com/etcd-io/etcd.git
synced 2024-09-27 06:25:44 +00:00

This issue is somewhat easily reproduced simply by bombarding the server with requests for progress notifications, which eventually leads to one being delivered ahead of the payload message. This is then caught by the watch response validation code previously added by Marek Siarkowicz. Signed-off-by: Peter Wortmann <peter.wortmann@skao.int>
272 lines
7.7 KiB
Go
272 lines
7.7 KiB
Go
// Copyright 2022 The etcd Authors
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package robustness
|
|
|
|
import (
|
|
"context"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/anishathalye/porcupine"
|
|
"go.uber.org/zap"
|
|
"go.uber.org/zap/zaptest"
|
|
"golang.org/x/sync/errgroup"
|
|
|
|
"go.etcd.io/etcd/api/v3/version"
|
|
"go.etcd.io/etcd/tests/v3/framework/e2e"
|
|
"go.etcd.io/etcd/tests/v3/robustness/model"
|
|
)
|
|
|
|
const (
|
|
// waitBetweenFailpointTriggers
|
|
waitBetweenFailpointTriggers = time.Second
|
|
)
|
|
|
|
var (
|
|
LowTraffic = trafficConfig{
|
|
name: "LowTraffic",
|
|
minimalQPS: 100,
|
|
maximalQPS: 200,
|
|
clientCount: 8,
|
|
requestProgress: false,
|
|
traffic: traffic{
|
|
keyCount: 10,
|
|
leaseTTL: DefaultLeaseTTL,
|
|
largePutSize: 32769,
|
|
writes: []requestChance{
|
|
{operation: Put, chance: 45},
|
|
{operation: LargePut, chance: 5},
|
|
{operation: Delete, chance: 10},
|
|
{operation: MultiOpTxn, chance: 10},
|
|
{operation: PutWithLease, chance: 10},
|
|
{operation: LeaseRevoke, chance: 10},
|
|
{operation: CompareAndSet, chance: 10},
|
|
},
|
|
},
|
|
}
|
|
HighTraffic = trafficConfig{
|
|
name: "HighTraffic",
|
|
minimalQPS: 200,
|
|
maximalQPS: 1000,
|
|
clientCount: 12,
|
|
requestProgress: false,
|
|
traffic: traffic{
|
|
keyCount: 10,
|
|
largePutSize: 32769,
|
|
leaseTTL: DefaultLeaseTTL,
|
|
writes: []requestChance{
|
|
{operation: Put, chance: 85},
|
|
{operation: MultiOpTxn, chance: 10},
|
|
{operation: LargePut, chance: 5},
|
|
},
|
|
},
|
|
}
|
|
ReqProgTraffic = trafficConfig{
|
|
name: "RequestProgressTraffic",
|
|
minimalQPS: 200,
|
|
maximalQPS: 1000,
|
|
clientCount: 12,
|
|
requestProgress: true,
|
|
traffic: traffic{
|
|
keyCount: 10,
|
|
largePutSize: 8196,
|
|
leaseTTL: DefaultLeaseTTL,
|
|
writes: []requestChance{
|
|
{operation: Put, chance: 95},
|
|
{operation: LargePut, chance: 5},
|
|
},
|
|
},
|
|
}
|
|
defaultTraffic = LowTraffic
|
|
trafficList = []trafficConfig{
|
|
LowTraffic, HighTraffic,
|
|
}
|
|
)
|
|
|
|
func TestRobustness(t *testing.T) {
|
|
testRunner.BeforeTest(t)
|
|
v, err := e2e.GetVersionFromBinary(e2e.BinPath.Etcd)
|
|
if err != nil {
|
|
t.Fatalf("Failed checking etcd version binary, binary: %q, err: %v", e2e.BinPath.Etcd, err)
|
|
}
|
|
type scenario struct {
|
|
name string
|
|
failpoint Failpoint
|
|
config e2e.EtcdProcessClusterConfig
|
|
traffic *trafficConfig
|
|
}
|
|
scenarios := []scenario{}
|
|
for _, traffic := range trafficList {
|
|
scenarios = append(scenarios, scenario{
|
|
name: "ClusterOfSize1/" + traffic.name,
|
|
failpoint: RandomOneNodeClusterFailpoint,
|
|
traffic: &traffic,
|
|
config: *e2e.NewConfig(
|
|
e2e.WithClusterSize(1),
|
|
e2e.WithSnapshotCount(100),
|
|
e2e.WithGoFailEnabled(true),
|
|
e2e.WithCompactionBatchLimit(100), // required for compactBeforeCommitBatch and compactAfterCommitBatch failpoints
|
|
e2e.WithWatchProcessNotifyInterval(100*time.Millisecond),
|
|
),
|
|
})
|
|
scenarios = append(scenarios, scenario{
|
|
name: "ClusterOfSize3/" + traffic.name,
|
|
failpoint: RandomMultiNodeClusterFailpoint,
|
|
traffic: &traffic,
|
|
config: *e2e.NewConfig(
|
|
e2e.WithIsPeerTLS(true),
|
|
e2e.WithSnapshotCount(100),
|
|
e2e.WithPeerProxy(true),
|
|
e2e.WithGoFailEnabled(true),
|
|
e2e.WithCompactionBatchLimit(100), // required for compactBeforeCommitBatch and compactAfterCommitBatch failpoints
|
|
e2e.WithWatchProcessNotifyInterval(100*time.Millisecond),
|
|
),
|
|
})
|
|
}
|
|
scenarios = append(scenarios, scenario{
|
|
name: "Issue14370",
|
|
failpoint: RaftBeforeSavePanic,
|
|
config: *e2e.NewConfig(
|
|
e2e.WithClusterSize(1),
|
|
e2e.WithGoFailEnabled(true),
|
|
),
|
|
})
|
|
scenarios = append(scenarios, scenario{
|
|
name: "Issue14685",
|
|
failpoint: DefragBeforeCopyPanic,
|
|
config: *e2e.NewConfig(
|
|
e2e.WithClusterSize(1),
|
|
e2e.WithGoFailEnabled(true),
|
|
),
|
|
})
|
|
scenarios = append(scenarios, scenario{
|
|
name: "Issue13766",
|
|
failpoint: KillFailpoint,
|
|
traffic: &HighTraffic,
|
|
config: *e2e.NewConfig(
|
|
e2e.WithSnapshotCount(100),
|
|
),
|
|
})
|
|
scenarios = append(scenarios, scenario{
|
|
name: "Issue15220",
|
|
failpoint: RandomOneNodeClusterFailpoint,
|
|
traffic: &ReqProgTraffic,
|
|
config: *e2e.NewConfig(
|
|
e2e.WithClusterSize(1),
|
|
),
|
|
})
|
|
snapshotOptions := []e2e.EPClusterOption{
|
|
e2e.WithGoFailEnabled(true),
|
|
e2e.WithSnapshotCount(100),
|
|
e2e.WithPeerProxy(true),
|
|
e2e.WithIsPeerTLS(true),
|
|
}
|
|
if v.Compare(version.V3_6) >= 0 {
|
|
snapshotOptions = append(snapshotOptions, e2e.WithSnapshotCatchUpEntries(100))
|
|
}
|
|
scenarios = append(scenarios, scenario{
|
|
name: "Snapshot",
|
|
failpoint: RandomSnapshotFailpoint,
|
|
traffic: &HighTraffic,
|
|
config: *e2e.NewConfig(snapshotOptions...),
|
|
})
|
|
for _, scenario := range scenarios {
|
|
if scenario.traffic == nil {
|
|
scenario.traffic = &defaultTraffic
|
|
}
|
|
|
|
t.Run(scenario.name, func(t *testing.T) {
|
|
lg := zaptest.NewLogger(t)
|
|
scenario.config.Logger = lg
|
|
ctx := context.Background()
|
|
testRobustness(ctx, t, lg, scenario.config, scenario.traffic, FailpointConfig{
|
|
failpoint: scenario.failpoint,
|
|
count: 1,
|
|
retries: 3,
|
|
waitBetweenTriggers: waitBetweenFailpointTriggers,
|
|
})
|
|
})
|
|
}
|
|
}
|
|
|
|
func testRobustness(ctx context.Context, t *testing.T, lg *zap.Logger, config e2e.EtcdProcessClusterConfig, traffic *trafficConfig, failpoint FailpointConfig) {
|
|
r := report{lg: lg}
|
|
var err error
|
|
r.clus, err = e2e.NewEtcdProcessCluster(ctx, t, e2e.WithConfig(&config))
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
defer r.clus.Close()
|
|
|
|
defer func() {
|
|
r.Report(t)
|
|
}()
|
|
r.operations, r.responses = runScenario(ctx, t, lg, r.clus, *traffic, failpoint)
|
|
forcestopCluster(r.clus)
|
|
|
|
watchProgressNotifyEnabled := r.clus.Cfg.WatchProcessNotifyInterval != 0
|
|
validateWatchResponses(t, r.responses, traffic.requestProgress || watchProgressNotifyEnabled)
|
|
|
|
r.events = watchEvents(r.responses)
|
|
validateEventsMatch(t, r.events)
|
|
|
|
r.patchedOperations = patchOperationBasedOnWatchEvents(r.operations, longestHistory(r.events))
|
|
r.visualizeHistory = model.ValidateOperationHistoryAndReturnVisualize(t, lg, r.patchedOperations)
|
|
}
|
|
|
|
func runScenario(ctx context.Context, t *testing.T, lg *zap.Logger, clus *e2e.EtcdProcessCluster, traffic trafficConfig, failpoint FailpointConfig) (operations []porcupine.Operation, responses [][]watchResponse) {
|
|
g := errgroup.Group{}
|
|
finishTraffic := make(chan struct{})
|
|
|
|
g.Go(func() error {
|
|
defer close(finishTraffic)
|
|
triggerFailpoints(ctx, t, lg, clus, failpoint)
|
|
time.Sleep(time.Second)
|
|
return nil
|
|
})
|
|
maxRevisionChan := make(chan int64, 1)
|
|
g.Go(func() error {
|
|
defer close(maxRevisionChan)
|
|
operations = simulateTraffic(ctx, t, lg, clus, traffic, finishTraffic)
|
|
maxRevisionChan <- operationsMaxRevision(operations)
|
|
return nil
|
|
})
|
|
g.Go(func() error {
|
|
responses = collectClusterWatchEvents(ctx, t, clus, maxRevisionChan, traffic.requestProgress)
|
|
return nil
|
|
})
|
|
g.Wait()
|
|
return operations, responses
|
|
}
|
|
|
|
func operationsMaxRevision(operations []porcupine.Operation) int64 {
|
|
var maxRevision int64
|
|
for _, op := range operations {
|
|
revision := op.Output.(model.EtcdResponse).Revision
|
|
if revision > maxRevision {
|
|
maxRevision = revision
|
|
}
|
|
}
|
|
return maxRevision
|
|
}
|
|
|
|
// forcestopCluster stops the etcd member with signal kill.
|
|
func forcestopCluster(clus *e2e.EtcdProcessCluster) error {
|
|
for _, member := range clus.Procs {
|
|
member.Kill()
|
|
}
|
|
return clus.Stop()
|
|
}
|