etcd/tests/robustness/linearizability_test.go

// Copyright 2022 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package robustness

import (
	"context"
	"testing"
	"time"

	"github.com/anishathalye/porcupine"
	"go.uber.org/zap"
	"go.uber.org/zap/zaptest"
	"golang.org/x/sync/errgroup"

	"go.etcd.io/etcd/api/v3/version"
	"go.etcd.io/etcd/tests/v3/framework/e2e"
	"go.etcd.io/etcd/tests/v3/robustness/model"
)

const (
	// waitBetweenFailpointTriggers
	waitBetweenFailpointTriggers = time.Second
)

var (
	LowTraffic = trafficConfig{
		name:            "LowTraffic",
		minimalQPS:      100,
		maximalQPS:      200,
		clientCount:     8,
		requestProgress: false,
		traffic: traffic{
			keyCount:     10,
			leaseTTL:     DefaultLeaseTTL,
			largePutSize: 32769,
			writes: []requestChance{
				{operation: Put, chance: 45},
				{operation: LargePut, chance: 5},
				{operation: Delete, chance: 10},
				{operation: MultiOpTxn, chance: 10},
				{operation: PutWithLease, chance: 10},
				{operation: LeaseRevoke, chance: 10},
				{operation: CompareAndSet, chance: 10},
			},
		},
	}
	HighTraffic = trafficConfig{
		name:            "HighTraffic",
		minimalQPS:      200,
		maximalQPS:      1000,
		clientCount:     12,
		requestProgress: false,
		traffic: traffic{
			keyCount:     10,
			largePutSize: 32769,
			leaseTTL:     DefaultLeaseTTL,
			writes: []requestChance{
				{operation: Put, chance: 85},
				{operation: MultiOpTxn, chance: 10},
				{operation: LargePut, chance: 5},
			},
		},
	}
	ReqProgTraffic = trafficConfig{
		name:            "RequestProgressTraffic",
		minimalQPS:      200,
		maximalQPS:      1000,
		clientCount:     12,
		requestProgress: true,
		traffic: traffic{
			keyCount:     10,
			largePutSize: 8196,
			leaseTTL:     DefaultLeaseTTL,
			writes: []requestChance{
				{operation: Put, chance: 95},
				{operation: LargePut, chance: 5},
			},
		},
	}
	defaultTraffic = LowTraffic
	trafficList    = []trafficConfig{
		LowTraffic, HighTraffic,
	}
)

func TestRobustness(t *testing.T) {
	testRunner.BeforeTest(t)
	v, err := e2e.GetVersionFromBinary(e2e.BinPath.Etcd)
	if err != nil {
		t.Fatalf("Failed checking etcd version binary, binary: %q, err: %v", e2e.BinPath.Etcd, err)
	}
	type scenario struct {
		name      string
		failpoint Failpoint
		config    e2e.EtcdProcessClusterConfig
		traffic   *trafficConfig
	}
	scenarios := []scenario{}
	for _, traffic := range trafficList {
		scenarios = append(scenarios, scenario{
			name:      "ClusterOfSize1/" + traffic.name,
			failpoint: RandomOneNodeClusterFailpoint,
			traffic:   &traffic,
			config: *e2e.NewConfig(
				e2e.WithClusterSize(1),
				e2e.WithSnapshotCount(100),
				e2e.WithGoFailEnabled(true),
				e2e.WithCompactionBatchLimit(100), // required for compactBeforeCommitBatch and compactAfterCommitBatch failpoints
				e2e.WithWatchProcessNotifyInterval(100*time.Millisecond),
			),
		})
		scenarios = append(scenarios, scenario{
			name:      "ClusterOfSize3/" + traffic.name,
			failpoint: RandomMultiNodeClusterFailpoint,
			traffic:   &traffic,
			config: *e2e.NewConfig(
				e2e.WithIsPeerTLS(true),
				e2e.WithSnapshotCount(100),
				e2e.WithPeerProxy(true),
				e2e.WithGoFailEnabled(true),
				e2e.WithCompactionBatchLimit(100), // required for compactBeforeCommitBatch and compactAfterCommitBatch failpoints
				e2e.WithWatchProcessNotifyInterval(100*time.Millisecond),
			),
		})
	}
	scenarios = append(scenarios, scenario{
		name:      "Issue14370",
		failpoint: RaftBeforeSavePanic,
		config: *e2e.NewConfig(
			e2e.WithClusterSize(1),
			e2e.WithGoFailEnabled(true),
		),
	})
	scenarios = append(scenarios, scenario{
		name:      "Issue14685",
		failpoint: DefragBeforeCopyPanic,
		config: *e2e.NewConfig(
			e2e.WithClusterSize(1),
			e2e.WithGoFailEnabled(true),
		),
	})
	scenarios = append(scenarios, scenario{
		name:      "Issue13766",
		failpoint: KillFailpoint,
		traffic:   &HighTraffic,
		config: *e2e.NewConfig(
			e2e.WithSnapshotCount(100),
		),
	})
	scenarios = append(scenarios, scenario{
		name:      "Issue15220",
		failpoint: RandomOneNodeClusterFailpoint,
		traffic:   &ReqProgTraffic,
		config: *e2e.NewConfig(
			e2e.WithClusterSize(1),
		),
	})
	snapshotOptions := []e2e.EPClusterOption{
		e2e.WithGoFailEnabled(true),
		e2e.WithSnapshotCount(100),
		e2e.WithPeerProxy(true),
		e2e.WithIsPeerTLS(true),
	}
	if v.Compare(version.V3_6) >= 0 {
		snapshotOptions = append(snapshotOptions, e2e.WithSnapshotCatchUpEntries(100))
	}
	scenarios = append(scenarios, scenario{
		name:      "Snapshot",
		failpoint: RandomSnapshotFailpoint,
		traffic:   &HighTraffic,
		config:    *e2e.NewConfig(snapshotOptions...),
	})
	for _, scenario := range scenarios {
		if scenario.traffic == nil {
			scenario.traffic = &defaultTraffic
		}

		t.Run(scenario.name, func(t *testing.T) {
			lg := zaptest.NewLogger(t)
			scenario.config.Logger = lg
			ctx := context.Background()
			testRobustness(ctx, t, lg, scenario.config, scenario.traffic, FailpointConfig{
				failpoint:           scenario.failpoint,
				count:               1,
				retries:             3,
				waitBetweenTriggers: waitBetweenFailpointTriggers,
			})
		})
	}
}

func testRobustness(ctx context.Context, t *testing.T, lg *zap.Logger, config e2e.EtcdProcessClusterConfig, traffic *trafficConfig, failpoint FailpointConfig) {
	r := report{lg: lg}
	var err error
	r.clus, err = e2e.NewEtcdProcessCluster(ctx, t, e2e.WithConfig(&config))
	if err != nil {
		t.Fatal(err)
	}
	defer r.clus.Close()

	defer func() {
		r.Report(t)
	}()
	r.operations, r.responses = runScenario(ctx, t, lg, r.clus, *traffic, failpoint)
	forcestopCluster(r.clus)

	watchProgressNotifyEnabled := r.clus.Cfg.WatchProcessNotifyInterval != 0
	validateWatchResponses(t, r.responses, traffic.requestProgress || watchProgressNotifyEnabled)

	r.events = watchEvents(r.responses)
	validateEventsMatch(t, r.events)

	r.patchedOperations = patchOperationBasedOnWatchEvents(r.operations, longestHistory(r.events))
	r.visualizeHistory = model.ValidateOperationHistoryAndReturnVisualize(t, lg, r.patchedOperations)
}

func runScenario(ctx context.Context, t *testing.T, lg *zap.Logger, clus *e2e.EtcdProcessCluster, traffic trafficConfig, failpoint FailpointConfig) (operations []porcupine.Operation, responses [][]watchResponse) {
	g := errgroup.Group{}
	finishTraffic := make(chan struct{})

	g.Go(func() error {
		defer close(finishTraffic)
		triggerFailpoints(ctx, t, lg, clus, failpoint)
		time.Sleep(time.Second)
		return nil
	})
	maxRevisionChan := make(chan int64, 1)
	g.Go(func() error {
		defer close(maxRevisionChan)
		operations = simulateTraffic(ctx, t, lg, clus, traffic, finishTraffic)
		maxRevisionChan <- operationsMaxRevision(operations)
		return nil
	})
	g.Go(func() error {
		responses = collectClusterWatchEvents(ctx, t, clus, maxRevisionChan, traffic.requestProgress)
		return nil
	})
	g.Wait()
	return operations, responses
}

func operationsMaxRevision(operations []porcupine.Operation) int64 {
	var maxRevision int64
	for _, op := range operations {
		revision := op.Output.(model.EtcdResponse).Revision
		if revision > maxRevision {
			maxRevision = revision
		}
	}
	return maxRevision
}

// forcestopCluster stops the etcd member with signal kill.
func forcestopCluster(clus *e2e.EtcdProcessCluster) error {
	for _, member := range clus.Procs {
		member.Kill()
	}
	return clus.Stop()
}