etcd/tests/linearizability/failpoints.go
Marek Siarkowicz 0c004a6ce4 tests: Propagage logger through linearizability tests
Signed-off-by: Marek Siarkowicz <siarkowicz@google.com>
2023-01-30 14:30:21 +01:00

320 lines
12 KiB
Go

// Copyright 2022 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package linearizability
import (
"context"
"fmt"
"math/rand"
"strings"
"testing"
"time"
"go.uber.org/zap"
clientv3 "go.etcd.io/etcd/client/v3"
"go.etcd.io/etcd/tests/v3/framework/e2e"
)
const (
triggerTimeout = 2 * time.Second
)
var (
KillFailpoint Failpoint = killFailpoint{}
DefragBeforeCopyPanic Failpoint = goPanicFailpoint{"defragBeforeCopy", triggerDefrag, AnyMember}
DefragBeforeRenamePanic Failpoint = goPanicFailpoint{"defragBeforeRename", triggerDefrag, AnyMember}
BeforeCommitPanic Failpoint = goPanicFailpoint{"beforeCommit", nil, AnyMember}
AfterCommitPanic Failpoint = goPanicFailpoint{"afterCommit", nil, AnyMember}
RaftBeforeSavePanic Failpoint = goPanicFailpoint{"raftBeforeSave", nil, AnyMember}
RaftAfterSavePanic Failpoint = goPanicFailpoint{"raftAfterSave", nil, AnyMember}
BackendBeforePreCommitHookPanic Failpoint = goPanicFailpoint{"commitBeforePreCommitHook", nil, AnyMember}
BackendAfterPreCommitHookPanic Failpoint = goPanicFailpoint{"commitAfterPreCommitHook", nil, AnyMember}
BackendBeforeStartDBTxnPanic Failpoint = goPanicFailpoint{"beforeStartDBTxn", nil, AnyMember}
BackendAfterStartDBTxnPanic Failpoint = goPanicFailpoint{"afterStartDBTxn", nil, AnyMember}
BackendBeforeWritebackBufPanic Failpoint = goPanicFailpoint{"beforeWritebackBuf", nil, AnyMember}
BackendAfterWritebackBufPanic Failpoint = goPanicFailpoint{"afterWritebackBuf", nil, AnyMember}
CompactBeforeCommitScheduledCompactPanic Failpoint = goPanicFailpoint{"compactBeforeCommitScheduledCompact", triggerCompact, AnyMember}
CompactAfterCommitScheduledCompactPanic Failpoint = goPanicFailpoint{"compactAfterCommitScheduledCompact", triggerCompact, AnyMember}
CompactBeforeSetFinishedCompactPanic Failpoint = goPanicFailpoint{"compactBeforeSetFinishedCompact", triggerCompact, AnyMember}
CompactAfterSetFinishedCompactPanic Failpoint = goPanicFailpoint{"compactAfterSetFinishedCompact", triggerCompact, AnyMember}
CompactBeforeCommitBatchPanic Failpoint = goPanicFailpoint{"compactBeforeCommitBatch", triggerCompact, AnyMember}
CompactAfterCommitBatchPanic Failpoint = goPanicFailpoint{"compactAfterCommitBatch", triggerCompact, AnyMember}
RaftBeforeLeaderSendPanic Failpoint = goPanicFailpoint{"raftBeforeLeaderSend", nil, Leader}
BlackholePeerNetwork Failpoint = blackholePeerNetworkFailpoint{duration: time.Second}
DelayPeerNetwork Failpoint = delayPeerNetworkFailpoint{duration: time.Second, baseLatency: 75 * time.Millisecond, randomizedLatency: 50 * time.Millisecond}
RandomFailpoint Failpoint = randomFailpoint{[]Failpoint{
KillFailpoint, BeforeCommitPanic, AfterCommitPanic, RaftBeforeSavePanic,
RaftAfterSavePanic, DefragBeforeCopyPanic, DefragBeforeRenamePanic,
BackendBeforePreCommitHookPanic, BackendAfterPreCommitHookPanic,
BackendBeforeStartDBTxnPanic, BackendAfterStartDBTxnPanic,
BackendBeforeWritebackBufPanic, BackendAfterWritebackBufPanic,
CompactBeforeCommitScheduledCompactPanic, CompactAfterCommitScheduledCompactPanic,
CompactBeforeSetFinishedCompactPanic, CompactAfterSetFinishedCompactPanic,
CompactBeforeCommitBatchPanic, CompactAfterCommitBatchPanic,
RaftBeforeLeaderSendPanic,
BlackholePeerNetwork,
DelayPeerNetwork,
}}
// TODO: Figure out how to reliably trigger below failpoints and add them to RandomFailpoint
raftBeforeApplySnapPanic Failpoint = goPanicFailpoint{"raftBeforeApplySnap", nil, AnyMember}
raftAfterApplySnapPanic Failpoint = goPanicFailpoint{"raftAfterApplySnap", nil, AnyMember}
raftAfterWALReleasePanic Failpoint = goPanicFailpoint{"raftAfterWALRelease", nil, AnyMember}
raftBeforeFollowerSendPanic Failpoint = goPanicFailpoint{"raftBeforeFollowerSend", nil, AnyMember}
raftBeforeSaveSnapPanic Failpoint = goPanicFailpoint{"raftBeforeSaveSnap", nil, AnyMember}
raftAfterSaveSnapPanic Failpoint = goPanicFailpoint{"raftAfterSaveSnap", nil, AnyMember}
)
type Failpoint interface {
Trigger(ctx context.Context, t *testing.T, lg *zap.Logger, clus *e2e.EtcdProcessCluster) error
Name() string
Available(e2e.EtcdProcess) bool
}
type killFailpoint struct{}
func (f killFailpoint) Trigger(ctx context.Context, t *testing.T, lg *zap.Logger, clus *e2e.EtcdProcessCluster) error {
member := clus.Procs[rand.Int()%len(clus.Procs)]
killCtx, cancel := context.WithTimeout(ctx, triggerTimeout)
defer cancel()
for member.IsRunning() {
err := member.Kill()
if err != nil {
lg.Info("Sending kill signal failed", zap.Error(err))
}
err = member.Wait(killCtx)
if err != nil && !strings.Contains(err.Error(), "unexpected exit code") {
lg.Info("Failed to kill the process", zap.Error(err))
return fmt.Errorf("failed to kill the process within %s, err: %w", triggerTimeout, err)
}
}
err := member.Start(ctx)
if err != nil {
return err
}
return nil
}
func (f killFailpoint) Name() string {
return "Kill"
}
func (f killFailpoint) Available(e2e.EtcdProcess) bool {
return true
}
type goPanicFailpoint struct {
failpoint string
trigger func(ctx context.Context, member e2e.EtcdProcess) error
target failpointTarget
}
type failpointTarget string
const (
AnyMember failpointTarget = "AnyMember"
Leader failpointTarget = "Leader"
)
func (f goPanicFailpoint) Trigger(ctx context.Context, t *testing.T, lg *zap.Logger, clus *e2e.EtcdProcessCluster) error {
member := f.pickMember(t, clus)
triggerCtx, cancel := context.WithTimeout(ctx, triggerTimeout)
defer cancel()
for member.IsRunning() {
lg.Info("Setting up gofailpoint", zap.String("failpoint", f.Name()))
err := member.Failpoints().Setup(triggerCtx, f.failpoint, "panic")
if err != nil {
lg.Info("goFailpoint setup failed", zap.String("failpoint", f.Name()), zap.Error(err))
}
if !member.IsRunning() {
// TODO: Check member logs that etcd not running is caused panic caused by proper gofailpoint.
break
}
if f.trigger != nil {
lg.Info("Triggering gofailpoint", zap.String("failpoint", f.Name()))
err = f.trigger(triggerCtx, member)
if err != nil {
lg.Info("gofailpoint trigger failed", zap.String("failpoint", f.Name()), zap.Error(err))
}
}
lg.Info("Waiting for member to exist", zap.String("member", member.Config().Name))
err = member.Wait(triggerCtx)
if err != nil && !strings.Contains(err.Error(), "unexpected exit code") {
lg.Info("Member didn't exit as expected", zap.String("member", member.Config().Name), zap.Error(err))
return fmt.Errorf("member didn't exit as expected: %v", err)
}
lg.Info("Member existed as expected", zap.String("member", member.Config().Name))
}
err := member.Start(ctx)
if err != nil {
return err
}
return nil
}
func (f goPanicFailpoint) pickMember(t *testing.T, clus *e2e.EtcdProcessCluster) e2e.EtcdProcess {
switch f.target {
case AnyMember:
return clus.Procs[rand.Int()%len(clus.Procs)]
case Leader:
return clus.Procs[clus.WaitLeader(t)]
default:
panic("unknown target")
}
}
func (f goPanicFailpoint) Available(member e2e.EtcdProcess) bool {
memberFailpoints := member.Failpoints()
if memberFailpoints == nil {
return false
}
available := memberFailpoints.Available()
_, found := available[f.failpoint]
return found
}
func (f goPanicFailpoint) Name() string {
return f.failpoint
}
func triggerDefrag(ctx context.Context, member e2e.EtcdProcess) error {
cc, err := clientv3.New(clientv3.Config{
Endpoints: member.EndpointsV3(),
Logger: zap.NewNop(),
DialKeepAliveTime: 1 * time.Millisecond,
DialKeepAliveTimeout: 5 * time.Millisecond,
})
if err != nil {
return fmt.Errorf("failed creating client: %w", err)
}
defer cc.Close()
_, err = cc.Defragment(ctx, member.EndpointsV3()[0])
if err != nil && !strings.Contains(err.Error(), "error reading from server: EOF") {
return err
}
return nil
}
func triggerCompact(ctx context.Context, member e2e.EtcdProcess) error {
cc, err := clientv3.New(clientv3.Config{
Endpoints: member.EndpointsV3(),
Logger: zap.NewNop(),
DialKeepAliveTime: 1 * time.Millisecond,
DialKeepAliveTimeout: 5 * time.Millisecond,
})
if err != nil {
return fmt.Errorf("failed creating client: %w", err)
}
defer cc.Close()
resp, err := cc.Get(ctx, "/")
if err != nil {
return err
}
_, err = cc.Compact(ctx, resp.Header.Revision)
if err != nil && !strings.Contains(err.Error(), "error reading from server: EOF") {
return err
}
return nil
}
type randomFailpoint struct {
failpoints []Failpoint
}
func (f randomFailpoint) Trigger(ctx context.Context, t *testing.T, lg *zap.Logger, clus *e2e.EtcdProcessCluster) error {
availableFailpoints := make([]Failpoint, 0, len(f.failpoints))
for _, failpoint := range f.failpoints {
count := 0
for _, proc := range clus.Procs {
if failpoint.Available(proc) {
count++
}
}
if count == len(clus.Procs) {
availableFailpoints = append(availableFailpoints, failpoint)
}
}
failpoint := availableFailpoints[rand.Int()%len(availableFailpoints)]
lg.Info("Triggering failpoint\n", zap.String("failpoint", failpoint.Name()))
return failpoint.Trigger(ctx, t, lg, clus)
}
func (f randomFailpoint) Name() string {
return "Random"
}
func (f randomFailpoint) Available(e2e.EtcdProcess) bool {
return true
}
type blackholePeerNetworkFailpoint struct {
duration time.Duration
}
func (f blackholePeerNetworkFailpoint) Trigger(ctx context.Context, t *testing.T, lg *zap.Logger, clus *e2e.EtcdProcessCluster) error {
member := clus.Procs[rand.Int()%len(clus.Procs)]
proxy := member.PeerProxy()
proxy.BlackholeTx()
proxy.BlackholeRx()
lg.Info("Blackholing traffic from and to member", zap.String("member", member.Config().Name))
time.Sleep(f.duration)
lg.Info("Traffic restored from and to member", zap.String("member", member.Config().Name))
proxy.UnblackholeTx()
proxy.UnblackholeRx()
return nil
}
func (f blackholePeerNetworkFailpoint) Name() string {
return "blackhole"
}
func (f blackholePeerNetworkFailpoint) Available(clus e2e.EtcdProcess) bool {
return clus.PeerProxy() != nil
}
type delayPeerNetworkFailpoint struct {
duration time.Duration
baseLatency time.Duration
randomizedLatency time.Duration
}
func (f delayPeerNetworkFailpoint) Trigger(ctx context.Context, t *testing.T, lg *zap.Logger, clus *e2e.EtcdProcessCluster) error {
member := clus.Procs[rand.Int()%len(clus.Procs)]
proxy := member.PeerProxy()
proxy.DelayRx(f.baseLatency, f.randomizedLatency)
proxy.DelayTx(f.baseLatency, f.randomizedLatency)
lg.Info("Delaying traffic from and to member", zap.String("member", member.Config().Name), zap.Duration("baseLatency", f.baseLatency), zap.Duration("randomizedLatency", f.randomizedLatency))
time.Sleep(f.duration)
lg.Info("Traffic delay removed", zap.String("member", member.Config().Name))
proxy.UndelayRx()
proxy.UndelayTx()
return nil
}
func (f delayPeerNetworkFailpoint) Name() string {
return "delay"
}
func (f delayPeerNetworkFailpoint) Available(clus e2e.EtcdProcess) bool {
return clus.PeerProxy() != nil
}