Marek Siarkowicz 3471ef133d Add an e2e test and robustness failpoint around recovering from snapshot backend
Signed-off-by: Marek Siarkowicz <siarkowicz@google.com>
2024-01-04 15:25:24 +01:00

236 lines
9.6 KiB
Go

// Copyright 2023 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package failpoint
import (
"context"
"fmt"
"math/rand"
"strings"
"testing"
"time"
"go.uber.org/zap"
"go.etcd.io/etcd/tests/v3/framework/e2e"
)
var (
DefragBeforeCopyPanic Failpoint = goPanicFailpoint{"defragBeforeCopy", triggerDefrag{}, AnyMember}
DefragBeforeRenamePanic Failpoint = goPanicFailpoint{"defragBeforeRename", triggerDefrag{}, AnyMember}
BeforeCommitPanic Failpoint = goPanicFailpoint{"beforeCommit", nil, AnyMember}
AfterCommitPanic Failpoint = goPanicFailpoint{"afterCommit", nil, AnyMember}
RaftBeforeSavePanic Failpoint = goPanicFailpoint{"raftBeforeSave", nil, AnyMember}
RaftAfterSavePanic Failpoint = goPanicFailpoint{"raftAfterSave", nil, AnyMember}
BackendBeforePreCommitHookPanic Failpoint = goPanicFailpoint{"commitBeforePreCommitHook", nil, AnyMember}
BackendAfterPreCommitHookPanic Failpoint = goPanicFailpoint{"commitAfterPreCommitHook", nil, AnyMember}
BackendBeforeStartDBTxnPanic Failpoint = goPanicFailpoint{"beforeStartDBTxn", nil, AnyMember}
BackendAfterStartDBTxnPanic Failpoint = goPanicFailpoint{"afterStartDBTxn", nil, AnyMember}
BackendBeforeWritebackBufPanic Failpoint = goPanicFailpoint{"beforeWritebackBuf", nil, AnyMember}
BackendAfterWritebackBufPanic Failpoint = goPanicFailpoint{"afterWritebackBuf", nil, AnyMember}
CompactBeforeCommitScheduledCompactPanic Failpoint = goPanicFailpoint{"compactBeforeCommitScheduledCompact", triggerCompact{}, AnyMember}
CompactAfterCommitScheduledCompactPanic Failpoint = goPanicFailpoint{"compactAfterCommitScheduledCompact", triggerCompact{}, AnyMember}
CompactBeforeSetFinishedCompactPanic Failpoint = goPanicFailpoint{"compactBeforeSetFinishedCompact", triggerCompact{}, AnyMember}
CompactAfterSetFinishedCompactPanic Failpoint = goPanicFailpoint{"compactAfterSetFinishedCompact", triggerCompact{}, AnyMember}
CompactBeforeCommitBatchPanic Failpoint = goPanicFailpoint{"compactBeforeCommitBatch", triggerCompact{multiBatchCompaction: true}, AnyMember}
CompactAfterCommitBatchPanic Failpoint = goPanicFailpoint{"compactAfterCommitBatch", triggerCompact{multiBatchCompaction: true}, AnyMember}
RaftBeforeLeaderSendPanic Failpoint = goPanicFailpoint{"raftBeforeLeaderSend", nil, Leader}
RaftBeforeFollowerSendPanic Failpoint = goPanicFailpoint{"raftBeforeFollowerSend", nil, Follower}
RaftBeforeApplySnapPanic Failpoint = goPanicFailpoint{"raftBeforeApplySnap", triggerBlackhole{waitTillSnapshot: true}, Follower}
RaftAfterApplySnapPanic Failpoint = goPanicFailpoint{"raftAfterApplySnap", triggerBlackhole{waitTillSnapshot: true}, Follower}
RaftAfterWALReleasePanic Failpoint = goPanicFailpoint{"raftAfterWALRelease", triggerBlackhole{waitTillSnapshot: true}, Follower}
RaftBeforeSaveSnapPanic Failpoint = goPanicFailpoint{"raftBeforeSaveSnap", triggerBlackhole{waitTillSnapshot: true}, Follower}
RaftAfterSaveSnapPanic Failpoint = goPanicFailpoint{"raftAfterSaveSnap", triggerBlackhole{waitTillSnapshot: true}, Follower}
ApplyBeforeOpenSnapshot Failpoint = goPanicFailpoint{"applyBeforeOpenSnapshot", triggerBlackhole{waitTillSnapshot: true}, Follower}
BeforeApplyOneConfChangeSleep Failpoint = killAndGofailSleep{"beforeApplyOneConfChange", time.Second}
RaftBeforeSaveSleep Failpoint = gofailSleepAndDeactivate{"raftBeforeSave", time.Second}
RaftAfterSaveSleep Failpoint = gofailSleepAndDeactivate{"raftAfterSave", time.Second}
)
type goPanicFailpoint struct {
failpoint string
trigger trigger
target failpointTarget
}
type failpointTarget string
const (
AnyMember failpointTarget = "AnyMember"
Leader failpointTarget = "Leader"
Follower failpointTarget = "Follower"
)
func (f goPanicFailpoint) Inject(ctx context.Context, t *testing.T, lg *zap.Logger, clus *e2e.EtcdProcessCluster) error {
member := f.pickMember(t, clus)
for member.IsRunning() {
select {
case <-ctx.Done():
return ctx.Err()
default:
}
lg.Info("Setting up gofailpoint", zap.String("failpoint", f.Name()))
err := member.Failpoints().SetupHTTP(ctx, f.failpoint, "panic")
if err != nil {
lg.Info("goFailpoint setup failed", zap.String("failpoint", f.Name()), zap.Error(err))
continue
}
if !member.IsRunning() {
// TODO: Check member logs that etcd not running is caused panic caused by proper gofailpoint.
break
}
if f.trigger != nil {
lg.Info("Triggering gofailpoint", zap.String("failpoint", f.Name()))
err = f.trigger.Trigger(ctx, t, member, clus)
if err != nil {
lg.Info("gofailpoint trigger failed", zap.String("failpoint", f.Name()), zap.Error(err))
}
}
lg.Info("Waiting for member to exit", zap.String("member", member.Config().Name))
err = member.Wait(ctx)
if err != nil && !strings.Contains(err.Error(), "unexpected exit code") {
lg.Info("Member didn't exit as expected", zap.String("member", member.Config().Name), zap.Error(err))
return fmt.Errorf("member didn't exit as expected: %v", err)
}
lg.Info("Member exited as expected", zap.String("member", member.Config().Name))
}
if lazyfs := member.LazyFS(); lazyfs != nil {
lg.Info("Removing data that was not fsynced")
err := lazyfs.ClearCache(ctx)
if err != nil {
return err
}
}
return member.Start(ctx)
}
func (f goPanicFailpoint) pickMember(t *testing.T, clus *e2e.EtcdProcessCluster) e2e.EtcdProcess {
switch f.target {
case AnyMember:
return clus.Procs[rand.Int()%len(clus.Procs)]
case Leader:
return clus.Procs[clus.WaitLeader(t)]
case Follower:
return clus.Procs[(clus.WaitLeader(t)+1)%len(clus.Procs)]
default:
panic("unknown target")
}
}
func (f goPanicFailpoint) Available(config e2e.EtcdProcessClusterConfig, member e2e.EtcdProcess) bool {
if f.target == Follower && config.ClusterSize == 1 {
return false
}
if f.trigger != nil && !f.trigger.Available(config, member) {
return false
}
memberFailpoints := member.Failpoints()
if memberFailpoints == nil {
return false
}
return memberFailpoints.Available(f.failpoint)
}
func (f goPanicFailpoint) Name() string {
return fmt.Sprintf("%s=panic()", f.failpoint)
}
type killAndGofailSleep struct {
failpoint string
time time.Duration
}
func (f killAndGofailSleep) Inject(ctx context.Context, t *testing.T, lg *zap.Logger, clus *e2e.EtcdProcessCluster) error {
member := clus.Procs[rand.Int()%len(clus.Procs)]
for member.IsRunning() {
err := member.Kill()
if err != nil {
lg.Info("Sending kill signal failed", zap.Error(err))
}
err = member.Wait(ctx)
if err != nil && !strings.Contains(err.Error(), "unexpected exit code") {
lg.Info("Failed to kill the process", zap.Error(err))
return fmt.Errorf("failed to kill the process within %s, err: %w", triggerTimeout, err)
}
}
lg.Info("Setting up goFailpoint", zap.String("failpoint", f.Name()))
err := member.Failpoints().SetupEnv(f.failpoint, fmt.Sprintf(`sleep(%q)`, f.time))
if err != nil {
return err
}
err = member.Start(ctx)
if err != nil {
return err
}
// TODO: Check gofail status (https://github.com/etcd-io/gofail/pull/47) and wait for sleep to beis executed at least once.
return nil
}
func (f killAndGofailSleep) Name() string {
return fmt.Sprintf("%s=sleep(%s)", f.failpoint, f.time)
}
func (f killAndGofailSleep) Available(config e2e.EtcdProcessClusterConfig, member e2e.EtcdProcess) bool {
if config.ClusterSize == 1 {
return false
}
memberFailpoints := member.Failpoints()
if memberFailpoints == nil {
return false
}
return memberFailpoints.Available(f.failpoint)
}
type gofailSleepAndDeactivate struct {
failpoint string
time time.Duration
}
func (f gofailSleepAndDeactivate) Inject(ctx context.Context, t *testing.T, lg *zap.Logger, clus *e2e.EtcdProcessCluster) error {
member := clus.Procs[rand.Int()%len(clus.Procs)]
lg.Info("Setting up gofailpoint", zap.String("failpoint", f.Name()))
err := member.Failpoints().SetupHTTP(ctx, f.failpoint, fmt.Sprintf(`sleep(%q)`, f.time))
if err != nil {
lg.Info("goFailpoint setup failed", zap.String("failpoint", f.Name()), zap.Error(err))
return fmt.Errorf("goFailpoint %s setup failed, err:%w", f.Name(), err)
}
time.Sleep(f.time)
lg.Info("Deactivating gofailpoint", zap.String("failpoint", f.Name()))
err = member.Failpoints().DeactivateHTTP(ctx, f.failpoint)
if err != nil {
lg.Info("goFailpoint deactivate failed", zap.String("failpoint", f.Name()), zap.Error(err))
return fmt.Errorf("goFailpoint %s deactivate failed, err: %w", f.Name(), err)
}
return nil
}
func (f gofailSleepAndDeactivate) Name() string {
return fmt.Sprintf("%s=sleep(%s)", f.failpoint, f.time)
}
func (f gofailSleepAndDeactivate) Available(config e2e.EtcdProcessClusterConfig, member e2e.EtcdProcess) bool {
if config.ClusterSize == 1 {
return false
}
memberFailpoints := member.Failpoints()
if memberFailpoints == nil {
return false
}
return memberFailpoints.Available(f.failpoint)
}