mirror of
https://github.com/etcd-io/etcd.git
synced 2024-09-27 06:25:44 +00:00
162 lines
5.3 KiB
Go
162 lines
5.3 KiB
Go
// Copyright 2022 The etcd Authors
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package failpoint
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"math/rand"
|
|
"testing"
|
|
"time"
|
|
|
|
"go.uber.org/zap"
|
|
healthpb "google.golang.org/grpc/health/grpc_health_v1"
|
|
|
|
clientv3 "go.etcd.io/etcd/client/v3"
|
|
"go.etcd.io/etcd/tests/v3/framework/e2e"
|
|
)
|
|
|
|
const (
|
|
triggerTimeout = time.Minute
|
|
waitBetweenFailpointTriggers = time.Second
|
|
failpointInjectionsCount = 1
|
|
failpointInjectionsRetries = 3
|
|
)
|
|
|
|
var (
|
|
allFailpoints = []Failpoint{
|
|
KillFailpoint, BeforeCommitPanic, AfterCommitPanic, RaftBeforeSavePanic, RaftAfterSavePanic,
|
|
DefragBeforeCopyPanic, DefragBeforeRenamePanic, BackendBeforePreCommitHookPanic, BackendAfterPreCommitHookPanic,
|
|
BackendBeforeStartDBTxnPanic, BackendAfterStartDBTxnPanic, BackendBeforeWritebackBufPanic,
|
|
BackendAfterWritebackBufPanic, CompactBeforeCommitScheduledCompactPanic, CompactAfterCommitScheduledCompactPanic,
|
|
CompactBeforeSetFinishedCompactPanic, CompactAfterSetFinishedCompactPanic, CompactBeforeCommitBatchPanic,
|
|
CompactAfterCommitBatchPanic, RaftBeforeLeaderSendPanic, BlackholePeerNetwork, DelayPeerNetwork,
|
|
RaftBeforeFollowerSendPanic, RaftBeforeApplySnapPanic, RaftAfterApplySnapPanic, RaftAfterWALReleasePanic,
|
|
RaftBeforeSaveSnapPanic, RaftAfterSaveSnapPanic, BlackholeUntilSnapshot,
|
|
BeforeApplyOneConfChangeSleep,
|
|
MemberReplace,
|
|
DropPeerNetwork,
|
|
RaftBeforeSaveSleep,
|
|
RaftAfterSaveSleep,
|
|
}
|
|
)
|
|
|
|
func PickRandom(t *testing.T, clus *e2e.EtcdProcessCluster) Failpoint {
|
|
availableFailpoints := make([]Failpoint, 0, len(allFailpoints))
|
|
for _, failpoint := range allFailpoints {
|
|
err := Validate(clus, failpoint)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
availableFailpoints = append(availableFailpoints, failpoint)
|
|
}
|
|
if len(availableFailpoints) == 0 {
|
|
t.Errorf("No available failpoints")
|
|
return nil
|
|
}
|
|
return availableFailpoints[rand.Int()%len(availableFailpoints)]
|
|
}
|
|
|
|
func Validate(clus *e2e.EtcdProcessCluster, failpoint Failpoint) error {
|
|
for _, proc := range clus.Procs {
|
|
if !failpoint.Available(*clus.Cfg, proc) {
|
|
return fmt.Errorf("failpoint %q not available on %s", failpoint.Name(), proc.Config().Name)
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func Inject(ctx context.Context, t *testing.T, lg *zap.Logger, clus *e2e.EtcdProcessCluster, failpoint Failpoint) {
|
|
ctx, cancel := context.WithTimeout(ctx, triggerTimeout)
|
|
defer cancel()
|
|
var err error
|
|
successes := 0
|
|
failures := 0
|
|
for successes < failpointInjectionsCount && failures < failpointInjectionsRetries {
|
|
time.Sleep(waitBetweenFailpointTriggers)
|
|
|
|
lg.Info("Verifying cluster health before failpoint", zap.String("failpoint", failpoint.Name()))
|
|
if err = verifyClusterHealth(ctx, t, clus); err != nil {
|
|
t.Errorf("failed to verify cluster health before failpoint injection, err: %v", err)
|
|
return
|
|
}
|
|
|
|
lg.Info("Triggering failpoint", zap.String("failpoint", failpoint.Name()))
|
|
err = failpoint.Inject(ctx, t, lg, clus)
|
|
if err != nil {
|
|
select {
|
|
case <-ctx.Done():
|
|
t.Errorf("Triggering failpoints timed out, err: %v", ctx.Err())
|
|
return
|
|
default:
|
|
}
|
|
lg.Info("Failed to trigger failpoint", zap.String("failpoint", failpoint.Name()), zap.Error(err))
|
|
failures++
|
|
continue
|
|
}
|
|
|
|
lg.Info("Verifying cluster health after failpoint", zap.String("failpoint", failpoint.Name()))
|
|
if err = verifyClusterHealth(ctx, t, clus); err != nil {
|
|
t.Errorf("failed to verify cluster health after failpoint injection, err: %v", err)
|
|
return
|
|
}
|
|
|
|
successes++
|
|
}
|
|
if successes < failpointInjectionsCount || failures >= failpointInjectionsRetries {
|
|
t.Errorf("failed to trigger failpoints enough times, err: %v", err)
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
func verifyClusterHealth(ctx context.Context, _ *testing.T, clus *e2e.EtcdProcessCluster) error {
|
|
for i := 0; i < len(clus.Procs); i++ {
|
|
clusterClient, err := clientv3.New(clientv3.Config{
|
|
Endpoints: clus.Procs[i].EndpointsGRPC(),
|
|
Logger: zap.NewNop(),
|
|
DialKeepAliveTime: 10 * time.Second,
|
|
DialKeepAliveTimeout: 100 * time.Millisecond,
|
|
})
|
|
if err != nil {
|
|
return fmt.Errorf("Error creating client for cluster %s: %v", clus.Procs[i].Config().Name, err)
|
|
}
|
|
defer clusterClient.Close()
|
|
|
|
cli := healthpb.NewHealthClient(clusterClient.ActiveConnection())
|
|
resp, err := cli.Check(ctx, &healthpb.HealthCheckRequest{})
|
|
if err != nil {
|
|
return fmt.Errorf("Error checking member %s health: %v", clus.Procs[i].Config().Name, err)
|
|
}
|
|
if resp.Status != healthpb.HealthCheckResponse_SERVING {
|
|
return fmt.Errorf("Member %s health status expected %s, got %s",
|
|
clus.Procs[i].Config().Name,
|
|
healthpb.HealthCheckResponse_SERVING,
|
|
resp.Status)
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
type Failpoint interface {
|
|
Inject(ctx context.Context, t *testing.T, lg *zap.Logger, clus *e2e.EtcdProcessCluster) error
|
|
Name() string
|
|
AvailabilityChecker
|
|
}
|
|
|
|
type AvailabilityChecker interface {
|
|
Available(e2e.EtcdProcessClusterConfig, e2e.EtcdProcess) bool
|
|
}
|