v3rpc: run health notifier to listen on online defrag state change

Signed-off-by: Chao Chen <chaochn@amazon.com>
2024-09-27 06:25:44 +00:00 · 2023-10-26 13:11:45 -07:00 · 2023-10-26 13:11:45 -07:00 · 8a6c1335e2
commit 8a6c1335e2
parent 5fad87c2ab
4 changed files with 48 additions and 37 deletions
--- a/server/etcdserver/api/v3rpc/grpc.go
+++ b/server/etcdserver/api/v3rpc/grpc.go
@ -77,8 +77,9 @@ func Server(s *etcdserver.EtcdServer, tls *tls.Config, interceptor grpc.UnarySer
 	pb.RegisterAuthServer(grpcServer, NewAuthServer(s))

 	hsrv := health.NewServer()
-	pb.RegisterMaintenanceServer(grpcServer, NewMaintenanceServer(s, NewHealthNotifier(hsrv, s.Logger())))
+	healthNotifier := newHealthNotifier(hsrv, s)
 	healthpb.RegisterHealthServer(grpcServer, hsrv)
+	pb.RegisterMaintenanceServer(grpcServer, NewMaintenanceServer(s, healthNotifier))

 	// set zero values for metrics registered for this grpc server
 	grpc_prometheus.Register(grpcServer)
--- a/server/etcdserver/api/v3rpc/health.go
+++ b/server/etcdserver/api/v3rpc/health.go
@ -18,37 +18,47 @@ import (
 	"go.uber.org/zap"
 	"google.golang.org/grpc/health"
 	healthpb "google.golang.org/grpc/health/grpc_health_v1"
+
+	"go.etcd.io/etcd/server/v3/etcdserver"
 )

 const (
 	allGRPCServices = ""
 )

-type HealthNotifier interface {
-	StartServe()
-	StopServe(reason string)
+type notifier interface {
+	defragStarted()
+	defragFinished()
 }

-func NewHealthNotifier(hs *health.Server, lg *zap.Logger) HealthNotifier {
+func newHealthNotifier(hs *health.Server, s *etcdserver.EtcdServer) notifier {
 	if hs == nil {
 		panic("unexpected nil gRPC health server")
 	}
-	if lg == nil {
-		lg = zap.NewNop()
-	}
-	hc := &healthChecker{hs: hs, lg: lg}
+	hc := &healthNotifier{hs: hs, lg: s.Logger(), stopGRPCServiceOnDefrag: s.Cfg.ExperimentalStopGRPCServiceOnDefrag}
 	// set grpc health server as serving status blindly since
 	// the grpc server will serve iff s.ReadyNotify() is closed.
-	hc.StartServe()
+	hc.startServe()
 	return hc
 }

-type healthChecker struct {
+type healthNotifier struct {
 	hs *health.Server
 	lg *zap.Logger
+
+	stopGRPCServiceOnDefrag bool
 }

-func (hc *healthChecker) StartServe() {
+func (hc *healthNotifier) defragStarted() {
+	if !hc.stopGRPCServiceOnDefrag {
+		return
+	}
+	hc.stopServe("defrag is active")
+}
+
+func (hc *healthNotifier) defragFinished() { hc.startServe() }
+
+func (hc *healthNotifier) startServe() {
 	hc.lg.Info(
 		"grpc service status changed",
 		zap.String("service", allGRPCServices),
@ -57,7 +67,7 @@ func (hc *healthChecker) StartServe() {
 	hc.hs.SetServingStatus(allGRPCServices, healthpb.HealthCheckResponse_SERVING)
 }

-func (hc *healthChecker) StopServe(reason string) {
+func (hc *healthNotifier) stopServe(reason string) {
 	hc.lg.Warn(
 		"grpc service status changed",
 		zap.String("service", allGRPCServices),
--- a/server/etcdserver/api/v3rpc/maintenance.go
+++ b/server/etcdserver/api/v3rpc/maintenance.go
@ -74,13 +74,12 @@ type maintenanceServer struct {
 	cs     ClusterStatusGetter
 	d      Downgrader
 	vs     serverversion.Server
-	hn     HealthNotifier

-	stopServingOnDefrag bool
+	healthNotifier notifier
 }

-func NewMaintenanceServer(s *etcdserver.EtcdServer, hn HealthNotifier) pb.MaintenanceServer {
-	srv := &maintenanceServer{lg: s.Cfg.Logger, rg: s, hasher: s.KV().HashStorage(), bg: s, a: s, lt: s, hdr: newHeader(s), cs: s, d: s, vs: etcdserver.NewServerVersionAdapter(s), hn: hn, stopServingOnDefrag: s.Cfg.ExperimentalStopGRPCServiceOnDefrag}
+func NewMaintenanceServer(s *etcdserver.EtcdServer, healthNotifier notifier) pb.MaintenanceServer {
+	srv := &maintenanceServer{lg: s.Cfg.Logger, rg: s, hasher: s.KV().HashStorage(), bg: s, a: s, lt: s, hdr: newHeader(s), cs: s, d: s, vs: etcdserver.NewServerVersionAdapter(s), healthNotifier: healthNotifier}
 	if srv.lg == nil {
 		srv.lg = zap.NewNop()
 	}
@ -89,10 +88,8 @@ func NewMaintenanceServer(s *etcdserver.EtcdServer, hn HealthNotifier) pb.Mainte

 func (ms *maintenanceServer) Defragment(ctx context.Context, sr *pb.DefragmentRequest) (*pb.DefragmentResponse, error) {
 	ms.lg.Info("starting defragment")
-	if ms.stopServingOnDefrag {
-		ms.hn.StopServe("defrag is active")
-		defer ms.hn.StartServe()
-	}
+	ms.healthNotifier.defragStarted()
+	defer ms.healthNotifier.defragFinished()
 	err := ms.bg.Backend().Defrag()
 	if err != nil {
 		ms.lg.Warn("failed to defragment", zap.Error(err))
--- a/tests/e2e/failover_test.go
+++ b/tests/e2e/failover_test.go
@ -49,11 +49,11 @@ func TestFailoverOnDefrag(t *testing.T) {
 		gRPCDialOptions []grpc.DialOption

 		// common assertion
-		expectedMinTotalRequestsCount int
+		expectedMinQPS float64
 		// happy case assertion
-		expectedMaxFailedRequestsCount int
+		expectedMaxFailureRate float64
 		// negative case assertion
-		expectedMinFailedRequestsCount int
+		expectedMinFailureRate float64
 	}{
 		{
 			name: "defrag failover happy case",
@ -66,8 +66,8 @@ func TestFailoverOnDefrag(t *testing.T) {
 				grpc.WithDisableServiceConfig(),
 				grpc.WithDefaultServiceConfig(`{"loadBalancingPolicy": "round_robin", "healthCheckConfig": {"serviceName": ""}}`),
 			},
-			expectedMinTotalRequestsCount:  300,
-			expectedMaxFailedRequestsCount: 5,
+			expectedMinQPS:         20,
+			expectedMaxFailureRate: 0.01,
 		},
 		{
 			name: "defrag blocks one-third of requests with stopGRPCServiceOnDefrag set to false",
@ -80,8 +80,8 @@ func TestFailoverOnDefrag(t *testing.T) {
 				grpc.WithDisableServiceConfig(),
 				grpc.WithDefaultServiceConfig(`{"loadBalancingPolicy": "round_robin", "healthCheckConfig": {"serviceName": ""}}`),
 			},
-			expectedMinTotalRequestsCount:  300,
-			expectedMinFailedRequestsCount: 90,
+			expectedMinQPS:         20,
+			expectedMinFailureRate: 0.25,
 		},
 		{
 			name: "defrag blocks one-third of requests with stopGRPCServiceOnDefrag set to true and client health check disabled",
@ -90,8 +90,8 @@ func TestFailoverOnDefrag(t *testing.T) {
 				e2e.WithExperimentalStopGRPCServiceOnDefrag(true),
 				e2e.WithGoFailEnabled(true),
 			},
-			expectedMinTotalRequestsCount:  300,
-			expectedMinFailedRequestsCount: 90,
+			expectedMinQPS:         20,
+			expectedMinFailureRate: 0.25,
 		},
 	}

@ -105,6 +105,7 @@ func TestFailoverOnDefrag(t *testing.T) {
 			endpoints := clus.EndpointsGRPC()

 			requestVolume, successfulRequestCount := 0, 0
+			start := time.Now()
 			g := new(errgroup.Group)
 			g.Go(func() (lastErr error) {
 				clusterClient, cerr := clientv3.New(clientv3.Config{
@ -143,15 +144,17 @@ func TestFailoverOnDefrag(t *testing.T) {
 			if err != nil {
 				t.Logf("etcd client failed to fail over, error (%v)", err)
 			}
-			t.Logf("request failure rate is %.2f%%, traffic volume successfulRequestCount %d requests, total %d requests", (1-float64(successfulRequestCount)/float64(requestVolume))*100, successfulRequestCount, requestVolume)

-			require.GreaterOrEqual(t, requestVolume, tc.expectedMinTotalRequestsCount)
-			failedRequestCount := requestVolume - successfulRequestCount
-			if tc.expectedMaxFailedRequestsCount != 0 {
-				require.LessOrEqual(t, failedRequestCount, tc.expectedMaxFailedRequestsCount)
+			qps := float64(requestVolume) / float64(time.Since(start)) * float64(time.Second)
+			failureRate := 1 - float64(successfulRequestCount)/float64(requestVolume)
+			t.Logf("request failure rate is %.2f%%, qps is %.2f requests/second", failureRate*100, qps)
+
+			require.GreaterOrEqual(t, qps, tc.expectedMinQPS)
+			if tc.expectedMaxFailureRate != 0.0 {
+				require.LessOrEqual(t, failureRate, tc.expectedMaxFailureRate)
 			}
-			if tc.expectedMinFailedRequestsCount != 0 {
-				require.GreaterOrEqual(t, failedRequestCount, tc.expectedMinFailedRequestsCount)
+			if tc.expectedMinFailureRate != 0.0 {
+				require.GreaterOrEqual(t, failureRate, tc.expectedMinFailureRate)
 			}
 		})
 	}