clientv3: only health-check when timeout elapses since last failure

Otherwise network-partitioned member with active health-check server would not be gray-listed, making health-balancer stuck with isolated endpoint. Also clarifies some log messages. Signed-off-by: Gyu-Ho Lee <gyuhox@gmail.com>
2024-09-27 06:25:44 +00:00 · 2017-10-07 06:16:40 -07:00 · 2017-10-07 06:16:40 -07:00 · 1704443c6d
commit 1704443c6d
parent e47be1f325
1 changed files with 22 additions and 5 deletions
--- a/clientv3/health_balancer.go
+++ b/clientv3/health_balancer.go
@ -36,7 +36,8 @@ type healthBalancer struct {
 	balancer

 	// healthCheck checks an endpoint's health.
-	healthCheck healthCheckFunc
+	healthCheck        healthCheckFunc
+	healthCheckTimeout time.Duration

 	// mu protects addrs, eps, unhealthy map, and stopc.
 	mu sync.RWMutex
@ -71,6 +72,7 @@ func newHealthBalancer(b balancer, timeout time.Duration, hc healthCheckFunc) *h
 	if timeout < minHealthRetryDuration {
 		timeout = minHealthRetryDuration
 	}
+	hb.healthCheckTimeout = timeout

 	hb.wg.Add(1)
 	go func() {
@ -95,6 +97,9 @@ func (hb *healthBalancer) Up(addr grpc.Address) func(error) {
 		hb.unhealthy[addr.Addr] = time.Now()
 		hb.mu.Unlock()
 		f(err)
+		if logger.V(4) {
+			logger.Infof("clientv3/health-balancer: %s becomes unhealthy (%v)", addr.Addr, err)
+		}
 	}
 }

@ -140,7 +145,7 @@ func (hb *healthBalancer) updateUnhealthy(timeout time.Duration) {
 				if time.Since(v) > timeout {
 					delete(hb.unhealthy, k)
 					if logger.V(4) {
-						logger.Infof("clientv3/balancer: removes %s from unhealthy after %v", k, timeout)
+						logger.Infof("clientv3/health-balancer: removes %s from unhealthy after %v", k, timeout)
 					}
 				}
 			}
@ -175,17 +180,29 @@ func (hb *healthBalancer) liveAddrs() []grpc.Address {
 func (hb *healthBalancer) mayPin(addr grpc.Address) bool {
 	hb.mu.RLock()
 	skip := len(hb.addrs) == 1 || len(hb.unhealthy) == 0
-	_, bad := hb.unhealthy[addr.Addr]
+	failedTime, bad := hb.unhealthy[addr.Addr]
+	dur := hb.healthCheckTimeout
 	hb.mu.RUnlock()
 	if skip || !bad {
 		return true
 	}
+	// prevent isolated member's endpoint from being infinitely retried, as follows:
+	//   1. keepalive pings detects GoAway with http2.ErrCodeEnhanceYourCalm
+	//   2. balancer 'Up' unpins with grpc: failed with network I/O error
+	//   3. grpc-healthcheck still SERVING, thus retry to pin
+	// instead, return before grpc-healthcheck if failed within healthcheck timeout
+	if elapsed := time.Since(failedTime); elapsed < dur {
+		if logger.V(4) {
+			logger.Infof("clientv3/health-balancer: %s is up but not pinned (failed %v ago, require minimum %v after failure)", addr.Addr, elapsed, dur)
+		}
+		return false
+	}
 	if ok, _ := hb.healthCheck(addr.Addr); ok {
 		hb.mu.Lock()
 		delete(hb.unhealthy, addr.Addr)
 		hb.mu.Unlock()
 		if logger.V(4) {
-			logger.Infof("clientv3/balancer: %s is healthy", addr.Addr)
+			logger.Infof("clientv3/health-balancer: %s is healthy (health check success)", addr.Addr)
 		}
 		return true
 	}
@ -193,7 +210,7 @@ func (hb *healthBalancer) mayPin(addr grpc.Address) bool {
 	hb.unhealthy[addr.Addr] = time.Now()
 	hb.mu.Unlock()
 	if logger.V(4) {
-		logger.Infof("clientv3/balancer: %s becomes unhealthy", addr.Addr)
+		logger.Infof("clientv3/health-balancer: %s becomes unhealthy (health check failed)", addr.Addr)
 	}
 	return false
 }