mirror of
https://github.com/etcd-io/etcd.git
synced 2024-09-27 06:25:44 +00:00
Merge pull request #8669 from gyuho/balancer
clientv3/balancer: handle network partition in health check
This commit is contained in:
commit
8329963d69
4
.words
4
.words
@ -1,5 +1,8 @@
|
||||
ErrCodeEnhanceYourCalm
|
||||
GoAway
|
||||
RPC
|
||||
RPCs
|
||||
backoff
|
||||
blackholed
|
||||
cancelable
|
||||
cancelation
|
||||
@ -10,6 +13,7 @@ etcd
|
||||
gRPC
|
||||
goroutine
|
||||
goroutines
|
||||
healthcheck
|
||||
iff
|
||||
inflight
|
||||
keepalive
|
||||
|
@ -44,6 +44,8 @@ type balancer interface {
|
||||
endpoints() []string
|
||||
// pinned returns the current pinned endpoint.
|
||||
pinned() string
|
||||
// endpointError handles error from server-side.
|
||||
endpointError(addr string, err error)
|
||||
|
||||
// up is Up but includes whether the balancer will use the connection.
|
||||
up(addr grpc.Address) (func(error), bool)
|
||||
@ -150,6 +152,8 @@ func (b *simpleBalancer) pinned() string {
|
||||
return b.pinAddr
|
||||
}
|
||||
|
||||
func (b *simpleBalancer) endpointError(addr string, err error) { return }
|
||||
|
||||
func getHost2ep(eps []string) map[string]string {
|
||||
hm := make(map[string]string, len(eps))
|
||||
for i := range eps {
|
||||
|
@ -36,7 +36,8 @@ type healthBalancer struct {
|
||||
balancer
|
||||
|
||||
// healthCheck checks an endpoint's health.
|
||||
healthCheck healthCheckFunc
|
||||
healthCheck healthCheckFunc
|
||||
healthCheckTimeout time.Duration
|
||||
|
||||
// mu protects addrs, eps, unhealthy map, and stopc.
|
||||
mu sync.RWMutex
|
||||
@ -71,6 +72,7 @@ func newHealthBalancer(b balancer, timeout time.Duration, hc healthCheckFunc) *h
|
||||
if timeout < minHealthRetryDuration {
|
||||
timeout = minHealthRetryDuration
|
||||
}
|
||||
hb.healthCheckTimeout = timeout
|
||||
|
||||
hb.wg.Add(1)
|
||||
go func() {
|
||||
@ -95,6 +97,9 @@ func (hb *healthBalancer) Up(addr grpc.Address) func(error) {
|
||||
hb.unhealthy[addr.Addr] = time.Now()
|
||||
hb.mu.Unlock()
|
||||
f(err)
|
||||
if logger.V(4) {
|
||||
logger.Infof("clientv3/health-balancer: %s becomes unhealthy (%v)", addr.Addr, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -140,7 +145,7 @@ func (hb *healthBalancer) updateUnhealthy(timeout time.Duration) {
|
||||
if time.Since(v) > timeout {
|
||||
delete(hb.unhealthy, k)
|
||||
if logger.V(4) {
|
||||
logger.Infof("clientv3/balancer: removes %s from unhealthy after %v", k, timeout)
|
||||
logger.Infof("clientv3/health-balancer: removes %s from unhealthy after %v", k, timeout)
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -172,20 +177,41 @@ func (hb *healthBalancer) liveAddrs() []grpc.Address {
|
||||
return addrs
|
||||
}
|
||||
|
||||
func (hb *healthBalancer) endpointError(addr string, err error) {
|
||||
hb.mu.Lock()
|
||||
hb.unhealthy[addr] = time.Now()
|
||||
hb.mu.Unlock()
|
||||
if logger.V(4) {
|
||||
logger.Infof("clientv3/health-balancer: marking %s as unhealthy (%v)", addr, err)
|
||||
}
|
||||
}
|
||||
|
||||
func (hb *healthBalancer) mayPin(addr grpc.Address) bool {
|
||||
hb.mu.RLock()
|
||||
skip := len(hb.addrs) == 1 || len(hb.unhealthy) == 0
|
||||
_, bad := hb.unhealthy[addr.Addr]
|
||||
skip := len(hb.addrs) == 1 || len(hb.unhealthy) == 0 || len(hb.addrs) == len(hb.unhealthy)
|
||||
failedTime, bad := hb.unhealthy[addr.Addr]
|
||||
dur := hb.healthCheckTimeout
|
||||
hb.mu.RUnlock()
|
||||
if skip || !bad {
|
||||
return true
|
||||
}
|
||||
// prevent isolated member's endpoint from being infinitely retried, as follows:
|
||||
// 1. keepalive pings detects GoAway with http2.ErrCodeEnhanceYourCalm
|
||||
// 2. balancer 'Up' unpins with grpc: failed with network I/O error
|
||||
// 3. grpc-healthcheck still SERVING, thus retry to pin
|
||||
// instead, return before grpc-healthcheck if failed within healthcheck timeout
|
||||
if elapsed := time.Since(failedTime); elapsed < dur {
|
||||
if logger.V(4) {
|
||||
logger.Infof("clientv3/health-balancer: %s is up but not pinned (failed %v ago, require minimum %v after failure)", addr.Addr, elapsed, dur)
|
||||
}
|
||||
return false
|
||||
}
|
||||
if ok, _ := hb.healthCheck(addr.Addr); ok {
|
||||
hb.mu.Lock()
|
||||
delete(hb.unhealthy, addr.Addr)
|
||||
hb.mu.Unlock()
|
||||
if logger.V(4) {
|
||||
logger.Infof("clientv3/balancer: %s is healthy", addr.Addr)
|
||||
logger.Infof("clientv3/health-balancer: %s is healthy (health check success)", addr.Addr)
|
||||
}
|
||||
return true
|
||||
}
|
||||
@ -193,7 +219,7 @@ func (hb *healthBalancer) mayPin(addr grpc.Address) bool {
|
||||
hb.unhealthy[addr.Addr] = time.Now()
|
||||
hb.mu.Unlock()
|
||||
if logger.V(4) {
|
||||
logger.Infof("clientv3/balancer: %s becomes unhealthy", addr.Addr)
|
||||
logger.Infof("clientv3/health-balancer: %s becomes unhealthy (health check failed)", addr.Addr)
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
97
clientv3/integration/network_partition_test.go
Normal file
97
clientv3/integration/network_partition_test.go
Normal file
@ -0,0 +1,97 @@
|
||||
// Copyright 2017 The etcd Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// +build !cluster_proxy
|
||||
|
||||
package integration
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/coreos/etcd/clientv3"
|
||||
"github.com/coreos/etcd/integration"
|
||||
"github.com/coreos/etcd/pkg/testutil"
|
||||
)
|
||||
|
||||
// TestNetworkPartitionBalancerPut tests when one member becomes isolated,
|
||||
// first Put request fails, and following retry succeeds with client balancer
|
||||
// switching to others.
|
||||
func TestNetworkPartitionBalancerPut(t *testing.T) {
|
||||
testNetworkPartitionBalancer(t, func(cli *clientv3.Client, ctx context.Context) error {
|
||||
_, err := cli.Put(ctx, "a", "b")
|
||||
return err
|
||||
})
|
||||
}
|
||||
|
||||
// TestNetworkPartitionBalancerGet tests when one member becomes isolated,
|
||||
// first Get request fails, and following retry succeeds with client balancer
|
||||
// switching to others.
|
||||
func TestNetworkPartitionBalancerGet(t *testing.T) {
|
||||
testNetworkPartitionBalancer(t, func(cli *clientv3.Client, ctx context.Context) error {
|
||||
_, err := cli.Get(ctx, "a")
|
||||
return err
|
||||
})
|
||||
}
|
||||
|
||||
func testNetworkPartitionBalancer(t *testing.T, op func(*clientv3.Client, context.Context) error) {
|
||||
defer testutil.AfterTest(t)
|
||||
|
||||
clus := integration.NewClusterV3(t, &integration.ClusterConfig{
|
||||
Size: 3,
|
||||
GRPCKeepAliveMinTime: time.Millisecond, // avoid too_many_pings
|
||||
})
|
||||
defer clus.Terminate(t)
|
||||
|
||||
// expect pin ep[0]
|
||||
ccfg := clientv3.Config{
|
||||
Endpoints: []string{clus.Members[0].GRPCAddr()},
|
||||
DialTimeout: 3 * time.Second,
|
||||
DialKeepAliveTime: 2 * time.Second,
|
||||
DialKeepAliveTimeout: 2 * time.Second,
|
||||
}
|
||||
cli, err := clientv3.New(ccfg)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer cli.Close()
|
||||
|
||||
// add other endpoints for later endpoint switch
|
||||
cli.SetEndpoints(clus.Members[0].GRPCAddr(), clus.Members[1].GRPCAddr(), clus.Members[1].GRPCAddr())
|
||||
|
||||
time.Sleep(3 * time.Second)
|
||||
clus.Members[0].InjectPartition(t, clus.Members[1:])
|
||||
defer clus.Members[0].RecoverPartition(t, clus.Members[1:])
|
||||
|
||||
for i := 0; i < 2; i++ {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
|
||||
err = op(cli, ctx)
|
||||
cancel()
|
||||
if err == nil {
|
||||
break
|
||||
}
|
||||
if err != context.DeadlineExceeded {
|
||||
t.Fatalf("#%d: expected %v, got %v", i, context.DeadlineExceeded, err)
|
||||
}
|
||||
// give enough time for endpoint switch
|
||||
// TODO: remove random sleep by syncing directly with balancer
|
||||
if i == 0 {
|
||||
time.Sleep(5 * time.Second)
|
||||
}
|
||||
}
|
||||
if err != nil {
|
||||
t.Fatalf("balancer did not switch in time (%v)", err)
|
||||
}
|
||||
}
|
@ -66,6 +66,8 @@ func (c *Client) newRetryWrapper(isStop retryStopErrFunc) retryRpcFunc {
|
||||
if logger.V(4) {
|
||||
logger.Infof("clientv3/retry: error %v on pinned endpoint %s", err, pinned)
|
||||
}
|
||||
// mark this before endpoint switch is triggered
|
||||
c.balancer.endpointError(pinned, err)
|
||||
notify := c.balancer.ConnectNotify()
|
||||
if s, ok := status.FromError(err); ok && s.Code() == codes.Unavailable {
|
||||
c.balancer.next()
|
||||
|
Loading…
x
Reference in New Issue
Block a user