mirror of
https://github.com/etcd-io/etcd.git
synced 2024-09-27 06:25:44 +00:00
Merge pull request #5543 from heyitsanthony/clientv3-unblock-reconnect
clientv3: don't hold client lock while dialing
This commit is contained in:
commit
88afb0b0a6
@ -101,25 +101,39 @@ func NewFromConfigFile(path string) (*Client, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Close shuts down the client's etcd connections.
|
// Close shuts down the client's etcd connections.
|
||||||
func (c *Client) Close() error {
|
func (c *Client) Close() (err error) {
|
||||||
c.mu.Lock()
|
c.mu.Lock()
|
||||||
defer c.mu.Unlock()
|
defer c.mu.Unlock()
|
||||||
|
|
||||||
|
// acquire the cancel
|
||||||
if c.cancel == nil {
|
if c.cancel == nil {
|
||||||
return nil
|
// already canceled
|
||||||
|
if c.lastConnErr != c.ctx.Err() {
|
||||||
|
err = c.lastConnErr
|
||||||
|
}
|
||||||
|
return
|
||||||
}
|
}
|
||||||
c.cancel()
|
cancel := c.cancel
|
||||||
c.cancel = nil
|
c.cancel = nil
|
||||||
connc := c.newconnc
|
|
||||||
c.mu.Unlock()
|
c.mu.Unlock()
|
||||||
c.connStartRetry(nil)
|
|
||||||
|
// close watcher and lease before terminating connection
|
||||||
|
// so they don't retry on a closed client
|
||||||
c.Watcher.Close()
|
c.Watcher.Close()
|
||||||
c.Lease.Close()
|
c.Lease.Close()
|
||||||
|
|
||||||
|
// cancel reconnection loop
|
||||||
|
cancel()
|
||||||
|
c.mu.Lock()
|
||||||
|
connc := c.newconnc
|
||||||
|
c.mu.Unlock()
|
||||||
|
// connc on cancel() is left closed
|
||||||
<-connc
|
<-connc
|
||||||
c.mu.Lock()
|
c.mu.Lock()
|
||||||
if c.lastConnErr != c.ctx.Err() {
|
if c.lastConnErr != c.ctx.Err() {
|
||||||
return c.lastConnErr
|
err = c.lastConnErr
|
||||||
}
|
}
|
||||||
return nil
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// Ctx is a context for "out of band" messages (e.g., for sending
|
// Ctx is a context for "out of band" messages (e.g., for sending
|
||||||
@ -278,34 +292,48 @@ func newClient(cfg *Config) (*Client, error) {
|
|||||||
func (c *Client) ActiveConnection() *grpc.ClientConn {
|
func (c *Client) ActiveConnection() *grpc.ClientConn {
|
||||||
c.mu.RLock()
|
c.mu.RLock()
|
||||||
defer c.mu.RUnlock()
|
defer c.mu.RUnlock()
|
||||||
|
if c.conn == nil {
|
||||||
|
panic("trying to return nil active connection")
|
||||||
|
}
|
||||||
return c.conn
|
return c.conn
|
||||||
}
|
}
|
||||||
|
|
||||||
// retryConnection establishes a new connection
|
// retryConnection establishes a new connection
|
||||||
func (c *Client) retryConnection(err error) (newConn *grpc.ClientConn, dialErr error) {
|
func (c *Client) retryConnection(err error) {
|
||||||
|
oldconn := c.conn
|
||||||
|
|
||||||
|
// return holding lock so old connection can be cleaned up in this defer
|
||||||
|
defer func() {
|
||||||
|
if oldconn != nil {
|
||||||
|
oldconn.Close()
|
||||||
|
if st, _ := oldconn.State(); st != grpc.Shutdown {
|
||||||
|
// wait so grpc doesn't leak sleeping goroutines
|
||||||
|
oldconn.WaitForStateChange(context.Background(), st)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
c.mu.Unlock()
|
||||||
|
}()
|
||||||
|
|
||||||
c.mu.Lock()
|
c.mu.Lock()
|
||||||
defer c.mu.Unlock()
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
c.errors = append(c.errors, err)
|
c.errors = append(c.errors, err)
|
||||||
}
|
}
|
||||||
if c.conn != nil {
|
|
||||||
c.conn.Close()
|
|
||||||
if st, _ := c.conn.State(); st != grpc.Shutdown {
|
|
||||||
// wait so grpc doesn't leak sleeping goroutines
|
|
||||||
c.conn.WaitForStateChange(context.Background(), st)
|
|
||||||
}
|
|
||||||
c.conn = nil
|
|
||||||
}
|
|
||||||
if c.cancel == nil {
|
if c.cancel == nil {
|
||||||
// client has called Close() so don't try to dial out
|
// client has called Close() so don't try to dial out
|
||||||
return nil, c.ctx.Err()
|
return
|
||||||
}
|
}
|
||||||
|
c.mu.Unlock()
|
||||||
|
|
||||||
c.conn, dialErr = c.cfg.retryDialer(c)
|
nc, dialErr := c.cfg.retryDialer(c)
|
||||||
|
|
||||||
|
c.mu.Lock()
|
||||||
|
if nc != nil {
|
||||||
|
c.conn = nc
|
||||||
|
}
|
||||||
if dialErr != nil {
|
if dialErr != nil {
|
||||||
c.errors = append(c.errors, dialErr)
|
c.errors = append(c.errors, dialErr)
|
||||||
}
|
}
|
||||||
return c.conn, dialErr
|
c.lastConnErr = dialErr
|
||||||
}
|
}
|
||||||
|
|
||||||
// connStartRetry schedules a reconnect if one is not already running
|
// connStartRetry schedules a reconnect if one is not already running
|
||||||
@ -321,17 +349,20 @@ func (c *Client) connStartRetry(err error) {
|
|||||||
|
|
||||||
// connWait waits for a reconnect to be processed
|
// connWait waits for a reconnect to be processed
|
||||||
func (c *Client) connWait(ctx context.Context, err error) (*grpc.ClientConn, error) {
|
func (c *Client) connWait(ctx context.Context, err error) (*grpc.ClientConn, error) {
|
||||||
c.mu.Lock()
|
c.mu.RLock()
|
||||||
ch := c.newconnc
|
ch := c.newconnc
|
||||||
c.mu.Unlock()
|
c.mu.RUnlock()
|
||||||
c.connStartRetry(err)
|
c.connStartRetry(err)
|
||||||
select {
|
select {
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
return nil, ctx.Err()
|
return nil, ctx.Err()
|
||||||
case <-ch:
|
case <-ch:
|
||||||
}
|
}
|
||||||
c.mu.Lock()
|
c.mu.RLock()
|
||||||
defer c.mu.Unlock()
|
defer c.mu.RUnlock()
|
||||||
|
if c.cancel == nil {
|
||||||
|
return c.conn, rpctypes.ErrConnClosed
|
||||||
|
}
|
||||||
return c.conn, c.lastConnErr
|
return c.conn, c.lastConnErr
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -340,11 +371,8 @@ func (c *Client) connMonitor() {
|
|||||||
var err error
|
var err error
|
||||||
|
|
||||||
defer func() {
|
defer func() {
|
||||||
_, err = c.retryConnection(c.ctx.Err())
|
c.retryConnection(c.ctx.Err())
|
||||||
c.mu.Lock()
|
|
||||||
c.lastConnErr = err
|
|
||||||
close(c.newconnc)
|
close(c.newconnc)
|
||||||
c.mu.Unlock()
|
|
||||||
}()
|
}()
|
||||||
|
|
||||||
limiter := rate.NewLimiter(rate.Every(minConnRetryWait), 1)
|
limiter := rate.NewLimiter(rate.Every(minConnRetryWait), 1)
|
||||||
@ -354,10 +382,8 @@ func (c *Client) connMonitor() {
|
|||||||
case <-c.ctx.Done():
|
case <-c.ctx.Done():
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
conn, connErr := c.retryConnection(err)
|
c.retryConnection(err)
|
||||||
c.mu.Lock()
|
c.mu.Lock()
|
||||||
c.lastConnErr = connErr
|
|
||||||
c.conn = conn
|
|
||||||
close(c.newconnc)
|
close(c.newconnc)
|
||||||
c.newconnc = make(chan struct{})
|
c.newconnc = make(chan struct{})
|
||||||
c.reconnc = make(chan error, 1)
|
c.reconnc = make(chan error, 1)
|
||||||
|
@ -131,6 +131,13 @@ func TestKVPutWithRequireLeader(t *testing.T) {
|
|||||||
if err != rpctypes.ErrNoLeader {
|
if err != rpctypes.ErrNoLeader {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// clients may give timeout errors since the members are stopped; take
|
||||||
|
// the clients so that terminating the cluster won't complain
|
||||||
|
clus.Client(1).Close()
|
||||||
|
clus.Client(2).Close()
|
||||||
|
clus.TakeClient(1)
|
||||||
|
clus.TakeClient(2)
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestKVRange(t *testing.T) {
|
func TestKVRange(t *testing.T) {
|
||||||
@ -633,13 +640,22 @@ func TestKVPutStoppedServerAndClose(t *testing.T) {
|
|||||||
defer testutil.AfterTest(t)
|
defer testutil.AfterTest(t)
|
||||||
clus := integration.NewClusterV3(t, &integration.ClusterConfig{Size: 1})
|
clus := integration.NewClusterV3(t, &integration.ClusterConfig{Size: 1})
|
||||||
defer clus.Terminate(t)
|
defer clus.Terminate(t)
|
||||||
|
cli := clus.Client(0)
|
||||||
clus.Members[0].Stop(t)
|
clus.Members[0].Stop(t)
|
||||||
// this Put fails and triggers an asynchronous connection retry
|
// this Put fails and triggers an asynchronous connection retry
|
||||||
_, err := clus.Client(0).Put(context.TODO(), "abc", "123")
|
_, err := cli.Put(context.TODO(), "abc", "123")
|
||||||
if err == nil ||
|
if err == nil ||
|
||||||
(!strings.Contains(err.Error(), "connection is closing") &&
|
(!strings.Contains(err.Error(), "connection is closing") &&
|
||||||
!strings.Contains(err.Error(), "transport is closing")) {
|
!strings.Contains(err.Error(), "transport is closing")) {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
// cluster will terminate and close the client with the retry in-flight
|
|
||||||
|
// wait some so the client closes with the retry in-flight
|
||||||
|
time.Sleep(time.Second)
|
||||||
|
|
||||||
|
// get the timeout
|
||||||
|
clus.TakeClient(0)
|
||||||
|
if err := cli.Close(); err == nil || !strings.Contains(err.Error(), "timed out") {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -74,7 +74,7 @@ func TestTxnWriteFail(t *testing.T) {
|
|||||||
|
|
||||||
dialTimeout := 5 * time.Second
|
dialTimeout := 5 * time.Second
|
||||||
select {
|
select {
|
||||||
case <-time.After(2*dialTimeout + time.Second):
|
case <-time.After(dialTimeout + time.Second):
|
||||||
t.Fatalf("timed out waiting for txn to fail")
|
t.Fatalf("timed out waiting for txn to fail")
|
||||||
case <-donec:
|
case <-donec:
|
||||||
// don't restart cluster until txn errors out
|
// don't restart cluster until txn errors out
|
||||||
|
@ -88,9 +88,11 @@ func (r *remoteClient) acquire(ctx context.Context) error {
|
|||||||
r.client.mu.RLock()
|
r.client.mu.RLock()
|
||||||
closed := r.client.cancel == nil
|
closed := r.client.cancel == nil
|
||||||
c := r.client.conn
|
c := r.client.conn
|
||||||
|
lastConnErr := r.client.lastConnErr
|
||||||
match := r.conn == c
|
match := r.conn == c
|
||||||
r.mu.Unlock()
|
r.mu.Unlock()
|
||||||
if c != nil && match {
|
if lastConnErr == nil && match {
|
||||||
|
// new connection already
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
r.client.mu.RUnlock()
|
r.client.mu.RUnlock()
|
||||||
|
@ -20,7 +20,7 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func TestTxnPanics(t *testing.T) {
|
func TestTxnPanics(t *testing.T) {
|
||||||
kv := NewKV(&Client{})
|
kv := &kv{}
|
||||||
|
|
||||||
errc := make(chan string)
|
errc := make(chan string)
|
||||||
df := func() {
|
df := func() {
|
||||||
|
@ -521,6 +521,9 @@ func (w *watcher) openWatchClient() (ws pb.Watch_WatchClient, err error) {
|
|||||||
return nil, v3rpc.Error(err)
|
return nil, v3rpc.Error(err)
|
||||||
}
|
}
|
||||||
w.rc.release()
|
w.rc.release()
|
||||||
|
if nerr := w.rc.reconnectWait(w.ctx, err); nerr != nil {
|
||||||
|
return nil, v3rpc.Error(nerr)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return ws, nil
|
return ws, nil
|
||||||
}
|
}
|
||||||
|
@ -795,15 +795,7 @@ func (c *ClusterV3) Terminate(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (c *ClusterV3) RandClient() *clientv3.Client {
|
func (c *ClusterV3) RandClient() *clientv3.Client {
|
||||||
for i := 0; i < 100; i++ {
|
return c.clients[rand.Intn(len(c.clients))]
|
||||||
cli := c.clients[rand.Intn(len(c.clients))]
|
|
||||||
if cli.ActiveConnection() == nil {
|
|
||||||
time.Sleep(10 * time.Millisecond)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
return cli
|
|
||||||
}
|
|
||||||
panic("failed to get a active client")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *ClusterV3) Client(i int) *clientv3.Client {
|
func (c *ClusterV3) Client(i int) *clientv3.Client {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user