mirror of
https://github.com/etcd-io/etcd.git
synced 2024-09-27 06:25:44 +00:00

In a [scheduled test][1], the error shows ``` 2023-04-19T11:16:15.8166316Z traffic.go:96: rpc error: code = Unavailable desc = keepalive ping failed to receive ACK within timeout ``` According to [grpc-keepalive@v1.51.0][2], each frame from server will fresh the `lastRead` and it won't file `Ping` frame to server. But the client used by [`tombstone` request][3] might hit the race. Since we use 5ms as timeout, the client might not receive the result of `Ping` from server in time. The keepalive will mark it timeout and close the connection. I didn't reproduce it in my local. If we add the sleep before update `lastRead`, it can reproduce it sometimes. Still investigating this part. ```diff diff --git a/internal/transport/http2_client.go b/internal/transport/http2_client.go index d518b07e..bee9c00a 100644 --- a/internal/transport/http2_client.go +++ b/internal/transport/http2_client.go @@ -1560,6 +1560,7 @@ func (t *http2Client) reader(errCh chan<- error) { t.controlBuf.throttle() frame, err := t.framer.fr.ReadFrame() if t.keepaliveEnabled { + time.Sleep(2 * time.Millisecond) atomic.StoreInt64(&t.lastRead, time.Now().UnixNano()) } if err != nil { ``` `DialKeepAliveTime` is always >= [10s][4]. I think we should increase the timeout to avoid flaky caused by unstable env. And in a [scheduled test][5], the error shows ``` logger.go:130: 2023-04-22T10:45:52.646Z INFO Failed to trigger failpoint {"failpoint": "blackhole", "error": "context deadline exceeded"} ``` Before sending `Status` to member, the client doesn't [pick][6] the connection in time (100ms) and returns the error. The `waitTillSnapshot` is used to ensure that it is good enough to trigger snapshot transfer. And we have 1min timeout for injectFailpoints, so I think we can remove the 100ms timeout to reduce unnecessary stop. ``` injectFailpoints(1min timeout) failpoint.Inject triggerBlockhole.Trigger blackhole waitTillSnapshot ``` > NOTE: I didn't reproduce it either. :( Reference: [1]: <https://github.com/etcd-io/etcd/actions/runs/4741737098/jobs/8419176899> [2]: <eeb9afa1f6/internal/transport/http2_client.go (L1647)
> [3]: <7450cd886d/tests/robustness/traffic.go (L94)
> [4]: <eeb9afa1f6/dialoptions.go (L445)
> [5]: <https://github.com/etcd-io/etcd/actions/runs/4772033408/jobs/8484334015> [6]: <eeb9afa1f6/clientconn.go (L932)
> REF: #15763 Signed-off-by: Wei Fu <fuweid89@gmail.com>
151 lines
4.5 KiB
Go
151 lines
4.5 KiB
Go
// Copyright 2022 The etcd Authors
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package robustness
|
|
|
|
import (
|
|
"context"
|
|
"time"
|
|
|
|
"go.uber.org/zap"
|
|
|
|
"go.etcd.io/etcd/api/v3/mvccpb"
|
|
clientv3 "go.etcd.io/etcd/client/v3"
|
|
"go.etcd.io/etcd/tests/v3/robustness/identity"
|
|
"go.etcd.io/etcd/tests/v3/robustness/model"
|
|
)
|
|
|
|
type recordingClient struct {
|
|
client clientv3.Client
|
|
history *model.AppendableHistory
|
|
baseTime time.Time
|
|
}
|
|
|
|
func NewClient(endpoints []string, ids identity.Provider, baseTime time.Time) (*recordingClient, error) {
|
|
cc, err := clientv3.New(clientv3.Config{
|
|
Endpoints: endpoints,
|
|
Logger: zap.NewNop(),
|
|
DialKeepAliveTime: 10 * time.Second,
|
|
DialKeepAliveTimeout: 100 * time.Millisecond,
|
|
})
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return &recordingClient{
|
|
client: *cc,
|
|
history: model.NewAppendableHistory(ids),
|
|
baseTime: baseTime,
|
|
}, nil
|
|
}
|
|
|
|
func (c *recordingClient) Close() error {
|
|
return c.client.Close()
|
|
}
|
|
|
|
func (c *recordingClient) Get(ctx context.Context, key string) ([]*mvccpb.KeyValue, error) {
|
|
callTime := time.Since(c.baseTime)
|
|
resp, err := c.client.Get(ctx, key)
|
|
returnTime := time.Since(c.baseTime)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
c.history.AppendGet(key, callTime, returnTime, resp)
|
|
return resp.Kvs, nil
|
|
}
|
|
|
|
func (c *recordingClient) Put(ctx context.Context, key, value string) error {
|
|
callTime := time.Since(c.baseTime)
|
|
resp, err := c.client.Put(ctx, key, value)
|
|
returnTime := time.Since(c.baseTime)
|
|
c.history.AppendPut(key, value, callTime, returnTime, resp, err)
|
|
return err
|
|
}
|
|
|
|
func (c *recordingClient) Delete(ctx context.Context, key string) error {
|
|
callTime := time.Since(c.baseTime)
|
|
resp, err := c.client.Delete(ctx, key)
|
|
returnTime := time.Since(c.baseTime)
|
|
c.history.AppendDelete(key, callTime, returnTime, resp, err)
|
|
return nil
|
|
}
|
|
|
|
func (c *recordingClient) CompareAndSet(ctx context.Context, key, expectedValue, newValue string) error {
|
|
callTime := time.Since(c.baseTime)
|
|
txn := c.client.Txn(ctx)
|
|
var cmp clientv3.Cmp
|
|
if expectedValue == "" {
|
|
cmp = clientv3.Compare(clientv3.CreateRevision(key), "=", 0)
|
|
} else {
|
|
cmp = clientv3.Compare(clientv3.Value(key), "=", expectedValue)
|
|
}
|
|
resp, err := txn.If(
|
|
cmp,
|
|
).Then(
|
|
clientv3.OpPut(key, newValue),
|
|
).Commit()
|
|
returnTime := time.Since(c.baseTime)
|
|
c.history.AppendCompareAndSet(key, expectedValue, newValue, callTime, returnTime, resp, err)
|
|
return err
|
|
}
|
|
|
|
func (c *recordingClient) Txn(ctx context.Context, cmp []clientv3.Cmp, ops []clientv3.Op) error {
|
|
callTime := time.Since(c.baseTime)
|
|
txn := c.client.Txn(ctx)
|
|
resp, err := txn.If(
|
|
cmp...,
|
|
).Then(
|
|
ops...,
|
|
).Commit()
|
|
returnTime := time.Since(c.baseTime)
|
|
c.history.AppendTxn(cmp, ops, callTime, returnTime, resp, err)
|
|
return err
|
|
}
|
|
|
|
func (c *recordingClient) LeaseGrant(ctx context.Context, ttl int64) (int64, error) {
|
|
callTime := time.Since(c.baseTime)
|
|
resp, err := c.client.Lease.Grant(ctx, ttl)
|
|
returnTime := time.Since(c.baseTime)
|
|
c.history.AppendLeaseGrant(callTime, returnTime, resp, err)
|
|
var leaseId int64
|
|
if resp != nil {
|
|
leaseId = int64(resp.ID)
|
|
}
|
|
return leaseId, err
|
|
}
|
|
|
|
func (c *recordingClient) LeaseRevoke(ctx context.Context, leaseId int64) error {
|
|
callTime := time.Since(c.baseTime)
|
|
resp, err := c.client.Lease.Revoke(ctx, clientv3.LeaseID(leaseId))
|
|
returnTime := time.Since(c.baseTime)
|
|
c.history.AppendLeaseRevoke(leaseId, callTime, returnTime, resp, err)
|
|
return err
|
|
}
|
|
|
|
func (c *recordingClient) PutWithLease(ctx context.Context, key string, value string, leaseId int64) error {
|
|
callTime := time.Since(c.baseTime)
|
|
opts := clientv3.WithLease(clientv3.LeaseID(leaseId))
|
|
resp, err := c.client.Put(ctx, key, value, opts)
|
|
returnTime := time.Since(c.baseTime)
|
|
c.history.AppendPutWithLease(key, value, int64(leaseId), callTime, returnTime, resp, err)
|
|
return err
|
|
}
|
|
|
|
func (c *recordingClient) Defragment(ctx context.Context) error {
|
|
callTime := time.Since(c.baseTime)
|
|
resp, err := c.client.Defragment(ctx, c.client.Endpoints()[0])
|
|
returnTime := time.Since(c.baseTime)
|
|
c.history.AppendDefragment(callTime, returnTime, resp, err)
|
|
return err
|
|
}
|