From 09d053e0355c253a21d9a375e69a00c488f0162c Mon Sep 17 00:00:00 2001 From: Wei Fu Date: Fri, 28 Apr 2023 14:13:03 +0800 Subject: [PATCH] tests/robustness: tune timeout policy In a [scheduled test][1], the error shows ``` 2023-04-19T11:16:15.8166316Z traffic.go:96: rpc error: code = Unavailable desc = keepalive ping failed to receive ACK within timeout ``` According to [grpc-keepalive@v1.51.0][2], each frame from server will fresh the `lastRead` and it won't file `Ping` frame to server. But the client used by [`tombstone` request][3] might hit the race. Since we use 5ms as timeout, the client might not receive the result of `Ping` from server in time. The keepalive will mark it timeout and close the connection. I didn't reproduce it in my local. If we add the sleep before update `lastRead`, it can reproduce it sometimes. Still investigating this part. ```diff diff --git a/internal/transport/http2_client.go b/internal/transport/http2_client.go index d518b07e..bee9c00a 100644 --- a/internal/transport/http2_client.go +++ b/internal/transport/http2_client.go @@ -1560,6 +1560,7 @@ func (t *http2Client) reader(errCh chan<- error) { t.controlBuf.throttle() frame, err := t.framer.fr.ReadFrame() if t.keepaliveEnabled { + time.Sleep(2 * time.Millisecond) atomic.StoreInt64(&t.lastRead, time.Now().UnixNano()) } if err != nil { ``` `DialKeepAliveTime` is always >= [10s][4]. I think we should increase the timeout to avoid flaky caused by unstable env. And in a [scheduled test][5], the error shows ``` logger.go:130: 2023-04-22T10:45:52.646Z INFO Failed to trigger failpoint {"failpoint": "blackhole", "error": "context deadline exceeded"} ``` Before sending `Status` to member, the client doesn't [pick][6] the connection in time (100ms) and returns the error. The `waitTillSnapshot` is used to ensure that it is good enough to trigger snapshot transfer. And we have 1min timeout for injectFailpoints, so I think we can remove the 100ms timeout to reduce unnecessary stop. ``` injectFailpoints(1min timeout) failpoint.Inject triggerBlockhole.Trigger blackhole waitTillSnapshot ``` > NOTE: I didn't reproduce it either. :( Reference: [1]: [2]: [3]: [4]: [5]: [6]: REF: #15763 Signed-off-by: Wei Fu --- tests/robustness/client.go | 4 ++-- tests/robustness/failpoints.go | 24 +++++++++++------------- tests/robustness/watch.go | 4 ++-- 3 files changed, 15 insertions(+), 17 deletions(-) diff --git a/tests/robustness/client.go b/tests/robustness/client.go index 295a43d80..625d561aa 100644 --- a/tests/robustness/client.go +++ b/tests/robustness/client.go @@ -36,8 +36,8 @@ func NewClient(endpoints []string, ids identity.Provider, baseTime time.Time) (* cc, err := clientv3.New(clientv3.Config{ Endpoints: endpoints, Logger: zap.NewNop(), - DialKeepAliveTime: 1 * time.Millisecond, - DialKeepAliveTimeout: 5 * time.Millisecond, + DialKeepAliveTime: 10 * time.Second, + DialKeepAliveTimeout: 100 * time.Millisecond, }) if err != nil { return nil, err diff --git a/tests/robustness/failpoints.go b/tests/robustness/failpoints.go index cb77ba2ee..907d57692 100644 --- a/tests/robustness/failpoints.go +++ b/tests/robustness/failpoints.go @@ -132,8 +132,8 @@ func verifyClusterHealth(ctx context.Context, t *testing.T, clus *e2e.EtcdProces clusterClient, err := clientv3.New(clientv3.Config{ Endpoints: clus.Procs[i].EndpointsGRPC(), Logger: zap.NewNop(), - DialKeepAliveTime: 1 * time.Millisecond, - DialKeepAliveTimeout: 5 * time.Millisecond, + DialKeepAliveTime: 10 * time.Second, + DialKeepAliveTimeout: 100 * time.Millisecond, }) if err != nil { return fmt.Errorf("Error creating client for cluster %s: %v", clus.Procs[i].Config().Name, err) @@ -298,8 +298,8 @@ func (t triggerDefrag) Trigger(_ *testing.T, ctx context.Context, member e2e.Etc cc, err := clientv3.New(clientv3.Config{ Endpoints: member.EndpointsGRPC(), Logger: zap.NewNop(), - DialKeepAliveTime: 1 * time.Millisecond, - DialKeepAliveTimeout: 5 * time.Millisecond, + DialKeepAliveTime: 10 * time.Second, + DialKeepAliveTimeout: 100 * time.Millisecond, }) if err != nil { return fmt.Errorf("failed creating client: %w", err) @@ -322,8 +322,8 @@ func (t triggerCompact) Trigger(_ *testing.T, ctx context.Context, member e2e.Et cc, err := clientv3.New(clientv3.Config{ Endpoints: member.EndpointsGRPC(), Logger: zap.NewNop(), - DialKeepAliveTime: 1 * time.Millisecond, - DialKeepAliveTimeout: 5 * time.Millisecond, + DialKeepAliveTime: 10 * time.Second, + DialKeepAliveTimeout: 100 * time.Millisecond, }) if err != nil { return fmt.Errorf("failed creating client: %w", err) @@ -444,8 +444,8 @@ func waitTillSnapshot(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCl clusterClient, err := clientv3.New(clientv3.Config{ Endpoints: endpoints, Logger: zap.NewNop(), - DialKeepAliveTime: 1 * time.Millisecond, - DialKeepAliveTimeout: 5 * time.Millisecond, + DialKeepAliveTime: 10 * time.Second, + DialKeepAliveTimeout: 100 * time.Millisecond, }) if err != nil { return err @@ -455,8 +455,8 @@ func waitTillSnapshot(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCl blackholedMemberClient, err := clientv3.New(clientv3.Config{ Endpoints: []string{blackholedMember.Config().ClientURL}, Logger: zap.NewNop(), - DialKeepAliveTime: 1 * time.Millisecond, - DialKeepAliveTimeout: 5 * time.Millisecond, + DialKeepAliveTime: 10 * time.Second, + DialKeepAliveTimeout: 100 * time.Millisecond, }) if err != nil { return err @@ -491,9 +491,7 @@ func waitTillSnapshot(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCl // latestRevisionForEndpoint gets latest revision of the first endpoint in Client.Endpoints list func latestRevisionForEndpoint(ctx context.Context, c *clientv3.Client) (int64, error) { - cntx, cancel := context.WithTimeout(ctx, 100*time.Millisecond) - defer cancel() - resp, err := c.Status(cntx, c.Endpoints()[0]) + resp, err := c.Status(ctx, c.Endpoints()[0]) if err != nil { return 0, err } diff --git a/tests/robustness/watch.go b/tests/robustness/watch.go index c44d639f8..836e0ce2d 100644 --- a/tests/robustness/watch.go +++ b/tests/robustness/watch.go @@ -40,8 +40,8 @@ func collectClusterWatchEvents(ctx context.Context, t *testing.T, clus *e2e.Etcd c, err := clientv3.New(clientv3.Config{ Endpoints: member.EndpointsGRPC(), Logger: zap.NewNop(), - DialKeepAliveTime: 1 * time.Millisecond, - DialKeepAliveTimeout: 5 * time.Millisecond, + DialKeepAliveTime: 10 * time.Second, + DialKeepAliveTimeout: 100 * time.Millisecond, }) if err != nil { t.Fatal(err)