From 09d053e0355c253a21d9a375e69a00c488f0162c Mon Sep 17 00:00:00 2001
From: Wei Fu <fuweid89@gmail.com>
Date: Fri, 28 Apr 2023 14:13:03 +0800
Subject: [PATCH] tests/robustness: tune timeout policy

In a [scheduled test][1], the error shows

```
2023-04-19T11:16:15.8166316Z     traffic.go:96: rpc error: code = Unavailable desc = keepalive ping failed to receive ACK within timeout
```

According to [grpc-keepalive@v1.51.0][2], each frame from server will
fresh the `lastRead` and it won't file `Ping` frame to server. But the
client used by [`tombstone` request][3] might hit the race. Since we use
5ms as timeout, the client might not receive the result of `Ping` from
server in time. The keepalive will mark it timeout and close the
connection.

I didn't reproduce it in my local. If we add the sleep before update
`lastRead`, it can reproduce it sometimes. Still investigating this
part.

```diff
diff --git a/internal/transport/http2_client.go b/internal/transport/http2_client.go
index d518b07e..bee9c00a 100644
--- a/internal/transport/http2_client.go
+++ b/internal/transport/http2_client.go
@@ -1560,6 +1560,7 @@ func (t *http2Client) reader(errCh chan<- error) {
                t.controlBuf.throttle()
                frame, err := t.framer.fr.ReadFrame()
                if t.keepaliveEnabled {
+                       time.Sleep(2 * time.Millisecond)
                        atomic.StoreInt64(&t.lastRead, time.Now().UnixNano())
                }
                if err != nil {
```

`DialKeepAliveTime` is always >= [10s][4]. I think we should increase
the timeout to avoid flaky caused by unstable env.

And in a [scheduled test][5], the error shows

```
logger.go:130: 2023-04-22T10:45:52.646Z	INFO	Failed to trigger failpoint	{"failpoint": "blackhole", "error": "context deadline exceeded"}
```

Before sending `Status` to member, the client doesn't [pick][6] the
connection in time (100ms) and returns the error.

The `waitTillSnapshot` is used to ensure that it is good enough to
trigger snapshot transfer. And we have 1min timeout for
injectFailpoints, so I think we can remove the 100ms timeout to reduce
unnecessary stop.

```
injectFailpoints(1min timeout)
  failpoint.Inject
    triggerBlockhole.Trigger
      blackhole
        waitTillSnapshot
```

> NOTE: I didn't reproduce it either. :(

Reference:

[1]: <https://github.com/etcd-io/etcd/actions/runs/4741737098/jobs/8419176899>
[2]: <https://github.com/grpc/grpc-go/blob/eeb9afa1f6b6388152955eeca8926e36ca94c768/internal/transport/http2_client.go#L1647>
[3]: <https://github.com/etcd-io/etcd/blob/7450cd886dd34458e28033588859c662822cb010/tests/robustness/traffic.go#L94>
[4]: <https://github.com/grpc/grpc-go/blob/eeb9afa1f6b6388152955eeca8926e36ca94c768/dialoptions.go#L445>
[5]: <https://github.com/etcd-io/etcd/actions/runs/4772033408/jobs/8484334015>
[6]: <https://github.com/grpc/grpc-go/blob/eeb9afa1f6b6388152955eeca8926e36ca94c768/clientconn.go#L932>

REF: #15763

Signed-off-by: Wei Fu <fuweid89@gmail.com>
---
 tests/robustness/client.go     |  4 ++--
 tests/robustness/failpoints.go | 24 +++++++++++-------------
 tests/robustness/watch.go      |  4 ++--
 3 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/tests/robustness/client.go b/tests/robustness/client.go
index 295a43d80..625d561aa 100644
--- a/tests/robustness/client.go
+++ b/tests/robustness/client.go
@@ -36,8 +36,8 @@ func NewClient(endpoints []string, ids identity.Provider, baseTime time.Time) (*
 	cc, err := clientv3.New(clientv3.Config{
 		Endpoints:            endpoints,
 		Logger:               zap.NewNop(),
-		DialKeepAliveTime:    1 * time.Millisecond,
-		DialKeepAliveTimeout: 5 * time.Millisecond,
+		DialKeepAliveTime:    10 * time.Second,
+		DialKeepAliveTimeout: 100 * time.Millisecond,
 	})
 	if err != nil {
 		return nil, err
diff --git a/tests/robustness/failpoints.go b/tests/robustness/failpoints.go
index cb77ba2ee..907d57692 100644
--- a/tests/robustness/failpoints.go
+++ b/tests/robustness/failpoints.go
@@ -132,8 +132,8 @@ func verifyClusterHealth(ctx context.Context, t *testing.T, clus *e2e.EtcdProces
 		clusterClient, err := clientv3.New(clientv3.Config{
 			Endpoints:            clus.Procs[i].EndpointsGRPC(),
 			Logger:               zap.NewNop(),
-			DialKeepAliveTime:    1 * time.Millisecond,
-			DialKeepAliveTimeout: 5 * time.Millisecond,
+			DialKeepAliveTime:    10 * time.Second,
+			DialKeepAliveTimeout: 100 * time.Millisecond,
 		})
 		if err != nil {
 			return fmt.Errorf("Error creating client for cluster %s: %v", clus.Procs[i].Config().Name, err)
@@ -298,8 +298,8 @@ func (t triggerDefrag) Trigger(_ *testing.T, ctx context.Context, member e2e.Etc
 	cc, err := clientv3.New(clientv3.Config{
 		Endpoints:            member.EndpointsGRPC(),
 		Logger:               zap.NewNop(),
-		DialKeepAliveTime:    1 * time.Millisecond,
-		DialKeepAliveTimeout: 5 * time.Millisecond,
+		DialKeepAliveTime:    10 * time.Second,
+		DialKeepAliveTimeout: 100 * time.Millisecond,
 	})
 	if err != nil {
 		return fmt.Errorf("failed creating client: %w", err)
@@ -322,8 +322,8 @@ func (t triggerCompact) Trigger(_ *testing.T, ctx context.Context, member e2e.Et
 	cc, err := clientv3.New(clientv3.Config{
 		Endpoints:            member.EndpointsGRPC(),
 		Logger:               zap.NewNop(),
-		DialKeepAliveTime:    1 * time.Millisecond,
-		DialKeepAliveTimeout: 5 * time.Millisecond,
+		DialKeepAliveTime:    10 * time.Second,
+		DialKeepAliveTimeout: 100 * time.Millisecond,
 	})
 	if err != nil {
 		return fmt.Errorf("failed creating client: %w", err)
@@ -444,8 +444,8 @@ func waitTillSnapshot(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCl
 	clusterClient, err := clientv3.New(clientv3.Config{
 		Endpoints:            endpoints,
 		Logger:               zap.NewNop(),
-		DialKeepAliveTime:    1 * time.Millisecond,
-		DialKeepAliveTimeout: 5 * time.Millisecond,
+		DialKeepAliveTime:    10 * time.Second,
+		DialKeepAliveTimeout: 100 * time.Millisecond,
 	})
 	if err != nil {
 		return err
@@ -455,8 +455,8 @@ func waitTillSnapshot(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCl
 	blackholedMemberClient, err := clientv3.New(clientv3.Config{
 		Endpoints:            []string{blackholedMember.Config().ClientURL},
 		Logger:               zap.NewNop(),
-		DialKeepAliveTime:    1 * time.Millisecond,
-		DialKeepAliveTimeout: 5 * time.Millisecond,
+		DialKeepAliveTime:    10 * time.Second,
+		DialKeepAliveTimeout: 100 * time.Millisecond,
 	})
 	if err != nil {
 		return err
@@ -491,9 +491,7 @@ func waitTillSnapshot(ctx context.Context, t *testing.T, clus *e2e.EtcdProcessCl
 
 // latestRevisionForEndpoint gets latest revision of the first endpoint in Client.Endpoints list
 func latestRevisionForEndpoint(ctx context.Context, c *clientv3.Client) (int64, error) {
-	cntx, cancel := context.WithTimeout(ctx, 100*time.Millisecond)
-	defer cancel()
-	resp, err := c.Status(cntx, c.Endpoints()[0])
+	resp, err := c.Status(ctx, c.Endpoints()[0])
 	if err != nil {
 		return 0, err
 	}
diff --git a/tests/robustness/watch.go b/tests/robustness/watch.go
index c44d639f8..836e0ce2d 100644
--- a/tests/robustness/watch.go
+++ b/tests/robustness/watch.go
@@ -40,8 +40,8 @@ func collectClusterWatchEvents(ctx context.Context, t *testing.T, clus *e2e.Etcd
 		c, err := clientv3.New(clientv3.Config{
 			Endpoints:            member.EndpointsGRPC(),
 			Logger:               zap.NewNop(),
-			DialKeepAliveTime:    1 * time.Millisecond,
-			DialKeepAliveTimeout: 5 * time.Millisecond,
+			DialKeepAliveTime:    10 * time.Second,
+			DialKeepAliveTimeout: 100 * time.Millisecond,
 		})
 		if err != nil {
 			t.Fatal(err)