From 17b982382e110b201b15015e435c19e33cb7b4c1 Mon Sep 17 00:00:00 2001 From: Piotr Tabor Date: Fri, 16 Apr 2021 18:55:56 +0200 Subject: [PATCH] Fix TestSnapshotV3RestoreMultiMemberAdd flakes (leaks) - most important: unix's socket transport should not keep idle connections. For top-level Transport we close them using: https://github.com/etcd-io/etcd/blob/f3c518025e284d3d070a63d5969c69176f247b33/server/etcdserver/api/rafthttp/transport.go#L226 but currently we don't have access to close them witing the nest (unix) transport. Short idle deadline is good enough. - Use dialContext (instead of dial) to make sure context is passed down the stack - Make sure Context is cancelled as soon as the operation is done in pipeline - nit: use dedicated method to yeld goroutines. Tested with: ``` d=$(date +"%Y%m%d_%H%M") (cd tests && go test --timeout=60m ./integration/snapshot -run TestSnapshotV3RestoreMultiMemberAdd -v --count=180 2>&1 | tee log_${d}.log) ``` There were transports & cmux leaked: ``` leak.go:118: Test appears to have leaked a Transport: internal/poll.runtime_pollWait(0x7f6c5c3784c8, 0x72, 0xffffffffffffffff) /usr/lib/google-golang/src/runtime/netpoll.go:222 +0x55 internal/poll.(*pollDesc).wait(0xc003296298, 0x72, 0x0, 0x18, 0xffffffffffffffff) /usr/lib/google-golang/src/internal/poll/fd_poll_runtime.go:87 +0x45 internal/poll.(*pollDesc).waitRead(...) /usr/lib/google-golang/src/internal/poll/fd_poll_runtime.go:92 internal/poll.(*FD).Read(0xc003296280, 0xc0031f60a8, 0x18, 0x18, 0x0, 0x0, 0x0) /usr/lib/google-golang/src/internal/poll/fd_unix.go:166 +0x1d5 net.(*netFD).Read(0xc003296280, 0xc0031f60a8, 0x18, 0x18, 0x18, 0xc0009056e2, 0x203000) /usr/lib/google-golang/src/net/fd_posix.go:55 +0x4f net.(*conn).Read(0xc000010258, 0xc0031f60a8, 0x18, 0x18, 0x0, 0x0, 0x0) /usr/lib/google-golang/src/net/net.go:183 +0x91 github.com/soheilhy/cmux.(*bufferedReader).Read(0xc0003d24e0, 0xc0031f60a8, 0x18, 0x18, 0xc0003d24d0, 0xc0009056e2, 0xc000278400) /home/ptab/private/golang/pkg/mod/github.com/soheilhy/cmux@v0.1.5/buffer.go:53 +0x12d github.com/soheilhy/cmux.hasHTTP2Preface(0x1367e20, 0xc0003d24e0, 0x7f6c5c699f40) /home/ptab/private/golang/pkg/mod/github.com/soheilhy/cmux@v0.1.5/matchers.go:195 +0x8a github.com/soheilhy/cmux.matchersToMatchWriters.func1(0x7f6c5c699f40, 0xc000010258, 0x1367e20, 0xc0003d24e0, 0xc000010258) /home/ptab/private/golang/pkg/mod/github.com/soheilhy/cmux@v0.1.5/cmux.go:128 +0x39 github.com/soheilhy/cmux.(*cMux).serve(0xc003228690, 0x138c410, 0xc000010258, 0xc00327f740, 0xc0059ba860) /home/ptab/private/golang/pkg/mod/github.com/soheilhy/cmux@v0.1.5/cmux.go:192 +0x1e7 created by github.com/soheilhy/cmux.(*cMux).Serve /home/ptab/private/golang/pkg/mod/github.com/soheilhy/cmux@v0.1.5/cmux.go:179 +0x191 internal/poll.runtime_pollWait(0x7f6c5c60f3f0, 0x72, 0xffffffffffffffff) /usr/lib/google-golang/src/runtime/netpoll.go:222 +0x55 internal/poll.(*pollDesc).wait(0xc000d53018, 0x72, 0x1000, 0x1000, 0xffffffffffffffff) /usr/lib/google-golang/src/internal/poll/fd_poll_runtime.go:87 +0x45 internal/poll.(*pollDesc).waitRead(...) /usr/lib/google-golang/src/internal/poll/fd_poll_runtime.go:92 internal/poll.(*FD).Read(0xc000d53000, 0xc000cfd000, 0x1000, 0x1000, 0x0, 0x0, 0x0) /usr/lib/google-golang/src/internal/poll/fd_unix.go:166 +0x1d5 net.(*netFD).Read(0xc000d53000, 0xc000cfd000, 0x1000, 0x1000, 0x3, 0x3, 0x1000000000001) /usr/lib/google-golang/src/net/fd_posix.go:55 +0x4f net.(*conn).Read(0xc00031a570, 0xc000cfd000, 0x1000, 0x1000, 0x0, 0x0, 0x0) /usr/lib/google-golang/src/net/net.go:183 +0x91 net/http.(*persistConn).Read(0xc00093b320, 0xc000cfd000, 0x1000, 0x1000, 0x577750, 0x60, 0x0) /usr/lib/google-golang/src/net/http/transport.go:1933 +0x77 bufio.(*Reader).fill(0xc005702fc0) /usr/lib/google-golang/src/bufio/bufio.go:101 +0x108 bufio.(*Reader).Peek(0xc005702fc0, 0x1, 0xc00077c660, 0xc003b082a0, 0xc000d08de0, 0x5ae586, 0x11dd6c0) /usr/lib/google-golang/src/bufio/bufio.go:139 +0x4f net/http.(*persistConn).readLoop(0xc00093b320) /usr/lib/google-golang/src/net/http/transport.go:2094 +0x1a8 created by net/http.(*Transport).dialConn /usr/lib/google-golang/src/net/http/transport.go:1754 +0xdaa net/http.(*persistConn).writeLoop(0xc00093b320) /usr/lib/google-golang/src/net/http/transport.go:2393 +0xf7 created by net/http.(*Transport).dialConn /usr/lib/google-golang/src/net/http/transport.go:1755 +0xdcf sync.runtime_Semacquire(0xc0059ba868) /usr/lib/google-golang/src/runtime/sema.go:56 +0x45 sync.(*WaitGroup).Wait(0xc0059ba860) /usr/lib/google-golang/src/sync/waitgroup.go:130 +0x65 github.com/soheilhy/cmux.(*cMux).Serve.func1(0xc003228690, 0xc0059ba860) /home/ptab/private/golang/pkg/mod/github.com/soheilhy/cmux@v0.1.5/cmux.go:158 +0x56 github.com/soheilhy/cmux.(*cMux).Serve(0xc003228690, 0x13698c0, 0xc00377a0f0) /home/ptab/private/golang/pkg/mod/github.com/soheilhy/cmux@v0.1.5/cmux.go:173 +0x115 go.etcd.io/etcd/server/v3/embed.(*Etcd).servePeers.func1(0xc0007cc360, 0x122b75f) /home/ptab/corp/etcd/server/embed/etcd.go:518 +0x2b9 go.etcd.io/etcd/server/v3/embed.(*Etcd).servePeers.func3(0xc00036d080, 0xc0059330a0) /home/ptab/corp/etcd/server/embed/etcd.go:549 +0x182 created by go.etcd.io/etcd/server/v3/embed.(*Etcd).servePeers /home/ptab/corp/etcd/server/embed/etcd.go:543 +0x73a --- FAIL: TestSnapshotV3RestoreMultiMemberAdd (17.74s) ``` --- client/pkg/transport/transport.go | 18 ++++++++++++------ pkg/proxy/server.go | 4 +++- pkg/proxy/server_test.go | 3 ++- server/etcdserver/api/rafthttp/pipeline.go | 4 +++- 4 files changed, 20 insertions(+), 9 deletions(-) diff --git a/client/pkg/transport/transport.go b/client/pkg/transport/transport.go index 949610d46..648512772 100644 --- a/client/pkg/transport/transport.go +++ b/client/pkg/transport/transport.go @@ -15,6 +15,7 @@ package transport import ( + "context" "net" "net/http" "strings" @@ -31,11 +32,11 @@ func NewTransport(info TLSInfo, dialtimeoutd time.Duration) (*http.Transport, er t := &http.Transport{ Proxy: http.ProxyFromEnvironment, - Dial: (&net.Dialer{ + DialContext: (&net.Dialer{ Timeout: dialtimeoutd, // value taken from http.DefaultTransport KeepAlive: 30 * time.Second, - }).Dial, + }).DialContext, // value taken from http.DefaultTransport TLSHandshakeTimeout: 10 * time.Second, TLSClientConfig: cfg, @@ -45,15 +46,20 @@ func NewTransport(info TLSInfo, dialtimeoutd time.Duration) (*http.Transport, er Timeout: dialtimeoutd, KeepAlive: 30 * time.Second, } - dial := func(net, addr string) (net.Conn, error) { - return dialer.Dial("unix", addr) - } + dialContext := func(ctx context.Context, net, addr string) (net.Conn, error) { + return dialer.DialContext(ctx, "unix", addr) + } tu := &http.Transport{ Proxy: http.ProxyFromEnvironment, - Dial: dial, + DialContext: dialContext, TLSHandshakeTimeout: 10 * time.Second, TLSClientConfig: cfg, + // Cost of reopening connection on sockets is low, and they are mostly used in testing. + // Long living unix-transport connections were leading to 'leak' test flakes. + // Alternativly the returned Transport (t) should override CloseIdleConnections to + // forward it to 'tu' as well. + IdleConnTimeout: time.Microsecond, } ut := &unixTransport{tu} diff --git a/pkg/proxy/server.go b/pkg/proxy/server.go index ebbdd2379..480a9492b 100644 --- a/pkg/proxy/server.go +++ b/pkg/proxy/server.go @@ -15,6 +15,7 @@ package proxy import ( + "context" "fmt" "io" mrand "math/rand" @@ -295,6 +296,7 @@ func (s *server) To() string { func (s *server) listenAndServe() { defer s.closeWg.Done() + ctx := context.Background() s.lg.Info("proxy is listening on", zap.String("from", s.From())) close(s.readyc) @@ -380,7 +382,7 @@ func (s *server) listenAndServe() { } continue } - out, err = tp.Dial(s.to.Scheme, s.to.Host) + out, err = tp.DialContext(ctx, s.to.Scheme, s.to.Host) } else { out, err = net.Dial(s.to.Scheme, s.to.Host) } diff --git a/pkg/proxy/server_test.go b/pkg/proxy/server_test.go index 77ee64320..c634055e6 100644 --- a/pkg/proxy/server_test.go +++ b/pkg/proxy/server_test.go @@ -16,6 +16,7 @@ package proxy import ( "bytes" + "context" "crypto/tls" "fmt" "io/ioutil" @@ -615,7 +616,7 @@ func send(t *testing.T, data []byte, scheme, addr string, tlsInfo transport.TLSI if terr != nil { t.Fatal(terr) } - out, err = tp.Dial(scheme, addr) + out, err = tp.DialContext(context.Background(), scheme, addr) } else { out, err = net.Dial(scheme, addr) } diff --git a/server/etcdserver/api/rafthttp/pipeline.go b/server/etcdserver/api/rafthttp/pipeline.go index 19facc852..de3b45911 100644 --- a/server/etcdserver/api/rafthttp/pipeline.go +++ b/server/etcdserver/api/rafthttp/pipeline.go @@ -19,6 +19,7 @@ import ( "context" "errors" "io/ioutil" + "runtime" "sync" "time" @@ -139,6 +140,7 @@ func (p *pipeline) post(data []byte) (err error) { go func() { select { case <-done: + cancel() case <-p.stopc: waitSchedule() cancel() @@ -173,4 +175,4 @@ func (p *pipeline) post(data []byte) (err error) { } // waitSchedule waits other goroutines to be scheduled for a while -func waitSchedule() { time.Sleep(time.Millisecond) } +func waitSchedule() { runtime.Gosched() }