From 4a9ad47bcc51370bc3bbf88991e67153b182ff62 Mon Sep 17 00:00:00 2001 From: Wei Fu Date: Sat, 11 Mar 2023 14:04:15 +0800 Subject: [PATCH 1/2] tests/integration: deflake #15409 The cluster version will be initialized after the member becomes leader. The update is handled asynchronously. It couldn't be updated if the member has been closed and the go-runtime picks the `s.stopping` channel first. ```go // https://github.com/etcd-io/etcd/blob/e2a5df534cf6b0a12859e61bfc2758d56a1e1971/server/etcdserver/server.go#L2170 func (s *EtcdServer) monitorClusterVersions() { ... for { select { case <-s.firstCommitInTerm.Receive(): case <-time.After(monitorVersionInterval): case <-s.stopping: return } ... } } ``` Or after the `s.stopping` has been closed, the [UpdateClusterVersion][1] won't file GoAttach successfully. For the #15409, we can see the warn log `server has stopped; skipping GoAttach` from GoAttach: ```plain https://github.com/etcd-io/etcd/actions/runs/4340931587/jobs/7580103902 logger.go:130: 2023-03-06T07:36:44.253Z WARN default stopping grpc server due to error {"error": "accept tcp 127.0.0.1:2379: use of closed network connection"} logger.go:130: 2023-03-06T07:36:44.253Z WARN default stopped grpc server due to error {"error": "accept tcp 127.0.0.1:2379: use of closed network connection"} logger.go:130: 2023-03-06T07:36:44.253Z ERROR default setting up serving from embedded etcd failed. {"error": "accept tcp 127.0.0.1:2379: use of closed network connection"} logger.go:130: 2023-03-06T07:36:44.253Z ERROR default setting up serving from embedded etcd failed. {"error": "http: Server closed"} logger.go:130: 2023-03-06T07:36:44.253Z INFO default skipped leadership transfer for single voting member cluster {"local-member-id": "8e9e05c52164694d", "current-leader-member-id": "8e9e05c52164694d"} logger.go:130: 2023-03-06T07:36:44.253Z WARN default server has stopped; skipping GoAttach ... ``` If the cluster version isn't updated, the minimum storage version will be v3.5 because the [AuthStatus][2] is introduced in [v3.5][3]. The compare will fail. To fix this issue, we should wait for cluster version to become ready after server is ready to serve request. [1]: [2]: [3]: Signed-off-by: Wei Fu --- tests/integration/utl_wal_version_test.go | 41 +++++++++++++++++++++-- 1 file changed, 38 insertions(+), 3 deletions(-) diff --git a/tests/integration/utl_wal_version_test.go b/tests/integration/utl_wal_version_test.go index c0b75f9a5..179d6839c 100644 --- a/tests/integration/utl_wal_version_test.go +++ b/tests/integration/utl_wal_version_test.go @@ -28,6 +28,7 @@ import ( "go.etcd.io/etcd/server/v3/embed" "go.etcd.io/etcd/server/v3/storage/wal" "go.etcd.io/etcd/server/v3/storage/wal/walpb" + framecfg "go.etcd.io/etcd/tests/v3/framework/config" "go.etcd.io/etcd/tests/v3/framework/integration" ) @@ -45,28 +46,62 @@ func TestEtcdVersionFromWAL(t *testing.T) { t.Fatalf("failed to start embed.Etcd for test") } + // When the member becomes leader, it will update the cluster version + // with the cluster's minimum version. As it's updated asynchronously, + // it could not be updated in time before close. Wait for it to become + // ready. + if err := waitForClusterVersionReady(srv); err != nil { + srv.Close() + t.Fatalf("failed to wait for cluster version to become ready: %v", err) + } + ccfg := clientv3.Config{Endpoints: []string{cfg.ACUrls[0].String()}} cli, err := integration.NewClient(t, ccfg) if err != nil { srv.Close() t.Fatal(err) } - // Get auth status to increase etcd version of proto stored in wal + + // Once the cluster version has been updated, any entity's storage + // version should be align with cluster version. ctx, cancel := context.WithTimeout(context.Background(), testutil.RequestTimeout) - cli.AuthStatus(ctx) + _, err = cli.AuthStatus(ctx) cancel() + if err != nil { + srv.Close() + t.Fatalf("failed to get auth status: %v", err) + } cli.Close() srv.Close() w, err := wal.Open(zap.NewNop(), cfg.Dir+"/member/wal", walpb.Snapshot{}) if err != nil { - panic(err) + t.Fatal(err) } defer w.Close() + walVersion, err := wal.ReadWALVersion(w) if err != nil { t.Fatal(err) } assert.Equal(t, &semver.Version{Major: 3, Minor: 6}, walVersion.MinimalEtcdVersion()) } + +func waitForClusterVersionReady(srv *embed.Etcd) error { + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + for { + select { + case <-ctx.Done(): + return ctx.Err() + default: + } + + if srv.Server.ClusterVersion() != nil { + return nil + } + time.Sleep(framecfg.TickDuration) + } +} From 22bdc913025de198ef2759e9f534ed8dd555de66 Mon Sep 17 00:00:00 2001 From: Wei Fu Date: Sat, 11 Mar 2023 14:31:15 +0800 Subject: [PATCH 2/2] server/etcdserver: add log for terminating monitors Adding log for terminating monitors is to make the debug easier. Signed-off-by: Wei Fu --- server/etcdserver/server.go | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/server/etcdserver/server.go b/server/etcdserver/server.go index 1d9ea9784..d3edf7cf8 100644 --- a/server/etcdserver/server.go +++ b/server/etcdserver/server.go @@ -2168,12 +2168,14 @@ func (s *EtcdServer) StorageVersion() *semver.Version { // monitorClusterVersions every monitorVersionInterval checks if it's the leader and updates cluster version if needed. func (s *EtcdServer) monitorClusterVersions() { - monitor := serverversion.NewMonitor(s.Logger(), NewServerVersionAdapter(s)) + lg := s.Logger() + monitor := serverversion.NewMonitor(lg, NewServerVersionAdapter(s)) for { select { case <-s.firstCommitInTerm.Receive(): case <-time.After(monitorVersionInterval): case <-s.stopping: + lg.Info("server has stopped; stopping cluster version's monitor") return } @@ -2189,12 +2191,14 @@ func (s *EtcdServer) monitorClusterVersions() { // monitorStorageVersion every monitorVersionInterval updates storage version if needed. func (s *EtcdServer) monitorStorageVersion() { - monitor := serverversion.NewMonitor(s.Logger(), NewServerVersionAdapter(s)) + lg := s.Logger() + monitor := serverversion.NewMonitor(lg, NewServerVersionAdapter(s)) for { select { case <-time.After(monitorVersionInterval): case <-s.clusterVersionChanged.Receive(): case <-s.stopping: + lg.Info("server has stopped; stopping storage version's monitor") return } monitor.UpdateStorageVersionIfNeeded() @@ -2218,6 +2222,7 @@ func (s *EtcdServer) monitorKVHash() { for { select { case <-s.stopping: + lg.Info("server has stopped; stopping kv hash's monitor") return case <-checkTicker.C: } @@ -2239,6 +2244,8 @@ func (s *EtcdServer) monitorCompactHash() { select { case <-time.After(t): case <-s.stopping: + lg := s.Logger() + lg.Info("server has stopped; stopping compact hash's monitor") return } if !s.isLeader() {