etcdmain: Honour ExperimentalWaitClusterReadyTimeout in startEtcd

When we can't reach quorum, we were waiting forever and never sending the systemd notify message. As a result, systemd would eventually time out and restart the etcd process which likely would make the unhealthy cluster in an even worse state Improves #13785 Signed-off-by: Nicolai Moore <niconorsk@gmail.com>
2024-09-27 06:25:44 +00:00 · 2022-08-26 10:05:33 +10:00
parent 96a2669839
commit e15bdd9df1
5 changed files with 66 additions and 15 deletions
--- a/server/etcdmain/etcd.go
+++ b/server/etcdmain/etcd.go
@@ -19,6 +19,7 @@ import (
 	"os"
 	"runtime"
 	"strings"
+	"time"

 	"go.etcd.io/etcd/client/pkg/v3/fileutil"
 	"go.etcd.io/etcd/client/pkg/v3/logutil"
@@ -207,6 +208,8 @@ func startEtcd(cfg *embed.Config) (<-chan struct{}, <-chan error, error) {
 	select {
 	case <-e.Server.ReadyNotify(): // wait for e.Server to join the cluster
 	case <-e.Server.StopNotify(): // publish aborted from 'ErrStopped'
+	case <-time.After(cfg.ExperimentalWaitClusterReadyTimeout):
+		e.GetLogger().Warn("startEtcd: timed out waiting for the ready notification")
 	}
 	return e.Server.StopNotify(), e.Err(), nil
 }