From 92f180c574044d6f4b73a187bc9d55efb4b7fb54 Mon Sep 17 00:00:00 2001 From: Gyuho Lee Date: Wed, 18 Mar 2020 10:56:23 -0700 Subject: [PATCH] *: log server-side /health checks To make it easier to root-cause when /health check fails. For example, we are using load balancer to health check each etcd instance, and when one etcd node gets terminated, it's hard to tell whether etcd "server" was really failing or client (or load balancer") failed to reach the etcd cluster which is also failure in load balancer health check. Signed-off-by: Gyuho Lee --- embed/etcd.go | 4 ++-- etcdmain/grpc_proxy.go | 4 ++-- etcdserver/api/etcdhttp/base.go | 8 +++++--- etcdserver/api/etcdhttp/metrics.go | 24 +++++++++++++++++------- etcdserver/api/v2http/client.go | 2 +- proxy/grpcproxy/health.go | 8 ++++++-- 6 files changed, 33 insertions(+), 17 deletions(-) diff --git a/embed/etcd.go b/embed/etcd.go index 4c02c1184..bf84b2567 100644 --- a/embed/etcd.go +++ b/embed/etcd.go @@ -629,7 +629,7 @@ func (e *Etcd) serveClients() (err error) { } } else { mux := http.NewServeMux() - etcdhttp.HandleBasic(mux, e.Server) + etcdhttp.HandleBasic(e.cfg.logger, mux, e.Server) h = mux } @@ -664,7 +664,7 @@ func (e *Etcd) serveMetrics() (err error) { if len(e.cfg.ListenMetricsUrls) > 0 { metricsMux := http.NewServeMux() - etcdhttp.HandleMetricsHealth(metricsMux, e.Server) + etcdhttp.HandleMetricsHealth(e.cfg.logger, metricsMux, e.Server) for _, murl := range e.cfg.ListenMetricsUrls { tlsInfo := &e.cfg.ClientTLSInfo diff --git a/etcdmain/grpc_proxy.go b/etcdmain/grpc_proxy.go index c4dedec4b..db6b0e686 100644 --- a/etcdmain/grpc_proxy.go +++ b/etcdmain/grpc_proxy.go @@ -204,7 +204,7 @@ func startGRPCProxy(cmd *cobra.Command, args []string) { go func() { mux := http.NewServeMux() grpcproxy.HandleMetrics(mux, httpClient, client.Endpoints()) - grpcproxy.HandleHealth(mux, client) + grpcproxy.HandleHealth(lg, mux, client) lg.Info("gRPC proxy server metrics URL serving") herr := http.Serve(mhttpl, mux) if herr != nil { @@ -381,7 +381,7 @@ func mustHTTPListener(lg *zap.Logger, m cmux.CMux, tlsinfo *transport.TLSInfo, c httpmux := http.NewServeMux() httpmux.HandleFunc("/", http.NotFound) grpcproxy.HandleMetrics(httpmux, httpClient, c.Endpoints()) - grpcproxy.HandleHealth(httpmux, c) + grpcproxy.HandleHealth(lg, httpmux, c) if grpcProxyEnablePprof { for p, h := range debugutil.PProfHandlers() { httpmux.Handle(p, h) diff --git a/etcdserver/api/etcdhttp/base.go b/etcdserver/api/etcdhttp/base.go index e520f81b5..d0c22e97e 100644 --- a/etcdserver/api/etcdhttp/base.go +++ b/etcdserver/api/etcdhttp/base.go @@ -25,7 +25,6 @@ import ( "go.etcd.io/etcd/etcdserver/api/v2error" "go.etcd.io/etcd/etcdserver/api/v2http/httptypes" "go.etcd.io/etcd/version" - "go.uber.org/zap" ) @@ -37,10 +36,13 @@ const ( // HandleBasic adds handlers to a mux for serving JSON etcd client requests // that do not access the v2 store. -func HandleBasic(mux *http.ServeMux, server etcdserver.ServerPeer) { +func HandleBasic(lg *zap.Logger, mux *http.ServeMux, server etcdserver.ServerPeer) { + if lg == nil { + lg = zap.NewNop() + } mux.HandleFunc(varsPath, serveVars) - HandleMetricsHealth(mux, server) + HandleMetricsHealth(lg, mux, server) mux.HandleFunc(versionPath, versionHandler(server.Cluster(), serveVersion)) } diff --git a/etcdserver/api/etcdhttp/metrics.go b/etcdserver/api/etcdhttp/metrics.go index 0fa2f75c2..8fd813dcb 100644 --- a/etcdserver/api/etcdhttp/metrics.go +++ b/etcdserver/api/etcdhttp/metrics.go @@ -20,12 +20,12 @@ import ( "net/http" "time" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" "go.etcd.io/etcd/etcdserver" "go.etcd.io/etcd/etcdserver/etcdserverpb" "go.etcd.io/etcd/raft" - - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promhttp" + "go.uber.org/zap" ) const ( @@ -34,9 +34,9 @@ const ( ) // HandleMetricsHealth registers metrics and health handlers. -func HandleMetricsHealth(mux *http.ServeMux, srv etcdserver.ServerV2) { +func HandleMetricsHealth(lg *zap.Logger, mux *http.ServeMux, srv etcdserver.ServerV2) { mux.Handle(PathMetrics, promhttp.Handler()) - mux.Handle(PathHealth, NewHealthHandler(func() Health { return checkHealth(srv) })) + mux.Handle(PathHealth, NewHealthHandler(lg, func() Health { return checkHealth(lg, srv) })) } // HandlePrometheus registers prometheus handler on '/metrics'. @@ -45,21 +45,24 @@ func HandlePrometheus(mux *http.ServeMux) { } // NewHealthHandler handles '/health' requests. -func NewHealthHandler(hfunc func() Health) http.HandlerFunc { +func NewHealthHandler(lg *zap.Logger, hfunc func() Health) http.HandlerFunc { return func(w http.ResponseWriter, r *http.Request) { if r.Method != http.MethodGet { w.Header().Set("Allow", http.MethodGet) http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed) + lg.Warn("/health error", zap.Int("status-code", http.StatusMethodNotAllowed)) return } h := hfunc() d, _ := json.Marshal(h) if h.Health != "true" { http.Error(w, string(d), http.StatusServiceUnavailable) + lg.Warn("/health error", zap.String("output", string(d)), zap.Int("status-code", http.StatusServiceUnavailable)) return } w.WriteHeader(http.StatusOK) w.Write(d) + lg.Info("/health OK", zap.Int("status-code", http.StatusOK)) } } @@ -91,7 +94,7 @@ type Health struct { // TODO: server NOSPACE, etcdserver.ErrNoLeader in health API -func checkHealth(srv etcdserver.ServerV2) (h Health) { +func checkHealth(lg *zap.Logger, srv etcdserver.ServerV2) (h Health) { h.Health = "true" defer func() { @@ -105,11 +108,15 @@ func checkHealth(srv etcdserver.ServerV2) (h Health) { as := srv.Alarms() if len(as) > 0 { h.Health = "false" + for _, v := range as { + lg.Warn("serving /health false due to an alarm", zap.String("alarm", v.String())) + } return } if uint64(srv.Leader()) == raft.None { h.Health = "false" + lg.Warn("serving /health false; no leader") return } @@ -118,6 +125,9 @@ func checkHealth(srv etcdserver.ServerV2) (h Health) { cancel() if err != nil { h.Health = "false" + lg.Warn("serving /health false; QGET fails", zap.Error(err)) } + + lg.Info("serving /health true") return } diff --git a/etcdserver/api/v2http/client.go b/etcdserver/api/v2http/client.go index 005ff0b8a..a256894d6 100644 --- a/etcdserver/api/v2http/client.go +++ b/etcdserver/api/v2http/client.go @@ -57,7 +57,7 @@ func NewClientHandler(lg *zap.Logger, server etcdserver.ServerPeer, timeout time lg = zap.NewNop() } mux := http.NewServeMux() - etcdhttp.HandleBasic(mux, server) + etcdhttp.HandleBasic(lg, mux, server) handleV2(lg, mux, server, timeout) return requestLogger(lg, mux) } diff --git a/proxy/grpcproxy/health.go b/proxy/grpcproxy/health.go index 4756cff2e..9418502c0 100644 --- a/proxy/grpcproxy/health.go +++ b/proxy/grpcproxy/health.go @@ -22,11 +22,15 @@ import ( "go.etcd.io/etcd/clientv3" "go.etcd.io/etcd/etcdserver/api/etcdhttp" "go.etcd.io/etcd/etcdserver/api/v3rpc/rpctypes" + "go.uber.org/zap" ) // HandleHealth registers health handler on '/health'. -func HandleHealth(mux *http.ServeMux, c *clientv3.Client) { - mux.Handle(etcdhttp.PathHealth, etcdhttp.NewHealthHandler(func() etcdhttp.Health { return checkHealth(c) })) +func HandleHealth(lg *zap.Logger, mux *http.ServeMux, c *clientv3.Client) { + if lg == nil { + lg = zap.NewNop() + } + mux.Handle(etcdhttp.PathHealth, etcdhttp.NewHealthHandler(lg, func() etcdhttp.Health { return checkHealth(c) })) } func checkHealth(c *clientv3.Client) etcdhttp.Health {