Merge pull request #12195 from tangcong/optimize-healthcheck

*: check health by using v3 range request and its corresponding timeout
This commit is contained in:
Gyuho Lee 2020-08-11 21:32:44 -07:00 committed by GitHub
commit fe36be2251
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 49 additions and 21 deletions

View File

@ -134,6 +134,7 @@ Note that any `etcd_debugging_*` metrics are experimental and subject to change.
- Improve [mvcc.watchResponse channel Memory Usage](https://github.com/etcd-io/etcd/pull/11987). - Improve [mvcc.watchResponse channel Memory Usage](https://github.com/etcd-io/etcd/pull/11987).
- Log [expensive request info in UnaryInterceptor](https://github.com/etcd-io/etcd/pull/12086). - Log [expensive request info in UnaryInterceptor](https://github.com/etcd-io/etcd/pull/12086).
- [Fix invalid Go type in etcdserverpb](https://github.com/etcd-io/etcd/pull/12000). - [Fix invalid Go type in etcdserverpb](https://github.com/etcd-io/etcd/pull/12000).
- [Improve healthcheck by using v3 range request and its corresponding timeout](https://github.com/etcd-io/etcd/pull/12195).
### Package `embed` ### Package `embed`

View File

@ -632,6 +632,7 @@ func (e *Etcd) serveClients() (err error) {
} else { } else {
mux := http.NewServeMux() mux := http.NewServeMux()
etcdhttp.HandleBasic(e.cfg.logger, mux, e.Server) etcdhttp.HandleBasic(e.cfg.logger, mux, e.Server)
etcdhttp.HandleMetricsHealthForV3(e.cfg.logger, mux, e.Server)
h = mux h = mux
} }
@ -666,7 +667,7 @@ func (e *Etcd) serveMetrics() (err error) {
if len(e.cfg.ListenMetricsUrls) > 0 { if len(e.cfg.ListenMetricsUrls) > 0 {
metricsMux := http.NewServeMux() metricsMux := http.NewServeMux()
etcdhttp.HandleMetricsHealth(e.cfg.logger, metricsMux, e.Server) etcdhttp.HandleMetricsHealthForV3(e.cfg.logger, metricsMux, e.Server)
for _, murl := range e.cfg.ListenMetricsUrls { for _, murl := range e.cfg.ListenMetricsUrls {
tlsInfo := &e.cfg.ClientTLSInfo tlsInfo := &e.cfg.ClientTLSInfo

View File

@ -37,12 +37,7 @@ const (
// HandleBasic adds handlers to a mux for serving JSON etcd client requests // HandleBasic adds handlers to a mux for serving JSON etcd client requests
// that do not access the v2 store. // that do not access the v2 store.
func HandleBasic(lg *zap.Logger, mux *http.ServeMux, server etcdserver.ServerPeer) { func HandleBasic(lg *zap.Logger, mux *http.ServeMux, server etcdserver.ServerPeer) {
if lg == nil {
lg = zap.NewNop()
}
mux.HandleFunc(varsPath, serveVars) mux.HandleFunc(varsPath, serveVars)
HandleMetricsHealth(lg, mux, server)
mux.HandleFunc(versionPath, versionHandler(server.Cluster(), serveVersion)) mux.HandleFunc(versionPath, versionHandler(server.Cluster(), serveVersion))
} }

View File

@ -17,6 +17,7 @@ package etcdhttp
import ( import (
"context" "context"
"encoding/json" "encoding/json"
"fmt"
"net/http" "net/http"
"time" "time"
@ -38,7 +39,14 @@ const (
// HandleMetricsHealth registers metrics and health handlers. // HandleMetricsHealth registers metrics and health handlers.
func HandleMetricsHealth(lg *zap.Logger, mux *http.ServeMux, srv etcdserver.ServerV2) { func HandleMetricsHealth(lg *zap.Logger, mux *http.ServeMux, srv etcdserver.ServerV2) {
mux.Handle(PathMetrics, promhttp.Handler()) mux.Handle(PathMetrics, promhttp.Handler())
mux.Handle(PathHealth, NewHealthHandler(lg, func() Health { return checkHealth(lg, srv) })) mux.Handle(PathHealth, NewHealthHandler(lg, func() Health { return checkV2Health(lg, srv) }))
}
// HandleMetricsHealthForV3 registers metrics and health handlers. it checks health by using v3 range request
// and its corresponding timeout.
func HandleMetricsHealthForV3(lg *zap.Logger, mux *http.ServeMux, srv *etcdserver.EtcdServer) {
mux.Handle(PathMetrics, promhttp.Handler())
mux.Handle(PathHealth, NewHealthHandler(lg, func() Health { return checkV3Health(lg, srv) }))
} }
// HandlePrometheus registers prometheus handler on '/metrics'. // HandlePrometheus registers prometheus handler on '/metrics'.
@ -56,6 +64,13 @@ func NewHealthHandler(lg *zap.Logger, hfunc func() Health) http.HandlerFunc {
return return
} }
h := hfunc() h := hfunc()
defer func() {
if h.Health == "true" {
healthSuccess.Inc()
} else {
healthFailed.Inc()
}
}()
d, _ := json.Marshal(h) d, _ := json.Marshal(h)
if h.Health != "true" { if h.Health != "true" {
http.Error(w, string(d), http.StatusServiceUnavailable) http.Error(w, string(d), http.StatusServiceUnavailable)
@ -97,17 +112,9 @@ type Health struct {
// TODO: server NOSPACE, etcdserver.ErrNoLeader in health API // TODO: server NOSPACE, etcdserver.ErrNoLeader in health API
func checkHealth(lg *zap.Logger, srv etcdserver.ServerV2) (h Health) { func checkHealth(lg *zap.Logger, srv etcdserver.ServerV2) Health {
h := Health{}
h.Health = "true" h.Health = "true"
defer func() {
if h.Health == "true" {
healthSuccess.Inc()
} else {
healthFailed.Inc()
}
}()
as := srv.Alarms() as := srv.Alarms()
if len(as) > 0 { if len(as) > 0 {
h.Health = "false" h.Health = "false"
@ -122,25 +129,48 @@ func checkHealth(lg *zap.Logger, srv etcdserver.ServerV2) (h Health) {
} }
lg.Warn("serving /health false due to an alarm", zap.String("alarm", v.String())) lg.Warn("serving /health false due to an alarm", zap.String("alarm", v.String()))
} }
return return h
} }
if uint64(srv.Leader()) == raft.None { if uint64(srv.Leader()) == raft.None {
h.Health = "false" h.Health = "false"
h.Reason = "RAFT NO LEADER" h.Reason = "RAFT NO LEADER"
lg.Warn("serving /health false; no leader") lg.Warn("serving /health false; no leader")
return return h
}
return h
} }
func checkV2Health(lg *zap.Logger, srv etcdserver.ServerV2) (h Health) {
if h = checkHealth(lg, srv); h.Health != "true" {
return
}
ctx, cancel := context.WithTimeout(context.Background(), time.Second) ctx, cancel := context.WithTimeout(context.Background(), time.Second)
_, err := srv.Do(ctx, etcdserverpb.Request{Method: "QGET"}) _, err := srv.Do(ctx, etcdserverpb.Request{Method: "QGET"})
cancel() cancel()
if err != nil { if err != nil {
h.Health = "false" h.Health = "false"
h.Reason = "QGET ERROR" h.Reason = fmt.Sprintf("QGET ERROR:%s", err)
lg.Warn("serving /health false; QGET fails", zap.Error(err)) lg.Warn("serving /health false; QGET fails", zap.Error(err))
return
}
lg.Info("serving /health true")
return
}
func checkV3Health(lg *zap.Logger, srv *etcdserver.EtcdServer) (h Health) {
if h = checkHealth(lg, srv); h.Health != "true" {
return
}
ctx, cancel := context.WithTimeout(context.Background(), srv.Cfg.ReqTimeout())
_, err := srv.Range(ctx, &etcdserverpb.RangeRequest{KeysOnly: true, Limit: 1})
cancel()
if err != nil {
h.Health = "false"
h.Reason = fmt.Sprintf("RANGE ERROR:%s", err)
lg.Warn("serving /health false; Range fails", zap.Error(err))
return
} }
lg.Info("serving /health true") lg.Info("serving /health true")
return return
} }

View File

@ -58,6 +58,7 @@ func NewClientHandler(lg *zap.Logger, server etcdserver.ServerPeer, timeout time
} }
mux := http.NewServeMux() mux := http.NewServeMux()
etcdhttp.HandleBasic(lg, mux, server) etcdhttp.HandleBasic(lg, mux, server)
etcdhttp.HandleMetricsHealth(lg, mux, server)
handleV2(lg, mux, server, timeout) handleV2(lg, mux, server, timeout)
return requestLogger(lg, mux) return requestLogger(lg, mux)
} }