etcdhttp/metrics.go: exclude alarms from health check conditionally with ?exclude=NOSPACE

This commit is contained in:
Chao Chen
2021-04-20 13:17:09 -07:00
parent f3c518025e
commit 140ea4fa29
4 changed files with 182 additions and 15 deletions

View File

@@ -39,14 +39,14 @@ const (
// HandleMetricsHealth registers metrics and health handlers.
func HandleMetricsHealth(lg *zap.Logger, mux *http.ServeMux, srv etcdserver.ServerV2) {
mux.Handle(PathMetrics, promhttp.Handler())
mux.Handle(PathHealth, NewHealthHandler(lg, func() Health { return checkV2Health(lg, srv) }))
mux.Handle(PathHealth, NewHealthHandler(lg, func(excludedAlarms AlarmSet) Health { return checkV2Health(lg, srv, excludedAlarms) }))
}
// HandleMetricsHealthForV3 registers metrics and health handlers. it checks health by using v3 range request
// and its corresponding timeout.
func HandleMetricsHealthForV3(lg *zap.Logger, mux *http.ServeMux, srv *etcdserver.EtcdServer) {
mux.Handle(PathMetrics, promhttp.Handler())
mux.Handle(PathHealth, NewHealthHandler(lg, func() Health { return checkV3Health(lg, srv) }))
mux.Handle(PathHealth, NewHealthHandler(lg, func(excludedAlarms AlarmSet) Health { return checkV3Health(lg, srv, excludedAlarms) }))
}
// HandlePrometheus registers prometheus handler on '/metrics'.
@@ -55,7 +55,7 @@ func HandlePrometheus(mux *http.ServeMux) {
}
// NewHealthHandler handles '/health' requests.
func NewHealthHandler(lg *zap.Logger, hfunc func() Health) http.HandlerFunc {
func NewHealthHandler(lg *zap.Logger, hfunc func(excludedAlarms AlarmSet) Health) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodGet {
w.Header().Set("Allow", http.MethodGet)
@@ -63,7 +63,8 @@ func NewHealthHandler(lg *zap.Logger, hfunc func() Health) http.HandlerFunc {
lg.Warn("/health error", zap.Int("status-code", http.StatusMethodNotAllowed))
return
}
h := hfunc()
excludedAlarms := getExcludedAlarms(r)
h := hfunc(excludedAlarms)
defer func() {
if h.Health == "true" {
healthSuccess.Inc()
@@ -110,15 +111,38 @@ type Health struct {
Reason string `json:"reason"`
}
// TODO: server NOSPACE, etcdserver.ErrNoLeader in health API
type AlarmSet map[string]struct{}
func checkHealth(lg *zap.Logger, srv etcdserver.ServerV2) Health {
func getExcludedAlarms(r *http.Request) (alarms AlarmSet) {
alarms = make(map[string]struct{}, 2)
alms, found := r.URL.Query()["exclude"]
if found {
for _, alm := range alms {
if len(alms) == 0 {
continue
}
alarms[alm] = struct{}{}
}
}
return alarms
}
// TODO: etcdserver.ErrNoLeader in health API
func checkHealth(lg *zap.Logger, srv etcdserver.ServerV2, excludedAlarms AlarmSet) Health {
h := Health{}
h.Health = "true"
as := srv.Alarms()
if len(as) > 0 {
h.Health = "false"
for _, v := range as {
alarmName := v.Alarm.String()
if _, found := excludedAlarms[alarmName]; found {
lg.Debug("/health excluded alarm", zap.String("alarm", alarmName))
delete(excludedAlarms, alarmName)
continue
}
h.Health = "false"
switch v.Alarm {
case etcdserverpb.AlarmType_NOSPACE:
h.Reason = "ALARM NOSPACE"
@@ -128,8 +152,12 @@ func checkHealth(lg *zap.Logger, srv etcdserver.ServerV2) Health {
h.Reason = "ALARM UNKNOWN"
}
lg.Warn("serving /health false due to an alarm", zap.String("alarm", v.String()))
return h
}
return h
}
if len(excludedAlarms) > 0 {
lg.Warn("fail exclude alarms from health check", zap.String("exclude alarms", fmt.Sprintf("%+v", excludedAlarms)))
}
if uint64(srv.Leader()) == raft.None {
@@ -141,8 +169,8 @@ func checkHealth(lg *zap.Logger, srv etcdserver.ServerV2) Health {
return h
}
func checkV2Health(lg *zap.Logger, srv etcdserver.ServerV2) (h Health) {
if h = checkHealth(lg, srv); h.Health != "true" {
func checkV2Health(lg *zap.Logger, srv etcdserver.ServerV2, excludedAlarms AlarmSet) (h Health) {
if h = checkHealth(lg, srv, excludedAlarms); h.Health != "true" {
return
}
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
@@ -158,8 +186,8 @@ func checkV2Health(lg *zap.Logger, srv etcdserver.ServerV2) (h Health) {
return
}
func checkV3Health(lg *zap.Logger, srv *etcdserver.EtcdServer) (h Health) {
if h = checkHealth(lg, srv); h.Health != "true" {
func checkV3Health(lg *zap.Logger, srv *etcdserver.EtcdServer, excludedAlarms AlarmSet) (h Health) {
if h = checkHealth(lg, srv, excludedAlarms); h.Health != "true" {
return
}
ctx, cancel := context.WithTimeout(context.Background(), srv.Cfg.ReqTimeout())