mirror of
https://github.com/etcd-io/etcd.git
synced 2024-09-27 06:25:44 +00:00
196 lines
5.7 KiB
Go
196 lines
5.7 KiB
Go
// Copyright 2017 The etcd Authors
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package etcdhttp
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"net/http"
|
|
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
"go.etcd.io/etcd/api/v3/etcdserverpb"
|
|
pb "go.etcd.io/etcd/api/v3/etcdserverpb"
|
|
"go.etcd.io/etcd/client/pkg/v3/types"
|
|
"go.etcd.io/etcd/raft/v3"
|
|
"go.etcd.io/etcd/server/v3/auth"
|
|
"go.etcd.io/etcd/server/v3/config"
|
|
"go.uber.org/zap"
|
|
)
|
|
|
|
const (
|
|
PathHealth = "/health"
|
|
PathProxyHealth = "/proxy/health"
|
|
)
|
|
|
|
type ServerHealth interface {
|
|
Alarms() []*pb.AlarmMember
|
|
Leader() types.ID
|
|
Range(context.Context, *pb.RangeRequest) (*pb.RangeResponse, error)
|
|
Config() config.ServerConfig
|
|
}
|
|
|
|
// HandleHealth registers metrics and health handlers. it checks health by using v3 range request
|
|
// and its corresponding timeout.
|
|
func HandleHealth(lg *zap.Logger, mux *http.ServeMux, srv ServerHealth) {
|
|
mux.Handle(PathHealth, NewHealthHandler(lg, func(excludedAlarms AlarmSet, serializable bool) Health {
|
|
if h := checkAlarms(lg, srv, excludedAlarms); h.Health != "true" {
|
|
return h
|
|
}
|
|
if h := checkLeader(lg, srv, serializable); h.Health != "true" {
|
|
return h
|
|
}
|
|
return checkAPI(lg, srv, serializable)
|
|
}))
|
|
}
|
|
|
|
// NewHealthHandler handles '/health' requests.
|
|
func NewHealthHandler(lg *zap.Logger, hfunc func(excludedAlarms AlarmSet, Serializable bool) Health) http.HandlerFunc {
|
|
return func(w http.ResponseWriter, r *http.Request) {
|
|
if r.Method != http.MethodGet {
|
|
w.Header().Set("Allow", http.MethodGet)
|
|
http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed)
|
|
lg.Warn("/health error", zap.Int("status-code", http.StatusMethodNotAllowed))
|
|
return
|
|
}
|
|
excludedAlarms := getExcludedAlarms(r)
|
|
// Passing the query parameter "serializable=true" ensures that the
|
|
// health of the local etcd is checked vs the health of the cluster.
|
|
// This is useful for probes attempting to validate the liveness of
|
|
// the etcd process vs readiness of the cluster to serve requests.
|
|
serializableFlag := getSerializableFlag(r)
|
|
h := hfunc(excludedAlarms, serializableFlag)
|
|
defer func() {
|
|
if h.Health == "true" {
|
|
healthSuccess.Inc()
|
|
} else {
|
|
healthFailed.Inc()
|
|
}
|
|
}()
|
|
d, _ := json.Marshal(h)
|
|
if h.Health != "true" {
|
|
http.Error(w, string(d), http.StatusServiceUnavailable)
|
|
lg.Warn("/health error", zap.String("output", string(d)), zap.Int("status-code", http.StatusServiceUnavailable))
|
|
return
|
|
}
|
|
w.WriteHeader(http.StatusOK)
|
|
w.Write(d)
|
|
lg.Debug("/health OK", zap.Int("status-code", http.StatusOK))
|
|
}
|
|
}
|
|
|
|
var (
|
|
healthSuccess = prometheus.NewCounter(prometheus.CounterOpts{
|
|
Namespace: "etcd",
|
|
Subsystem: "server",
|
|
Name: "health_success",
|
|
Help: "The total number of successful health checks",
|
|
})
|
|
healthFailed = prometheus.NewCounter(prometheus.CounterOpts{
|
|
Namespace: "etcd",
|
|
Subsystem: "server",
|
|
Name: "health_failures",
|
|
Help: "The total number of failed health checks",
|
|
})
|
|
)
|
|
|
|
func init() {
|
|
prometheus.MustRegister(healthSuccess)
|
|
prometheus.MustRegister(healthFailed)
|
|
}
|
|
|
|
// Health defines etcd server health status.
|
|
// TODO: remove manual parsing in etcdctl cluster-health
|
|
type Health struct {
|
|
Health string `json:"health"`
|
|
Reason string `json:"reason"`
|
|
}
|
|
|
|
type AlarmSet map[string]struct{}
|
|
|
|
func getExcludedAlarms(r *http.Request) (alarms AlarmSet) {
|
|
alarms = make(map[string]struct{}, 2)
|
|
alms, found := r.URL.Query()["exclude"]
|
|
if found {
|
|
for _, alm := range alms {
|
|
if len(alm) == 0 {
|
|
continue
|
|
}
|
|
alarms[alm] = struct{}{}
|
|
}
|
|
}
|
|
return alarms
|
|
}
|
|
|
|
func getSerializableFlag(r *http.Request) bool {
|
|
return r.URL.Query().Get("serializable") == "true"
|
|
}
|
|
|
|
// TODO: etcdserver.ErrNoLeader in health API
|
|
|
|
func checkAlarms(lg *zap.Logger, srv ServerHealth, excludedAlarms AlarmSet) Health {
|
|
h := Health{Health: "true"}
|
|
as := srv.Alarms()
|
|
if len(as) > 0 {
|
|
for _, v := range as {
|
|
alarmName := v.Alarm.String()
|
|
if _, found := excludedAlarms[alarmName]; found {
|
|
lg.Debug("/health excluded alarm", zap.String("alarm", v.String()))
|
|
continue
|
|
}
|
|
|
|
h.Health = "false"
|
|
switch v.Alarm {
|
|
case etcdserverpb.AlarmType_NOSPACE:
|
|
h.Reason = "ALARM NOSPACE"
|
|
case etcdserverpb.AlarmType_CORRUPT:
|
|
h.Reason = "ALARM CORRUPT"
|
|
default:
|
|
h.Reason = "ALARM UNKNOWN"
|
|
}
|
|
lg.Warn("serving /health false due to an alarm", zap.String("alarm", v.String()))
|
|
return h
|
|
}
|
|
}
|
|
|
|
return h
|
|
}
|
|
|
|
func checkLeader(lg *zap.Logger, srv ServerHealth, serializable bool) Health {
|
|
h := Health{Health: "true"}
|
|
if !serializable && (uint64(srv.Leader()) == raft.None) {
|
|
h.Health = "false"
|
|
h.Reason = "RAFT NO LEADER"
|
|
lg.Warn("serving /health false; no leader")
|
|
}
|
|
return h
|
|
}
|
|
|
|
func checkAPI(lg *zap.Logger, srv ServerHealth, serializable bool) Health {
|
|
h := Health{Health: "true"}
|
|
cfg := srv.Config()
|
|
ctx, cancel := context.WithTimeout(context.Background(), cfg.ReqTimeout())
|
|
_, err := srv.Range(ctx, &etcdserverpb.RangeRequest{KeysOnly: true, Limit: 1, Serializable: serializable})
|
|
cancel()
|
|
if err != nil && err != auth.ErrUserEmpty && err != auth.ErrPermissionDenied {
|
|
h.Health = "false"
|
|
h.Reason = fmt.Sprintf("RANGE ERROR:%s", err)
|
|
lg.Warn("serving /health false; Range fails", zap.Error(err))
|
|
return h
|
|
}
|
|
lg.Debug("serving /health true")
|
|
return h
|
|
}
|