mirror of
https://github.com/etcd-io/etcd.git
synced 2024-09-27 06:25:44 +00:00
server: Split metrics and health code
Signed-off-by: Siyuan Zhang <sizhang@google.com>
This commit is contained in:
parent
cc44646a2e
commit
4a8381a461
@ -773,7 +773,8 @@ func (e *Etcd) serveClients() (err error) {
|
|||||||
} else {
|
} else {
|
||||||
mux := http.NewServeMux()
|
mux := http.NewServeMux()
|
||||||
etcdhttp.HandleBasic(mux, e.Server)
|
etcdhttp.HandleBasic(mux, e.Server)
|
||||||
etcdhttp.HandleMetricsHealth(mux, e.Server)
|
etcdhttp.HandleMetrics(mux)
|
||||||
|
etcdhttp.HandleHealth(mux, e.Server)
|
||||||
h = mux
|
h = mux
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -862,7 +863,8 @@ func (e *Etcd) serveMetrics() (err error) {
|
|||||||
|
|
||||||
if len(e.cfg.ListenMetricsUrls) > 0 {
|
if len(e.cfg.ListenMetricsUrls) > 0 {
|
||||||
metricsMux := http.NewServeMux()
|
metricsMux := http.NewServeMux()
|
||||||
etcdhttp.HandleMetricsHealth(metricsMux, e.Server)
|
etcdhttp.HandleMetrics(metricsMux)
|
||||||
|
etcdhttp.HandleHealth(metricsMux, e.Server)
|
||||||
|
|
||||||
for _, murl := range e.cfg.ListenMetricsUrls {
|
for _, murl := range e.cfg.ListenMetricsUrls {
|
||||||
tlsInfo := &e.cfg.ClientTLSInfo
|
tlsInfo := &e.cfg.ClientTLSInfo
|
||||||
|
@ -542,7 +542,7 @@ func startProxy(cfg *config) error {
|
|||||||
plog.Infof("v2 proxy started listening on client requests on %q", host)
|
plog.Infof("v2 proxy started listening on client requests on %q", host)
|
||||||
}
|
}
|
||||||
mux := http.NewServeMux()
|
mux := http.NewServeMux()
|
||||||
etcdhttp.HandlePrometheus(mux) // v2 proxy just uses the same port
|
etcdhttp.HandleMetrics(mux) // v2 proxy just uses the same port
|
||||||
mux.Handle("/", ph)
|
mux.Handle("/", ph)
|
||||||
plog.Fatal(http.Serve(l, mux))
|
plog.Fatal(http.Serve(l, mux))
|
||||||
}()
|
}()
|
||||||
|
223
etcdserver/api/etcdhttp/health.go
Normal file
223
etcdserver/api/etcdhttp/health.go
Normal file
@ -0,0 +1,223 @@
|
|||||||
|
// Copyright 2017 The etcd Authors
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package etcdhttp
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"net/http"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"go.etcd.io/etcd/auth"
|
||||||
|
"go.etcd.io/etcd/etcdserver"
|
||||||
|
pb "go.etcd.io/etcd/etcdserver/etcdserverpb"
|
||||||
|
"go.etcd.io/etcd/pkg/types"
|
||||||
|
"go.etcd.io/etcd/raft"
|
||||||
|
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
PathHealth = "/health"
|
||||||
|
)
|
||||||
|
|
||||||
|
type ServerHealth interface {
|
||||||
|
serverHealthV2V3
|
||||||
|
Range(context.Context, *pb.RangeRequest) (*pb.RangeResponse, error)
|
||||||
|
Config() etcdserver.ServerConfig
|
||||||
|
}
|
||||||
|
|
||||||
|
type serverHealthV2V3 interface {
|
||||||
|
Alarms() []*pb.AlarmMember
|
||||||
|
Leader() types.ID
|
||||||
|
}
|
||||||
|
|
||||||
|
// HandleHealthForV2 registers metrics and health handlers for v2.
|
||||||
|
func HandleHealthForV2(mux *http.ServeMux, srv etcdserver.ServerV2) {
|
||||||
|
mux.Handle(PathHealth, NewHealthHandler(func(excludedAlarms AlarmSet, serializable bool) Health {
|
||||||
|
if h := checkAlarms(srv, excludedAlarms); h.Health != "true" {
|
||||||
|
return h
|
||||||
|
}
|
||||||
|
if h := checkLeader(srv, serializable); h.Health != "true" {
|
||||||
|
return h
|
||||||
|
}
|
||||||
|
return checkV2API(srv)
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
|
// HandleHealth registers metrics and health handlers. it checks health by using v3 range request
|
||||||
|
// and its corresponding timeout.
|
||||||
|
func HandleHealth(mux *http.ServeMux, srv ServerHealth) {
|
||||||
|
mux.Handle(PathHealth, NewHealthHandler(func(excludedAlarms AlarmSet, serializable bool) Health {
|
||||||
|
if h := checkAlarms(srv, excludedAlarms); h.Health != "true" {
|
||||||
|
return h
|
||||||
|
}
|
||||||
|
if h := checkLeader(srv, serializable); h.Health != "true" {
|
||||||
|
return h
|
||||||
|
}
|
||||||
|
return checkAPI(srv, serializable)
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewHealthHandler handles '/health' requests.
|
||||||
|
func NewHealthHandler(hfunc func(excludedAlarms AlarmSet, serializable bool) Health) http.HandlerFunc {
|
||||||
|
return func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
if r.Method != http.MethodGet {
|
||||||
|
w.Header().Set("Allow", http.MethodGet)
|
||||||
|
http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed)
|
||||||
|
plog.Warningf("/health error (status code %d)", http.StatusMethodNotAllowed)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
excludedAlarms := getExcludedAlarms(r)
|
||||||
|
// Passing the query parameter "serializable=true" ensures that the
|
||||||
|
// health of the local etcd is checked vs the health of the cluster.
|
||||||
|
// This is useful for probes attempting to validate the liveness of
|
||||||
|
// the etcd process vs readiness of the cluster to serve requests.
|
||||||
|
serializableFlag := getSerializableFlag(r)
|
||||||
|
h := hfunc(excludedAlarms, serializableFlag)
|
||||||
|
defer func() {
|
||||||
|
if h.Health == "true" {
|
||||||
|
healthSuccess.Inc()
|
||||||
|
} else {
|
||||||
|
healthFailed.Inc()
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
d, _ := json.Marshal(h)
|
||||||
|
if h.Health != "true" {
|
||||||
|
http.Error(w, string(d), http.StatusServiceUnavailable)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
w.WriteHeader(http.StatusOK)
|
||||||
|
w.Write(d)
|
||||||
|
plog.Debugf("/health OK (status code %d)", http.StatusOK)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var (
|
||||||
|
healthSuccess = prometheus.NewCounter(prometheus.CounterOpts{
|
||||||
|
Namespace: "etcd",
|
||||||
|
Subsystem: "server",
|
||||||
|
Name: "health_success",
|
||||||
|
Help: "The total number of successful health checks",
|
||||||
|
})
|
||||||
|
healthFailed = prometheus.NewCounter(prometheus.CounterOpts{
|
||||||
|
Namespace: "etcd",
|
||||||
|
Subsystem: "server",
|
||||||
|
Name: "health_failures",
|
||||||
|
Help: "The total number of failed health checks",
|
||||||
|
})
|
||||||
|
)
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
prometheus.MustRegister(healthSuccess)
|
||||||
|
prometheus.MustRegister(healthFailed)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Health defines etcd server health status.
|
||||||
|
// TODO: remove manual parsing in etcdctl cluster-health
|
||||||
|
type Health struct {
|
||||||
|
Health string `json:"health"`
|
||||||
|
Reason string `json:"-"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type AlarmSet map[string]struct{}
|
||||||
|
|
||||||
|
func getExcludedAlarms(r *http.Request) (alarms AlarmSet) {
|
||||||
|
alarms = make(map[string]struct{}, 2)
|
||||||
|
alms, found := r.URL.Query()["exclude"]
|
||||||
|
if found {
|
||||||
|
for _, alm := range alms {
|
||||||
|
if len(alms) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
alarms[alm] = struct{}{}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return alarms
|
||||||
|
}
|
||||||
|
|
||||||
|
func getSerializableFlag(r *http.Request) bool {
|
||||||
|
return r.URL.Query().Get("serializable") == "true"
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: etcdserver.ErrNoLeader in health API
|
||||||
|
|
||||||
|
func checkAlarms(srv serverHealthV2V3, excludedAlarms AlarmSet) Health {
|
||||||
|
h := Health{Health: "true"}
|
||||||
|
as := srv.Alarms()
|
||||||
|
if len(as) > 0 {
|
||||||
|
for _, v := range as {
|
||||||
|
alarmName := v.Alarm.String()
|
||||||
|
if _, found := excludedAlarms[alarmName]; found {
|
||||||
|
plog.Debugf("/health excluded alarm %s", v.String())
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
h.Health = "false"
|
||||||
|
switch v.Alarm {
|
||||||
|
case pb.AlarmType_NOSPACE:
|
||||||
|
h.Reason = "ALARM NOSPACE"
|
||||||
|
case pb.AlarmType_CORRUPT:
|
||||||
|
h.Reason = "ALARM CORRUPT"
|
||||||
|
default:
|
||||||
|
h.Reason = "ALARM UNKNOWN"
|
||||||
|
}
|
||||||
|
plog.Warningf("/health error due to %s", v.String())
|
||||||
|
return h
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return h
|
||||||
|
}
|
||||||
|
|
||||||
|
func checkLeader(srv serverHealthV2V3, serializable bool) Health {
|
||||||
|
h := Health{Health: "true"}
|
||||||
|
if !serializable && (uint64(srv.Leader()) == raft.None) {
|
||||||
|
h.Health = "false"
|
||||||
|
h.Reason = "RAFT NO LEADER"
|
||||||
|
plog.Warningf("/health error; no leader (status code %d)", http.StatusServiceUnavailable)
|
||||||
|
}
|
||||||
|
return h
|
||||||
|
}
|
||||||
|
|
||||||
|
func checkV2API(srv etcdserver.ServerV2) Health {
|
||||||
|
h := Health{Health: "true"}
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
|
||||||
|
_, err := srv.Do(ctx, pb.Request{Method: "QGET"})
|
||||||
|
cancel()
|
||||||
|
if err != nil {
|
||||||
|
h.Health = "false"
|
||||||
|
h.Reason = fmt.Sprintf("QGET ERROR:%s", err)
|
||||||
|
plog.Warningf("/health error; QGET failed %v (status code %d)", err, http.StatusServiceUnavailable)
|
||||||
|
return h
|
||||||
|
}
|
||||||
|
return h
|
||||||
|
}
|
||||||
|
|
||||||
|
func checkAPI(srv ServerHealth, serializable bool) Health {
|
||||||
|
h := Health{Health: "true"}
|
||||||
|
cfg := srv.Config()
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), cfg.ReqTimeout())
|
||||||
|
_, err := srv.Range(ctx, &pb.RangeRequest{KeysOnly: true, Limit: 1, Serializable: serializable})
|
||||||
|
cancel()
|
||||||
|
if err != nil && err != auth.ErrUserEmpty && err != auth.ErrPermissionDenied {
|
||||||
|
h.Health = "false"
|
||||||
|
h.Reason = fmt.Sprintf("RANGE ERROR:%s", err)
|
||||||
|
plog.Warningf("serving /health false; Range failed %v (status code %d)", err, http.StatusServiceUnavailable)
|
||||||
|
return h
|
||||||
|
}
|
||||||
|
return h
|
||||||
|
}
|
@ -19,29 +19,20 @@ import (
|
|||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"io/ioutil"
|
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/http/httptest"
|
"net/http/httptest"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"go.etcd.io/etcd/auth"
|
"go.etcd.io/etcd/auth"
|
||||||
"go.etcd.io/etcd/etcdserver"
|
"go.etcd.io/etcd/etcdserver"
|
||||||
stats "go.etcd.io/etcd/etcdserver/api/v2stats"
|
|
||||||
pb "go.etcd.io/etcd/etcdserver/etcdserverpb"
|
pb "go.etcd.io/etcd/etcdserver/etcdserverpb"
|
||||||
"go.etcd.io/etcd/pkg/testutil"
|
"go.etcd.io/etcd/pkg/testutil"
|
||||||
"go.etcd.io/etcd/pkg/types"
|
"go.etcd.io/etcd/pkg/types"
|
||||||
"go.etcd.io/etcd/raft"
|
"go.etcd.io/etcd/raft"
|
||||||
)
|
)
|
||||||
|
|
||||||
type fakeStats struct{}
|
|
||||||
|
|
||||||
func (s *fakeStats) SelfStats() []byte { return nil }
|
|
||||||
func (s *fakeStats) LeaderStats() []byte { return nil }
|
|
||||||
func (s *fakeStats) StoreStats() []byte { return nil }
|
|
||||||
|
|
||||||
type fakeHealthServer struct {
|
type fakeHealthServer struct {
|
||||||
fakeServer
|
fakeServer
|
||||||
stats.Stats
|
|
||||||
health string
|
health string
|
||||||
apiError error
|
apiError error
|
||||||
}
|
}
|
||||||
@ -130,18 +121,21 @@ func TestHealthHandler(t *testing.T) {
|
|||||||
expectHealth: "true",
|
expectHealth: "true",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
name: "Healthy even if authentication failed",
|
||||||
healthCheckURL: "/health",
|
healthCheckURL: "/health",
|
||||||
apiError: auth.ErrUserEmpty,
|
apiError: auth.ErrUserEmpty,
|
||||||
expectStatusCode: http.StatusOK,
|
expectStatusCode: http.StatusOK,
|
||||||
expectHealth: "true",
|
expectHealth: "true",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
name: "Healthy even if authorization failed",
|
||||||
healthCheckURL: "/health",
|
healthCheckURL: "/health",
|
||||||
apiError: auth.ErrPermissionDenied,
|
apiError: auth.ErrPermissionDenied,
|
||||||
expectStatusCode: http.StatusOK,
|
expectStatusCode: http.StatusOK,
|
||||||
expectHealth: "true",
|
expectHealth: "true",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
name: "Unhealthy if api is not available",
|
||||||
healthCheckURL: "/health",
|
healthCheckURL: "/health",
|
||||||
apiError: fmt.Errorf("Unexpected error"),
|
apiError: fmt.Errorf("Unexpected error"),
|
||||||
expectStatusCode: http.StatusServiceUnavailable,
|
expectStatusCode: http.StatusServiceUnavailable,
|
||||||
@ -149,12 +143,11 @@ func TestHealthHandler(t *testing.T) {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
for i, tt := range tests {
|
for _, tt := range tests {
|
||||||
t.Run(tt.name, func(t *testing.T) {
|
t.Run(tt.name, func(t *testing.T) {
|
||||||
mux := http.NewServeMux()
|
mux := http.NewServeMux()
|
||||||
HandleMetricsHealth(mux, &fakeHealthServer{
|
HandleHealth(mux, &fakeHealthServer{
|
||||||
fakeServer: fakeServer{alarms: tt.alarms},
|
fakeServer: fakeServer{alarms: tt.alarms},
|
||||||
Stats: &fakeStats{},
|
|
||||||
health: tt.expectHealth,
|
health: tt.expectHealth,
|
||||||
apiError: tt.apiError,
|
apiError: tt.apiError,
|
||||||
})
|
})
|
||||||
@ -163,14 +156,14 @@ func TestHealthHandler(t *testing.T) {
|
|||||||
|
|
||||||
res, err := ts.Client().Do(&http.Request{Method: http.MethodGet, URL: testutil.MustNewURL(t, ts.URL+tt.healthCheckURL)})
|
res, err := ts.Client().Do(&http.Request{Method: http.MethodGet, URL: testutil.MustNewURL(t, ts.URL+tt.healthCheckURL)})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Errorf("fail serve http request %s %v in test case #%d", tt.healthCheckURL, err, i+1)
|
t.Errorf("fail serve http request %s %v", tt.healthCheckURL, err)
|
||||||
}
|
}
|
||||||
if res == nil {
|
if res == nil {
|
||||||
t.Errorf("got nil http response with http request %s in test case #%d", tt.healthCheckURL, i+1)
|
t.Errorf("got nil http response with http request %s", tt.healthCheckURL)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
if res.StatusCode != tt.expectStatusCode {
|
if res.StatusCode != tt.expectStatusCode {
|
||||||
t.Errorf("want statusCode %d but got %d in test case #%d", tt.expectStatusCode, res.StatusCode, i+1)
|
t.Errorf("want statusCode %d but got %d", tt.expectStatusCode, res.StatusCode)
|
||||||
}
|
}
|
||||||
health, err := parseHealthOutput(res.Body)
|
health, err := parseHealthOutput(res.Body)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -185,7 +178,7 @@ func TestHealthHandler(t *testing.T) {
|
|||||||
|
|
||||||
func parseHealthOutput(body io.Reader) (Health, error) {
|
func parseHealthOutput(body io.Reader) (Health, error) {
|
||||||
obj := Health{}
|
obj := Health{}
|
||||||
d, derr := ioutil.ReadAll(body)
|
d, derr := io.ReadAll(body)
|
||||||
if derr != nil {
|
if derr != nil {
|
||||||
return obj, derr
|
return obj, derr
|
||||||
}
|
}
|
@ -15,218 +15,16 @@
|
|||||||
package etcdhttp
|
package etcdhttp
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
|
||||||
"encoding/json"
|
|
||||||
"fmt"
|
|
||||||
"net/http"
|
"net/http"
|
||||||
"time"
|
|
||||||
|
|
||||||
"go.etcd.io/etcd/auth"
|
|
||||||
"go.etcd.io/etcd/etcdserver"
|
|
||||||
pb "go.etcd.io/etcd/etcdserver/etcdserverpb"
|
|
||||||
"go.etcd.io/etcd/pkg/types"
|
|
||||||
"go.etcd.io/etcd/raft"
|
|
||||||
|
|
||||||
"github.com/prometheus/client_golang/prometheus"
|
|
||||||
"github.com/prometheus/client_golang/prometheus/promhttp"
|
"github.com/prometheus/client_golang/prometheus/promhttp"
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
PathMetrics = "/metrics"
|
PathMetrics = "/metrics"
|
||||||
PathHealth = "/health"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
type ServerHealth interface {
|
// HandleMetrics registers prometheus handler on '/metrics'.
|
||||||
serverHealthV2V3
|
func HandleMetrics(mux *http.ServeMux) {
|
||||||
Range(context.Context, *pb.RangeRequest) (*pb.RangeResponse, error)
|
|
||||||
Config() etcdserver.ServerConfig
|
|
||||||
}
|
|
||||||
|
|
||||||
type serverHealthV2V3 interface {
|
|
||||||
Alarms() []*pb.AlarmMember
|
|
||||||
Leader() types.ID
|
|
||||||
}
|
|
||||||
|
|
||||||
// HandleMetricsHealthForV2 registers metrics and health handlers for v2.
|
|
||||||
func HandleMetricsHealthForV2(mux *http.ServeMux, srv etcdserver.ServerV2) {
|
|
||||||
mux.Handle(PathMetrics, promhttp.Handler())
|
|
||||||
mux.Handle(PathHealth, NewHealthHandler(func(excludedAlarms AlarmSet, serializable bool) Health {
|
|
||||||
if h := checkAlarms(srv, excludedAlarms); h.Health != "true" {
|
|
||||||
return h
|
|
||||||
}
|
|
||||||
if h := checkLeader(srv, serializable); h.Health != "true" {
|
|
||||||
return h
|
|
||||||
}
|
|
||||||
return checkV2API(srv)
|
|
||||||
}))
|
|
||||||
}
|
|
||||||
|
|
||||||
// HandleMetricsHealth registers metrics and health handlers. it checks health by using v3 range request
|
|
||||||
// and its corresponding timeout.
|
|
||||||
func HandleMetricsHealth(mux *http.ServeMux, srv ServerHealth) {
|
|
||||||
mux.Handle(PathMetrics, promhttp.Handler())
|
|
||||||
mux.Handle(PathHealth, NewHealthHandler(func(excludedAlarms AlarmSet, serializable bool) Health {
|
|
||||||
if h := checkAlarms(srv, excludedAlarms); h.Health != "true" {
|
|
||||||
return h
|
|
||||||
}
|
|
||||||
if h := checkLeader(srv, serializable); h.Health != "true" {
|
|
||||||
return h
|
|
||||||
}
|
|
||||||
return checkAPI(srv, serializable)
|
|
||||||
}))
|
|
||||||
}
|
|
||||||
|
|
||||||
// HandlePrometheus registers prometheus handler on '/metrics'.
|
|
||||||
func HandlePrometheus(mux *http.ServeMux) {
|
|
||||||
mux.Handle(PathMetrics, promhttp.Handler())
|
mux.Handle(PathMetrics, promhttp.Handler())
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewHealthHandler handles '/health' requests.
|
|
||||||
func NewHealthHandler(hfunc func(excludedAlarms AlarmSet, serializable bool) Health) http.HandlerFunc {
|
|
||||||
return func(w http.ResponseWriter, r *http.Request) {
|
|
||||||
if r.Method != http.MethodGet {
|
|
||||||
w.Header().Set("Allow", http.MethodGet)
|
|
||||||
http.Error(w, "Method Not Allowed", http.StatusMethodNotAllowed)
|
|
||||||
plog.Warningf("/health error (status code %d)", http.StatusMethodNotAllowed)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
excludedAlarms := getExcludedAlarms(r)
|
|
||||||
// Passing the query parameter "serializable=true" ensures that the
|
|
||||||
// health of the local etcd is checked vs the health of the cluster.
|
|
||||||
// This is useful for probes attempting to validate the liveness of
|
|
||||||
// the etcd process vs readiness of the cluster to serve requests.
|
|
||||||
serializableFlag := getSerializableFlag(r)
|
|
||||||
h := hfunc(excludedAlarms, serializableFlag)
|
|
||||||
defer func() {
|
|
||||||
if h.Health == "true" {
|
|
||||||
healthSuccess.Inc()
|
|
||||||
} else {
|
|
||||||
healthFailed.Inc()
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
d, _ := json.Marshal(h)
|
|
||||||
if h.Health != "true" {
|
|
||||||
http.Error(w, string(d), http.StatusServiceUnavailable)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
w.WriteHeader(http.StatusOK)
|
|
||||||
w.Write(d)
|
|
||||||
plog.Debugf("/health OK (status code %d)", http.StatusOK)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
var (
|
|
||||||
healthSuccess = prometheus.NewCounter(prometheus.CounterOpts{
|
|
||||||
Namespace: "etcd",
|
|
||||||
Subsystem: "server",
|
|
||||||
Name: "health_success",
|
|
||||||
Help: "The total number of successful health checks",
|
|
||||||
})
|
|
||||||
healthFailed = prometheus.NewCounter(prometheus.CounterOpts{
|
|
||||||
Namespace: "etcd",
|
|
||||||
Subsystem: "server",
|
|
||||||
Name: "health_failures",
|
|
||||||
Help: "The total number of failed health checks",
|
|
||||||
})
|
|
||||||
)
|
|
||||||
|
|
||||||
func init() {
|
|
||||||
prometheus.MustRegister(healthSuccess)
|
|
||||||
prometheus.MustRegister(healthFailed)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Health defines etcd server health status.
|
|
||||||
// TODO: remove manual parsing in etcdctl cluster-health
|
|
||||||
type Health struct {
|
|
||||||
Health string `json:"health"`
|
|
||||||
Reason string `json:"-"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type AlarmSet map[string]struct{}
|
|
||||||
|
|
||||||
func getExcludedAlarms(r *http.Request) (alarms AlarmSet) {
|
|
||||||
alarms = make(map[string]struct{}, 2)
|
|
||||||
alms, found := r.URL.Query()["exclude"]
|
|
||||||
if found {
|
|
||||||
for _, alm := range alms {
|
|
||||||
if len(alms) == 0 {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
alarms[alm] = struct{}{}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return alarms
|
|
||||||
}
|
|
||||||
|
|
||||||
func getSerializableFlag(r *http.Request) bool {
|
|
||||||
return r.URL.Query().Get("serializable") == "true"
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: etcdserver.ErrNoLeader in health API
|
|
||||||
|
|
||||||
func checkAlarms(srv serverHealthV2V3, excludedAlarms AlarmSet) Health {
|
|
||||||
h := Health{Health: "true"}
|
|
||||||
as := srv.Alarms()
|
|
||||||
if len(as) > 0 {
|
|
||||||
for _, v := range as {
|
|
||||||
alarmName := v.Alarm.String()
|
|
||||||
if _, found := excludedAlarms[alarmName]; found {
|
|
||||||
plog.Debugf("/health excluded alarm %s", v.String())
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
h.Health = "false"
|
|
||||||
switch v.Alarm {
|
|
||||||
case pb.AlarmType_NOSPACE:
|
|
||||||
h.Reason = "ALARM NOSPACE"
|
|
||||||
case pb.AlarmType_CORRUPT:
|
|
||||||
h.Reason = "ALARM CORRUPT"
|
|
||||||
default:
|
|
||||||
h.Reason = "ALARM UNKNOWN"
|
|
||||||
}
|
|
||||||
plog.Warningf("/health error due to %s", v.String())
|
|
||||||
return h
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return h
|
|
||||||
}
|
|
||||||
|
|
||||||
func checkLeader(srv serverHealthV2V3, serializable bool) Health {
|
|
||||||
h := Health{Health: "true"}
|
|
||||||
if !serializable && (uint64(srv.Leader()) == raft.None) {
|
|
||||||
h.Health = "false"
|
|
||||||
h.Reason = "RAFT NO LEADER"
|
|
||||||
plog.Warningf("/health error; no leader (status code %d)", http.StatusServiceUnavailable)
|
|
||||||
}
|
|
||||||
return h
|
|
||||||
}
|
|
||||||
|
|
||||||
func checkV2API(srv etcdserver.ServerV2) Health {
|
|
||||||
h := Health{Health: "true"}
|
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
|
|
||||||
_, err := srv.Do(ctx, pb.Request{Method: "QGET"})
|
|
||||||
cancel()
|
|
||||||
if err != nil {
|
|
||||||
h.Health = "false"
|
|
||||||
h.Reason = fmt.Sprintf("QGET ERROR:%s", err)
|
|
||||||
plog.Warningf("/health error; QGET failed %v (status code %d)", err, http.StatusServiceUnavailable)
|
|
||||||
return h
|
|
||||||
}
|
|
||||||
return h
|
|
||||||
}
|
|
||||||
|
|
||||||
func checkAPI(srv ServerHealth, serializable bool) Health {
|
|
||||||
h := Health{Health: "true"}
|
|
||||||
cfg := srv.Config()
|
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), cfg.ReqTimeout())
|
|
||||||
_, err := srv.Range(ctx, &pb.RangeRequest{KeysOnly: true, Limit: 1, Serializable: serializable})
|
|
||||||
cancel()
|
|
||||||
if err != nil && err != auth.ErrUserEmpty && err != auth.ErrPermissionDenied {
|
|
||||||
h.Health = "false"
|
|
||||||
h.Reason = fmt.Sprintf("RANGE ERROR:%s", err)
|
|
||||||
plog.Warningf("serving /health false; Range failed %v (status code %d)", err, http.StatusServiceUnavailable)
|
|
||||||
return h
|
|
||||||
}
|
|
||||||
return h
|
|
||||||
}
|
|
||||||
|
@ -55,7 +55,8 @@ const (
|
|||||||
func NewClientHandler(lg *zap.Logger, server etcdserver.ServerPeer, timeout time.Duration) http.Handler {
|
func NewClientHandler(lg *zap.Logger, server etcdserver.ServerPeer, timeout time.Duration) http.Handler {
|
||||||
mux := http.NewServeMux()
|
mux := http.NewServeMux()
|
||||||
etcdhttp.HandleBasic(mux, server)
|
etcdhttp.HandleBasic(mux, server)
|
||||||
etcdhttp.HandleMetricsHealthForV2(mux, server)
|
etcdhttp.HandleMetrics(mux)
|
||||||
|
etcdhttp.HandleHealthForV2(mux, server)
|
||||||
handleV2(lg, mux, server, timeout)
|
handleV2(lg, mux, server, timeout)
|
||||||
return requestLogger(lg, mux)
|
return requestLogger(lg, mux)
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user