mirror of
https://github.com/etcd-io/etcd.git
synced 2024-09-27 06:25:44 +00:00
etcdserver: add metric counters for livez/readyz health checks.
Signed-off-by: Siyuan Zhang <sizhang@google.com>
This commit is contained in:
parent
b343231b12
commit
3897103b77
@ -12,6 +12,9 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// This file defines the http endpoints for etcd health checks.
|
||||
// The endpoints include /livez, /readyz and /health.
|
||||
|
||||
package etcdhttp
|
||||
|
||||
import (
|
||||
@ -34,8 +37,13 @@ import (
|
||||
)
|
||||
|
||||
const (
|
||||
PathHealth = "/health"
|
||||
PathProxyHealth = "/proxy/health"
|
||||
PathHealth = "/health"
|
||||
PathProxyHealth = "/proxy/health"
|
||||
HealthStatusSuccess string = "success"
|
||||
HealthStatusError string = "error"
|
||||
checkTypeLivez = "livez"
|
||||
checkTypeReadyz = "readyz"
|
||||
checkTypeHealth = "health"
|
||||
)
|
||||
|
||||
type ServerHealth interface {
|
||||
@ -111,11 +119,29 @@ var (
|
||||
Name: "health_failures",
|
||||
Help: "The total number of failed health checks",
|
||||
})
|
||||
healthCheckGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||
Namespace: "etcd",
|
||||
Subsystem: "server",
|
||||
Name: "healthcheck",
|
||||
Help: "The result of each kind of healthcheck.",
|
||||
},
|
||||
[]string{"type", "name"},
|
||||
)
|
||||
healthCheckCounter = prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||
Namespace: "etcd",
|
||||
Subsystem: "server",
|
||||
Name: "healthchecks_total",
|
||||
Help: "The total number of each kind of healthcheck.",
|
||||
},
|
||||
[]string{"type", "name", "status"},
|
||||
)
|
||||
)
|
||||
|
||||
func init() {
|
||||
prometheus.MustRegister(healthSuccess)
|
||||
prometheus.MustRegister(healthFailed)
|
||||
prometheus.MustRegister(healthCheckGauge)
|
||||
prometheus.MustRegister(healthCheckCounter)
|
||||
}
|
||||
|
||||
// Health defines etcd server health status.
|
||||
@ -125,6 +151,12 @@ type Health struct {
|
||||
Reason string `json:"reason"`
|
||||
}
|
||||
|
||||
// HealthStatus is used in new /readyz or /livez health checks instead of the Health struct.
|
||||
type HealthStatus struct {
|
||||
Reason string `json:"reason"`
|
||||
Status string `json:"status"`
|
||||
}
|
||||
|
||||
func getQuerySet(r *http.Request, query string) StringSet {
|
||||
querySet := make(map[string]struct{})
|
||||
qs, found := r.URL.Query()[query]
|
||||
@ -201,18 +233,18 @@ func checkAPI(ctx context.Context, lg *zap.Logger, srv ServerHealth, serializabl
|
||||
type HealthCheck func(ctx context.Context) error
|
||||
|
||||
type CheckRegistry struct {
|
||||
path string
|
||||
checks map[string]HealthCheck
|
||||
checkType string
|
||||
checks map[string]HealthCheck
|
||||
}
|
||||
|
||||
func installLivezEndpoints(lg *zap.Logger, mux *http.ServeMux, server ServerHealth) {
|
||||
reg := CheckRegistry{path: "/livez", checks: make(map[string]HealthCheck)}
|
||||
reg := CheckRegistry{checkType: checkTypeLivez, checks: make(map[string]HealthCheck)}
|
||||
reg.Register("serializable_read", serializableReadCheck(server))
|
||||
reg.InstallHttpEndpoints(lg, mux)
|
||||
}
|
||||
|
||||
func installReadyzEndpoints(lg *zap.Logger, mux *http.ServeMux, server ServerHealth) {
|
||||
reg := CheckRegistry{path: "/readyz", checks: make(map[string]HealthCheck)}
|
||||
reg := CheckRegistry{checkType: checkTypeReadyz, checks: make(map[string]HealthCheck)}
|
||||
reg.Register("data_corruption", activeAlarmCheck(server, pb.AlarmType_CORRUPT))
|
||||
reg.Register("serializable_read", serializableReadCheck(server))
|
||||
reg.InstallHttpEndpoints(lg, mux)
|
||||
@ -222,6 +254,10 @@ func (reg *CheckRegistry) Register(name string, check HealthCheck) {
|
||||
reg.checks[name] = check
|
||||
}
|
||||
|
||||
func (reg *CheckRegistry) RootPath() string {
|
||||
return "/" + reg.checkType
|
||||
}
|
||||
|
||||
func (reg *CheckRegistry) InstallHttpEndpoints(lg *zap.Logger, mux *http.ServeMux) {
|
||||
checkNames := make([]string, 0, len(reg.checks))
|
||||
for k := range reg.checks {
|
||||
@ -229,19 +265,19 @@ func (reg *CheckRegistry) InstallHttpEndpoints(lg *zap.Logger, mux *http.ServeMu
|
||||
}
|
||||
|
||||
// installs the http handler for the root path.
|
||||
reg.installRootHttpEndpoint(lg, mux, reg.path, checkNames...)
|
||||
reg.installRootHttpEndpoint(lg, mux, checkNames...)
|
||||
for _, checkName := range checkNames {
|
||||
// installs the http handler for the individual check sub path.
|
||||
subpath := path.Join(reg.path, checkName)
|
||||
subpath := path.Join(reg.RootPath(), checkName)
|
||||
check := checkName
|
||||
mux.Handle(subpath, newHealthHandler(subpath, lg, func(r *http.Request) Health {
|
||||
mux.Handle(subpath, newHealthHandler(subpath, lg, func(r *http.Request) HealthStatus {
|
||||
return reg.runHealthChecks(r.Context(), check)
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
func (reg *CheckRegistry) runHealthChecks(ctx context.Context, checkNames ...string) Health {
|
||||
h := Health{Health: "true"}
|
||||
func (reg *CheckRegistry) runHealthChecks(ctx context.Context, checkNames ...string) HealthStatus {
|
||||
h := HealthStatus{Status: HealthStatusSuccess}
|
||||
var individualCheckOutput bytes.Buffer
|
||||
for _, checkName := range checkNames {
|
||||
check, found := reg.checks[checkName]
|
||||
@ -250,9 +286,11 @@ func (reg *CheckRegistry) runHealthChecks(ctx context.Context, checkNames ...str
|
||||
}
|
||||
if err := check(ctx); err != nil {
|
||||
fmt.Fprintf(&individualCheckOutput, "[-]%s failed: %v\n", checkName, err)
|
||||
h.Health = "false"
|
||||
h.Status = HealthStatusError
|
||||
recordMetrics(reg.checkType, checkName, HealthStatusError)
|
||||
} else {
|
||||
fmt.Fprintf(&individualCheckOutput, "[+]%s ok\n", checkName)
|
||||
recordMetrics(reg.checkType, checkName, HealthStatusSuccess)
|
||||
}
|
||||
}
|
||||
h.Reason = individualCheckOutput.String()
|
||||
@ -260,19 +298,20 @@ func (reg *CheckRegistry) runHealthChecks(ctx context.Context, checkNames ...str
|
||||
}
|
||||
|
||||
// installRootHttpEndpoint installs the http handler for the root path.
|
||||
func (reg *CheckRegistry) installRootHttpEndpoint(lg *zap.Logger, mux *http.ServeMux, path string, checks ...string) {
|
||||
hfunc := func(r *http.Request) Health {
|
||||
func (reg *CheckRegistry) installRootHttpEndpoint(lg *zap.Logger, mux *http.ServeMux, checks ...string) {
|
||||
hfunc := func(r *http.Request) HealthStatus {
|
||||
// extracts the health check names to be excludeList from the query param
|
||||
excluded := getQuerySet(r, "exclude")
|
||||
|
||||
filteredCheckNames := filterCheckList(lg, listToStringSet(checks), excluded)
|
||||
return reg.runHealthChecks(r.Context(), filteredCheckNames...)
|
||||
h := reg.runHealthChecks(r.Context(), filteredCheckNames...)
|
||||
return h
|
||||
}
|
||||
mux.Handle(path, newHealthHandler(path, lg, hfunc))
|
||||
mux.Handle(reg.RootPath(), newHealthHandler(reg.RootPath(), lg, hfunc))
|
||||
}
|
||||
|
||||
// newHealthHandler generates a http HandlerFunc for a health check function hfunc.
|
||||
func newHealthHandler(path string, lg *zap.Logger, hfunc func(*http.Request) Health) http.HandlerFunc {
|
||||
func newHealthHandler(path string, lg *zap.Logger, hfunc func(*http.Request) HealthStatus) http.HandlerFunc {
|
||||
return func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method != http.MethodGet {
|
||||
w.Header().Set("Allow", http.MethodGet)
|
||||
@ -282,7 +321,7 @@ func newHealthHandler(path string, lg *zap.Logger, hfunc func(*http.Request) Hea
|
||||
}
|
||||
h := hfunc(r)
|
||||
// Always returns detailed reason for failed checks.
|
||||
if h.Health != "true" {
|
||||
if h.Status == HealthStatusError {
|
||||
http.Error(w, h.Reason, http.StatusServiceUnavailable)
|
||||
lg.Error("Health check error", zap.String("path", path), zap.String("reason", h.Reason), zap.Int("status-code", http.StatusServiceUnavailable))
|
||||
return
|
||||
@ -342,6 +381,22 @@ func listToStringSet(list []string) StringSet {
|
||||
return set
|
||||
}
|
||||
|
||||
func recordMetrics(checkType, name string, status string) {
|
||||
val := 0.0
|
||||
if status == HealthStatusSuccess {
|
||||
val = 1.0
|
||||
}
|
||||
healthCheckGauge.With(prometheus.Labels{
|
||||
"type": checkType,
|
||||
"name": name,
|
||||
}).Set(val)
|
||||
healthCheckCounter.With(prometheus.Labels{
|
||||
"type": checkType,
|
||||
"name": name,
|
||||
"status": status,
|
||||
}).Inc()
|
||||
}
|
||||
|
||||
// activeAlarmCheck checks if a specific alarm type is active in the server.
|
||||
func activeAlarmCheck(srv ServerHealth, at pb.AlarmType) func(context.Context) error {
|
||||
return func(ctx context.Context) error {
|
||||
|
@ -23,6 +23,7 @@ import (
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"go.uber.org/zap/zaptest"
|
||||
|
||||
"go.etcd.io/raft/v3"
|
||||
@ -193,6 +194,7 @@ func TestHttpSubPath(t *testing.T) {
|
||||
ts := httptest.NewServer(mux)
|
||||
defer ts.Close()
|
||||
checkHttpResponse(t, ts, tt.healthCheckURL, tt.expectStatusCode, tt.inResult, tt.notInResult)
|
||||
checkMetrics(t, tt.healthCheckURL, "", tt.expectStatusCode)
|
||||
})
|
||||
}
|
||||
}
|
||||
@ -291,6 +293,7 @@ func TestSerializableReadCheck(t *testing.T) {
|
||||
ts := httptest.NewServer(mux)
|
||||
defer ts.Close()
|
||||
checkHttpResponse(t, ts, tt.healthCheckURL, tt.expectStatusCode, tt.inResult, tt.notInResult)
|
||||
checkMetrics(t, tt.healthCheckURL, "serializable_read", tt.expectStatusCode)
|
||||
})
|
||||
}
|
||||
}
|
||||
@ -323,3 +326,54 @@ func checkHttpResponse(t *testing.T, ts *httptest.Server, url string, expectStat
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func checkMetrics(t *testing.T, url, checkName string, expectStatusCode int) {
|
||||
defer healthCheckGauge.Reset()
|
||||
defer healthCheckCounter.Reset()
|
||||
|
||||
typeName := strings.TrimPrefix(strings.Split(url, "?")[0], "/")
|
||||
if len(checkName) == 0 {
|
||||
checkName = strings.Split(typeName, "/")[1]
|
||||
typeName = strings.Split(typeName, "/")[0]
|
||||
}
|
||||
|
||||
expectedSuccessCount := 1
|
||||
expectedErrorCount := 0
|
||||
if expectStatusCode != http.StatusOK {
|
||||
expectedSuccessCount = 0
|
||||
expectedErrorCount = 1
|
||||
}
|
||||
|
||||
gather, _ := prometheus.DefaultGatherer.Gather()
|
||||
for _, mf := range gather {
|
||||
name := *mf.Name
|
||||
val := 0
|
||||
switch name {
|
||||
case "etcd_server_healthcheck":
|
||||
val = int(mf.GetMetric()[0].GetGauge().GetValue())
|
||||
case "etcd_server_healthcheck_total":
|
||||
val = int(mf.GetMetric()[0].GetCounter().GetValue())
|
||||
default:
|
||||
continue
|
||||
}
|
||||
labelMap := make(map[string]string)
|
||||
for _, label := range mf.GetMetric()[0].Label {
|
||||
labelMap[label.GetName()] = label.GetValue()
|
||||
}
|
||||
if typeName != labelMap["type"] {
|
||||
continue
|
||||
}
|
||||
if labelMap["name"] != checkName {
|
||||
continue
|
||||
}
|
||||
if statusLabel, found := labelMap["status"]; found && statusLabel == HealthStatusError {
|
||||
if val != expectedErrorCount {
|
||||
t.Fatalf("%s got errorCount %d, wanted %d\n", name, val, expectedErrorCount)
|
||||
}
|
||||
} else {
|
||||
if val != expectedSuccessCount {
|
||||
t.Fatalf("%s got expectedSuccessCount %d, wanted %d\n", name, val, expectedSuccessCount)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user