From 76d073a2b5d3ad1f079a6b48077516022f23fd29 Mon Sep 17 00:00:00 2001 From: Xiang Li Date: Thu, 5 May 2016 13:20:28 -0700 Subject: [PATCH] *: add leader changes to metrics --- Documentation/metrics.md | 16 ++++++++++++++-- etcdserver/metrics.go | 7 +++++++ etcdserver/raft.go | 1 + 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/Documentation/metrics.md b/Documentation/metrics.md index af18b7c4d..8c0c19f00 100644 --- a/Documentation/metrics.md +++ b/Documentation/metrics.md @@ -12,7 +12,19 @@ The naming of metrics follows the suggested [Prometheus best practices][promethe The metrics under the `etcd` prefix are for monitoring and alerting. They are stable high level metrics. If there is any change of these metrics, it will be included in release notes. -Metrics that are etcd2 related are documented [here][v2-http-metrics]. +Metrics that are etcd2 related are documented [v2 metrics guide][v2-http-metrics]. + +### server + +These metrics describe the status of the etcd server. In order to detect outages or problems for troubleshooting, the server metrics of every production etcd cluster should be closely monitored. + +All these metrics are prefixed with `etcd_server_` + +| Name | Description | Type | +|---------------------------|-----------------------------------|---------| +| leader_changes_seen_total | The number of leader changes seen | Counter | + +`leader_changes_seen_total` counts the number of leader changes the member has seen since its start. Rapid leadership changes impact the performance of etcd significantly. It also signals that the leader is unstable, perhaps due to network connectivity issues or excessive load hitting the etcd cluster. ### gRPC requests @@ -39,7 +51,7 @@ Example Prometheus queries that may be useful from these metrics (across all etc * `histogram_quantile(0.9, sum(rate(etcd_grpc_unary_requests_duration_seconds{job="etcd",grpc_method="PUT"}[5m]) ) by (le))` - Show the 0.90-tile latency (in seconds) of PUT request handling across all members, with a window of `5m`. + Show the 0.90-tile latency (in seconds) of PUT request handling across all members, with a window of `5m`. ## etcd_debugging namespace metrics diff --git a/etcdserver/metrics.go b/etcdserver/metrics.go index 6e48ed63d..f807d127e 100644 --- a/etcdserver/metrics.go +++ b/etcdserver/metrics.go @@ -44,12 +44,19 @@ var ( Name: "proposals_failed_total", Help: "The total number of failed proposals.", }) + leaderChanges = prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "etcd", + Subsystem: "server", + Name: "leader_changes_seen_total", + Help: "The number of leader changes seen", + }) ) func init() { prometheus.MustRegister(proposeDurations) prometheus.MustRegister(proposePending) prometheus.MustRegister(proposeFailed) + prometheus.MustRegister(leaderChanges) } func monitorFileDescriptor(done <-chan struct{}) { diff --git a/etcdserver/raft.go b/etcdserver/raft.go index 11b80dc5a..c41470e11 100644 --- a/etcdserver/raft.go +++ b/etcdserver/raft.go @@ -157,6 +157,7 @@ func (r *raftNode) start(s *EtcdServer) { r.mu.Lock() r.lt = time.Now() r.mu.Unlock() + leaderChanges.Inc() } atomic.StoreUint64(&r.lead, rd.SoftState.Lead) if rd.RaftState == raft.StateLeader {