mirror of
https://github.com/etcd-io/etcd.git
synced 2024-09-27 06:25:44 +00:00
*: add leader changes to metrics
This commit is contained in:
parent
879cfe7666
commit
76d073a2b5
@ -12,7 +12,19 @@ The naming of metrics follows the suggested [Prometheus best practices][promethe
|
||||
|
||||
The metrics under the `etcd` prefix are for monitoring and alerting. They are stable high level metrics. If there is any change of these metrics, it will be included in release notes.
|
||||
|
||||
Metrics that are etcd2 related are documented [here][v2-http-metrics].
|
||||
Metrics that are etcd2 related are documented [v2 metrics guide][v2-http-metrics].
|
||||
|
||||
### server
|
||||
|
||||
These metrics describe the status of the etcd server. In order to detect outages or problems for troubleshooting, the server metrics of every production etcd cluster should be closely monitored.
|
||||
|
||||
All these metrics are prefixed with `etcd_server_`
|
||||
|
||||
| Name | Description | Type |
|
||||
|---------------------------|-----------------------------------|---------|
|
||||
| leader_changes_seen_total | The number of leader changes seen | Counter |
|
||||
|
||||
`leader_changes_seen_total` counts the number of leader changes the member has seen since its start. Rapid leadership changes impact the performance of etcd significantly. It also signals that the leader is unstable, perhaps due to network connectivity issues or excessive load hitting the etcd cluster.
|
||||
|
||||
### gRPC requests
|
||||
|
||||
@ -39,7 +51,7 @@ Example Prometheus queries that may be useful from these metrics (across all etc
|
||||
|
||||
* `histogram_quantile(0.9, sum(rate(etcd_grpc_unary_requests_duration_seconds{job="etcd",grpc_method="PUT"}[5m]) ) by (le))`
|
||||
|
||||
Show the 0.90-tile latency (in seconds) of PUT request handling across all members, with a window of `5m`.
|
||||
Show the 0.90-tile latency (in seconds) of PUT request handling across all members, with a window of `5m`.
|
||||
|
||||
## etcd_debugging namespace metrics
|
||||
|
||||
|
@ -44,12 +44,19 @@ var (
|
||||
Name: "proposals_failed_total",
|
||||
Help: "The total number of failed proposals.",
|
||||
})
|
||||
leaderChanges = prometheus.NewCounter(prometheus.CounterOpts{
|
||||
Namespace: "etcd",
|
||||
Subsystem: "server",
|
||||
Name: "leader_changes_seen_total",
|
||||
Help: "The number of leader changes seen",
|
||||
})
|
||||
)
|
||||
|
||||
func init() {
|
||||
prometheus.MustRegister(proposeDurations)
|
||||
prometheus.MustRegister(proposePending)
|
||||
prometheus.MustRegister(proposeFailed)
|
||||
prometheus.MustRegister(leaderChanges)
|
||||
}
|
||||
|
||||
func monitorFileDescriptor(done <-chan struct{}) {
|
||||
|
@ -157,6 +157,7 @@ func (r *raftNode) start(s *EtcdServer) {
|
||||
r.mu.Lock()
|
||||
r.lt = time.Now()
|
||||
r.mu.Unlock()
|
||||
leaderChanges.Inc()
|
||||
}
|
||||
atomic.StoreUint64(&r.lead, rd.SoftState.Lead)
|
||||
if rd.RaftState == raft.StateLeader {
|
||||
|
Loading…
x
Reference in New Issue
Block a user