etcdserver: add "etcd_server_heartbeat_failures_total"

{"level":"warn","ts":1527101858.4149103,"caller":"etcdserver/raft.go:370","msg":"failed to send out heartbeat; took too long, server is overloaded likely from slow disk","heartbeat-interval":0.1,"expected-duration":0.2,"exceeded-duration":0.025771662}
{"level":"warn","ts":1527101858.4149644,"caller":"etcdserver/raft.go:370","msg":"failed to send out heartbeat; took too long, server is overloaded likely from slow disk","heartbeat-interval":0.1,"expected-duration":0.2,"exceeded-duration":0.034015766}

Signed-off-by: Gyuho Lee <gyuhox@gmail.com>
This commit is contained in:
Gyuho Lee 2018-05-23 11:53:40 -07:00 committed by Wenjia Zhang
parent 78a13e67a0
commit b8547734ae
2 changed files with 8 additions and 0 deletions

View File

@ -41,6 +41,12 @@ var (
Name: "leader_changes_seen_total",
Help: "The number of leader changes seen.",
})
heartbeatFailures = prometheus.NewCounter(prometheus.CounterOpts{
Namespace: "etcd",
Subsystem: "server",
Name: "heartbeat_failures_total",
Help: "The total number of heartbeat send failures (likely overloaded from slow disk).",
})
proposalsCommitted = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: "etcd",
Subsystem: "server",
@ -90,6 +96,7 @@ func init() {
prometheus.MustRegister(hasLeader)
prometheus.MustRegister(isLeader)
prometheus.MustRegister(leaderChanges)
prometheus.MustRegister(heartbeatFailures)
prometheus.MustRegister(proposalsCommitted)
prometheus.MustRegister(proposalsApplied)
prometheus.MustRegister(proposalsPending)

View File

@ -293,6 +293,7 @@ func (r *raftNode) sendMessages(ms []raftpb.Message) {
// TODO: limit request rate.
plog.Warningf("failed to send out heartbeat on time (exceeded the %v timeout for %v)", r.heartbeat, exceed)
plog.Warningf("server is likely overloaded")
heartbeatFailures.Inc()
}
}
}