diff --git a/etcdserver/metrics.go b/etcdserver/metrics.go index 88ff158b7..7d57b8115 100644 --- a/etcdserver/metrics.go +++ b/etcdserver/metrics.go @@ -41,6 +41,12 @@ var ( Name: "leader_changes_seen_total", Help: "The number of leader changes seen.", }) + heartbeatFailures = prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "etcd", + Subsystem: "server", + Name: "heartbeat_failures_total", + Help: "The total number of heartbeat send failures (likely overloaded from slow disk).", + }) proposalsCommitted = prometheus.NewGauge(prometheus.GaugeOpts{ Namespace: "etcd", Subsystem: "server", @@ -90,6 +96,7 @@ func init() { prometheus.MustRegister(hasLeader) prometheus.MustRegister(isLeader) prometheus.MustRegister(leaderChanges) + prometheus.MustRegister(heartbeatFailures) prometheus.MustRegister(proposalsCommitted) prometheus.MustRegister(proposalsApplied) prometheus.MustRegister(proposalsPending) diff --git a/etcdserver/raft.go b/etcdserver/raft.go index 45733512f..e69dfc504 100644 --- a/etcdserver/raft.go +++ b/etcdserver/raft.go @@ -293,6 +293,7 @@ func (r *raftNode) sendMessages(ms []raftpb.Message) { // TODO: limit request rate. plog.Warningf("failed to send out heartbeat on time (exceeded the %v timeout for %v)", r.heartbeat, exceed) plog.Warningf("server is likely overloaded") + heartbeatFailures.Inc() } } }