From b8547734ae50c43adde62e6dff7f4d2c47604942 Mon Sep 17 00:00:00 2001 From: Gyuho Lee Date: Wed, 23 May 2018 11:53:40 -0700 Subject: [PATCH] etcdserver: add "etcd_server_heartbeat_failures_total" {"level":"warn","ts":1527101858.4149103,"caller":"etcdserver/raft.go:370","msg":"failed to send out heartbeat; took too long, server is overloaded likely from slow disk","heartbeat-interval":0.1,"expected-duration":0.2,"exceeded-duration":0.025771662} {"level":"warn","ts":1527101858.4149644,"caller":"etcdserver/raft.go:370","msg":"failed to send out heartbeat; took too long, server is overloaded likely from slow disk","heartbeat-interval":0.1,"expected-duration":0.2,"exceeded-duration":0.034015766} Signed-off-by: Gyuho Lee --- etcdserver/metrics.go | 7 +++++++ etcdserver/raft.go | 1 + 2 files changed, 8 insertions(+) diff --git a/etcdserver/metrics.go b/etcdserver/metrics.go index 88ff158b7..7d57b8115 100644 --- a/etcdserver/metrics.go +++ b/etcdserver/metrics.go @@ -41,6 +41,12 @@ var ( Name: "leader_changes_seen_total", Help: "The number of leader changes seen.", }) + heartbeatFailures = prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "etcd", + Subsystem: "server", + Name: "heartbeat_failures_total", + Help: "The total number of heartbeat send failures (likely overloaded from slow disk).", + }) proposalsCommitted = prometheus.NewGauge(prometheus.GaugeOpts{ Namespace: "etcd", Subsystem: "server", @@ -90,6 +96,7 @@ func init() { prometheus.MustRegister(hasLeader) prometheus.MustRegister(isLeader) prometheus.MustRegister(leaderChanges) + prometheus.MustRegister(heartbeatFailures) prometheus.MustRegister(proposalsCommitted) prometheus.MustRegister(proposalsApplied) prometheus.MustRegister(proposalsPending) diff --git a/etcdserver/raft.go b/etcdserver/raft.go index 45733512f..e69dfc504 100644 --- a/etcdserver/raft.go +++ b/etcdserver/raft.go @@ -293,6 +293,7 @@ func (r *raftNode) sendMessages(ms []raftpb.Message) { // TODO: limit request rate. plog.Warningf("failed to send out heartbeat on time (exceeded the %v timeout for %v)", r.heartbeat, exceed) plog.Warningf("server is likely overloaded") + heartbeatFailures.Inc() } } }