From 2f0e3fd2df1057770395f8e8e9e18996e4bc70c4 Mon Sep 17 00:00:00 2001 From: Gyuho Lee Date: Wed, 23 May 2018 11:53:40 -0700 Subject: [PATCH] etcdserver: add "etcd_server_heartbeat_failures_total" {"level":"warn","ts":1527101858.4149103,"caller":"etcdserver/raft.go:370","msg":"failed to send out heartbeat; took too long, server is overloaded likely from slow disk","heartbeat-interval":0.1,"expected-duration":0.2,"exceeded-duration":0.025771662} {"level":"warn","ts":1527101858.4149644,"caller":"etcdserver/raft.go:370","msg":"failed to send out heartbeat; took too long, server is overloaded likely from slow disk","heartbeat-interval":0.1,"expected-duration":0.2,"exceeded-duration":0.034015766} Signed-off-by: Gyuho Lee --- etcdserver/metrics.go | 7 +++++++ etcdserver/raft.go | 1 + 2 files changed, 8 insertions(+) diff --git a/etcdserver/metrics.go b/etcdserver/metrics.go index 42a27166a..6c228b721 100644 --- a/etcdserver/metrics.go +++ b/etcdserver/metrics.go @@ -41,6 +41,12 @@ var ( Name: "leader_changes_seen_total", Help: "The number of leader changes seen.", }) + heartbeatFailures = prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "etcd", + Subsystem: "server", + Name: "heartbeat_failures_total", + Help: "The total number of heartbeat send failures (likely overloaded from slow disk).", + }) proposalsCommitted = prometheus.NewGauge(prometheus.GaugeOpts{ Namespace: "etcd", Subsystem: "server", @@ -96,6 +102,7 @@ func init() { prometheus.MustRegister(hasLeader) prometheus.MustRegister(isLeader) prometheus.MustRegister(leaderChanges) + prometheus.MustRegister(heartbeatFailures) prometheus.MustRegister(proposalsCommitted) prometheus.MustRegister(proposalsApplied) prometheus.MustRegister(proposalsPending) diff --git a/etcdserver/raft.go b/etcdserver/raft.go index 6c67275de..c3d93f83a 100644 --- a/etcdserver/raft.go +++ b/etcdserver/raft.go @@ -346,6 +346,7 @@ func (r *raftNode) processMessages(ms []raftpb.Message) []raftpb.Message { // TODO: limit request rate. plog.Warningf("failed to send out heartbeat on time (exceeded the %v timeout for %v)", r.heartbeat, exceed) plog.Warningf("server is likely overloaded") + heartbeatFailures.Inc() } } }