From 46bddacacbbe7aa21fb57be950488a83c3f7e159 Mon Sep 17 00:00:00 2001 From: Gyuho Lee Date: Thu, 8 Aug 2019 12:39:27 -0700 Subject: [PATCH] etcdserver/api: add "etcd_network_snapshot_send_inflights_total", "etcd_network_snapshot_receive_inflights_total" Useful for deciding when to terminate the unhealthy follower. If the follower is receiving a leader snapshot, operator may wait. Signed-off-by: Gyuho Lee --- etcdserver/api/rafthttp/http.go | 5 +++++ etcdserver/api/rafthttp/metrics.go | 20 ++++++++++++++++++++ etcdserver/api/rafthttp/snapshot_sender.go | 6 +++++- 3 files changed, 30 insertions(+), 1 deletion(-) diff --git a/etcdserver/api/rafthttp/http.go b/etcdserver/api/rafthttp/http.go index 18e9c53f2..d0e0c81e2 100644 --- a/etcdserver/api/rafthttp/http.go +++ b/etcdserver/api/rafthttp/http.go @@ -258,6 +258,11 @@ func (h *snapshotHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { return } + snapshotReceiveInflights.WithLabelValues(from).Inc() + defer func() { + snapshotReceiveInflights.WithLabelValues(from).Dec() + }() + if h.lg != nil { h.lg.Info( "receiving database snapshot", diff --git a/etcdserver/api/rafthttp/metrics.go b/etcdserver/api/rafthttp/metrics.go index ce51248d8..02fff84be 100644 --- a/etcdserver/api/rafthttp/metrics.go +++ b/etcdserver/api/rafthttp/metrics.go @@ -80,6 +80,15 @@ var ( []string{"To"}, ) + snapshotSendInflights = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "etcd", + Subsystem: "network", + Name: "snapshot_send_inflights_total", + Help: "Total number of inflight snapshot sends", + }, + []string{"To"}, + ) + snapshotSendFailures = prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: "etcd", Subsystem: "network", @@ -111,6 +120,15 @@ var ( []string{"From"}, ) + snapshotReceiveInflights = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "etcd", + Subsystem: "network", + Name: "snapshot_receive_inflights_total", + Help: "Total number of inflight snapshot receives", + }, + []string{"From"}, + ) + snapshotReceiveFailures = prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: "etcd", Subsystem: "network", @@ -156,9 +174,11 @@ func init() { prometheus.MustRegister(recvFailures) prometheus.MustRegister(snapshotSend) + prometheus.MustRegister(snapshotSendInflights) prometheus.MustRegister(snapshotSendFailures) prometheus.MustRegister(snapshotSendSeconds) prometheus.MustRegister(snapshotReceive) + prometheus.MustRegister(snapshotReceiveInflights) prometheus.MustRegister(snapshotReceiveFailures) prometheus.MustRegister(snapshotReceiveSeconds) diff --git a/etcdserver/api/rafthttp/snapshot_sender.go b/etcdserver/api/rafthttp/snapshot_sender.go index 85abaeaa4..62efb0cdc 100644 --- a/etcdserver/api/rafthttp/snapshot_sender.go +++ b/etcdserver/api/rafthttp/snapshot_sender.go @@ -90,6 +90,11 @@ func (s *snapshotSender) send(merged snap.Message) { plog.Infof("start to send database snapshot [index: %d, to %s]...", m.Snapshot.Metadata.Index, types.ID(m.To)) } + snapshotSendInflights.WithLabelValues(to).Inc() + defer func() { + snapshotSendInflights.WithLabelValues(to).Dec() + }() + err := s.post(req) defer merged.CloseWithError(err) if err != nil { @@ -139,7 +144,6 @@ func (s *snapshotSender) send(merged snap.Message) { } sentBytes.WithLabelValues(to).Add(float64(merged.TotalSize)) - snapshotSend.WithLabelValues(to).Inc() snapshotSendSeconds.WithLabelValues(to).Observe(time.Since(start).Seconds()) }