Merge pull request #11009 from gyuho/snapshot

*: add inflight snapshot metrics
This commit is contained in:
Gyuho Lee 2019-08-08 13:56:14 -07:00 committed by GitHub
commit 046c705f97
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 58 additions and 2 deletions

View File

@ -258,6 +258,11 @@ func (h *snapshotHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
return return
} }
snapshotReceiveInflights.WithLabelValues(from).Inc()
defer func() {
snapshotReceiveInflights.WithLabelValues(from).Dec()
}()
if h.lg != nil { if h.lg != nil {
h.lg.Info( h.lg.Info(
"receiving database snapshot", "receiving database snapshot",

View File

@ -80,6 +80,15 @@ var (
[]string{"To"}, []string{"To"},
) )
snapshotSendInflights = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "etcd",
Subsystem: "network",
Name: "snapshot_send_inflights_total",
Help: "Total number of inflight snapshot sends",
},
[]string{"To"},
)
snapshotSendFailures = prometheus.NewCounterVec(prometheus.CounterOpts{ snapshotSendFailures = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: "etcd", Namespace: "etcd",
Subsystem: "network", Subsystem: "network",
@ -111,6 +120,15 @@ var (
[]string{"From"}, []string{"From"},
) )
snapshotReceiveInflights = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "etcd",
Subsystem: "network",
Name: "snapshot_receive_inflights_total",
Help: "Total number of inflight snapshot receives",
},
[]string{"From"},
)
snapshotReceiveFailures = prometheus.NewCounterVec(prometheus.CounterOpts{ snapshotReceiveFailures = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: "etcd", Namespace: "etcd",
Subsystem: "network", Subsystem: "network",
@ -156,9 +174,11 @@ func init() {
prometheus.MustRegister(recvFailures) prometheus.MustRegister(recvFailures)
prometheus.MustRegister(snapshotSend) prometheus.MustRegister(snapshotSend)
prometheus.MustRegister(snapshotSendInflights)
prometheus.MustRegister(snapshotSendFailures) prometheus.MustRegister(snapshotSendFailures)
prometheus.MustRegister(snapshotSendSeconds) prometheus.MustRegister(snapshotSendSeconds)
prometheus.MustRegister(snapshotReceive) prometheus.MustRegister(snapshotReceive)
prometheus.MustRegister(snapshotReceiveInflights)
prometheus.MustRegister(snapshotReceiveFailures) prometheus.MustRegister(snapshotReceiveFailures)
prometheus.MustRegister(snapshotReceiveSeconds) prometheus.MustRegister(snapshotReceiveSeconds)

View File

@ -90,6 +90,11 @@ func (s *snapshotSender) send(merged snap.Message) {
plog.Infof("start to send database snapshot [index: %d, to %s]...", m.Snapshot.Metadata.Index, types.ID(m.To)) plog.Infof("start to send database snapshot [index: %d, to %s]...", m.Snapshot.Metadata.Index, types.ID(m.To))
} }
snapshotSendInflights.WithLabelValues(to).Inc()
defer func() {
snapshotSendInflights.WithLabelValues(to).Dec()
}()
err := s.post(req) err := s.post(req)
defer merged.CloseWithError(err) defer merged.CloseWithError(err)
if err != nil { if err != nil {
@ -139,7 +144,6 @@ func (s *snapshotSender) send(merged snap.Message) {
} }
sentBytes.WithLabelValues(to).Add(float64(merged.TotalSize)) sentBytes.WithLabelValues(to).Add(float64(merged.TotalSize))
snapshotSend.WithLabelValues(to).Inc() snapshotSend.WithLabelValues(to).Inc()
snapshotSendSeconds.WithLabelValues(to).Observe(time.Since(start).Seconds()) snapshotSendSeconds.WithLabelValues(to).Observe(time.Since(start).Seconds())
} }

View File

@ -76,6 +76,12 @@ var (
Name: "slow_apply_total", Name: "slow_apply_total",
Help: "The total number of slow apply requests (likely overloaded from slow disk).", Help: "The total number of slow apply requests (likely overloaded from slow disk).",
}) })
applySnapshotInProgress = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: "etcd",
Subsystem: "server",
Name: "snapshot_apply_in_progress_total",
Help: "1 if the server is applying the incoming snapshot. 0 if none.",
})
proposalsCommitted = prometheus.NewGauge(prometheus.GaugeOpts{ proposalsCommitted = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: "etcd", Namespace: "etcd",
Subsystem: "server", Subsystem: "server",
@ -153,6 +159,7 @@ func init() {
prometheus.MustRegister(leaderChanges) prometheus.MustRegister(leaderChanges)
prometheus.MustRegister(heartbeatSendFailures) prometheus.MustRegister(heartbeatSendFailures)
prometheus.MustRegister(slowApplies) prometheus.MustRegister(slowApplies)
prometheus.MustRegister(applySnapshotInProgress)
prometheus.MustRegister(proposalsCommitted) prometheus.MustRegister(proposalsCommitted)
prometheus.MustRegister(proposalsApplied) prometheus.MustRegister(proposalsApplied)
prometheus.MustRegister(proposalsPending) prometheus.MustRegister(proposalsPending)

View File

@ -1113,6 +1113,7 @@ func (s *EtcdServer) applySnapshot(ep *etcdProgress, apply *apply) {
if raft.IsEmptySnap(apply.snapshot) { if raft.IsEmptySnap(apply.snapshot) {
return return
} }
applySnapshotInProgress.Inc()
lg := s.getLogger() lg := s.getLogger()
if lg != nil { if lg != nil {
@ -1138,6 +1139,7 @@ func (s *EtcdServer) applySnapshot(ep *etcdProgress, apply *apply) {
} else { } else {
plog.Infof("finished applying incoming snapshot at index %d", ep.snapi) plog.Infof("finished applying incoming snapshot at index %d", ep.snapi)
} }
applySnapshotInProgress.Dec()
}() }()
if apply.snapshot.Metadata.Index <= ep.appliedi { if apply.snapshot.Metadata.Index <= ep.appliedi {

View File

@ -71,7 +71,25 @@ func TestV3WatchRestoreSnapshotUnsync(t *testing.T) {
// trigger snapshot send from leader to this slow follower // trigger snapshot send from leader to this slow follower
// which then calls watchable store Restore // which then calls watchable store Restore
clus.Members[0].RecoverPartition(t, clus.Members[1:]...) clus.Members[0].RecoverPartition(t, clus.Members[1:]...)
clus.WaitLeader(t) lead := clus.WaitLeader(t)
sends, err := clus.Members[lead].Metric("etcd_network_snapshot_send_inflights_total")
if err != nil {
t.Fatal(err)
}
if sends != "0" && sends != "1" {
// 0 if already sent, 1 if sending
t.Fatalf("inflight snapshot sends expected 0 or 1, got %q", sends)
}
receives, err := clus.Members[(lead+1)%3].Metric("etcd_network_snapshot_receive_inflights_total")
if err != nil {
t.Fatal(err)
}
if receives != "0" && receives != "1" {
// 0 if already received, 1 if receiving
t.Fatalf("inflight snapshot receives expected 0 or 1, got %q", receives)
}
time.Sleep(2 * time.Second) time.Sleep(2 * time.Second)
// slow follower now applies leader snapshot // slow follower now applies leader snapshot