From 964f6050ee7a65a15c703fd61ec74efca60e042f Mon Sep 17 00:00:00 2001 From: Xiang Li Date: Sat, 17 Oct 2015 13:03:46 -0700 Subject: [PATCH] raft: use HistogramVec for message_sent_latency --- Documentation/metrics.md | 12 ++++++------ rafthttp/metrics.go | 13 +++++++++---- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/Documentation/metrics.md b/Documentation/metrics.md index accc2e5fe..4469b91bc 100644 --- a/Documentation/metrics.md +++ b/Documentation/metrics.md @@ -24,7 +24,7 @@ etcd now exposes the following metrics: High file descriptors (`file_descriptors_used_total`) usage (near the file descriptors limitation of the process) indicates a potential out of file descriptors issue. That might cause etcd fails to create new WAL files and panics. -[Proposal](glossary.md#proposal) durations (`proposal_durations_seconds`) give you an histogram about the proposal commit latency. Latency can be introduced into this process by network and disk IO. +[Proposal](glossary.md#proposal) durations (`proposal_durations_seconds`) give you a histogram about the proposal commit latency. Latency can be introduced into this process by network and disk IO. Pending proposal (`pending_proposal_total`) gives you an idea about how many proposal are in the queue and waiting for commit. An increasing pending number indicates a high client load or an unstable cluster. @@ -82,13 +82,13 @@ Abnormally high snapshot duration (`snapshot_save_total_durations_seconds`) indi ### rafthttp -| Name | Description | Type | Labels | -|-----------------------------------|--------------------------------------------|---------|--------------------------------| -| message_sent_latency_microseconds | The latency distributions of messages sent | Summary | sendingType, msgType, remoteID | -| message_sent_failed_total | The total number of failed messages sent | Summary | sendingType, msgType, remoteID | +| Name | Description | Type | Labels | +|-----------------------------------|--------------------------------------------|--------------|--------------------------------| +| message_sent_latency_seconds | The latency distributions of messages sent | HistogramVec | sendingType, msgType, remoteID | +| message_sent_failed_total | The total number of failed messages sent | Summary | sendingType, msgType, remoteID | -Abnormally high message duration (`message_sent_latency_microseconds`) indicates network issues and might cause the cluster to be unstable. +Abnormally high message duration (`message_sent_latency_seconds`) indicates network issues and might cause the cluster to be unstable. An increase in message failures (`message_sent_failed_total`) indicates more severe network issues and might cause the cluster to be unstable. diff --git a/rafthttp/metrics.go b/rafthttp/metrics.go index e1c9508dd..48eba98ee 100644 --- a/rafthttp/metrics.go +++ b/rafthttp/metrics.go @@ -23,12 +23,17 @@ import ( ) var ( - msgSentDuration = prometheus.NewSummaryVec( - prometheus.SummaryOpts{ + // TODO: create a separate histogram for recording + // snapshot sending metric. snapshot can be large and + // take a long time to send. So it needs a different + // time range than other type of messages. + msgSentDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ Namespace: "etcd", Subsystem: "rafthttp", - Name: "message_sent_latency_microseconds", + Name: "message_sent_latency_seconds", Help: "message sent latency distributions.", + Buckets: prometheus.ExponentialBuckets(0.0005, 2, 13), }, []string{"sendingType", "remoteID", "msgType"}, ) @@ -53,7 +58,7 @@ func reportSentDuration(sendingType string, m raftpb.Message, duration time.Dura if isLinkHeartbeatMessage(m) { typ = "MsgLinkHeartbeat" } - msgSentDuration.WithLabelValues(sendingType, types.ID(m.To).String(), typ).Observe(float64(duration.Nanoseconds() / int64(time.Microsecond))) + msgSentDuration.WithLabelValues(sendingType, types.ID(m.To).String(), typ).Observe(float64(duration) / float64(time.Second)) } func reportSentFailure(sendingType string, m raftpb.Message) {