From 964f6050ee7a65a15c703fd61ec74efca60e042f Mon Sep 17 00:00:00 2001
From: Xiang Li <xiangli.cs@gmail.com>
Date: Sat, 17 Oct 2015 13:03:46 -0700
Subject: [PATCH] raft: use HistogramVec for message_sent_latency

---
 Documentation/metrics.md | 12 ++++++------
 rafthttp/metrics.go      | 13 +++++++++----
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/Documentation/metrics.md b/Documentation/metrics.md
index accc2e5fe..4469b91bc 100644
--- a/Documentation/metrics.md
+++ b/Documentation/metrics.md
@@ -24,7 +24,7 @@ etcd now exposes the following metrics:
 
 High file descriptors (`file_descriptors_used_total`) usage (near the file descriptors limitation of the process) indicates a potential out of file descriptors issue. That might cause etcd fails to create new WAL files and panics.
 
-[Proposal](glossary.md#proposal) durations (`proposal_durations_seconds`) give you an histogram about the proposal commit latency. Latency can be introduced into this process by network and disk IO.
+[Proposal](glossary.md#proposal) durations (`proposal_durations_seconds`) give you a histogram about the proposal commit latency. Latency can be introduced into this process by network and disk IO.
 
 Pending proposal (`pending_proposal_total`) gives you an idea about how many proposal are in the queue and waiting for commit. An increasing pending number indicates a high client load or an unstable cluster.
 
@@ -82,13 +82,13 @@ Abnormally high snapshot duration (`snapshot_save_total_durations_seconds`) indi
 
 ### rafthttp
 
-| Name                              | Description                                | Type    | Labels                         |
-|-----------------------------------|--------------------------------------------|---------|--------------------------------|
-| message_sent_latency_microseconds | The latency distributions of messages sent | Summary | sendingType, msgType, remoteID |
-| message_sent_failed_total         | The total number of failed messages sent   | Summary | sendingType, msgType, remoteID |
+| Name                              | Description                                | Type         | Labels                         |
+|-----------------------------------|--------------------------------------------|--------------|--------------------------------|
+| message_sent_latency_seconds      | The latency distributions of messages sent | HistogramVec | sendingType, msgType, remoteID |
+| message_sent_failed_total         | The total number of failed messages sent   | Summary      | sendingType, msgType, remoteID |
 
 
-Abnormally high message duration (`message_sent_latency_microseconds`) indicates network issues and might cause the cluster to be unstable.
+Abnormally high message duration (`message_sent_latency_seconds`) indicates network issues and might cause the cluster to be unstable.
 
 An increase in message failures (`message_sent_failed_total`) indicates more severe network issues and might cause the cluster to be unstable.
 
diff --git a/rafthttp/metrics.go b/rafthttp/metrics.go
index e1c9508dd..48eba98ee 100644
--- a/rafthttp/metrics.go
+++ b/rafthttp/metrics.go
@@ -23,12 +23,17 @@ import (
 )
 
 var (
-	msgSentDuration = prometheus.NewSummaryVec(
-		prometheus.SummaryOpts{
+	// TODO: create a separate histogram for recording
+	// snapshot sending metric. snapshot can be large and
+	// take a long time to send. So it needs a different
+	// time range than other type of messages.
+	msgSentDuration = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
 			Namespace: "etcd",
 			Subsystem: "rafthttp",
-			Name:      "message_sent_latency_microseconds",
+			Name:      "message_sent_latency_seconds",
 			Help:      "message sent latency distributions.",
+			Buckets:   prometheus.ExponentialBuckets(0.0005, 2, 13),
 		},
 		[]string{"sendingType", "remoteID", "msgType"},
 	)
@@ -53,7 +58,7 @@ func reportSentDuration(sendingType string, m raftpb.Message, duration time.Dura
 	if isLinkHeartbeatMessage(m) {
 		typ = "MsgLinkHeartbeat"
 	}
-	msgSentDuration.WithLabelValues(sendingType, types.ID(m.To).String(), typ).Observe(float64(duration.Nanoseconds() / int64(time.Microsecond)))
+	msgSentDuration.WithLabelValues(sendingType, types.ID(m.To).String(), typ).Observe(float64(duration) / float64(time.Second))
 }
 
 func reportSentFailure(sendingType string, m raftpb.Message) {