diff --git a/Documentation/metrics.md b/Documentation/metrics.md index a9d36bd9e..89a739934 100644 --- a/Documentation/metrics.md +++ b/Documentation/metrics.md @@ -15,16 +15,16 @@ etcd now exposes the following metrics: ## etcdserver -| Name | Description | Type | -|-----------------------------------------|--------------------------------------------------|---------| -| file_descriptors_used_total | The total number of file descriptors used | Gauge | -| proposal_durations_milliseconds | The latency distributions of committing proposal | Summary | -| pending_proposal_total | The total number of pending proposals | Gauge | -| proposal_failed_total | The total number of failed proposals | Counter | +| Name | Description | Type | +|-----------------------------------------|--------------------------------------------------|-----------| +| file_descriptors_used_total | The total number of file descriptors used | Gauge | +| proposal_durations_seconds | The latency distributions of committing proposal | Histogram | +| pending_proposal_total | The total number of pending proposals | Gauge | +| proposal_failed_total | The total number of failed proposals | Counter | High file descriptors (`file_descriptors_used_total`) usage (near the file descriptors limitation of the process) indicates a potential out of file descriptors issue. That might cause etcd fails to create new WAL files and panics. -[Proposal](glossary.md#proposal) durations (`proposal_durations_milliseconds`) give you an summary about the proposal commit latency. Latency can be introduced into this process by network and disk IO. +[Proposal](glossary.md#proposal) durations (`proposal_durations_seconds`) give you a histogram about the proposal commit latency. Latency can be introduced into this process by network and disk IO. Pending proposal (`pending_proposal_total`) gives you an idea about how many proposal are in the queue and waiting for commit. An increasing pending number indicates a high client load or an unstable cluster. @@ -32,12 +32,12 @@ Failed proposals (`proposal_failed_total`) are normally related to two issues: t ## wal -| Name | Description | Type | -|------------------------------------|--------------------------------------------------|---------| -| fsync_durations_microseconds | The latency distributions of fsync called by wal | Summary | -| last_index_saved | The index of the last entry saved by wal | Gauge | +| Name | Description | Type | +|------------------------------------|--------------------------------------------------|-----------| +| fsync_durations_seconds | The latency distributions of fsync called by wal | Histogram | +| last_index_saved | The index of the last entry saved by wal | Gauge | -Abnormally high fsync duration (`fsync_durations_microseconds`) indicates disk issues and might cause the cluster to be unstable. +Abnormally high fsync duration (`fsync_durations_seconds`) indicates disk issues and might cause the cluster to be unstable. ## http requests @@ -73,22 +73,22 @@ Example Prometheus queries that may be useful from these metrics (across all etc ## snapshot -| Name | Description | Type | -|--------------------------------------------|------------------------------------------------------------|---------| -| snapshot_save_total_durations_microseconds | The total latency distributions of save called by snapshot | Summary | +| Name | Description | Type | +|--------------------------------------------|------------------------------------------------------------|-----------| +| snapshot_save_total_durations_seconds | The total latency distributions of save called by snapshot | Histogram | -Abnormally high snapshot duration (`snapshot_save_total_durations_microseconds`) indicates disk issues and might cause the cluster to be unstable. +Abnormally high snapshot duration (`snapshot_save_total_durations_seconds`) indicates disk issues and might cause the cluster to be unstable. ## rafthttp -| Name | Description | Type | Labels | -|-----------------------------------|--------------------------------------------|---------|--------------------------------| -| message_sent_latency_microseconds | The latency distributions of messages sent | Summary | sendingType, msgType, remoteID | -| message_sent_failed_total | The total number of failed messages sent | Summary | sendingType, msgType, remoteID | +| Name | Description | Type | Labels | +|-----------------------------------|--------------------------------------------|--------------|--------------------------------| +| message_sent_latency_seconds | The latency distributions of messages sent | HistogramVec | sendingType, msgType, remoteID | +| message_sent_failed_total | The total number of failed messages sent | Summary | sendingType, msgType, remoteID | -Abnormally high message duration (`message_sent_latency_microseconds`) indicates network issues and might cause the cluster to be unstable. +Abnormally high message duration (`message_sent_latency_seconds`) indicates network issues and might cause the cluster to be unstable. An increase in message failures (`message_sent_failed_total`) indicates more severe network issues and might cause the cluster to be unstable. diff --git a/etcdserver/metrics.go b/etcdserver/metrics.go index 0544f3f1a..cf87a12d1 100644 --- a/etcdserver/metrics.go +++ b/etcdserver/metrics.go @@ -23,11 +23,12 @@ import ( var ( // TODO: with label in v3? - proposeDurations = prometheus.NewSummary(prometheus.SummaryOpts{ + proposeDurations = prometheus.NewHistogram(prometheus.HistogramOpts{ Namespace: "etcd", Subsystem: "server", - Name: "proposal_durations_milliseconds", + Name: "proposal_durations_seconds", Help: "The latency distributions of committing proposal.", + Buckets: prometheus.ExponentialBuckets(0.001, 2, 14), }) proposePending = prometheus.NewGauge(prometheus.GaugeOpts{ Namespace: "etcd", diff --git a/etcdserver/server.go b/etcdserver/server.go index b535bfda2..5d796ebdd 100644 --- a/etcdserver/server.go +++ b/etcdserver/server.go @@ -622,7 +622,7 @@ func (s *EtcdServer) Do(ctx context.Context, r pb.Request) (Response, error) { select { case x := <-ch: - proposeDurations.Observe(float64(time.Since(start).Nanoseconds() / int64(time.Millisecond))) + proposeDurations.Observe(float64(time.Since(start)) / float64(time.Second)) resp := x.(Response) return resp, resp.err case <-ctx.Done(): diff --git a/rafthttp/metrics.go b/rafthttp/metrics.go index e1c9508dd..48eba98ee 100644 --- a/rafthttp/metrics.go +++ b/rafthttp/metrics.go @@ -23,12 +23,17 @@ import ( ) var ( - msgSentDuration = prometheus.NewSummaryVec( - prometheus.SummaryOpts{ + // TODO: create a separate histogram for recording + // snapshot sending metric. snapshot can be large and + // take a long time to send. So it needs a different + // time range than other type of messages. + msgSentDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ Namespace: "etcd", Subsystem: "rafthttp", - Name: "message_sent_latency_microseconds", + Name: "message_sent_latency_seconds", Help: "message sent latency distributions.", + Buckets: prometheus.ExponentialBuckets(0.0005, 2, 13), }, []string{"sendingType", "remoteID", "msgType"}, ) @@ -53,7 +58,7 @@ func reportSentDuration(sendingType string, m raftpb.Message, duration time.Dura if isLinkHeartbeatMessage(m) { typ = "MsgLinkHeartbeat" } - msgSentDuration.WithLabelValues(sendingType, types.ID(m.To).String(), typ).Observe(float64(duration.Nanoseconds() / int64(time.Microsecond))) + msgSentDuration.WithLabelValues(sendingType, types.ID(m.To).String(), typ).Observe(float64(duration) / float64(time.Second)) } func reportSentFailure(sendingType string, m raftpb.Message) { diff --git a/snap/metrics.go b/snap/metrics.go index 72758499a..918baffb0 100644 --- a/snap/metrics.go +++ b/snap/metrics.go @@ -18,18 +18,20 @@ import "github.com/coreos/etcd/Godeps/_workspace/src/github.com/prometheus/clien var ( // TODO: save_fsync latency? - saveDurations = prometheus.NewSummary(prometheus.SummaryOpts{ + saveDurations = prometheus.NewHistogram(prometheus.HistogramOpts{ Namespace: "etcd", Subsystem: "snapshot", - Name: "save_total_durations_microseconds", + Name: "save_total_durations_seconds", Help: "The total latency distributions of save called by snapshot.", + Buckets: prometheus.ExponentialBuckets(0.001, 2, 14), }) - marshallingDurations = prometheus.NewSummary(prometheus.SummaryOpts{ + marshallingDurations = prometheus.NewHistogram(prometheus.HistogramOpts{ Namespace: "etcd", Subsystem: "snapshot", - Name: "save_marshalling_durations_microseconds", + Name: "save_marshalling_durations_seconds", Help: "The marshalling cost distributions of save called by snapshot.", + Buckets: prometheus.ExponentialBuckets(0.001, 2, 14), }) ) diff --git a/snap/snapshotter.go b/snap/snapshotter.go index 4f9eb9ed8..dd4fe02af 100644 --- a/snap/snapshotter.go +++ b/snap/snapshotter.go @@ -74,12 +74,12 @@ func (s *Snapshotter) save(snapshot *raftpb.Snapshot) error { if err != nil { return err } else { - marshallingDurations.Observe(float64(time.Since(start).Nanoseconds() / int64(time.Microsecond))) + marshallingDurations.Observe(float64(time.Since(start)) / float64(time.Second)) } err = ioutil.WriteFile(path.Join(s.dir, fname), d, 0666) if err == nil { - saveDurations.Observe(float64(time.Since(start).Nanoseconds() / int64(time.Microsecond))) + saveDurations.Observe(float64(time.Since(start)) / float64(time.Second)) } return err } diff --git a/wal/metrics.go b/wal/metrics.go index b792c214d..470d4a5f7 100644 --- a/wal/metrics.go +++ b/wal/metrics.go @@ -17,11 +17,12 @@ package wal import "github.com/coreos/etcd/Godeps/_workspace/src/github.com/prometheus/client_golang/prometheus" var ( - syncDurations = prometheus.NewSummary(prometheus.SummaryOpts{ + syncDurations = prometheus.NewHistogram(prometheus.HistogramOpts{ Namespace: "etcd", Subsystem: "wal", - Name: "fsync_durations_microseconds", + Name: "fsync_durations_seconds", Help: "The latency distributions of fsync called by wal.", + Buckets: prometheus.ExponentialBuckets(0.001, 2, 14), }) lastIndexSaved = prometheus.NewGauge(prometheus.GaugeOpts{ Namespace: "etcd", diff --git a/wal/wal.go b/wal/wal.go index d92c0619a..9a456fc7b 100644 --- a/wal/wal.go +++ b/wal/wal.go @@ -403,7 +403,7 @@ func (w *WAL) sync() error { } start := time.Now() err := w.f.Sync() - syncDurations.Observe(float64(time.Since(start).Nanoseconds() / int64(time.Microsecond))) + syncDurations.Observe(float64(time.Since(start)) / float64(time.Second)) return err }