Merge pull request #3700 from xiang90/metrics_hi

Replace Summary with Histogram for all metrics
This commit is contained in:
Xiang Li 2015-11-10 10:06:45 -08:00
commit ff36b9d9bc
8 changed files with 46 additions and 37 deletions

View File

@ -15,16 +15,16 @@ etcd now exposes the following metrics:
## etcdserver ## etcdserver
| Name | Description | Type | | Name | Description | Type |
|-----------------------------------------|--------------------------------------------------|---------| |-----------------------------------------|--------------------------------------------------|-----------|
| file_descriptors_used_total | The total number of file descriptors used | Gauge | | file_descriptors_used_total | The total number of file descriptors used | Gauge |
| proposal_durations_milliseconds | The latency distributions of committing proposal | Summary | | proposal_durations_seconds | The latency distributions of committing proposal | Histogram |
| pending_proposal_total | The total number of pending proposals | Gauge | | pending_proposal_total | The total number of pending proposals | Gauge |
| proposal_failed_total | The total number of failed proposals | Counter | | proposal_failed_total | The total number of failed proposals | Counter |
High file descriptors (`file_descriptors_used_total`) usage (near the file descriptors limitation of the process) indicates a potential out of file descriptors issue. That might cause etcd fails to create new WAL files and panics. High file descriptors (`file_descriptors_used_total`) usage (near the file descriptors limitation of the process) indicates a potential out of file descriptors issue. That might cause etcd fails to create new WAL files and panics.
[Proposal](glossary.md#proposal) durations (`proposal_durations_milliseconds`) give you an summary about the proposal commit latency. Latency can be introduced into this process by network and disk IO. [Proposal](glossary.md#proposal) durations (`proposal_durations_seconds`) give you a histogram about the proposal commit latency. Latency can be introduced into this process by network and disk IO.
Pending proposal (`pending_proposal_total`) gives you an idea about how many proposal are in the queue and waiting for commit. An increasing pending number indicates a high client load or an unstable cluster. Pending proposal (`pending_proposal_total`) gives you an idea about how many proposal are in the queue and waiting for commit. An increasing pending number indicates a high client load or an unstable cluster.
@ -32,12 +32,12 @@ Failed proposals (`proposal_failed_total`) are normally related to two issues: t
## wal ## wal
| Name | Description | Type | | Name | Description | Type |
|------------------------------------|--------------------------------------------------|---------| |------------------------------------|--------------------------------------------------|-----------|
| fsync_durations_microseconds | The latency distributions of fsync called by wal | Summary | | fsync_durations_seconds | The latency distributions of fsync called by wal | Histogram |
| last_index_saved | The index of the last entry saved by wal | Gauge | | last_index_saved | The index of the last entry saved by wal | Gauge |
Abnormally high fsync duration (`fsync_durations_microseconds`) indicates disk issues and might cause the cluster to be unstable. Abnormally high fsync duration (`fsync_durations_seconds`) indicates disk issues and might cause the cluster to be unstable.
## http requests ## http requests
@ -73,22 +73,22 @@ Example Prometheus queries that may be useful from these metrics (across all etc
## snapshot ## snapshot
| Name | Description | Type | | Name | Description | Type |
|--------------------------------------------|------------------------------------------------------------|---------| |--------------------------------------------|------------------------------------------------------------|-----------|
| snapshot_save_total_durations_microseconds | The total latency distributions of save called by snapshot | Summary | | snapshot_save_total_durations_seconds | The total latency distributions of save called by snapshot | Histogram |
Abnormally high snapshot duration (`snapshot_save_total_durations_microseconds`) indicates disk issues and might cause the cluster to be unstable. Abnormally high snapshot duration (`snapshot_save_total_durations_seconds`) indicates disk issues and might cause the cluster to be unstable.
## rafthttp ## rafthttp
| Name | Description | Type | Labels | | Name | Description | Type | Labels |
|-----------------------------------|--------------------------------------------|---------|--------------------------------| |-----------------------------------|--------------------------------------------|--------------|--------------------------------|
| message_sent_latency_microseconds | The latency distributions of messages sent | Summary | sendingType, msgType, remoteID | | message_sent_latency_seconds | The latency distributions of messages sent | HistogramVec | sendingType, msgType, remoteID |
| message_sent_failed_total | The total number of failed messages sent | Summary | sendingType, msgType, remoteID | | message_sent_failed_total | The total number of failed messages sent | Summary | sendingType, msgType, remoteID |
Abnormally high message duration (`message_sent_latency_microseconds`) indicates network issues and might cause the cluster to be unstable. Abnormally high message duration (`message_sent_latency_seconds`) indicates network issues and might cause the cluster to be unstable.
An increase in message failures (`message_sent_failed_total`) indicates more severe network issues and might cause the cluster to be unstable. An increase in message failures (`message_sent_failed_total`) indicates more severe network issues and might cause the cluster to be unstable.

View File

@ -23,11 +23,12 @@ import (
var ( var (
// TODO: with label in v3? // TODO: with label in v3?
proposeDurations = prometheus.NewSummary(prometheus.SummaryOpts{ proposeDurations = prometheus.NewHistogram(prometheus.HistogramOpts{
Namespace: "etcd", Namespace: "etcd",
Subsystem: "server", Subsystem: "server",
Name: "proposal_durations_milliseconds", Name: "proposal_durations_seconds",
Help: "The latency distributions of committing proposal.", Help: "The latency distributions of committing proposal.",
Buckets: prometheus.ExponentialBuckets(0.001, 2, 14),
}) })
proposePending = prometheus.NewGauge(prometheus.GaugeOpts{ proposePending = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: "etcd", Namespace: "etcd",

View File

@ -622,7 +622,7 @@ func (s *EtcdServer) Do(ctx context.Context, r pb.Request) (Response, error) {
select { select {
case x := <-ch: case x := <-ch:
proposeDurations.Observe(float64(time.Since(start).Nanoseconds() / int64(time.Millisecond))) proposeDurations.Observe(float64(time.Since(start)) / float64(time.Second))
resp := x.(Response) resp := x.(Response)
return resp, resp.err return resp, resp.err
case <-ctx.Done(): case <-ctx.Done():

View File

@ -23,12 +23,17 @@ import (
) )
var ( var (
msgSentDuration = prometheus.NewSummaryVec( // TODO: create a separate histogram for recording
prometheus.SummaryOpts{ // snapshot sending metric. snapshot can be large and
// take a long time to send. So it needs a different
// time range than other type of messages.
msgSentDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: "etcd", Namespace: "etcd",
Subsystem: "rafthttp", Subsystem: "rafthttp",
Name: "message_sent_latency_microseconds", Name: "message_sent_latency_seconds",
Help: "message sent latency distributions.", Help: "message sent latency distributions.",
Buckets: prometheus.ExponentialBuckets(0.0005, 2, 13),
}, },
[]string{"sendingType", "remoteID", "msgType"}, []string{"sendingType", "remoteID", "msgType"},
) )
@ -53,7 +58,7 @@ func reportSentDuration(sendingType string, m raftpb.Message, duration time.Dura
if isLinkHeartbeatMessage(m) { if isLinkHeartbeatMessage(m) {
typ = "MsgLinkHeartbeat" typ = "MsgLinkHeartbeat"
} }
msgSentDuration.WithLabelValues(sendingType, types.ID(m.To).String(), typ).Observe(float64(duration.Nanoseconds() / int64(time.Microsecond))) msgSentDuration.WithLabelValues(sendingType, types.ID(m.To).String(), typ).Observe(float64(duration) / float64(time.Second))
} }
func reportSentFailure(sendingType string, m raftpb.Message) { func reportSentFailure(sendingType string, m raftpb.Message) {

View File

@ -18,18 +18,20 @@ import "github.com/coreos/etcd/Godeps/_workspace/src/github.com/prometheus/clien
var ( var (
// TODO: save_fsync latency? // TODO: save_fsync latency?
saveDurations = prometheus.NewSummary(prometheus.SummaryOpts{ saveDurations = prometheus.NewHistogram(prometheus.HistogramOpts{
Namespace: "etcd", Namespace: "etcd",
Subsystem: "snapshot", Subsystem: "snapshot",
Name: "save_total_durations_microseconds", Name: "save_total_durations_seconds",
Help: "The total latency distributions of save called by snapshot.", Help: "The total latency distributions of save called by snapshot.",
Buckets: prometheus.ExponentialBuckets(0.001, 2, 14),
}) })
marshallingDurations = prometheus.NewSummary(prometheus.SummaryOpts{ marshallingDurations = prometheus.NewHistogram(prometheus.HistogramOpts{
Namespace: "etcd", Namespace: "etcd",
Subsystem: "snapshot", Subsystem: "snapshot",
Name: "save_marshalling_durations_microseconds", Name: "save_marshalling_durations_seconds",
Help: "The marshalling cost distributions of save called by snapshot.", Help: "The marshalling cost distributions of save called by snapshot.",
Buckets: prometheus.ExponentialBuckets(0.001, 2, 14),
}) })
) )

View File

@ -74,12 +74,12 @@ func (s *Snapshotter) save(snapshot *raftpb.Snapshot) error {
if err != nil { if err != nil {
return err return err
} else { } else {
marshallingDurations.Observe(float64(time.Since(start).Nanoseconds() / int64(time.Microsecond))) marshallingDurations.Observe(float64(time.Since(start)) / float64(time.Second))
} }
err = ioutil.WriteFile(path.Join(s.dir, fname), d, 0666) err = ioutil.WriteFile(path.Join(s.dir, fname), d, 0666)
if err == nil { if err == nil {
saveDurations.Observe(float64(time.Since(start).Nanoseconds() / int64(time.Microsecond))) saveDurations.Observe(float64(time.Since(start)) / float64(time.Second))
} }
return err return err
} }

View File

@ -17,11 +17,12 @@ package wal
import "github.com/coreos/etcd/Godeps/_workspace/src/github.com/prometheus/client_golang/prometheus" import "github.com/coreos/etcd/Godeps/_workspace/src/github.com/prometheus/client_golang/prometheus"
var ( var (
syncDurations = prometheus.NewSummary(prometheus.SummaryOpts{ syncDurations = prometheus.NewHistogram(prometheus.HistogramOpts{
Namespace: "etcd", Namespace: "etcd",
Subsystem: "wal", Subsystem: "wal",
Name: "fsync_durations_microseconds", Name: "fsync_durations_seconds",
Help: "The latency distributions of fsync called by wal.", Help: "The latency distributions of fsync called by wal.",
Buckets: prometheus.ExponentialBuckets(0.001, 2, 14),
}) })
lastIndexSaved = prometheus.NewGauge(prometheus.GaugeOpts{ lastIndexSaved = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: "etcd", Namespace: "etcd",

View File

@ -403,7 +403,7 @@ func (w *WAL) sync() error {
} }
start := time.Now() start := time.Now()
err := w.f.Sync() err := w.f.Sync()
syncDurations.Observe(float64(time.Since(start).Nanoseconds() / int64(time.Microsecond))) syncDurations.Observe(float64(time.Since(start)) / float64(time.Second))
return err return err
} }