Merge pull request #3700 from xiang90/metrics_hi

Replace Summary with Histogram for all metrics
This commit is contained in:
Xiang Li 2015-11-10 10:06:45 -08:00
commit ff36b9d9bc
8 changed files with 46 additions and 37 deletions

View File

@ -15,16 +15,16 @@ etcd now exposes the following metrics:
## etcdserver
| Name | Description | Type |
|-----------------------------------------|--------------------------------------------------|---------|
| file_descriptors_used_total | The total number of file descriptors used | Gauge |
| proposal_durations_milliseconds | The latency distributions of committing proposal | Summary |
| pending_proposal_total | The total number of pending proposals | Gauge |
| proposal_failed_total | The total number of failed proposals | Counter |
| Name | Description | Type |
|-----------------------------------------|--------------------------------------------------|-----------|
| file_descriptors_used_total | The total number of file descriptors used | Gauge |
| proposal_durations_seconds | The latency distributions of committing proposal | Histogram |
| pending_proposal_total | The total number of pending proposals | Gauge |
| proposal_failed_total | The total number of failed proposals | Counter |
High file descriptors (`file_descriptors_used_total`) usage (near the file descriptors limitation of the process) indicates a potential out of file descriptors issue. That might cause etcd fails to create new WAL files and panics.
[Proposal](glossary.md#proposal) durations (`proposal_durations_milliseconds`) give you an summary about the proposal commit latency. Latency can be introduced into this process by network and disk IO.
[Proposal](glossary.md#proposal) durations (`proposal_durations_seconds`) give you a histogram about the proposal commit latency. Latency can be introduced into this process by network and disk IO.
Pending proposal (`pending_proposal_total`) gives you an idea about how many proposal are in the queue and waiting for commit. An increasing pending number indicates a high client load or an unstable cluster.
@ -32,12 +32,12 @@ Failed proposals (`proposal_failed_total`) are normally related to two issues: t
## wal
| Name | Description | Type |
|------------------------------------|--------------------------------------------------|---------|
| fsync_durations_microseconds | The latency distributions of fsync called by wal | Summary |
| last_index_saved | The index of the last entry saved by wal | Gauge |
| Name | Description | Type |
|------------------------------------|--------------------------------------------------|-----------|
| fsync_durations_seconds | The latency distributions of fsync called by wal | Histogram |
| last_index_saved | The index of the last entry saved by wal | Gauge |
Abnormally high fsync duration (`fsync_durations_microseconds`) indicates disk issues and might cause the cluster to be unstable.
Abnormally high fsync duration (`fsync_durations_seconds`) indicates disk issues and might cause the cluster to be unstable.
## http requests
@ -73,22 +73,22 @@ Example Prometheus queries that may be useful from these metrics (across all etc
## snapshot
| Name | Description | Type |
|--------------------------------------------|------------------------------------------------------------|---------|
| snapshot_save_total_durations_microseconds | The total latency distributions of save called by snapshot | Summary |
| Name | Description | Type |
|--------------------------------------------|------------------------------------------------------------|-----------|
| snapshot_save_total_durations_seconds | The total latency distributions of save called by snapshot | Histogram |
Abnormally high snapshot duration (`snapshot_save_total_durations_microseconds`) indicates disk issues and might cause the cluster to be unstable.
Abnormally high snapshot duration (`snapshot_save_total_durations_seconds`) indicates disk issues and might cause the cluster to be unstable.
## rafthttp
| Name | Description | Type | Labels |
|-----------------------------------|--------------------------------------------|---------|--------------------------------|
| message_sent_latency_microseconds | The latency distributions of messages sent | Summary | sendingType, msgType, remoteID |
| message_sent_failed_total | The total number of failed messages sent | Summary | sendingType, msgType, remoteID |
| Name | Description | Type | Labels |
|-----------------------------------|--------------------------------------------|--------------|--------------------------------|
| message_sent_latency_seconds | The latency distributions of messages sent | HistogramVec | sendingType, msgType, remoteID |
| message_sent_failed_total | The total number of failed messages sent | Summary | sendingType, msgType, remoteID |
Abnormally high message duration (`message_sent_latency_microseconds`) indicates network issues and might cause the cluster to be unstable.
Abnormally high message duration (`message_sent_latency_seconds`) indicates network issues and might cause the cluster to be unstable.
An increase in message failures (`message_sent_failed_total`) indicates more severe network issues and might cause the cluster to be unstable.

View File

@ -23,11 +23,12 @@ import (
var (
// TODO: with label in v3?
proposeDurations = prometheus.NewSummary(prometheus.SummaryOpts{
proposeDurations = prometheus.NewHistogram(prometheus.HistogramOpts{
Namespace: "etcd",
Subsystem: "server",
Name: "proposal_durations_milliseconds",
Name: "proposal_durations_seconds",
Help: "The latency distributions of committing proposal.",
Buckets: prometheus.ExponentialBuckets(0.001, 2, 14),
})
proposePending = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: "etcd",

View File

@ -622,7 +622,7 @@ func (s *EtcdServer) Do(ctx context.Context, r pb.Request) (Response, error) {
select {
case x := <-ch:
proposeDurations.Observe(float64(time.Since(start).Nanoseconds() / int64(time.Millisecond)))
proposeDurations.Observe(float64(time.Since(start)) / float64(time.Second))
resp := x.(Response)
return resp, resp.err
case <-ctx.Done():

View File

@ -23,12 +23,17 @@ import (
)
var (
msgSentDuration = prometheus.NewSummaryVec(
prometheus.SummaryOpts{
// TODO: create a separate histogram for recording
// snapshot sending metric. snapshot can be large and
// take a long time to send. So it needs a different
// time range than other type of messages.
msgSentDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: "etcd",
Subsystem: "rafthttp",
Name: "message_sent_latency_microseconds",
Name: "message_sent_latency_seconds",
Help: "message sent latency distributions.",
Buckets: prometheus.ExponentialBuckets(0.0005, 2, 13),
},
[]string{"sendingType", "remoteID", "msgType"},
)
@ -53,7 +58,7 @@ func reportSentDuration(sendingType string, m raftpb.Message, duration time.Dura
if isLinkHeartbeatMessage(m) {
typ = "MsgLinkHeartbeat"
}
msgSentDuration.WithLabelValues(sendingType, types.ID(m.To).String(), typ).Observe(float64(duration.Nanoseconds() / int64(time.Microsecond)))
msgSentDuration.WithLabelValues(sendingType, types.ID(m.To).String(), typ).Observe(float64(duration) / float64(time.Second))
}
func reportSentFailure(sendingType string, m raftpb.Message) {

View File

@ -18,18 +18,20 @@ import "github.com/coreos/etcd/Godeps/_workspace/src/github.com/prometheus/clien
var (
// TODO: save_fsync latency?
saveDurations = prometheus.NewSummary(prometheus.SummaryOpts{
saveDurations = prometheus.NewHistogram(prometheus.HistogramOpts{
Namespace: "etcd",
Subsystem: "snapshot",
Name: "save_total_durations_microseconds",
Name: "save_total_durations_seconds",
Help: "The total latency distributions of save called by snapshot.",
Buckets: prometheus.ExponentialBuckets(0.001, 2, 14),
})
marshallingDurations = prometheus.NewSummary(prometheus.SummaryOpts{
marshallingDurations = prometheus.NewHistogram(prometheus.HistogramOpts{
Namespace: "etcd",
Subsystem: "snapshot",
Name: "save_marshalling_durations_microseconds",
Name: "save_marshalling_durations_seconds",
Help: "The marshalling cost distributions of save called by snapshot.",
Buckets: prometheus.ExponentialBuckets(0.001, 2, 14),
})
)

View File

@ -74,12 +74,12 @@ func (s *Snapshotter) save(snapshot *raftpb.Snapshot) error {
if err != nil {
return err
} else {
marshallingDurations.Observe(float64(time.Since(start).Nanoseconds() / int64(time.Microsecond)))
marshallingDurations.Observe(float64(time.Since(start)) / float64(time.Second))
}
err = ioutil.WriteFile(path.Join(s.dir, fname), d, 0666)
if err == nil {
saveDurations.Observe(float64(time.Since(start).Nanoseconds() / int64(time.Microsecond)))
saveDurations.Observe(float64(time.Since(start)) / float64(time.Second))
}
return err
}

View File

@ -17,11 +17,12 @@ package wal
import "github.com/coreos/etcd/Godeps/_workspace/src/github.com/prometheus/client_golang/prometheus"
var (
syncDurations = prometheus.NewSummary(prometheus.SummaryOpts{
syncDurations = prometheus.NewHistogram(prometheus.HistogramOpts{
Namespace: "etcd",
Subsystem: "wal",
Name: "fsync_durations_microseconds",
Name: "fsync_durations_seconds",
Help: "The latency distributions of fsync called by wal.",
Buckets: prometheus.ExponentialBuckets(0.001, 2, 14),
})
lastIndexSaved = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: "etcd",

View File

@ -403,7 +403,7 @@ func (w *WAL) sync() error {
}
start := time.Now()
err := w.f.Sync()
syncDurations.Observe(float64(time.Since(start).Nanoseconds() / int64(time.Microsecond)))
syncDurations.Observe(float64(time.Since(start)) / float64(time.Second))
return err
}