mirror of
https://github.com/etcd-io/etcd.git
synced 2024-09-27 06:25:44 +00:00
Merge pull request #3700 from xiang90/metrics_hi
Replace Summary with Histogram for all metrics
This commit is contained in:
commit
ff36b9d9bc
@ -15,16 +15,16 @@ etcd now exposes the following metrics:
|
||||
|
||||
## etcdserver
|
||||
|
||||
| Name | Description | Type |
|
||||
|-----------------------------------------|--------------------------------------------------|---------|
|
||||
| file_descriptors_used_total | The total number of file descriptors used | Gauge |
|
||||
| proposal_durations_milliseconds | The latency distributions of committing proposal | Summary |
|
||||
| pending_proposal_total | The total number of pending proposals | Gauge |
|
||||
| proposal_failed_total | The total number of failed proposals | Counter |
|
||||
| Name | Description | Type |
|
||||
|-----------------------------------------|--------------------------------------------------|-----------|
|
||||
| file_descriptors_used_total | The total number of file descriptors used | Gauge |
|
||||
| proposal_durations_seconds | The latency distributions of committing proposal | Histogram |
|
||||
| pending_proposal_total | The total number of pending proposals | Gauge |
|
||||
| proposal_failed_total | The total number of failed proposals | Counter |
|
||||
|
||||
High file descriptors (`file_descriptors_used_total`) usage (near the file descriptors limitation of the process) indicates a potential out of file descriptors issue. That might cause etcd fails to create new WAL files and panics.
|
||||
|
||||
[Proposal](glossary.md#proposal) durations (`proposal_durations_milliseconds`) give you an summary about the proposal commit latency. Latency can be introduced into this process by network and disk IO.
|
||||
[Proposal](glossary.md#proposal) durations (`proposal_durations_seconds`) give you a histogram about the proposal commit latency. Latency can be introduced into this process by network and disk IO.
|
||||
|
||||
Pending proposal (`pending_proposal_total`) gives you an idea about how many proposal are in the queue and waiting for commit. An increasing pending number indicates a high client load or an unstable cluster.
|
||||
|
||||
@ -32,12 +32,12 @@ Failed proposals (`proposal_failed_total`) are normally related to two issues: t
|
||||
|
||||
## wal
|
||||
|
||||
| Name | Description | Type |
|
||||
|------------------------------------|--------------------------------------------------|---------|
|
||||
| fsync_durations_microseconds | The latency distributions of fsync called by wal | Summary |
|
||||
| last_index_saved | The index of the last entry saved by wal | Gauge |
|
||||
| Name | Description | Type |
|
||||
|------------------------------------|--------------------------------------------------|-----------|
|
||||
| fsync_durations_seconds | The latency distributions of fsync called by wal | Histogram |
|
||||
| last_index_saved | The index of the last entry saved by wal | Gauge |
|
||||
|
||||
Abnormally high fsync duration (`fsync_durations_microseconds`) indicates disk issues and might cause the cluster to be unstable.
|
||||
Abnormally high fsync duration (`fsync_durations_seconds`) indicates disk issues and might cause the cluster to be unstable.
|
||||
|
||||
|
||||
## http requests
|
||||
@ -73,22 +73,22 @@ Example Prometheus queries that may be useful from these metrics (across all etc
|
||||
|
||||
## snapshot
|
||||
|
||||
| Name | Description | Type |
|
||||
|--------------------------------------------|------------------------------------------------------------|---------|
|
||||
| snapshot_save_total_durations_microseconds | The total latency distributions of save called by snapshot | Summary |
|
||||
| Name | Description | Type |
|
||||
|--------------------------------------------|------------------------------------------------------------|-----------|
|
||||
| snapshot_save_total_durations_seconds | The total latency distributions of save called by snapshot | Histogram |
|
||||
|
||||
Abnormally high snapshot duration (`snapshot_save_total_durations_microseconds`) indicates disk issues and might cause the cluster to be unstable.
|
||||
Abnormally high snapshot duration (`snapshot_save_total_durations_seconds`) indicates disk issues and might cause the cluster to be unstable.
|
||||
|
||||
|
||||
## rafthttp
|
||||
|
||||
| Name | Description | Type | Labels |
|
||||
|-----------------------------------|--------------------------------------------|---------|--------------------------------|
|
||||
| message_sent_latency_microseconds | The latency distributions of messages sent | Summary | sendingType, msgType, remoteID |
|
||||
| message_sent_failed_total | The total number of failed messages sent | Summary | sendingType, msgType, remoteID |
|
||||
| Name | Description | Type | Labels |
|
||||
|-----------------------------------|--------------------------------------------|--------------|--------------------------------|
|
||||
| message_sent_latency_seconds | The latency distributions of messages sent | HistogramVec | sendingType, msgType, remoteID |
|
||||
| message_sent_failed_total | The total number of failed messages sent | Summary | sendingType, msgType, remoteID |
|
||||
|
||||
|
||||
Abnormally high message duration (`message_sent_latency_microseconds`) indicates network issues and might cause the cluster to be unstable.
|
||||
Abnormally high message duration (`message_sent_latency_seconds`) indicates network issues and might cause the cluster to be unstable.
|
||||
|
||||
An increase in message failures (`message_sent_failed_total`) indicates more severe network issues and might cause the cluster to be unstable.
|
||||
|
||||
|
@ -23,11 +23,12 @@ import (
|
||||
|
||||
var (
|
||||
// TODO: with label in v3?
|
||||
proposeDurations = prometheus.NewSummary(prometheus.SummaryOpts{
|
||||
proposeDurations = prometheus.NewHistogram(prometheus.HistogramOpts{
|
||||
Namespace: "etcd",
|
||||
Subsystem: "server",
|
||||
Name: "proposal_durations_milliseconds",
|
||||
Name: "proposal_durations_seconds",
|
||||
Help: "The latency distributions of committing proposal.",
|
||||
Buckets: prometheus.ExponentialBuckets(0.001, 2, 14),
|
||||
})
|
||||
proposePending = prometheus.NewGauge(prometheus.GaugeOpts{
|
||||
Namespace: "etcd",
|
||||
|
@ -622,7 +622,7 @@ func (s *EtcdServer) Do(ctx context.Context, r pb.Request) (Response, error) {
|
||||
|
||||
select {
|
||||
case x := <-ch:
|
||||
proposeDurations.Observe(float64(time.Since(start).Nanoseconds() / int64(time.Millisecond)))
|
||||
proposeDurations.Observe(float64(time.Since(start)) / float64(time.Second))
|
||||
resp := x.(Response)
|
||||
return resp, resp.err
|
||||
case <-ctx.Done():
|
||||
|
@ -23,12 +23,17 @@ import (
|
||||
)
|
||||
|
||||
var (
|
||||
msgSentDuration = prometheus.NewSummaryVec(
|
||||
prometheus.SummaryOpts{
|
||||
// TODO: create a separate histogram for recording
|
||||
// snapshot sending metric. snapshot can be large and
|
||||
// take a long time to send. So it needs a different
|
||||
// time range than other type of messages.
|
||||
msgSentDuration = prometheus.NewHistogramVec(
|
||||
prometheus.HistogramOpts{
|
||||
Namespace: "etcd",
|
||||
Subsystem: "rafthttp",
|
||||
Name: "message_sent_latency_microseconds",
|
||||
Name: "message_sent_latency_seconds",
|
||||
Help: "message sent latency distributions.",
|
||||
Buckets: prometheus.ExponentialBuckets(0.0005, 2, 13),
|
||||
},
|
||||
[]string{"sendingType", "remoteID", "msgType"},
|
||||
)
|
||||
@ -53,7 +58,7 @@ func reportSentDuration(sendingType string, m raftpb.Message, duration time.Dura
|
||||
if isLinkHeartbeatMessage(m) {
|
||||
typ = "MsgLinkHeartbeat"
|
||||
}
|
||||
msgSentDuration.WithLabelValues(sendingType, types.ID(m.To).String(), typ).Observe(float64(duration.Nanoseconds() / int64(time.Microsecond)))
|
||||
msgSentDuration.WithLabelValues(sendingType, types.ID(m.To).String(), typ).Observe(float64(duration) / float64(time.Second))
|
||||
}
|
||||
|
||||
func reportSentFailure(sendingType string, m raftpb.Message) {
|
||||
|
@ -18,18 +18,20 @@ import "github.com/coreos/etcd/Godeps/_workspace/src/github.com/prometheus/clien
|
||||
|
||||
var (
|
||||
// TODO: save_fsync latency?
|
||||
saveDurations = prometheus.NewSummary(prometheus.SummaryOpts{
|
||||
saveDurations = prometheus.NewHistogram(prometheus.HistogramOpts{
|
||||
Namespace: "etcd",
|
||||
Subsystem: "snapshot",
|
||||
Name: "save_total_durations_microseconds",
|
||||
Name: "save_total_durations_seconds",
|
||||
Help: "The total latency distributions of save called by snapshot.",
|
||||
Buckets: prometheus.ExponentialBuckets(0.001, 2, 14),
|
||||
})
|
||||
|
||||
marshallingDurations = prometheus.NewSummary(prometheus.SummaryOpts{
|
||||
marshallingDurations = prometheus.NewHistogram(prometheus.HistogramOpts{
|
||||
Namespace: "etcd",
|
||||
Subsystem: "snapshot",
|
||||
Name: "save_marshalling_durations_microseconds",
|
||||
Name: "save_marshalling_durations_seconds",
|
||||
Help: "The marshalling cost distributions of save called by snapshot.",
|
||||
Buckets: prometheus.ExponentialBuckets(0.001, 2, 14),
|
||||
})
|
||||
)
|
||||
|
||||
|
@ -74,12 +74,12 @@ func (s *Snapshotter) save(snapshot *raftpb.Snapshot) error {
|
||||
if err != nil {
|
||||
return err
|
||||
} else {
|
||||
marshallingDurations.Observe(float64(time.Since(start).Nanoseconds() / int64(time.Microsecond)))
|
||||
marshallingDurations.Observe(float64(time.Since(start)) / float64(time.Second))
|
||||
}
|
||||
|
||||
err = ioutil.WriteFile(path.Join(s.dir, fname), d, 0666)
|
||||
if err == nil {
|
||||
saveDurations.Observe(float64(time.Since(start).Nanoseconds() / int64(time.Microsecond)))
|
||||
saveDurations.Observe(float64(time.Since(start)) / float64(time.Second))
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
@ -17,11 +17,12 @@ package wal
|
||||
import "github.com/coreos/etcd/Godeps/_workspace/src/github.com/prometheus/client_golang/prometheus"
|
||||
|
||||
var (
|
||||
syncDurations = prometheus.NewSummary(prometheus.SummaryOpts{
|
||||
syncDurations = prometheus.NewHistogram(prometheus.HistogramOpts{
|
||||
Namespace: "etcd",
|
||||
Subsystem: "wal",
|
||||
Name: "fsync_durations_microseconds",
|
||||
Name: "fsync_durations_seconds",
|
||||
Help: "The latency distributions of fsync called by wal.",
|
||||
Buckets: prometheus.ExponentialBuckets(0.001, 2, 14),
|
||||
})
|
||||
lastIndexSaved = prometheus.NewGauge(prometheus.GaugeOpts{
|
||||
Namespace: "etcd",
|
||||
|
@ -403,7 +403,7 @@ func (w *WAL) sync() error {
|
||||
}
|
||||
start := time.Now()
|
||||
err := w.f.Sync()
|
||||
syncDurations.Observe(float64(time.Since(start).Nanoseconds() / int64(time.Microsecond)))
|
||||
syncDurations.Observe(float64(time.Since(start)) / float64(time.Second))
|
||||
return err
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user