mirror of
https://github.com/etcd-io/etcd.git
synced 2024-09-27 06:25:44 +00:00
Merge pull request #3700 from xiang90/metrics_hi
Replace Summary with Histogram for all metrics
This commit is contained in:
commit
ff36b9d9bc
@ -16,15 +16,15 @@ etcd now exposes the following metrics:
|
|||||||
## etcdserver
|
## etcdserver
|
||||||
|
|
||||||
| Name | Description | Type |
|
| Name | Description | Type |
|
||||||
|-----------------------------------------|--------------------------------------------------|---------|
|
|-----------------------------------------|--------------------------------------------------|-----------|
|
||||||
| file_descriptors_used_total | The total number of file descriptors used | Gauge |
|
| file_descriptors_used_total | The total number of file descriptors used | Gauge |
|
||||||
| proposal_durations_milliseconds | The latency distributions of committing proposal | Summary |
|
| proposal_durations_seconds | The latency distributions of committing proposal | Histogram |
|
||||||
| pending_proposal_total | The total number of pending proposals | Gauge |
|
| pending_proposal_total | The total number of pending proposals | Gauge |
|
||||||
| proposal_failed_total | The total number of failed proposals | Counter |
|
| proposal_failed_total | The total number of failed proposals | Counter |
|
||||||
|
|
||||||
High file descriptors (`file_descriptors_used_total`) usage (near the file descriptors limitation of the process) indicates a potential out of file descriptors issue. That might cause etcd fails to create new WAL files and panics.
|
High file descriptors (`file_descriptors_used_total`) usage (near the file descriptors limitation of the process) indicates a potential out of file descriptors issue. That might cause etcd fails to create new WAL files and panics.
|
||||||
|
|
||||||
[Proposal](glossary.md#proposal) durations (`proposal_durations_milliseconds`) give you an summary about the proposal commit latency. Latency can be introduced into this process by network and disk IO.
|
[Proposal](glossary.md#proposal) durations (`proposal_durations_seconds`) give you a histogram about the proposal commit latency. Latency can be introduced into this process by network and disk IO.
|
||||||
|
|
||||||
Pending proposal (`pending_proposal_total`) gives you an idea about how many proposal are in the queue and waiting for commit. An increasing pending number indicates a high client load or an unstable cluster.
|
Pending proposal (`pending_proposal_total`) gives you an idea about how many proposal are in the queue and waiting for commit. An increasing pending number indicates a high client load or an unstable cluster.
|
||||||
|
|
||||||
@ -33,11 +33,11 @@ Failed proposals (`proposal_failed_total`) are normally related to two issues: t
|
|||||||
## wal
|
## wal
|
||||||
|
|
||||||
| Name | Description | Type |
|
| Name | Description | Type |
|
||||||
|------------------------------------|--------------------------------------------------|---------|
|
|------------------------------------|--------------------------------------------------|-----------|
|
||||||
| fsync_durations_microseconds | The latency distributions of fsync called by wal | Summary |
|
| fsync_durations_seconds | The latency distributions of fsync called by wal | Histogram |
|
||||||
| last_index_saved | The index of the last entry saved by wal | Gauge |
|
| last_index_saved | The index of the last entry saved by wal | Gauge |
|
||||||
|
|
||||||
Abnormally high fsync duration (`fsync_durations_microseconds`) indicates disk issues and might cause the cluster to be unstable.
|
Abnormally high fsync duration (`fsync_durations_seconds`) indicates disk issues and might cause the cluster to be unstable.
|
||||||
|
|
||||||
|
|
||||||
## http requests
|
## http requests
|
||||||
@ -74,21 +74,21 @@ Example Prometheus queries that may be useful from these metrics (across all etc
|
|||||||
## snapshot
|
## snapshot
|
||||||
|
|
||||||
| Name | Description | Type |
|
| Name | Description | Type |
|
||||||
|--------------------------------------------|------------------------------------------------------------|---------|
|
|--------------------------------------------|------------------------------------------------------------|-----------|
|
||||||
| snapshot_save_total_durations_microseconds | The total latency distributions of save called by snapshot | Summary |
|
| snapshot_save_total_durations_seconds | The total latency distributions of save called by snapshot | Histogram |
|
||||||
|
|
||||||
Abnormally high snapshot duration (`snapshot_save_total_durations_microseconds`) indicates disk issues and might cause the cluster to be unstable.
|
Abnormally high snapshot duration (`snapshot_save_total_durations_seconds`) indicates disk issues and might cause the cluster to be unstable.
|
||||||
|
|
||||||
|
|
||||||
## rafthttp
|
## rafthttp
|
||||||
|
|
||||||
| Name | Description | Type | Labels |
|
| Name | Description | Type | Labels |
|
||||||
|-----------------------------------|--------------------------------------------|---------|--------------------------------|
|
|-----------------------------------|--------------------------------------------|--------------|--------------------------------|
|
||||||
| message_sent_latency_microseconds | The latency distributions of messages sent | Summary | sendingType, msgType, remoteID |
|
| message_sent_latency_seconds | The latency distributions of messages sent | HistogramVec | sendingType, msgType, remoteID |
|
||||||
| message_sent_failed_total | The total number of failed messages sent | Summary | sendingType, msgType, remoteID |
|
| message_sent_failed_total | The total number of failed messages sent | Summary | sendingType, msgType, remoteID |
|
||||||
|
|
||||||
|
|
||||||
Abnormally high message duration (`message_sent_latency_microseconds`) indicates network issues and might cause the cluster to be unstable.
|
Abnormally high message duration (`message_sent_latency_seconds`) indicates network issues and might cause the cluster to be unstable.
|
||||||
|
|
||||||
An increase in message failures (`message_sent_failed_total`) indicates more severe network issues and might cause the cluster to be unstable.
|
An increase in message failures (`message_sent_failed_total`) indicates more severe network issues and might cause the cluster to be unstable.
|
||||||
|
|
||||||
|
@ -23,11 +23,12 @@ import (
|
|||||||
|
|
||||||
var (
|
var (
|
||||||
// TODO: with label in v3?
|
// TODO: with label in v3?
|
||||||
proposeDurations = prometheus.NewSummary(prometheus.SummaryOpts{
|
proposeDurations = prometheus.NewHistogram(prometheus.HistogramOpts{
|
||||||
Namespace: "etcd",
|
Namespace: "etcd",
|
||||||
Subsystem: "server",
|
Subsystem: "server",
|
||||||
Name: "proposal_durations_milliseconds",
|
Name: "proposal_durations_seconds",
|
||||||
Help: "The latency distributions of committing proposal.",
|
Help: "The latency distributions of committing proposal.",
|
||||||
|
Buckets: prometheus.ExponentialBuckets(0.001, 2, 14),
|
||||||
})
|
})
|
||||||
proposePending = prometheus.NewGauge(prometheus.GaugeOpts{
|
proposePending = prometheus.NewGauge(prometheus.GaugeOpts{
|
||||||
Namespace: "etcd",
|
Namespace: "etcd",
|
||||||
|
@ -622,7 +622,7 @@ func (s *EtcdServer) Do(ctx context.Context, r pb.Request) (Response, error) {
|
|||||||
|
|
||||||
select {
|
select {
|
||||||
case x := <-ch:
|
case x := <-ch:
|
||||||
proposeDurations.Observe(float64(time.Since(start).Nanoseconds() / int64(time.Millisecond)))
|
proposeDurations.Observe(float64(time.Since(start)) / float64(time.Second))
|
||||||
resp := x.(Response)
|
resp := x.(Response)
|
||||||
return resp, resp.err
|
return resp, resp.err
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
|
@ -23,12 +23,17 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
msgSentDuration = prometheus.NewSummaryVec(
|
// TODO: create a separate histogram for recording
|
||||||
prometheus.SummaryOpts{
|
// snapshot sending metric. snapshot can be large and
|
||||||
|
// take a long time to send. So it needs a different
|
||||||
|
// time range than other type of messages.
|
||||||
|
msgSentDuration = prometheus.NewHistogramVec(
|
||||||
|
prometheus.HistogramOpts{
|
||||||
Namespace: "etcd",
|
Namespace: "etcd",
|
||||||
Subsystem: "rafthttp",
|
Subsystem: "rafthttp",
|
||||||
Name: "message_sent_latency_microseconds",
|
Name: "message_sent_latency_seconds",
|
||||||
Help: "message sent latency distributions.",
|
Help: "message sent latency distributions.",
|
||||||
|
Buckets: prometheus.ExponentialBuckets(0.0005, 2, 13),
|
||||||
},
|
},
|
||||||
[]string{"sendingType", "remoteID", "msgType"},
|
[]string{"sendingType", "remoteID", "msgType"},
|
||||||
)
|
)
|
||||||
@ -53,7 +58,7 @@ func reportSentDuration(sendingType string, m raftpb.Message, duration time.Dura
|
|||||||
if isLinkHeartbeatMessage(m) {
|
if isLinkHeartbeatMessage(m) {
|
||||||
typ = "MsgLinkHeartbeat"
|
typ = "MsgLinkHeartbeat"
|
||||||
}
|
}
|
||||||
msgSentDuration.WithLabelValues(sendingType, types.ID(m.To).String(), typ).Observe(float64(duration.Nanoseconds() / int64(time.Microsecond)))
|
msgSentDuration.WithLabelValues(sendingType, types.ID(m.To).String(), typ).Observe(float64(duration) / float64(time.Second))
|
||||||
}
|
}
|
||||||
|
|
||||||
func reportSentFailure(sendingType string, m raftpb.Message) {
|
func reportSentFailure(sendingType string, m raftpb.Message) {
|
||||||
|
@ -18,18 +18,20 @@ import "github.com/coreos/etcd/Godeps/_workspace/src/github.com/prometheus/clien
|
|||||||
|
|
||||||
var (
|
var (
|
||||||
// TODO: save_fsync latency?
|
// TODO: save_fsync latency?
|
||||||
saveDurations = prometheus.NewSummary(prometheus.SummaryOpts{
|
saveDurations = prometheus.NewHistogram(prometheus.HistogramOpts{
|
||||||
Namespace: "etcd",
|
Namespace: "etcd",
|
||||||
Subsystem: "snapshot",
|
Subsystem: "snapshot",
|
||||||
Name: "save_total_durations_microseconds",
|
Name: "save_total_durations_seconds",
|
||||||
Help: "The total latency distributions of save called by snapshot.",
|
Help: "The total latency distributions of save called by snapshot.",
|
||||||
|
Buckets: prometheus.ExponentialBuckets(0.001, 2, 14),
|
||||||
})
|
})
|
||||||
|
|
||||||
marshallingDurations = prometheus.NewSummary(prometheus.SummaryOpts{
|
marshallingDurations = prometheus.NewHistogram(prometheus.HistogramOpts{
|
||||||
Namespace: "etcd",
|
Namespace: "etcd",
|
||||||
Subsystem: "snapshot",
|
Subsystem: "snapshot",
|
||||||
Name: "save_marshalling_durations_microseconds",
|
Name: "save_marshalling_durations_seconds",
|
||||||
Help: "The marshalling cost distributions of save called by snapshot.",
|
Help: "The marshalling cost distributions of save called by snapshot.",
|
||||||
|
Buckets: prometheus.ExponentialBuckets(0.001, 2, 14),
|
||||||
})
|
})
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -74,12 +74,12 @@ func (s *Snapshotter) save(snapshot *raftpb.Snapshot) error {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
} else {
|
} else {
|
||||||
marshallingDurations.Observe(float64(time.Since(start).Nanoseconds() / int64(time.Microsecond)))
|
marshallingDurations.Observe(float64(time.Since(start)) / float64(time.Second))
|
||||||
}
|
}
|
||||||
|
|
||||||
err = ioutil.WriteFile(path.Join(s.dir, fname), d, 0666)
|
err = ioutil.WriteFile(path.Join(s.dir, fname), d, 0666)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
saveDurations.Observe(float64(time.Since(start).Nanoseconds() / int64(time.Microsecond)))
|
saveDurations.Observe(float64(time.Since(start)) / float64(time.Second))
|
||||||
}
|
}
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
@ -17,11 +17,12 @@ package wal
|
|||||||
import "github.com/coreos/etcd/Godeps/_workspace/src/github.com/prometheus/client_golang/prometheus"
|
import "github.com/coreos/etcd/Godeps/_workspace/src/github.com/prometheus/client_golang/prometheus"
|
||||||
|
|
||||||
var (
|
var (
|
||||||
syncDurations = prometheus.NewSummary(prometheus.SummaryOpts{
|
syncDurations = prometheus.NewHistogram(prometheus.HistogramOpts{
|
||||||
Namespace: "etcd",
|
Namespace: "etcd",
|
||||||
Subsystem: "wal",
|
Subsystem: "wal",
|
||||||
Name: "fsync_durations_microseconds",
|
Name: "fsync_durations_seconds",
|
||||||
Help: "The latency distributions of fsync called by wal.",
|
Help: "The latency distributions of fsync called by wal.",
|
||||||
|
Buckets: prometheus.ExponentialBuckets(0.001, 2, 14),
|
||||||
})
|
})
|
||||||
lastIndexSaved = prometheus.NewGauge(prometheus.GaugeOpts{
|
lastIndexSaved = prometheus.NewGauge(prometheus.GaugeOpts{
|
||||||
Namespace: "etcd",
|
Namespace: "etcd",
|
||||||
|
@ -403,7 +403,7 @@ func (w *WAL) sync() error {
|
|||||||
}
|
}
|
||||||
start := time.Now()
|
start := time.Now()
|
||||||
err := w.f.Sync()
|
err := w.f.Sync()
|
||||||
syncDurations.Observe(float64(time.Since(start).Nanoseconds() / int64(time.Microsecond)))
|
syncDurations.Observe(float64(time.Since(start)) / float64(time.Second))
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user