From ea1d0f3e0d53a05a1c605f92d6863e4958e71a28 Mon Sep 17 00:00:00 2001 From: Brian Brazil Date: Fri, 29 Apr 2016 23:54:50 +0100 Subject: [PATCH] etcdserver: Improve some debug metrics. The _total suffix is by convention for counters, don't use it on a gauge. Clarify help string. Tweak metric name so it'll sort with related metrics, and be a little more understandable. Remove open file descriptor metric, as Prometheus client_golang provides that out of the box as process_open_fds which is also more up to date. Both only support Linux, so there's no loss of platform support. Fixes #5229 --- Documentation/metrics.md | 18 +++++++++++++----- etcdserver/metrics.go | 13 ++----------- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/Documentation/metrics.md b/Documentation/metrics.md index 9e8173660..a4d38cb50 100644 --- a/Documentation/metrics.md +++ b/Documentation/metrics.md @@ -79,16 +79,13 @@ The metrics under the `etcd_debugging` prefix are for debugging. They are very i | Name | Description | Type | |-----------------------------------------|--------------------------------------------------|-----------| -| file_descriptors_used_total | The total number of file descriptors used | Gauge | | proposal_durations_seconds | The latency distributions of committing proposal | Histogram | -| pending_proposal_total | The total number of pending proposals | Gauge | +| proposals_pending | The current number of pending proposals | Gauge | | proposal_failed_total | The total number of failed proposals | Counter | -Heavy file descriptor (`file_descriptors_used_total`) usage (i.e., near the process's file descriptor limit) indicates a potential file descriptor exhaustion issue. If the file descriptors are exhausted, etcd may panic because it cannot create new WAL files. - [Proposal][glossary-proposal] durations (`proposal_durations_seconds`) provides a proposal commit latency histogram. The reported latency reflects network and disk IO delays in etcd. -Pending proposal (`pending_proposal_total`) indicates how many proposals are queued for commit. A rising pending proposal total suggests there is a high client load or the cluster is unstable. +Proposals pending (`proposals_pending`) indicates how many proposals are queued for commit. Rising pending proposals suggests there is a high client load or the cluster is unstable. Failed proposals (`proposal_failed_total`) are normally related to two issues: temporary failures related to a leader election or longer duration downtime caused by a loss of quorum in the cluster. @@ -127,6 +124,17 @@ Label `msgType` is the type of raft message. `MsgApp` is log replication message Label `remoteID` is the member ID of the message destination. +## Prometheus supplied metrics + +The Prometheus client library provides a number of metrics under the `go` and `process` namespaces. There are a few that are particlarly interesting. + +| Name | Description | Type | +|-----------------------------------|--------------------------------------------|--------------| +| process_open_fds | Number of open file descriptors. | Gauge | +| process_max_fds | Maximum number of open file descriptors. | Gauge | + +Heavy file descriptor (`process_open_fds`) usage (i.e., near the process's file descriptor limit, `process_max_fds`) indicates a potential file descriptor exhaustion issue. If the file descriptors are exhausted, etcd may panic because it cannot create new WAL files. + [glossary-proposal]: glossary.md#proposal [prometheus]: http://prometheus.io/ [prometheus-getting-started]: http://prometheus.io/docs/introduction/getting_started/ diff --git a/etcdserver/metrics.go b/etcdserver/metrics.go index d0c623e82..e35a24cfc 100644 --- a/etcdserver/metrics.go +++ b/etcdserver/metrics.go @@ -33,8 +33,8 @@ var ( proposePending = prometheus.NewGauge(prometheus.GaugeOpts{ Namespace: "etcd_debugging", Subsystem: "server", - Name: "pending_proposal_total", - Help: "The total number of pending proposals.", + Name: "proposals_pending", + Help: "The current number of pending proposals.", }) // This is number of proposal failed in client's view. // The proposal might be later got committed in raft. @@ -44,20 +44,12 @@ var ( Name: "proposal_failed_total", Help: "The total number of failed proposals.", }) - - fileDescriptorUsed = prometheus.NewGauge(prometheus.GaugeOpts{ - Namespace: "etcd_debugging", - Subsystem: "server", - Name: "file_descriptors_used_total", - Help: "The total number of file descriptors used.", - }) ) func init() { prometheus.MustRegister(proposeDurations) prometheus.MustRegister(proposePending) prometheus.MustRegister(proposeFailed) - prometheus.MustRegister(fileDescriptorUsed) } func monitorFileDescriptor(done <-chan struct{}) { @@ -69,7 +61,6 @@ func monitorFileDescriptor(done <-chan struct{}) { plog.Errorf("cannot monitor file descriptor usage (%v)", err) return } - fileDescriptorUsed.Set(float64(used)) limit, err := runtime.FDLimit() if err != nil { plog.Errorf("cannot monitor file descriptor usage (%v)", err)