Merge pull request #5706 from xiang90/app_metrics

etcdserver: add applied metrics
2024-09-27 06:25:44 +00:00 · 2016-06-17 12:26:23 -07:00 · 2016-06-17 12:26:23 -07:00 · bd8627c8ab
commit bd8627c8ab
parent e4f56c4eb6 57474697af
3 changed files with 11 additions and 0 deletions
--- a/Documentation/metrics.md
+++ b/Documentation/metrics.md
@ -25,6 +25,7 @@ All these metrics are prefixed with `etcd_server_`
 | has_leader                | Whether or not a leader exists. 1 is existence, 0 is not.| Gauge   |
 | leader_changes_seen_total | The number of leader changes seen.                       | Counter |
 | proposals_committed_total | The total number of consensus proposals committed.       | Gauge   |
+| proposals_applied_total   | The total number of consensus proposals applied.         | Gauge   |


 `has_leader` indicates whether the member has a leader. If a member does not have a leader, it is
@ -35,6 +36,8 @@ is totally unavailable.

 `proposals_committed_total` records the total number of consensus proposals committed. This gauge should increase over time if the cluster is healthy. Several healthy members of an etcd cluster may have different total committed proposals at once. This discrepancy may be due to recovering from peers after starting, lagging behind the leader, or being the leader and therefore having the most commits. It is important to monitor this metric across all the members in the cluster; a consistently large lag between a single member and its leader indicates that member is slow or unhealthy.

+`proposals_applied_total` records the total number of consensus proposals applied. The etcd server applies every committed proposal asynchronously. The difference between `proposals_committed_total` and `proposals_applied_total` should usually be small (within a few thousands even under high load). If the difference between them continues to rise, it indicates that the etcd server is overloaded. This might happen when applying expensive queries like heavy range queries or large txn operations.
+
 ### disk

 These metrics describe the status of the disk operations.
--- a/etcdserver/metrics.go
+++ b/etcdserver/metrics.go
@ -64,6 +64,12 @@ var (
 		Name:      "proposals_committed_total",
 		Help:      "The total number of consensus proposals committed.",
 	})
+	proposalsApplied = prometheus.NewGauge(prometheus.GaugeOpts{
+		Namespace: "etcd",
+		Subsystem: "server",
+		Name:      "proposals_applied_total",
+		Help:      "The total number of consensus proposals applied.",
+	})
 )

 func init() {
@ -73,6 +79,7 @@ func init() {
 	prometheus.MustRegister(hasLeader)
 	prometheus.MustRegister(leaderChanges)
 	prometheus.MustRegister(proposalsCommitted)
+	prometheus.MustRegister(proposalsApplied)
 }

 func monitorFileDescriptor(done <-chan struct{}) {
--- a/etcdserver/server.go
+++ b/etcdserver/server.go
@ -604,6 +604,7 @@ func (s *EtcdServer) applyAll(ep *etcdProgress, apply *apply) {
 		plog.Warningf("apply entries took too long [%v for %d entries]", d, len(apply.entries))
 		plog.Warningf("avoid queries with large range/delete range!")
 	}
+	proposalsApplied.Set(float64(ep.appliedi))
 	// wait for the raft routine to finish the disk writes before triggering a
 	// snapshot. or applied index might be greater than the last index in raft
 	// storage, since the raft routine might be slower than apply routine.