diff --git a/Documentation/op-guide/etcd3_alert.rules.yml b/Documentation/op-guide/etcd3_alert.rules.yml index 17287172b..0b10cd414 100644 --- a/Documentation/op-guide/etcd3_alert.rules.yml +++ b/Documentation/op-guide/etcd3_alert.rules.yml @@ -2,6 +2,21 @@ groups: - name: etcd rules: + - alert: etcdMembersDown + annotations: + message: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value }}).' + expr: | + max by (job) ( + sum by (job) (up{job=~".*etcd.*"} == bool 0) + or + count by (job,endpoint) ( + sum by (job,endpoint,To) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[3m])) > 0.01 + ) + ) + > 0 + for: 3m + labels: + severity: critical - alert: etcdInsufficientMembers annotations: message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value @@ -22,11 +37,12 @@ groups: severity: critical - alert: etcdHighNumberOfLeaderChanges annotations: - message: 'etcd cluster "{{ $labels.job }}": instance {{ $labels.instance }} - has seen {{ $value }} leader changes within the last hour.' + message: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within + the last 15 minutes. Frequent elections may be a sign of insufficient resources, + high network latency, or disruptions by other components and should be investigated.' expr: | - rate(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}[15m]) > 3 - for: 15m + increase((max by (job) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 3 + for: 5m labels: severity: warning - alert: etcdHighNumberOfFailedGRPCRequests @@ -76,7 +92,7 @@ groups: - alert: etcdHighNumberOfFailedProposals annotations: message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within - the last hour on etcd instance {{ $labels.instance }}.' + the last 30 minutes on etcd instance {{ $labels.instance }}.' expr: | rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5 for: 15m diff --git a/Documentation/op-guide/grafana.json b/Documentation/op-guide/grafana.json index a9832d919..84baf15d6 100644 --- a/Documentation/op-guide/grafana.json +++ b/Documentation/op-guide/grafana.json @@ -9,7 +9,6 @@ "editable": true, "gnetId": null, "hideControls": false, - "id": 6, "links": [ ], @@ -1220,6 +1219,7 @@ }, "timezone": "browser", "title": "etcd", + "uid": "c2f4e12cdf69feb95caa41a5a1b423d9", "version": 215 } }