mirror of
https://github.com/etcd-io/etcd.git
synced 2024-09-27 06:25:44 +00:00
Documentation/etcd-mixin: Fix etcdHighNumberOfLeaderChanges (#11448)
The `etcdHighNumberOfLeaderChanges` alert had a copy and paste error when it was converted from docs to mixin in 10244 - we moved from "increase over 15m > 3" to "rate over 15m > 3" which is not the same (rate is measured per second, so it should have been "rate over 15m > (3 / 60 / 15)"). As part of fixing that, we need to capture when prometheus starts or when new etcd clusters are captured with a high leader change - i.e. if you start a new etcd cluster and at the moment prometheus first scrapes you are already at 5 leader changes, we should fire on that transition. This alert is also now more responsive, so if you get a quick burst of 3 leader changes we'll alert within 5m rather than 15m.
This commit is contained in:
parent
7f3dd59d22
commit
322c38e169
@ -57,14 +57,14 @@
|
||||
{
|
||||
alert: 'etcdHighNumberOfLeaderChanges',
|
||||
expr: |||
|
||||
rate(etcd_server_leader_changes_seen_total{%(etcd_selector)s}[15m]) > 3
|
||||
increase((max by (job) (etcd_server_leader_changes_seen_total{%(etcd_selector)s}) or 0*absent(etcd_server_leader_changes_seen_total{%(etcd_selector)s}))[15m:1m]) >= 3
|
||||
||| % $._config,
|
||||
'for': '15m',
|
||||
'for': '5m',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
annotations: {
|
||||
message: 'etcd cluster "{{ $labels.job }}": instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last 30 minutes.',
|
||||
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.',
|
||||
},
|
||||
},
|
||||
{
|
||||
|
@ -83,3 +83,33 @@ tests:
|
||||
severity: critical
|
||||
exp_annotations:
|
||||
message: 'etcd cluster "etcd": members are down (1).'
|
||||
- interval: 1m
|
||||
input_series:
|
||||
- series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.0"}'
|
||||
values: '0 0 2 0 0 1 0 0 0 0 0 0 0 0 0 0'
|
||||
- series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.1"}'
|
||||
values: '0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0'
|
||||
- series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.2"}'
|
||||
values: '0 0 0 0 0 0 0 0'
|
||||
alert_rule_test:
|
||||
- eval_time: 10m
|
||||
alertname: etcdHighNumberOfLeaderChanges
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
job: etcd
|
||||
severity: warning
|
||||
exp_annotations:
|
||||
message: 'etcd cluster "etcd": 3 leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
|
||||
- interval: 1m
|
||||
input_series:
|
||||
- series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.0"}'
|
||||
values: '0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0'
|
||||
- series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.1"}'
|
||||
values: '0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0'
|
||||
- series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.2"}'
|
||||
values: '0 0 0 0 0 0 0 0'
|
||||
alert_rule_test:
|
||||
- eval_time: 10m
|
||||
alertname: etcdHighNumberOfLeaderChanges
|
||||
exp_alerts:
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user