mirror of
https://github.com/etcd-io/etcd.git
synced 2024-09-27 06:25:44 +00:00
The `etcdHighNumberOfLeaderChanges` alert had a copy and paste error when it was converted from docs to mixin in 10244 - we moved from "increase over 15m > 3" to "rate over 15m > 3" which is not the same (rate is measured per second, so it should have been "rate over 15m > (3 / 60 / 15)"). As part of fixing that, we need to capture when prometheus starts or when new etcd clusters are captured with a high leader change - i.e. if you start a new etcd cluster and at the moment prometheus first scrapes you are already at 5 leader changes, we should fire on that transition. This alert is also now more responsive, so if you get a quick burst of 3 leader changes we'll alert within 5m rather than 15m.
116 lines
4.2 KiB
YAML
116 lines
4.2 KiB
YAML
rule_files:
|
|
- mixin.yaml
|
|
|
|
evaluation_interval: 1m
|
|
|
|
tests:
|
|
- interval: 1m
|
|
input_series:
|
|
- series: 'up{job="etcd",instance="10.10.10.0"}'
|
|
values: '1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0'
|
|
- series: 'up{job="etcd",instance="10.10.10.1"}'
|
|
values: '1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0'
|
|
- series: 'up{job="etcd",instance="10.10.10.2"}'
|
|
values: '1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0'
|
|
alert_rule_test:
|
|
- eval_time: 3m
|
|
alertname: etcdInsufficientMembers
|
|
- eval_time: 5m
|
|
alertname: etcdInsufficientMembers
|
|
- eval_time: 5m
|
|
alertname: etcdMembersDown
|
|
- eval_time: 7m
|
|
alertname: etcdMembersDown
|
|
exp_alerts:
|
|
- exp_labels:
|
|
job: etcd
|
|
severity: critical
|
|
exp_annotations:
|
|
message: 'etcd cluster "etcd": members are down (1).'
|
|
- eval_time: 7m
|
|
alertname: etcdInsufficientMembers
|
|
- eval_time: 11m
|
|
alertname: etcdInsufficientMembers
|
|
exp_alerts:
|
|
- exp_labels:
|
|
job: etcd
|
|
severity: critical
|
|
exp_annotations:
|
|
message: 'etcd cluster "etcd": insufficient members (1).'
|
|
- eval_time: 15m
|
|
alertname: etcdInsufficientMembers
|
|
exp_alerts:
|
|
- exp_labels:
|
|
job: etcd
|
|
severity: critical
|
|
exp_annotations:
|
|
message: 'etcd cluster "etcd": insufficient members (0).'
|
|
|
|
- interval: 1m
|
|
input_series:
|
|
- series: 'up{job="etcd",instance="10.10.10.0"}'
|
|
values: '1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0'
|
|
- series: 'up{job="etcd",instance="10.10.10.1"}'
|
|
values: '1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0'
|
|
- series: 'up{job="etcd",instance="10.10.10.2"}'
|
|
values: '1 1 1 1 0 0 0 0'
|
|
alert_rule_test:
|
|
- eval_time: 10m
|
|
alertname: etcdMembersDown
|
|
exp_alerts:
|
|
- exp_labels:
|
|
job: etcd
|
|
severity: critical
|
|
exp_annotations:
|
|
message: 'etcd cluster "etcd": members are down (2).'
|
|
|
|
- interval: 1m
|
|
input_series:
|
|
- series: 'up{job="etcd",instance="10.10.10.0"}'
|
|
values: '1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0'
|
|
- series: 'up{job="etcd",instance="10.10.10.1"}'
|
|
values: '1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0'
|
|
- series: 'etcd_network_peer_sent_failures_total{To="member-1",job="etcd",endpoint="test"}'
|
|
values: '0 0 1 2 3 4 5 6 7 8 9 10'
|
|
alert_rule_test:
|
|
- eval_time: 4m
|
|
alertname: etcdMembersDown
|
|
- eval_time: 6m
|
|
alertname: etcdMembersDown
|
|
exp_alerts:
|
|
- exp_labels:
|
|
job: etcd
|
|
severity: critical
|
|
exp_annotations:
|
|
message: 'etcd cluster "etcd": members are down (1).'
|
|
- interval: 1m
|
|
input_series:
|
|
- series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.0"}'
|
|
values: '0 0 2 0 0 1 0 0 0 0 0 0 0 0 0 0'
|
|
- series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.1"}'
|
|
values: '0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0'
|
|
- series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.2"}'
|
|
values: '0 0 0 0 0 0 0 0'
|
|
alert_rule_test:
|
|
- eval_time: 10m
|
|
alertname: etcdHighNumberOfLeaderChanges
|
|
exp_alerts:
|
|
- exp_labels:
|
|
job: etcd
|
|
severity: warning
|
|
exp_annotations:
|
|
message: 'etcd cluster "etcd": 3 leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
|
|
- interval: 1m
|
|
input_series:
|
|
- series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.0"}'
|
|
values: '0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0'
|
|
- series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.1"}'
|
|
values: '0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0'
|
|
- series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.2"}'
|
|
values: '0 0 0 0 0 0 0 0'
|
|
alert_rule_test:
|
|
- eval_time: 10m
|
|
alertname: etcdHighNumberOfLeaderChanges
|
|
exp_alerts:
|
|
|