Documentation/etcd-mixin: Add an alert for down etcd members

An etcd member being down is an important failure state - while
normal admin operations may cause transient outages to rotate,
when any member is down the cluster is operating in a degraded
fashion. Add an alert that records when any members are down
so that administrators know whether the next failure is fatal.

The rule is more complicated than `up{...} == 0` because not all
failure modes for etcd may have an `up{...}` entry for each member.
For instance, a Kubernetes service in front of an etcd cluster
might only have 2 endpoints recorded in `up` because the third
pod is evicted by the kubelet - the cluster is degraded but
`count(up{...})` would not return the full quorum size. Instead,
use network peer send failures as a failure detector and attempt
to return the max of down services or failing peers. We may
undercount the number of total failures, but we will at least
alert that a member is down.
This commit is contained in:
Clayton Coleman 2019-07-18 16:57:44 -04:00 committed by Sam Batschelet
parent 12c049e6be
commit 465592a718
2 changed files with 76 additions and 6 deletions

View File

@ -8,6 +8,26 @@
{
name: 'etcd',
rules: [
{
alert: 'etcdMembersDown',
expr: |||
max by (job) (
sum by (job) (up{%(etcd_selector)s} == bool 0)
or
count by (job,endpoint) (
sum by (job,endpoint,To) (rate(etcd_network_peer_sent_failures_total{%(etcd_selector)s}[3m])) > 0.01
)
)
> 0
||| % $._config,
'for': '3m',
labels: {
severity: 'critical',
},
annotations: {
message: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value }}).',
},
},
{
alert: 'etcdInsufficientMembers',
expr: |||

View File

@ -14,22 +14,72 @@ tests:
values: '1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0'
alert_rule_test:
- eval_time: 3m
alertname: EtcdInsufficientMembers
alertname: etcdInsufficientMembers
- eval_time: 5m
alertname: etcdInsufficientMembers
- eval_time: 5m
alertname: etcdMembersDown
- eval_time: 7m
alertname: EtcdInsufficientMembers
alertname: etcdMembersDown
exp_alerts:
- exp_labels:
job: etcd
severity: critical
exp_annotations:
message: 'etcd cluster "etcd": members are down (1).'
- eval_time: 7m
alertname: etcdInsufficientMembers
- eval_time: 11m
alertname: EtcdInsufficientMembers
alertname: etcdInsufficientMembers
exp_alerts:
- exp_labels:
job: etcd
severity: critical
exp_annotations:
message: 'Etcd cluster "etcd": insufficient members (1).'
message: 'etcd cluster "etcd": insufficient members (1).'
- eval_time: 15m
alertname: EtcdInsufficientMembers
alertname: etcdInsufficientMembers
exp_alerts:
- exp_labels:
job: etcd
severity: critical
exp_annotations:
message: 'Etcd cluster "etcd": insufficient members (0).'
message: 'etcd cluster "etcd": insufficient members (0).'
- interval: 1m
input_series:
- series: 'up{job="etcd",instance="10.10.10.0"}'
values: '1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0'
- series: 'up{job="etcd",instance="10.10.10.1"}'
values: '1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0'
- series: 'up{job="etcd",instance="10.10.10.2"}'
values: '1 1 1 1 0 0 0 0'
alert_rule_test:
- eval_time: 10m
alertname: etcdMembersDown
exp_alerts:
- exp_labels:
job: etcd
severity: critical
exp_annotations:
message: 'etcd cluster "etcd": members are down (2).'
- interval: 1m
input_series:
- series: 'up{job="etcd",instance="10.10.10.0"}'
values: '1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0'
- series: 'up{job="etcd",instance="10.10.10.1"}'
values: '1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0'
- series: 'etcd_network_peer_sent_failures_total{To="member-1",job="etcd",endpoint="test"}'
values: '0 0 1 2 3 4 5 6 7 8 9 10'
alert_rule_test:
- eval_time: 4m
alertname: etcdMembersDown
- eval_time: 6m
alertname: etcdMembersDown
exp_alerts:
- exp_labels:
job: etcd
severity: critical
exp_annotations:
message: 'etcd cluster "etcd": members are down (1).'