mirror of
https://github.com/etcd-io/etcd.git
synced 2024-09-27 06:25:44 +00:00
Documentation/etcd-mixin: Add an alert for down etcd members
An etcd member being down is an important failure state - while normal admin operations may cause transient outages to rotate, when any member is down the cluster is operating in a degraded fashion. Add an alert that records when any members are down so that administrators know whether the next failure is fatal. The rule is more complicated than `up{...} == 0` because not all failure modes for etcd may have an `up{...}` entry for each member. For instance, a Kubernetes service in front of an etcd cluster might only have 2 endpoints recorded in `up` because the third pod is evicted by the kubelet - the cluster is degraded but `count(up{...})` would not return the full quorum size. Instead, use network peer send failures as a failure detector and attempt to return the max of down services or failing peers. We may undercount the number of total failures, but we will at least alert that a member is down.
This commit is contained in:
parent
12c049e6be
commit
465592a718
@ -8,6 +8,26 @@
|
||||
{
|
||||
name: 'etcd',
|
||||
rules: [
|
||||
{
|
||||
alert: 'etcdMembersDown',
|
||||
expr: |||
|
||||
max by (job) (
|
||||
sum by (job) (up{%(etcd_selector)s} == bool 0)
|
||||
or
|
||||
count by (job,endpoint) (
|
||||
sum by (job,endpoint,To) (rate(etcd_network_peer_sent_failures_total{%(etcd_selector)s}[3m])) > 0.01
|
||||
)
|
||||
)
|
||||
> 0
|
||||
||| % $._config,
|
||||
'for': '3m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
message: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value }}).',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'etcdInsufficientMembers',
|
||||
expr: |||
|
||||
|
@ -14,22 +14,72 @@ tests:
|
||||
values: '1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0'
|
||||
alert_rule_test:
|
||||
- eval_time: 3m
|
||||
alertname: EtcdInsufficientMembers
|
||||
alertname: etcdInsufficientMembers
|
||||
- eval_time: 5m
|
||||
alertname: etcdInsufficientMembers
|
||||
- eval_time: 5m
|
||||
alertname: etcdMembersDown
|
||||
- eval_time: 7m
|
||||
alertname: EtcdInsufficientMembers
|
||||
alertname: etcdMembersDown
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
job: etcd
|
||||
severity: critical
|
||||
exp_annotations:
|
||||
message: 'etcd cluster "etcd": members are down (1).'
|
||||
- eval_time: 7m
|
||||
alertname: etcdInsufficientMembers
|
||||
- eval_time: 11m
|
||||
alertname: EtcdInsufficientMembers
|
||||
alertname: etcdInsufficientMembers
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
job: etcd
|
||||
severity: critical
|
||||
exp_annotations:
|
||||
message: 'Etcd cluster "etcd": insufficient members (1).'
|
||||
message: 'etcd cluster "etcd": insufficient members (1).'
|
||||
- eval_time: 15m
|
||||
alertname: EtcdInsufficientMembers
|
||||
alertname: etcdInsufficientMembers
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
job: etcd
|
||||
severity: critical
|
||||
exp_annotations:
|
||||
message: 'Etcd cluster "etcd": insufficient members (0).'
|
||||
message: 'etcd cluster "etcd": insufficient members (0).'
|
||||
|
||||
- interval: 1m
|
||||
input_series:
|
||||
- series: 'up{job="etcd",instance="10.10.10.0"}'
|
||||
values: '1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0'
|
||||
- series: 'up{job="etcd",instance="10.10.10.1"}'
|
||||
values: '1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0'
|
||||
- series: 'up{job="etcd",instance="10.10.10.2"}'
|
||||
values: '1 1 1 1 0 0 0 0'
|
||||
alert_rule_test:
|
||||
- eval_time: 10m
|
||||
alertname: etcdMembersDown
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
job: etcd
|
||||
severity: critical
|
||||
exp_annotations:
|
||||
message: 'etcd cluster "etcd": members are down (2).'
|
||||
|
||||
- interval: 1m
|
||||
input_series:
|
||||
- series: 'up{job="etcd",instance="10.10.10.0"}'
|
||||
values: '1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0'
|
||||
- series: 'up{job="etcd",instance="10.10.10.1"}'
|
||||
values: '1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0'
|
||||
- series: 'etcd_network_peer_sent_failures_total{To="member-1",job="etcd",endpoint="test"}'
|
||||
values: '0 0 1 2 3 4 5 6 7 8 9 10'
|
||||
alert_rule_test:
|
||||
- eval_time: 4m
|
||||
alertname: etcdMembersDown
|
||||
- eval_time: 6m
|
||||
alertname: etcdMembersDown
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
job: etcd
|
||||
severity: critical
|
||||
exp_annotations:
|
||||
message: 'etcd cluster "etcd": members are down (1).'
|
||||
|
Loading…
x
Reference in New Issue
Block a user