mirror of
https://github.com/etcd-io/etcd.git
synced 2024-09-27 06:25:44 +00:00
Merge pull request #12177 from ironcladlou/etcdmembersdown-tweak
Documentation: Further improve etcdMembersDown alert
This commit is contained in:
commit
1af6d61a1c
@ -7,6 +7,9 @@
|
||||
// instances are deployed on K8s, you will likely want to change
|
||||
// this to 'instance, pod'.
|
||||
etcd_instance_labels: 'instance',
|
||||
// scrape_interval_seconds is the global scrape interval which can be
|
||||
// used to dynamically adjust rate windows as a function of the interval.
|
||||
scrape_interval_seconds: 30,
|
||||
},
|
||||
|
||||
prometheusAlerts+:: {
|
||||
@ -21,12 +24,12 @@
|
||||
sum without (%(etcd_instance_labels)s) (up{%(etcd_selector)s} == bool 0)
|
||||
or
|
||||
count without (To) (
|
||||
sum without (%(etcd_instance_labels)s) (rate(etcd_network_peer_sent_failures_total{%(etcd_selector)s}[1m])) > 0.01
|
||||
sum without (%(etcd_instance_labels)s) (rate(etcd_network_peer_sent_failures_total{%(etcd_selector)s}[%(network_failure_range)ss])) > 0.01
|
||||
)
|
||||
)
|
||||
> 0
|
||||
||| % $._config,
|
||||
'for': '3m',
|
||||
||| % {etcd_instance_labels: $._config.etcd_instance_labels, etcd_selector: $._config.etcd_selector, network_failure_range: $._config.scrape_interval_seconds*4},
|
||||
'for': '10m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
|
@ -17,16 +17,16 @@ tests:
|
||||
alertname: etcdInsufficientMembers
|
||||
- eval_time: 5m
|
||||
alertname: etcdInsufficientMembers
|
||||
- eval_time: 5m
|
||||
- eval_time: 12m
|
||||
alertname: etcdMembersDown
|
||||
- eval_time: 7m
|
||||
- eval_time: 14m
|
||||
alertname: etcdMembersDown
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
job: etcd
|
||||
severity: critical
|
||||
exp_annotations:
|
||||
message: 'etcd cluster "etcd": members are down (1).'
|
||||
message: 'etcd cluster "etcd": members are down (3).'
|
||||
- eval_time: 7m
|
||||
alertname: etcdInsufficientMembers
|
||||
- eval_time: 11m
|
||||
@ -49,33 +49,31 @@ tests:
|
||||
- interval: 1m
|
||||
input_series:
|
||||
- series: 'up{job="etcd",instance="10.10.10.0"}'
|
||||
values: '1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0'
|
||||
values: '1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0'
|
||||
- series: 'up{job="etcd",instance="10.10.10.1"}'
|
||||
values: '1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0'
|
||||
values: '1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0'
|
||||
- series: 'up{job="etcd",instance="10.10.10.2"}'
|
||||
values: '1 1 1 1 0 0 0 0'
|
||||
values: '1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'
|
||||
alert_rule_test:
|
||||
- eval_time: 10m
|
||||
- eval_time: 14m
|
||||
alertname: etcdMembersDown
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
job: etcd
|
||||
severity: critical
|
||||
exp_annotations:
|
||||
message: 'etcd cluster "etcd": members are down (2).'
|
||||
message: 'etcd cluster "etcd": members are down (3).'
|
||||
|
||||
- interval: 1m
|
||||
input_series:
|
||||
- series: 'up{job="etcd",instance="10.10.10.0"}'
|
||||
values: '1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0'
|
||||
values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0'
|
||||
- series: 'up{job="etcd",instance="10.10.10.1"}'
|
||||
values: '1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0'
|
||||
values: '1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0'
|
||||
- series: 'etcd_network_peer_sent_failures_total{To="member-1",job="etcd",endpoint="test"}'
|
||||
values: '0 0 1 2 3 4 5 6 7 8 9 10'
|
||||
values: '0 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18'
|
||||
alert_rule_test:
|
||||
- eval_time: 4m
|
||||
alertname: etcdMembersDown
|
||||
- eval_time: 6m
|
||||
- eval_time: 13m
|
||||
alertname: etcdMembersDown
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
|
Loading…
x
Reference in New Issue
Block a user