mirror of
https://github.com/etcd-io/etcd.git
synced 2024-09-27 06:25:44 +00:00
167 lines
6.9 KiB
YAML
167 lines
6.9 KiB
YAML
---
|
|
rule_files: [manifests/etcd-prometheusRules.yaml]
|
|
evaluation_interval: 1m
|
|
tests:
|
|
- interval: 1m
|
|
input_series:
|
|
- series: up{job="etcd",instance="10.10.10.0"}
|
|
values: 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0
|
|
- series: up{job="etcd",instance="10.10.10.1"}
|
|
values: 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0
|
|
- series: up{job="etcd",instance="10.10.10.2"}
|
|
values: 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0
|
|
alert_rule_test:
|
|
- eval_time: 3m
|
|
alertname: etcdInsufficientMembers
|
|
- eval_time: 5m
|
|
alertname: etcdInsufficientMembers
|
|
- eval_time: 12m
|
|
alertname: etcdMembersDown
|
|
- eval_time: 14m
|
|
alertname: etcdMembersDown
|
|
exp_alerts:
|
|
- exp_labels:
|
|
job: etcd
|
|
severity: critical
|
|
exp_annotations:
|
|
description: 'etcd cluster "etcd": members are down (3).'
|
|
summary: etcd cluster members are down.
|
|
- eval_time: 7m
|
|
alertname: etcdInsufficientMembers
|
|
- eval_time: 11m
|
|
alertname: etcdInsufficientMembers
|
|
exp_alerts:
|
|
- exp_labels:
|
|
job: etcd
|
|
severity: critical
|
|
exp_annotations:
|
|
description: 'etcd cluster "etcd": insufficient members (1).'
|
|
summary: etcd cluster has insufficient number of members.
|
|
- eval_time: 15m
|
|
alertname: etcdInsufficientMembers
|
|
exp_alerts:
|
|
- exp_labels:
|
|
job: etcd
|
|
severity: critical
|
|
exp_annotations:
|
|
description: 'etcd cluster "etcd": insufficient members (0).'
|
|
summary: etcd cluster has insufficient number of members.
|
|
- interval: 1m
|
|
input_series:
|
|
- series: up{job="etcd",instance="10.10.10.0"}
|
|
values: 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0
|
|
- series: up{job="etcd",instance="10.10.10.1"}
|
|
values: 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0
|
|
- series: up{job="etcd",instance="10.10.10.2"}
|
|
values: 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
alert_rule_test:
|
|
- eval_time: 14m
|
|
alertname: etcdMembersDown
|
|
exp_alerts:
|
|
- exp_labels:
|
|
job: etcd
|
|
severity: critical
|
|
exp_annotations:
|
|
description: 'etcd cluster "etcd": members are down (3).'
|
|
summary: etcd cluster members are down.
|
|
- interval: 1m
|
|
input_series:
|
|
- series: up{job="etcd",instance="10.10.10.0"}
|
|
values: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0
|
|
- series: up{job="etcd",instance="10.10.10.1"}
|
|
values: 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0
|
|
- series: etcd_network_peer_sent_failures_total{To="member-1",job="etcd",endpoint="test"}
|
|
values: 0 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
|
|
alert_rule_test:
|
|
- eval_time: 13m
|
|
alertname: etcdMembersDown
|
|
exp_alerts:
|
|
- exp_labels:
|
|
job: etcd
|
|
severity: critical
|
|
exp_annotations:
|
|
description: 'etcd cluster "etcd": members are down (1).'
|
|
summary: etcd cluster members are down.
|
|
- interval: 1m
|
|
input_series:
|
|
- series: etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.0"}
|
|
values: 0 0 2 0 0 1 0 0 0 0 0 0 0 0 0 0
|
|
- series: etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.1"}
|
|
values: 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
|
|
- series: etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.2"}
|
|
values: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
alert_rule_test:
|
|
- eval_time: 10m
|
|
alertname: etcdHighNumberOfLeaderChanges
|
|
exp_alerts:
|
|
- exp_labels:
|
|
job: etcd
|
|
severity: warning
|
|
exp_annotations:
|
|
description: 'etcd cluster "etcd": 4 leader changes within the last
|
|
15 minutes. Frequent elections may be a sign of insufficient resources,
|
|
high network latency, or disruptions by other components and should
|
|
be investigated.'
|
|
summary: etcd cluster has high number of leader changes.
|
|
- interval: 1m
|
|
input_series:
|
|
- series: etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.0"}
|
|
values: 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
- series: etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.1"}
|
|
values: 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
- series: etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.2"}
|
|
values: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
alert_rule_test:
|
|
- eval_time: 10m
|
|
alertname: etcdHighNumberOfLeaderChanges
|
|
exp_alerts:
|
|
- interval: 1m
|
|
input_series:
|
|
- series: etcd_mvcc_db_total_size_in_bytes{job="etcd",instance="10.10.10.0"}
|
|
values: 0+8192x240
|
|
- series: etcd_server_quota_backend_bytes{job="etcd",instance="10.10.10.0"}
|
|
values: 524288+0x240
|
|
- series: etcd_mvcc_db_total_size_in_bytes{job="etcd",instance="10.10.10.1"}
|
|
values: 0+1024x240
|
|
- series: etcd_server_quota_backend_bytes{job="etcd",instance="10.10.10.1"}
|
|
values: 524288+0x240
|
|
alert_rule_test:
|
|
- eval_time: 11m
|
|
alertname: etcdExcessiveDatabaseGrowth
|
|
exp_alerts:
|
|
- exp_labels:
|
|
instance: 10.10.10.0
|
|
job: etcd
|
|
severity: warning
|
|
exp_annotations:
|
|
description: 'etcd cluster "etcd": Predicting running out of disk space
|
|
in the next four hours, based on write observations within the past
|
|
four hours on etcd instance 10.10.10.0, please check as it might be
|
|
disruptive.'
|
|
summary: etcd cluster database growing very fast.
|
|
- interval: 1m
|
|
input_series:
|
|
- series: etcd_mvcc_db_total_size_in_use_in_bytes{job="etcd",instance="10.10.10.0"}
|
|
values: 300000000+0x10
|
|
- series: etcd_mvcc_db_total_size_in_bytes{job="etcd",instance="10.10.10.0"}
|
|
values: 1000000000+0x10
|
|
- series: etcd_mvcc_db_total_size_in_use_in_bytes{job="etcd",instance="10.10.10.1"}
|
|
values: 700000000+0x10
|
|
- series: etcd_mvcc_db_total_size_in_bytes{job="etcd",instance="10.10.10.1"}
|
|
values: 1000000000+0x10
|
|
alert_rule_test:
|
|
- eval_time: 11m
|
|
alertname: etcdDatabaseHighFragmentationRatio
|
|
exp_alerts:
|
|
- exp_labels:
|
|
instance: 10.10.10.0
|
|
job: etcd
|
|
severity: warning
|
|
exp_annotations:
|
|
description: 'etcd cluster "etcd": database size in use on instance
|
|
10.10.10.0 is 30% of the actual allocated disk space, please run defragmentation
|
|
(e.g. etcdctl defrag) to retrieve the unused fragmented disk space.'
|
|
runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation
|
|
summary: etcd database size in use is less than 50% of the actual allocated
|
|
storage.
|