Documentation: Add etcd database quota alerts (#12249)

This commit:
- Fires a critical alert when the etcd database quota is 95% full
  at any given point of time to alert the user to defrag or increase
  the quota in order to avoid the alarm getting triggered which blocks
  all the writes to etcd meaning there can't be any new objects created.
  This is needed to make sure the cluster supports running large number
  of nodes and objects.
- Fires a warning when there is a sudden surge in etcd writes leading to
  increase in the etcd database quota size at an alarming rate as it
  is disruptive. It might be because of a rougue process and it's
  important to alert the admin.
This commit is contained in:
Naga Ravi Chaitanya Elluri 2020-09-25 14:03:04 -04:00 committed by GitHub
parent 8050881aaf
commit ed82418799
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 43 additions and 1 deletions

View File

@ -233,6 +233,32 @@
summary: 'etcd instance HTTP requests are slow.',
},
},
{
alert: 'etcdBackendQuotaLowSpace',
expr: |||
(etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100 > 95
||| % $._config,
'for': '10m',
labels: {
severity: 'critical',
},
annotations: {
message: 'etcd cluster "{{ $labels.job }}": database size exceeds the defined quota on etcd instance {{ $labels.instance }}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.',
},
},
{
alert: 'etcdExcessiveDatabaseGrowth',
expr: |||
increase(((etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100)[240m:1m]) > 50
||| % $._config,
'for': '10m',
labels: {
severity: 'warning',
},
annotations: {
message: 'etcd cluster "{{ $labels.job }}": Observed surge in etcd writes leading to 50% increase in database size over the past four hours on etcd instance {{ $labels.instance }}, please check as it might be disruptive.',
},
},
],
},
],

View File

@ -116,4 +116,20 @@ tests:
- eval_time: 10m
alertname: etcdHighNumberOfLeaderChanges
exp_alerts:
- interval: 1m
input_series:
- series: '((etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100){job="etcd",instance="10.10.10.0"}'
values: '0 10 20 0 0 10 0 0 30 0 0 0 0 0 0 0'
- series: '((etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100){job="etcd",instance="10.10.10.1"}'
values: '0 0 10 0 20 0 0 0 0 0 0 0 0 0 0 0'
- series: '((etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100){job="etcd",instance="10.10.10.2"}'
values: '0 0 0 0 0 0 0 0'
alert_rule_test:
- eval_time: 10m
alertname: etcdExcessiveDatabaseGrowth
exp_alerts:
- exp_labels:
job: etcd
severity: warning
exp_annotations:
message: 'etcd cluster "etcd": Observed surge in etcd writes leading to 50% increase in database size over the past four hours, please check as it might be disruptive.'