From ed8241879977333b35f3e70296085c215c7ff7bf Mon Sep 17 00:00:00 2001 From: Naga Ravi Chaitanya Elluri Date: Fri, 25 Sep 2020 14:03:04 -0400 Subject: [PATCH] Documentation: Add etcd database quota alerts (#12249) This commit: - Fires a critical alert when the etcd database quota is 95% full at any given point of time to alert the user to defrag or increase the quota in order to avoid the alarm getting triggered which blocks all the writes to etcd meaning there can't be any new objects created. This is needed to make sure the cluster supports running large number of nodes and objects. - Fires a warning when there is a sudden surge in etcd writes leading to increase in the etcd database quota size at an alarming rate as it is disruptive. It might be because of a rougue process and it's important to alert the admin. --- Documentation/etcd-mixin/mixin.libsonnet | 26 ++++++++++++++++++++++++ Documentation/etcd-mixin/test.yaml | 18 +++++++++++++++- 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/Documentation/etcd-mixin/mixin.libsonnet b/Documentation/etcd-mixin/mixin.libsonnet index 96e1ae98e..3355eb407 100644 --- a/Documentation/etcd-mixin/mixin.libsonnet +++ b/Documentation/etcd-mixin/mixin.libsonnet @@ -233,6 +233,32 @@ summary: 'etcd instance HTTP requests are slow.', }, }, + { + alert: 'etcdBackendQuotaLowSpace', + expr: ||| + (etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100 > 95 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'critical', + }, + annotations: { + message: 'etcd cluster "{{ $labels.job }}": database size exceeds the defined quota on etcd instance {{ $labels.instance }}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.', + }, + }, + { + alert: 'etcdExcessiveDatabaseGrowth', + expr: ||| + increase(((etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100)[240m:1m]) > 50 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'warning', + }, + annotations: { + message: 'etcd cluster "{{ $labels.job }}": Observed surge in etcd writes leading to 50% increase in database size over the past four hours on etcd instance {{ $labels.instance }}, please check as it might be disruptive.', + }, + }, ], }, ], diff --git a/Documentation/etcd-mixin/test.yaml b/Documentation/etcd-mixin/test.yaml index 8796339ca..2802f6502 100644 --- a/Documentation/etcd-mixin/test.yaml +++ b/Documentation/etcd-mixin/test.yaml @@ -116,4 +116,20 @@ tests: - eval_time: 10m alertname: etcdHighNumberOfLeaderChanges exp_alerts: - + - interval: 1m + input_series: + - series: '((etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100){job="etcd",instance="10.10.10.0"}' + values: '0 10 20 0 0 10 0 0 30 0 0 0 0 0 0 0' + - series: '((etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100){job="etcd",instance="10.10.10.1"}' + values: '0 0 10 0 20 0 0 0 0 0 0 0 0 0 0 0' + - series: '((etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100){job="etcd",instance="10.10.10.2"}' + values: '0 0 0 0 0 0 0 0' + alert_rule_test: + - eval_time: 10m + alertname: etcdExcessiveDatabaseGrowth + exp_alerts: + - exp_labels: + job: etcd + severity: warning + exp_annotations: + message: 'etcd cluster "etcd": Observed surge in etcd writes leading to 50% increase in database size over the past four hours, please check as it might be disruptive.'