diff --git a/Documentation/etcd-mixin/mixin.libsonnet b/Documentation/etcd-mixin/mixin.libsonnet index 96e1ae98e..3355eb407 100644 --- a/Documentation/etcd-mixin/mixin.libsonnet +++ b/Documentation/etcd-mixin/mixin.libsonnet @@ -233,6 +233,32 @@ summary: 'etcd instance HTTP requests are slow.', }, }, + { + alert: 'etcdBackendQuotaLowSpace', + expr: ||| + (etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100 > 95 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'critical', + }, + annotations: { + message: 'etcd cluster "{{ $labels.job }}": database size exceeds the defined quota on etcd instance {{ $labels.instance }}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.', + }, + }, + { + alert: 'etcdExcessiveDatabaseGrowth', + expr: ||| + increase(((etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100)[240m:1m]) > 50 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'warning', + }, + annotations: { + message: 'etcd cluster "{{ $labels.job }}": Observed surge in etcd writes leading to 50% increase in database size over the past four hours on etcd instance {{ $labels.instance }}, please check as it might be disruptive.', + }, + }, ], }, ], diff --git a/Documentation/etcd-mixin/test.yaml b/Documentation/etcd-mixin/test.yaml index 8796339ca..2802f6502 100644 --- a/Documentation/etcd-mixin/test.yaml +++ b/Documentation/etcd-mixin/test.yaml @@ -116,4 +116,20 @@ tests: - eval_time: 10m alertname: etcdHighNumberOfLeaderChanges exp_alerts: - + - interval: 1m + input_series: + - series: '((etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100){job="etcd",instance="10.10.10.0"}' + values: '0 10 20 0 0 10 0 0 30 0 0 0 0 0 0 0' + - series: '((etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100){job="etcd",instance="10.10.10.1"}' + values: '0 0 10 0 20 0 0 0 0 0 0 0 0 0 0 0' + - series: '((etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100){job="etcd",instance="10.10.10.2"}' + values: '0 0 0 0 0 0 0 0' + alert_rule_test: + - eval_time: 10m + alertname: etcdExcessiveDatabaseGrowth + exp_alerts: + - exp_labels: + job: etcd + severity: warning + exp_annotations: + message: 'etcd cluster "etcd": Observed surge in etcd writes leading to 50% increase in database size over the past four hours, please check as it might be disruptive.'