From 322c38e16991754615e041f325d055cb9f673041 Mon Sep 17 00:00:00 2001 From: Clayton Coleman Date: Fri, 13 Dec 2019 19:00:11 -0500 Subject: [PATCH] Documentation/etcd-mixin: Fix etcdHighNumberOfLeaderChanges (#11448) The `etcdHighNumberOfLeaderChanges` alert had a copy and paste error when it was converted from docs to mixin in 10244 - we moved from "increase over 15m > 3" to "rate over 15m > 3" which is not the same (rate is measured per second, so it should have been "rate over 15m > (3 / 60 / 15)"). As part of fixing that, we need to capture when prometheus starts or when new etcd clusters are captured with a high leader change - i.e. if you start a new etcd cluster and at the moment prometheus first scrapes you are already at 5 leader changes, we should fire on that transition. This alert is also now more responsive, so if you get a quick burst of 3 leader changes we'll alert within 5m rather than 15m. --- Documentation/etcd-mixin/mixin.libsonnet | 6 ++--- Documentation/etcd-mixin/test.yaml | 30 ++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/Documentation/etcd-mixin/mixin.libsonnet b/Documentation/etcd-mixin/mixin.libsonnet index 0653c8d0e..3d0c4b339 100644 --- a/Documentation/etcd-mixin/mixin.libsonnet +++ b/Documentation/etcd-mixin/mixin.libsonnet @@ -57,14 +57,14 @@ { alert: 'etcdHighNumberOfLeaderChanges', expr: ||| - rate(etcd_server_leader_changes_seen_total{%(etcd_selector)s}[15m]) > 3 + increase((max by (job) (etcd_server_leader_changes_seen_total{%(etcd_selector)s}) or 0*absent(etcd_server_leader_changes_seen_total{%(etcd_selector)s}))[15m:1m]) >= 3 ||| % $._config, - 'for': '15m', + 'for': '5m', labels: { severity: 'warning', }, annotations: { - message: 'etcd cluster "{{ $labels.job }}": instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last 30 minutes.', + message: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.', }, }, { diff --git a/Documentation/etcd-mixin/test.yaml b/Documentation/etcd-mixin/test.yaml index 408cfd972..56ee613b1 100644 --- a/Documentation/etcd-mixin/test.yaml +++ b/Documentation/etcd-mixin/test.yaml @@ -83,3 +83,33 @@ tests: severity: critical exp_annotations: message: 'etcd cluster "etcd": members are down (1).' + - interval: 1m + input_series: + - series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.0"}' + values: '0 0 2 0 0 1 0 0 0 0 0 0 0 0 0 0' + - series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.1"}' + values: '0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0' + - series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.2"}' + values: '0 0 0 0 0 0 0 0' + alert_rule_test: + - eval_time: 10m + alertname: etcdHighNumberOfLeaderChanges + exp_alerts: + - exp_labels: + job: etcd + severity: warning + exp_annotations: + message: 'etcd cluster "etcd": 3 leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.' + - interval: 1m + input_series: + - series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.0"}' + values: '0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0' + - series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.1"}' + values: '0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0' + - series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.2"}' + values: '0 0 0 0 0 0 0 0' + alert_rule_test: + - eval_time: 10m + alertname: etcdHighNumberOfLeaderChanges + exp_alerts: +