From 10a863aac21293b28a86c4613db8e2803613739c Mon Sep 17 00:00:00 2001 From: Frederic Branczyk Date: Fri, 10 Nov 2017 14:58:13 +0100 Subject: [PATCH] Documentation/op-guide: Add rules for Prometheus 2.0 --- Documentation/op-guide/etcd3_alert.rules.yml | 143 +++++++++++++++++++ Documentation/op-guide/monitoring.md | 2 +- Documentation/v2/etcd_alert.rules.yml | 91 ++++++++++++ 3 files changed, 235 insertions(+), 1 deletion(-) create mode 100644 Documentation/op-guide/etcd3_alert.rules.yml create mode 100644 Documentation/v2/etcd_alert.rules.yml diff --git a/Documentation/op-guide/etcd3_alert.rules.yml b/Documentation/op-guide/etcd3_alert.rules.yml new file mode 100644 index 000000000..47a24773e --- /dev/null +++ b/Documentation/op-guide/etcd3_alert.rules.yml @@ -0,0 +1,143 @@ +groups: +- name: etcd3_alert.rules + rules: + - alert: InsufficientMembers + expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1) + for: 3m + labels: + severity: critical + annotations: + description: If one more etcd member goes down the cluster will be unavailable + summary: etcd cluster insufficient members + - alert: NoLeader + expr: etcd_server_has_leader{job="etcd"} == 0 + for: 1m + labels: + severity: critical + annotations: + description: etcd member {{ $labels.instance }} has no leader + summary: etcd member has no leader + - alert: HighNumberOfLeaderChanges + expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3 + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader + changes within the last hour + summary: a high number of leader changes within the etcd cluster are happening + - alert: HighNumberOfFailedGRPCRequests + expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) + / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.01 + for: 10m + labels: + severity: warning + annotations: + description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed + on etcd instance {{ $labels.instance }}' + summary: a high number of gRPC requests are failing + - alert: HighNumberOfFailedGRPCRequests + expr: sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[5m])) BY (grpc_method) + / sum(rate(etcd_grpc_total{job="etcd"}[5m])) BY (grpc_method) > 0.05 + for: 5m + labels: + severity: critical + annotations: + description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed + on etcd instance {{ $labels.instance }}' + summary: a high number of gRPC requests are failing + - alert: GRPCRequestsSlow + expr: histogram_quantile(0.99, rate(etcd_grpc_unary_requests_duration_seconds_bucket[5m])) + > 0.15 + for: 10m + labels: + severity: critical + annotations: + description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method + }} are slow + summary: slow gRPC requests + - alert: HighNumberOfFailedHTTPRequests + expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) + BY (method) > 0.01 + for: 10m + labels: + severity: warning + annotations: + description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd + instance {{ $labels.instance }}' + summary: a high number of HTTP requests are failing + - alert: HighNumberOfFailedHTTPRequests + expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) + BY (method) > 0.05 + for: 5m + labels: + severity: critical + annotations: + description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd + instance {{ $labels.instance }}' + summary: a high number of HTTP requests are failing + - alert: HTTPRequestsSlow + expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) + > 0.15 + for: 10m + labels: + severity: warning + annotations: + description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method + }} are slow + summary: slow HTTP requests + - record: instance:fd_utilization + expr: process_open_fds / process_max_fds + - alert: FdExhaustionClose + expr: predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1 + for: 10m + labels: + severity: warning + annotations: + description: '{{ $labels.job }} instance {{ $labels.instance }} will exhaust + its file descriptors soon' + summary: file descriptors soon exhausted + - alert: FdExhaustionClose + expr: predict_linear(instance:fd_utilization[10m], 3600) > 1 + for: 10m + labels: + severity: critical + annotations: + description: '{{ $labels.job }} instance {{ $labels.instance }} will exhaust + its file descriptors soon' + summary: file descriptors soon exhausted + - alert: EtcdMemberCommunicationSlow + expr: histogram_quantile(0.99, rate(etcd_network_member_round_trip_time_seconds_bucket[5m])) + > 0.15 + for: 10m + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} member communication with + {{ $labels.To }} is slow + summary: etcd member communication is slow + - alert: HighNumberOfFailedProposals + expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5 + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal + failures within the last hour + summary: a high number of proposals within the etcd cluster are failing + - alert: HighFsyncDurations + expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) + > 0.5 + for: 10m + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} fync durations are high + summary: high fsync durations + - alert: HighCommitDurations + expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) + > 0.25 + for: 10m + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} commit durations are high + summary: high commit durations diff --git a/Documentation/op-guide/monitoring.md b/Documentation/op-guide/monitoring.md index caabe6093..e2f439adf 100644 --- a/Documentation/op-guide/monitoring.md +++ b/Documentation/op-guide/monitoring.md @@ -100,7 +100,7 @@ Now Prometheus will scrape etcd metrics every 10 seconds. ### Alerting -There is a [set of default alerts for etcd v3 clusters](./etcd3_alert.rules). +There is a set of default alerts for etcd v3 clusters for [Prometheus 1.x](./etcd3_alert.rules) as well as [Prometheus 2.x](./etcd3_alert.rules). > Note: `job` labels may need to be adjusted to fit a particular need. The rules were written to apply to a single cluster so it is recommended to choose labels unique to a cluster. diff --git a/Documentation/v2/etcd_alert.rules.yml b/Documentation/v2/etcd_alert.rules.yml new file mode 100644 index 000000000..c9f8686be --- /dev/null +++ b/Documentation/v2/etcd_alert.rules.yml @@ -0,0 +1,91 @@ +groups: +- name: etcd_alert.rules + rules: + - alert: InsufficientMembers + expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1) + for: 3m + labels: + severity: critical + annotations: + description: If one more etcd member goes down the cluster will be unavailable + summary: etcd cluster insufficient members + - alert: HighNumberOfFailedHTTPRequests + expr: sum(rate(etcd_http_failed_total{code!~"^(?:4[0-9]{2})$",job="etcd"}[5m])) + BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) + > 0.01 + for: 10m + labels: + severity: warning + annotations: + description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd + instance {{ $labels.instance }}' + summary: a high number of HTTP requests are failing + - alert: HighNumberOfFailedHTTPRequests + expr: sum(rate(etcd_http_failed_total{code!~"^(?:4[0-9]{2})$",job="etcd"}[5m])) + BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) + > 0.05 + for: 5m + labels: + severity: critical + annotations: + description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd + instance {{ $labels.instance }}' + summary: a high number of HTTP requests are failing + - alert: HighNumberOfFailedHTTPRequests + expr: sum(rate(etcd_http_failed_total{code=~"^(?:4[0-9]{2})$",job="etcd"}[5m])) + BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method) + > 0.5 + for: 10m + labels: + severity: critical + annotations: + description: '{{ $value }}% of requests for {{ $labels.method }} failed with + 4xx responses on etcd instance {{ $labels.instance }}' + summary: a high number of HTTP requests are failing + - alert: HTTPRequestsSlow + expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_second_bucket[5m])) + > 0.15 + for: 10m + labels: + severity: warning + annotations: + description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method + }} are slow + summary: slow HTTP requests + - record: instance:fd_utilization + expr: process_open_fds / process_max_fds + - alert: FdExhaustionClose + expr: predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1 + for: 10m + labels: + severity: warning + annotations: + description: '{{ $labels.job }} instance {{ $labels.instance }} will exhaust + its file descriptors soon' + summary: file descriptors soon exhausted + - alert: FdExhaustionClose + expr: predict_linear(instance:fd_utilization[10m], 3600) > 1 + for: 10m + labels: + severity: critical + annotations: + description: '{{ $labels.job }} instance {{ $labels.instance }} will exhaust + its file descriptors soon' + summary: file descriptors soon exhausted + - alert: HighNumberOfFailedProposals + expr: increase(etcd_server_proposal_failed_total{job="etcd"}[1h]) > 5 + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal + failures within the last hour + summary: a high number of proposals within the etcd cluster are failing + - alert: HighFsyncDurations + expr: histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m])) + > 0.5 + for: 10m + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} fync durations are high + summary: high fsync durations