mirror of
https://github.com/etcd-io/etcd.git
synced 2024-09-27 06:25:44 +00:00
Documentation/op-guide: Drop old alert_rules
Frederic says [1]: > Side note, we can probably remove the old alerting syntax rules, > Prometheus has removed this syntax >2.5 years ago. [1]: https://github.com/etcd-io/etcd/pull/12080#issuecomment-649982787
This commit is contained in:
parent
429826b467
commit
4160b8396d
@ -1,165 +0,0 @@
|
||||
# general cluster availability
|
||||
|
||||
# alert if another failed member will result in an unavailable cluster
|
||||
ALERT InsufficientMembers
|
||||
IF count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
|
||||
FOR 3m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "etcd cluster insufficient members",
|
||||
description = "If one more etcd member goes down the cluster will be unavailable",
|
||||
}
|
||||
|
||||
# etcd leader alerts
|
||||
# ==================
|
||||
|
||||
# alert if any etcd instance has no leader
|
||||
ALERT NoLeader
|
||||
IF etcd_server_has_leader{job="etcd"} == 0
|
||||
FOR 1m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "etcd member has no leader",
|
||||
description = "etcd member {{ $labels.instance }} has no leader",
|
||||
}
|
||||
|
||||
# alert if there are lots of leader changes
|
||||
ALERT HighNumberOfLeaderChanges
|
||||
IF increase((max by (job) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "a high number of leader changes within the etcd cluster are happening",
|
||||
description = "etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.",
|
||||
}
|
||||
|
||||
# gRPC request alerts
|
||||
# ===================
|
||||
|
||||
# alert if more than 1% of gRPC method calls have failed within the last 5 minutes
|
||||
ALERT HighNumberOfFailedGRPCRequests
|
||||
IF 100 * (sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m]))
|
||||
/ sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m]))) > 1
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "a high number of gRPC requests are failing",
|
||||
description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}",
|
||||
}
|
||||
|
||||
# alert if more than 5% of gRPC method calls have failed within the last 5 minutes
|
||||
ALERT HighNumberOfFailedGRPCRequests
|
||||
IF 100 * (sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m]))
|
||||
/ sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m]))) > 5
|
||||
FOR 5m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "a high number of gRPC requests are failing",
|
||||
description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}",
|
||||
}
|
||||
|
||||
# alert if the 99th percentile of gRPC method calls take more than 150ms
|
||||
ALERT GRPCRequestsSlow
|
||||
IF histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="etcd",grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le)) > 0.15
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "slow gRPC requests",
|
||||
description = "on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow",
|
||||
}
|
||||
|
||||
# file descriptor alerts
|
||||
# ======================
|
||||
|
||||
instance:fd_utilization = process_open_fds / process_max_fds
|
||||
|
||||
# alert if file descriptors are likely to exhaust within the next 4 hours
|
||||
ALERT FdExhaustionClose
|
||||
IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "file descriptors soon exhausted",
|
||||
description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust its file descriptors soon",
|
||||
}
|
||||
|
||||
# alert if file descriptors are likely to exhaust within the next hour
|
||||
ALERT FdExhaustionClose
|
||||
IF predict_linear(instance:fd_utilization[10m], 3600) > 1
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "critical"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "file descriptors soon exhausted",
|
||||
description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust its file descriptors soon",
|
||||
}
|
||||
|
||||
# etcd member communication alerts
|
||||
# ================================
|
||||
|
||||
# alert if 99th percentile of round trips take 150ms
|
||||
ALERT EtcdMemberCommunicationSlow
|
||||
IF histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m])) > 0.15
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "etcd member communication is slow",
|
||||
description = "etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow",
|
||||
}
|
||||
|
||||
# etcd proposal alerts
|
||||
# ====================
|
||||
|
||||
# alert if there are several failed proposals within an hour
|
||||
ALERT HighNumberOfFailedProposals
|
||||
IF increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "a high number of proposals within the etcd cluster are failing",
|
||||
description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour",
|
||||
}
|
||||
|
||||
# etcd disk io latency alerts
|
||||
# ===========================
|
||||
|
||||
# alert if 99th percentile of fsync durations is higher than 500ms
|
||||
ALERT HighFsyncDurations
|
||||
IF histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "high fsync durations",
|
||||
description = "etcd instance {{ $labels.instance }} fync durations are high",
|
||||
}
|
||||
|
||||
# alert if 99th percentile of commit durations is higher than 250ms
|
||||
ALERT HighCommitDurations
|
||||
IF histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25
|
||||
FOR 10m
|
||||
LABELS {
|
||||
severity = "warning"
|
||||
}
|
||||
ANNOTATIONS {
|
||||
summary = "high commit durations",
|
||||
description = "etcd instance {{ $labels.instance }} commit durations are high",
|
||||
}
|
@ -1,150 +0,0 @@
|
||||
# these rules synced manually from https://github.com/etcd-io/etcd/blob/master/Documentation/etcd-mixin/mixin.libsonnet
|
||||
groups:
|
||||
- name: etcd
|
||||
rules:
|
||||
- alert: etcdMembersDown
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value }}).'
|
||||
expr: |
|
||||
max by (job) (
|
||||
sum by (job) (up{job=~".*etcd.*"} == bool 0)
|
||||
or
|
||||
count by (job,endpoint) (
|
||||
sum by (job,endpoint,To) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[3m])) > 0.01
|
||||
)
|
||||
)
|
||||
> 0
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: etcdInsufficientMembers
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value
|
||||
}}).'
|
||||
expr: |
|
||||
sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"}) by (job) + 1) / 2)
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: etcdNoLeader
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has
|
||||
no leader.'
|
||||
expr: |
|
||||
etcd_server_has_leader{job=~".*etcd.*"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: etcdHighNumberOfLeaderChanges
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within
|
||||
the last 15 minutes. Frequent elections may be a sign of insufficient resources,
|
||||
high network latency, or disruptions by other components and should be investigated.'
|
||||
expr: |
|
||||
increase((max by (job) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: etcdHighNumberOfFailedGRPCRequests
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{
|
||||
$labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
|
||||
expr: |
|
||||
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
|
||||
/
|
||||
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
|
||||
> 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: etcdHighNumberOfFailedGRPCRequests
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{
|
||||
$labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
|
||||
expr: |
|
||||
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
|
||||
/
|
||||
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
|
||||
> 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: etcdGRPCRequestsSlow
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method
|
||||
}} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||
expr: |
|
||||
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) by (job, instance, grpc_service, grpc_method, le))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: etcdMemberCommunicationSlow
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To
|
||||
}} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||
expr: |
|
||||
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: etcdHighNumberOfFailedProposals
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within
|
||||
the last 30 minutes on etcd instance {{ $labels.instance }}.'
|
||||
expr: |
|
||||
rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: etcdHighFsyncDurations
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations are
|
||||
{{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||
expr: |
|
||||
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: etcdHighCommitDurations
|
||||
annotations:
|
||||
message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations
|
||||
{{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||
expr: |
|
||||
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.25
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: etcdHighNumberOfFailedHTTPRequests
|
||||
annotations:
|
||||
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
|
||||
instance {{ $labels.instance }}'
|
||||
expr: |
|
||||
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
|
||||
BY (method) > 0.01
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: etcdHighNumberOfFailedHTTPRequests
|
||||
annotations:
|
||||
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
|
||||
instance {{ $labels.instance }}.'
|
||||
expr: |
|
||||
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
|
||||
BY (method) > 0.05
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: etcdHTTPRequestsSlow
|
||||
annotations:
|
||||
message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method
|
||||
}} are slow.
|
||||
expr: |
|
||||
histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
@ -106,7 +106,7 @@ Now Prometheus will scrape etcd metrics every 10 seconds.
|
||||
|
||||
### Alerting
|
||||
|
||||
There is a set of default alerts for etcd v3 clusters for [Prometheus 1.x](./etcd3_alert.rules) as well as [Prometheus 2.x](./etcd3_alert.rules.yml).
|
||||
There is a set of [default alerts](../etcd-mixin) for etcd v3 clusters for Prometheus.
|
||||
|
||||
> Note: `job` labels may need to be adjusted to fit a particular need. The rules were written to apply to a single cluster so it is recommended to choose labels unique to a cluster.
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user