mirror of
https://github.com/etcd-io/etcd.git
synced 2024-09-27 06:25:44 +00:00
122 lines
4.1 KiB
Plaintext
122 lines
4.1 KiB
Plaintext
### General cluster availability ###
|
|
|
|
# alert if another failed member will result in an unavailable cluster
|
|
ALERT InsufficientMembers
|
|
IF count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
|
|
FOR 3m
|
|
LABELS {
|
|
severity = "critical"
|
|
}
|
|
ANNOTATIONS {
|
|
summary = "etcd cluster insufficient members",
|
|
description = "If one more etcd member goes down the cluster will be unavailable",
|
|
}
|
|
|
|
### HTTP requests alerts ###
|
|
|
|
# alert if more than 1% of requests to an HTTP endpoint have failed with a non 4xx response
|
|
ALERT HighNumberOfFailedHTTPRequests
|
|
IF sum by(method) (rate(etcd_http_failed_total{job="etcd", code!~"4[0-9]{2}"}[5m]))
|
|
/ sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.01
|
|
FOR 10m
|
|
LABELS {
|
|
severity = "warning"
|
|
}
|
|
ANNOTATIONS {
|
|
summary = "a high number of HTTP requests are failing",
|
|
description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
|
|
}
|
|
|
|
# alert if more than 5% of requests to an HTTP endpoint have failed with a non 4xx response
|
|
ALERT HighNumberOfFailedHTTPRequests
|
|
IF sum by(method) (rate(etcd_http_failed_total{job="etcd", code!~"4[0-9]{2}"}[5m]))
|
|
/ sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.05
|
|
FOR 5m
|
|
LABELS {
|
|
severity = "critical"
|
|
}
|
|
ANNOTATIONS {
|
|
summary = "a high number of HTTP requests are failing",
|
|
description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
|
|
}
|
|
|
|
# alert if 50% of requests get a 4xx response
|
|
ALERT HighNumberOfFailedHTTPRequests
|
|
IF sum by(method) (rate(etcd_http_failed_total{job="etcd", code=~"4[0-9]{2}"}[5m]))
|
|
/ sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.5
|
|
FOR 10m
|
|
LABELS {
|
|
severity = "critical"
|
|
}
|
|
ANNOTATIONS {
|
|
summary = "a high number of HTTP requests are failing",
|
|
description = "{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses on etcd instance {{ $labels.instance }}",
|
|
}
|
|
|
|
# alert if the 99th percentile of HTTP requests take more than 150ms
|
|
ALERT HTTPRequestsSlow
|
|
IF histogram_quantile(0.99, rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15
|
|
FOR 10m
|
|
LABELS {
|
|
severity = "warning"
|
|
}
|
|
ANNOTATIONS {
|
|
summary = "slow HTTP requests",
|
|
description = "on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow",
|
|
}
|
|
|
|
### File descriptor alerts ###
|
|
|
|
instance:fd_utilization = process_open_fds / process_max_fds
|
|
|
|
# alert if file descriptors are likely to exhaust within the next 4 hours
|
|
ALERT FdExhaustionClose
|
|
IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
|
|
FOR 10m
|
|
LABELS {
|
|
severity = "warning"
|
|
}
|
|
ANNOTATIONS {
|
|
summary = "file descriptors soon exhausted",
|
|
description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust its file descriptors soon",
|
|
}
|
|
|
|
# alert if file descriptors are likely to exhaust within the next hour
|
|
ALERT FdExhaustionClose
|
|
IF predict_linear(instance:fd_utilization[10m], 3600) > 1
|
|
FOR 10m
|
|
LABELS {
|
|
severity = "critical"
|
|
}
|
|
ANNOTATIONS {
|
|
summary = "file descriptors soon exhausted",
|
|
description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust its file descriptors soon",
|
|
}
|
|
|
|
### etcd proposal alerts ###
|
|
|
|
# alert if there are several failed proposals within an hour
|
|
ALERT HighNumberOfFailedProposals
|
|
IF increase(etcd_server_proposal_failed_total{job="etcd"}[1h]) > 5
|
|
LABELS {
|
|
severity = "warning"
|
|
}
|
|
ANNOTATIONS {
|
|
summary = "a high number of proposals within the etcd cluster are failing",
|
|
description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour",
|
|
}
|
|
|
|
### etcd disk io latency alerts ###
|
|
|
|
# alert if 99th percentile of fsync durations is higher than 500ms
|
|
ALERT HighFsyncDurations
|
|
IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m])) > 0.5
|
|
FOR 10m
|
|
LABELS {
|
|
severity = "warning"
|
|
}
|
|
ANNOTATIONS {
|
|
summary = "high fsync durations",
|
|
description = "etcd instance {{ $labels.instance }} fync durations are high",
|
|
}
|