contrib/mixin/mixin.libsonnet: Adjust gRPC failed requests

OK is not the only one that is allowed, this before also captured
context canceled, NotFound, and other non error requests.
This commit is contained in:
Lili Cosic 2021-06-21 11:42:08 +02:00
parent f992d697ed
commit f00231951d

View File

@ -33,7 +33,7 @@
) )
) )
> 0 > 0
||| % {etcd_instance_labels: $._config.etcd_instance_labels, etcd_selector: $._config.etcd_selector, network_failure_range: $._config.scrape_interval_seconds*4}, ||| % { etcd_instance_labels: $._config.etcd_instance_labels, etcd_selector: $._config.etcd_selector, network_failure_range: $._config.scrape_interval_seconds * 4 },
'for': '10m', 'for': '10m',
labels: { labels: {
severity: 'critical', severity: 'critical',
@ -88,7 +88,7 @@
{ {
alert: 'etcdHighNumberOfFailedGRPCRequests', alert: 'etcdHighNumberOfFailedGRPCRequests',
expr: ||| expr: |||
100 * sum(rate(grpc_server_handled_total{%(etcd_selector)s, grpc_code!="OK"}[5m])) without (grpc_type, grpc_code) 100 * sum(rate(grpc_server_handled_total{%(etcd_selector)s, grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
/ /
sum(rate(grpc_server_handled_total{%(etcd_selector)s}[5m])) without (grpc_type, grpc_code) sum(rate(grpc_server_handled_total{%(etcd_selector)s}[5m])) without (grpc_type, grpc_code)
> 1 > 1
@ -105,7 +105,7 @@
{ {
alert: 'etcdHighNumberOfFailedGRPCRequests', alert: 'etcdHighNumberOfFailedGRPCRequests',
expr: ||| expr: |||
100 * sum(rate(grpc_server_handled_total{%(etcd_selector)s, grpc_code!="OK"}[5m])) without (grpc_type, grpc_code) 100 * sum(rate(grpc_server_handled_total{%(etcd_selector)s, grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
/ /
sum(rate(grpc_server_handled_total{%(etcd_selector)s}[5m])) without (grpc_type, grpc_code) sum(rate(grpc_server_handled_total{%(etcd_selector)s}[5m])) without (grpc_type, grpc_code)
> 5 > 5
@ -207,7 +207,7 @@
summary: 'etcd cluster 99th percentile commit durations are too high.', summary: 'etcd cluster 99th percentile commit durations are too high.',
}, },
}, },
{ {
alert: 'etcdBackendQuotaLowSpace', alert: 'etcdBackendQuotaLowSpace',
expr: ||| expr: |||
(etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100 > 95 (etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100 > 95
@ -219,8 +219,8 @@
annotations: { annotations: {
message: 'etcd cluster "{{ $labels.job }}": database size exceeds the defined quota on etcd instance {{ $labels.instance }}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.', message: 'etcd cluster "{{ $labels.job }}": database size exceeds the defined quota on etcd instance {{ $labels.instance }}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.',
}, },
}, },
{ {
alert: 'etcdExcessiveDatabaseGrowth', alert: 'etcdExcessiveDatabaseGrowth',
expr: ||| expr: |||
increase(((etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100)[240m:1m]) > 50 increase(((etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100)[240m:1m]) > 50
@ -232,7 +232,7 @@
annotations: { annotations: {
message: 'etcd cluster "{{ $labels.job }}": Observed surge in etcd writes leading to 50% increase in database size over the past four hours on etcd instance {{ $labels.instance }}, please check as it might be disruptive.', message: 'etcd cluster "{{ $labels.job }}": Observed surge in etcd writes leading to 50% increase in database size over the past four hours on etcd instance {{ $labels.instance }}, please check as it might be disruptive.',
}, },
}, },
], ],
}, },
], ],
@ -243,7 +243,7 @@
uid: std.md5('etcd.json'), uid: std.md5('etcd.json'),
title: 'etcd', title: 'etcd',
description: 'etcd sample Grafana dashboard with Prometheus', description: 'etcd sample Grafana dashboard with Prometheus',
tags: [ 'etcd-mixin' ], tags: ['etcd-mixin'],
style: 'dark', style: 'dark',
timezone: 'browser', timezone: 'browser',
editable: true, editable: true,
@ -369,7 +369,7 @@
step: 2, step: 2,
}, },
{ {
expr: 'sum(rate(grpc_server_handled_total{job="$cluster",grpc_type="unary",grpc_code!="OK"}[5m]))', expr: 'sum(rate(grpc_server_handled_total{job="$cluster",grpc_type="unary",grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m]))',
format: 'time_series', format: 'time_series',
intervalFactor: 2, intervalFactor: 2,
legendFormat: 'RPC Failed Rate', legendFormat: 'RPC Failed Rate',