mirror of
https://github.com/etcd-io/etcd.git
synced 2024-09-27 06:25:44 +00:00
Mixin: Support configuring cluster label
This commit is contained in:
parent
4cbb949595
commit
7885f2a951
@ -15,6 +15,8 @@
|
||||
// 1 : On Dashboard Load (Will refresh Dashboards variables when dashboard are loaded)
|
||||
// 2 : On Time Range Change (Will refresh Dashboards variables when time range will be changed)
|
||||
dashboard_var_refresh: 2,
|
||||
// clusterLabel is used to identify a cluster.
|
||||
clusterLabel: 'job',
|
||||
},
|
||||
|
||||
prometheusAlerts+:: {
|
||||
@ -39,7 +41,7 @@
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
description: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value }}).',
|
||||
description: 'etcd cluster "{{ $labels.%s }}": members are down ({{ $value }}).' % $._config.clusterLabel,
|
||||
summary: 'etcd cluster members are down.',
|
||||
},
|
||||
},
|
||||
@ -53,7 +55,7 @@
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
description: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).',
|
||||
description: 'etcd cluster "{{ $labels.%s }}": insufficient members ({{ $value }}).' % $._config.clusterLabel,
|
||||
summary: 'etcd cluster has insufficient number of members.',
|
||||
},
|
||||
},
|
||||
@ -67,7 +69,7 @@
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
description: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.',
|
||||
description: 'etcd cluster "{{ $labels.%s }}": member {{ $labels.instance }} has no leader.' % $._config.clusterLabel,
|
||||
summary: 'etcd cluster has no leader.',
|
||||
},
|
||||
},
|
||||
@ -81,7 +83,7 @@
|
||||
severity: 'warning',
|
||||
},
|
||||
annotations: {
|
||||
description: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.',
|
||||
description: 'etcd cluster "{{ $labels.%s }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.' % $._config.clusterLabel,
|
||||
summary: 'etcd cluster has high number of leader changes.',
|
||||
},
|
||||
},
|
||||
@ -98,7 +100,7 @@
|
||||
severity: 'warning',
|
||||
},
|
||||
annotations: {
|
||||
description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.',
|
||||
description: 'etcd cluster "{{ $labels.%s }}": {{ $value }}%% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.' % $._config.clusterLabel,
|
||||
summary: 'etcd cluster has high number of failed grpc requests.',
|
||||
},
|
||||
},
|
||||
@ -115,7 +117,7 @@
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.',
|
||||
description: 'etcd cluster "{{ $labels.%s }}": {{ $value }}%% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.' % $._config.clusterLabel,
|
||||
summary: 'etcd cluster has high number of failed grpc requests.',
|
||||
},
|
||||
},
|
||||
@ -130,7 +132,7 @@
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
description: 'etcd cluster "{{ $labels.job }}": 99th percentile of gRPC requests is {{ $value }}s on etcd instance {{ $labels.instance }}.',
|
||||
description: 'etcd cluster "{{ $labels.%s }}": 99th percentile of gRPC requests is {{ $value }}s on etcd instance {{ $labels.instance }}.' % $._config.clusterLabel,
|
||||
summary: 'etcd grpc requests are slow',
|
||||
},
|
||||
},
|
||||
@ -145,7 +147,7 @@
|
||||
severity: 'warning',
|
||||
},
|
||||
annotations: {
|
||||
description: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.',
|
||||
description: 'etcd cluster "{{ $labels.%s }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.' % $._config.clusterLabel,
|
||||
summary: 'etcd cluster member communication is slow.',
|
||||
},
|
||||
},
|
||||
@ -159,7 +161,7 @@
|
||||
severity: 'warning',
|
||||
},
|
||||
annotations: {
|
||||
description: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.',
|
||||
description: 'etcd cluster "{{ $labels.%s }}": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.' % $._config.clusterLabel,
|
||||
summary: 'etcd cluster has high number of proposal failures.',
|
||||
},
|
||||
},
|
||||
@ -174,7 +176,7 @@
|
||||
severity: 'warning',
|
||||
},
|
||||
annotations: {
|
||||
description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.',
|
||||
description: 'etcd cluster "{{ $labels.%s }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.' % $._config.clusterLabel,
|
||||
summary: 'etcd cluster 99th percentile fsync durations are too high.',
|
||||
},
|
||||
},
|
||||
@ -189,7 +191,7 @@
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.',
|
||||
description: 'etcd cluster "{{ $labels.%s }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.' % $._config.clusterLabel,
|
||||
},
|
||||
},
|
||||
{
|
||||
@ -203,7 +205,7 @@
|
||||
severity: 'warning',
|
||||
},
|
||||
annotations: {
|
||||
description: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.',
|
||||
description: 'etcd cluster "{{ $labels.%s }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.' % $._config.clusterLabel,
|
||||
summary: 'etcd cluster 99th percentile commit durations are too high.',
|
||||
},
|
||||
},
|
||||
@ -217,7 +219,7 @@
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
description: 'etcd cluster "{{ $labels.job }}": database size exceeds the defined quota on etcd instance {{ $labels.instance }}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.',
|
||||
description: 'etcd cluster "{{ $labels.%s }}": database size exceeds the defined quota on etcd instance {{ $labels.instance }}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.' % $._config.clusterLabel,
|
||||
},
|
||||
},
|
||||
{
|
||||
@ -230,7 +232,7 @@
|
||||
severity: 'warning',
|
||||
},
|
||||
annotations: {
|
||||
description: 'etcd cluster "{{ $labels.job }}": Observed surge in etcd writes leading to 50% increase in database size over the past four hours on etcd instance {{ $labels.instance }}, please check as it might be disruptive.',
|
||||
description: 'etcd cluster "{{ $labels.%s }}": Observed surge in etcd writes leading to 50%% increase in database size over the past four hours on etcd instance {{ $labels.instance }}, please check as it might be disruptive.' % $._config.clusterLabel,
|
||||
},
|
||||
},
|
||||
],
|
||||
@ -310,7 +312,7 @@
|
||||
show: false,
|
||||
},
|
||||
targets: [{
|
||||
expr: 'sum(etcd_server_has_leader{job="$cluster"})',
|
||||
expr: 'sum(etcd_server_has_leader{%s="$cluster"})' % $._config.clusterLabel,
|
||||
intervalFactor: 2,
|
||||
legendFormat: '',
|
||||
metric: 'etcd_server_has_leader',
|
||||
@ -360,7 +362,7 @@
|
||||
steppedLine: false,
|
||||
targets: [
|
||||
{
|
||||
expr: 'sum(rate(grpc_server_started_total{job="$cluster",grpc_type="unary"}[5m]))',
|
||||
expr: 'sum(rate(grpc_server_started_total{%s="$cluster",grpc_type="unary"}[5m]))' % $._config.clusterLabel,
|
||||
format: 'time_series',
|
||||
intervalFactor: 2,
|
||||
legendFormat: 'RPC Rate',
|
||||
@ -369,7 +371,7 @@
|
||||
step: 2,
|
||||
},
|
||||
{
|
||||
expr: 'sum(rate(grpc_server_handled_total{job="$cluster",grpc_type="unary",grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m]))',
|
||||
expr: 'sum(rate(grpc_server_handled_total{%s="$cluster",grpc_type="unary",grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m]))' % $._config.clusterLabel,
|
||||
format: 'time_series',
|
||||
intervalFactor: 2,
|
||||
legendFormat: 'RPC Failed Rate',
|
||||
@ -446,7 +448,7 @@
|
||||
steppedLine: false,
|
||||
targets: [
|
||||
{
|
||||
expr: 'sum(grpc_server_started_total{job="$cluster",grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{job="$cluster",grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"})',
|
||||
expr: 'sum(grpc_server_started_total{%(clusterLabel)s="$cluster",grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{%(clusterLabel)s="$cluster",grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"})' % $._config,
|
||||
intervalFactor: 2,
|
||||
legendFormat: 'Watch Streams',
|
||||
metric: 'grpc_server_handled_total',
|
||||
@ -454,7 +456,7 @@
|
||||
step: 4,
|
||||
},
|
||||
{
|
||||
expr: 'sum(grpc_server_started_total{job="$cluster",grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{job="$cluster",grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"})',
|
||||
expr: 'sum(grpc_server_started_total{%(clusterLabel)s="$cluster",grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{%(clusterLabel)s="$cluster",grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"})' % $._config,
|
||||
intervalFactor: 2,
|
||||
legendFormat: 'Lease Streams',
|
||||
metric: 'grpc_server_handled_total',
|
||||
@ -539,7 +541,7 @@
|
||||
stack: false,
|
||||
steppedLine: false,
|
||||
targets: [{
|
||||
expr: 'etcd_mvcc_db_total_size_in_bytes{job="$cluster"}',
|
||||
expr: 'etcd_mvcc_db_total_size_in_bytes{%s="$cluster"}' % $._config.clusterLabel,
|
||||
hide: false,
|
||||
interval: '',
|
||||
intervalFactor: 2,
|
||||
@ -614,7 +616,7 @@
|
||||
steppedLine: true,
|
||||
targets: [
|
||||
{
|
||||
expr: 'histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket{job="$cluster"}[5m])) by (instance, le))',
|
||||
expr: 'histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket{%s="$cluster"}[5m])) by (instance, le))' % $._config.clusterLabel,
|
||||
hide: false,
|
||||
intervalFactor: 2,
|
||||
legendFormat: '{{instance}} WAL fsync',
|
||||
@ -623,7 +625,7 @@
|
||||
step: 4,
|
||||
},
|
||||
{
|
||||
expr: 'histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket{job="$cluster"}[5m])) by (instance, le))',
|
||||
expr: 'histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket{%s="$cluster"}[5m])) by (instance, le))' % $._config.clusterLabel,
|
||||
intervalFactor: 2,
|
||||
legendFormat: '{{instance}} DB fsync',
|
||||
metric: 'etcd_disk_backend_commit_duration_seconds_bucket',
|
||||
@ -696,7 +698,7 @@
|
||||
stack: false,
|
||||
steppedLine: false,
|
||||
targets: [{
|
||||
expr: 'process_resident_memory_bytes{job="$cluster"}',
|
||||
expr: 'process_resident_memory_bytes{%s="$cluster"}' % $._config.clusterLabel,
|
||||
intervalFactor: 2,
|
||||
legendFormat: '{{instance}} Resident Memory',
|
||||
metric: 'process_resident_memory_bytes',
|
||||
@ -778,7 +780,7 @@
|
||||
stack: true,
|
||||
steppedLine: false,
|
||||
targets: [{
|
||||
expr: 'rate(etcd_network_client_grpc_received_bytes_total{job="$cluster"}[5m])',
|
||||
expr: 'rate(etcd_network_client_grpc_received_bytes_total{%s="$cluster"}[5m])' % $._config.clusterLabel,
|
||||
intervalFactor: 2,
|
||||
legendFormat: '{{instance}} Client Traffic In',
|
||||
metric: 'etcd_network_client_grpc_received_bytes_total',
|
||||
@ -852,7 +854,7 @@
|
||||
stack: true,
|
||||
steppedLine: false,
|
||||
targets: [{
|
||||
expr: 'rate(etcd_network_client_grpc_sent_bytes_total{job="$cluster"}[5m])',
|
||||
expr: 'rate(etcd_network_client_grpc_sent_bytes_total{%s="$cluster"}[5m])' % $._config.clusterLabel,
|
||||
intervalFactor: 2,
|
||||
legendFormat: '{{instance}} Client Traffic Out',
|
||||
metric: 'etcd_network_client_grpc_sent_bytes_total',
|
||||
@ -926,7 +928,7 @@
|
||||
stack: false,
|
||||
steppedLine: false,
|
||||
targets: [{
|
||||
expr: 'sum(rate(etcd_network_peer_received_bytes_total{job="$cluster"}[5m])) by (instance)',
|
||||
expr: 'sum(rate(etcd_network_peer_received_bytes_total{%s="$cluster"}[5m])) by (instance)' % $._config.clusterLabel,
|
||||
intervalFactor: 2,
|
||||
legendFormat: '{{instance}} Peer Traffic In',
|
||||
metric: 'etcd_network_peer_received_bytes_total',
|
||||
@ -1001,7 +1003,7 @@
|
||||
stack: false,
|
||||
steppedLine: false,
|
||||
targets: [{
|
||||
expr: 'sum(rate(etcd_network_peer_sent_bytes_total{job="$cluster"}[5m])) by (instance)',
|
||||
expr: 'sum(rate(etcd_network_peer_sent_bytes_total{%s="$cluster"}[5m])) by (instance)' % $._config.clusterLabel,
|
||||
hide: false,
|
||||
interval: '',
|
||||
intervalFactor: 2,
|
||||
@ -1084,7 +1086,7 @@
|
||||
steppedLine: false,
|
||||
targets: [
|
||||
{
|
||||
expr: 'sum(rate(etcd_server_proposals_failed_total{job="$cluster"}[5m]))',
|
||||
expr: 'sum(rate(etcd_server_proposals_failed_total{%s="$cluster"}[5m]))' % $._config.clusterLabel,
|
||||
intervalFactor: 2,
|
||||
legendFormat: 'Proposal Failure Rate',
|
||||
metric: 'etcd_server_proposals_failed_total',
|
||||
@ -1092,7 +1094,7 @@
|
||||
step: 2,
|
||||
},
|
||||
{
|
||||
expr: 'sum(etcd_server_proposals_pending{job="$cluster"})',
|
||||
expr: 'sum(etcd_server_proposals_pending{%s="$cluster"})' % $._config.clusterLabel,
|
||||
intervalFactor: 2,
|
||||
legendFormat: 'Proposal Pending Total',
|
||||
metric: 'etcd_server_proposals_pending',
|
||||
@ -1100,7 +1102,7 @@
|
||||
step: 2,
|
||||
},
|
||||
{
|
||||
expr: 'sum(rate(etcd_server_proposals_committed_total{job="$cluster"}[5m]))',
|
||||
expr: 'sum(rate(etcd_server_proposals_committed_total{%s="$cluster"}[5m]))' % $._config.clusterLabel,
|
||||
intervalFactor: 2,
|
||||
legendFormat: 'Proposal Commit Rate',
|
||||
metric: 'etcd_server_proposals_committed_total',
|
||||
@ -1108,7 +1110,7 @@
|
||||
step: 2,
|
||||
},
|
||||
{
|
||||
expr: 'sum(rate(etcd_server_proposals_applied_total{job="$cluster"}[5m]))',
|
||||
expr: 'sum(rate(etcd_server_proposals_applied_total{%s="$cluster"}[5m]))' % $._config.clusterLabel,
|
||||
intervalFactor: 2,
|
||||
legendFormat: 'Proposal Apply Rate',
|
||||
refId: 'D',
|
||||
@ -1185,7 +1187,7 @@
|
||||
stack: false,
|
||||
steppedLine: false,
|
||||
targets: [{
|
||||
expr: 'changes(etcd_server_leader_changes_seen_total{job="$cluster"}[1d])',
|
||||
expr: 'changes(etcd_server_leader_changes_seen_total{%s="$cluster"}[1d])' % $._config.clusterLabel,
|
||||
intervalFactor: 2,
|
||||
legendFormat: '{{instance}} Total Leader Elections Per Day',
|
||||
metric: 'etcd_server_leader_changes_seen_total',
|
||||
@ -1291,7 +1293,7 @@
|
||||
multi: false,
|
||||
name: 'cluster',
|
||||
options: [],
|
||||
query: 'label_values(etcd_server_has_leader, job)',
|
||||
query: 'label_values(etcd_server_has_leader, %s)' % $._config.clusterLabel,
|
||||
refresh: $._config.dashboard_var_refresh,
|
||||
regex: '',
|
||||
sort: 2,
|
||||
|
Loading…
x
Reference in New Issue
Block a user