mirror of
https://github.com/etcd-io/etcd.git
synced 2024-09-27 06:25:44 +00:00
1446 lines
47 KiB
Jsonnet
1446 lines
47 KiB
Jsonnet
{
|
|
_config+:: {
|
|
etcd_selector: 'job=~".*etcd.*"',
|
|
// etcd_instance_labels are the label names that are uniquely
|
|
// identifying an instance and need to be aggreated away for alerts
|
|
// that are about an etcd cluster as a whole. For example, if etcd
|
|
// instances are deployed on K8s, you will likely want to change
|
|
// this to 'instance, pod'.
|
|
etcd_instance_labels: 'instance',
|
|
// scrape_interval_seconds is the global scrape interval which can be
|
|
// used to dynamically adjust rate windows as a function of the interval.
|
|
scrape_interval_seconds: 30,
|
|
// Dashboard variable refresh option on Grafana (https://grafana.com/docs/grafana/latest/datasources/prometheus/).
|
|
// 0 : Never (Will never refresh the Dashboard variables values)
|
|
// 1 : On Dashboard Load (Will refresh Dashboards variables when dashboard are loaded)
|
|
// 2 : On Time Range Change (Will refresh Dashboards variables when time range will be changed)
|
|
dashboard_var_refresh: 2,
|
|
// clusterLabel is used to identify a cluster.
|
|
clusterLabel: 'job',
|
|
},
|
|
|
|
prometheusAlerts+:: {
|
|
groups+: [
|
|
{
|
|
name: 'etcd',
|
|
rules: [
|
|
{
|
|
alert: 'etcdMembersDown',
|
|
expr: |||
|
|
max without (endpoint) (
|
|
sum without (%(etcd_instance_labels)s) (up{%(etcd_selector)s} == bool 0)
|
|
or
|
|
count without (To) (
|
|
sum without (%(etcd_instance_labels)s) (rate(etcd_network_peer_sent_failures_total{%(etcd_selector)s}[%(network_failure_range)ss])) > 0.01
|
|
)
|
|
)
|
|
> 0
|
|
||| % { etcd_instance_labels: $._config.etcd_instance_labels, etcd_selector: $._config.etcd_selector, network_failure_range: $._config.scrape_interval_seconds * 4 },
|
|
'for': '10m',
|
|
labels: {
|
|
severity: 'critical',
|
|
},
|
|
annotations: {
|
|
description: 'etcd cluster "{{ $labels.%s }}": members are down ({{ $value }}).' % $._config.clusterLabel,
|
|
summary: 'etcd cluster members are down.',
|
|
},
|
|
},
|
|
{
|
|
alert: 'etcdInsufficientMembers',
|
|
expr: |||
|
|
sum(up{%(etcd_selector)s} == bool 1) without (%(etcd_instance_labels)s) < ((count(up{%(etcd_selector)s}) without (%(etcd_instance_labels)s) + 1) / 2)
|
|
||| % $._config,
|
|
'for': '3m',
|
|
labels: {
|
|
severity: 'critical',
|
|
},
|
|
annotations: {
|
|
description: 'etcd cluster "{{ $labels.%s }}": insufficient members ({{ $value }}).' % $._config.clusterLabel,
|
|
summary: 'etcd cluster has insufficient number of members.',
|
|
},
|
|
},
|
|
{
|
|
alert: 'etcdNoLeader',
|
|
expr: |||
|
|
etcd_server_has_leader{%(etcd_selector)s} == 0
|
|
||| % $._config,
|
|
'for': '1m',
|
|
labels: {
|
|
severity: 'critical',
|
|
},
|
|
annotations: {
|
|
description: 'etcd cluster "{{ $labels.%s }}": member {{ $labels.instance }} has no leader.' % $._config.clusterLabel,
|
|
summary: 'etcd cluster has no leader.',
|
|
},
|
|
},
|
|
{
|
|
alert: 'etcdHighNumberOfLeaderChanges',
|
|
expr: |||
|
|
increase((max without (%(etcd_instance_labels)s) (etcd_server_leader_changes_seen_total{%(etcd_selector)s}) or 0*absent(etcd_server_leader_changes_seen_total{%(etcd_selector)s}))[15m:1m]) >= 4
|
|
||| % $._config,
|
|
'for': '5m',
|
|
labels: {
|
|
severity: 'warning',
|
|
},
|
|
annotations: {
|
|
description: 'etcd cluster "{{ $labels.%s }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.' % $._config.clusterLabel,
|
|
summary: 'etcd cluster has high number of leader changes.',
|
|
},
|
|
},
|
|
{
|
|
alert: 'etcdHighNumberOfFailedGRPCRequests',
|
|
expr: |||
|
|
100 * sum(rate(grpc_server_handled_total{%(etcd_selector)s, grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
|
|
/
|
|
sum(rate(grpc_server_handled_total{%(etcd_selector)s}[5m])) without (grpc_type, grpc_code)
|
|
> 1
|
|
||| % $._config,
|
|
'for': '10m',
|
|
labels: {
|
|
severity: 'warning',
|
|
},
|
|
annotations: {
|
|
description: 'etcd cluster "{{ $labels.%s }}": {{ $value }}%% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.' % $._config.clusterLabel,
|
|
summary: 'etcd cluster has high number of failed grpc requests.',
|
|
},
|
|
},
|
|
{
|
|
alert: 'etcdHighNumberOfFailedGRPCRequests',
|
|
expr: |||
|
|
100 * sum(rate(grpc_server_handled_total{%(etcd_selector)s, grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
|
|
/
|
|
sum(rate(grpc_server_handled_total{%(etcd_selector)s}[5m])) without (grpc_type, grpc_code)
|
|
> 5
|
|
||| % $._config,
|
|
'for': '5m',
|
|
labels: {
|
|
severity: 'critical',
|
|
},
|
|
annotations: {
|
|
description: 'etcd cluster "{{ $labels.%s }}": {{ $value }}%% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.' % $._config.clusterLabel,
|
|
summary: 'etcd cluster has high number of failed grpc requests.',
|
|
},
|
|
},
|
|
{
|
|
alert: 'etcdGRPCRequestsSlow',
|
|
expr: |||
|
|
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{%(etcd_selector)s, grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type))
|
|
> 0.15
|
|
||| % $._config,
|
|
'for': '10m',
|
|
labels: {
|
|
severity: 'critical',
|
|
},
|
|
annotations: {
|
|
description: 'etcd cluster "{{ $labels.%s }}": 99th percentile of gRPC requests is {{ $value }}s on etcd instance {{ $labels.instance }} for {{ $labels.grpc_method }} method.' % $._config.clusterLabel,
|
|
summary: 'etcd grpc requests are slow',
|
|
},
|
|
},
|
|
{
|
|
alert: 'etcdMemberCommunicationSlow',
|
|
expr: |||
|
|
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{%(etcd_selector)s}[5m]))
|
|
> 0.15
|
|
||| % $._config,
|
|
'for': '10m',
|
|
labels: {
|
|
severity: 'warning',
|
|
},
|
|
annotations: {
|
|
description: 'etcd cluster "{{ $labels.%s }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.' % $._config.clusterLabel,
|
|
summary: 'etcd cluster member communication is slow.',
|
|
},
|
|
},
|
|
{
|
|
alert: 'etcdHighNumberOfFailedProposals',
|
|
expr: |||
|
|
rate(etcd_server_proposals_failed_total{%(etcd_selector)s}[15m]) > 5
|
|
||| % $._config,
|
|
'for': '15m',
|
|
labels: {
|
|
severity: 'warning',
|
|
},
|
|
annotations: {
|
|
description: 'etcd cluster "{{ $labels.%s }}": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.' % $._config.clusterLabel,
|
|
summary: 'etcd cluster has high number of proposal failures.',
|
|
},
|
|
},
|
|
{
|
|
alert: 'etcdHighFsyncDurations',
|
|
expr: |||
|
|
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{%(etcd_selector)s}[5m]))
|
|
> 0.5
|
|
||| % $._config,
|
|
'for': '10m',
|
|
labels: {
|
|
severity: 'warning',
|
|
},
|
|
annotations: {
|
|
description: 'etcd cluster "{{ $labels.%s }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.' % $._config.clusterLabel,
|
|
summary: 'etcd cluster 99th percentile fsync durations are too high.',
|
|
},
|
|
},
|
|
{
|
|
alert: 'etcdHighFsyncDurations',
|
|
expr: |||
|
|
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{%(etcd_selector)s}[5m]))
|
|
> 1
|
|
||| % $._config,
|
|
'for': '10m',
|
|
labels: {
|
|
severity: 'critical',
|
|
},
|
|
annotations: {
|
|
description: 'etcd cluster "{{ $labels.%s }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.' % $._config.clusterLabel,
|
|
summary: 'etcd cluster 99th percentile fsync durations are too high.',
|
|
},
|
|
},
|
|
{
|
|
alert: 'etcdHighCommitDurations',
|
|
expr: |||
|
|
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{%(etcd_selector)s}[5m]))
|
|
> 0.25
|
|
||| % $._config,
|
|
'for': '10m',
|
|
labels: {
|
|
severity: 'warning',
|
|
},
|
|
annotations: {
|
|
description: 'etcd cluster "{{ $labels.%s }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.' % $._config.clusterLabel,
|
|
summary: 'etcd cluster 99th percentile commit durations are too high.',
|
|
},
|
|
},
|
|
{
|
|
alert: 'etcdDatabaseQuotaLowSpace',
|
|
expr: |||
|
|
(last_over_time(etcd_mvcc_db_total_size_in_bytes[5m]) / last_over_time(etcd_server_quota_backend_bytes[5m]))*100 > 95
|
|
||| % $._config,
|
|
'for': '10m',
|
|
labels: {
|
|
severity: 'critical',
|
|
},
|
|
annotations: {
|
|
description: 'etcd cluster "{{ $labels.%s }}": database size exceeds the defined quota on etcd instance {{ $labels.instance }}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.' % $._config.clusterLabel,
|
|
summary: 'etcd cluster database is running full.',
|
|
},
|
|
},
|
|
{
|
|
alert: 'etcdExcessiveDatabaseGrowth',
|
|
expr: |||
|
|
predict_linear(etcd_mvcc_db_total_size_in_bytes[4h], 4*60*60) > etcd_server_quota_backend_bytes
|
|
||| % $._config,
|
|
'for': '10m',
|
|
labels: {
|
|
severity: 'warning',
|
|
},
|
|
annotations: {
|
|
description: 'etcd cluster "{{ $labels.%s }}": Predicting running out of disk space in the next four hours, based on write observations within the past four hours on etcd instance {{ $labels.instance }}, please check as it might be disruptive.' % $._config.clusterLabel,
|
|
summary: 'etcd cluster database growing very fast.',
|
|
},
|
|
},
|
|
{
|
|
alert: 'etcdDatabaseHighFragmentationRatio',
|
|
expr: |||
|
|
(last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes[5m])) < 0.5
|
|
||| % $._config,
|
|
'for': '10m',
|
|
labels: {
|
|
severity: 'warning',
|
|
},
|
|
annotations: {
|
|
description: 'etcd cluster "{{ $labels.%s }}": database size in use on instance {{ $labels.instance }} is {{ $value | humanizePercentage }} of the actual allocated disk space, please run defragmentation (e.g. etcdctl defrag) to retrieve the unused fragmented disk space.' % $._config.clusterLabel,
|
|
summary: 'etcd database size in use is less than 50% of the actual allocated storage.',
|
|
runbook_url: 'https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation',
|
|
},
|
|
},
|
|
],
|
|
},
|
|
],
|
|
},
|
|
|
|
grafanaDashboards+:: {
|
|
'etcd.json': {
|
|
uid: std.md5('etcd.json'),
|
|
title: 'etcd',
|
|
description: 'etcd sample Grafana dashboard with Prometheus',
|
|
tags: ['etcd-mixin'],
|
|
style: 'dark',
|
|
timezone: 'browser',
|
|
editable: true,
|
|
hideControls: false,
|
|
sharedCrosshair: false,
|
|
rows: [
|
|
{
|
|
collapse: false,
|
|
editable: true,
|
|
height: '250px',
|
|
panels: [
|
|
{
|
|
cacheTimeout: null,
|
|
colorBackground: false,
|
|
colorValue: false,
|
|
colors: [
|
|
'rgba(245, 54, 54, 0.9)',
|
|
'rgba(237, 129, 40, 0.89)',
|
|
'rgba(50, 172, 45, 0.97)',
|
|
],
|
|
datasource: '$datasource',
|
|
editable: true,
|
|
'error': false,
|
|
format: 'none',
|
|
gauge: {
|
|
maxValue: 100,
|
|
minValue: 0,
|
|
show: false,
|
|
thresholdLabels: false,
|
|
thresholdMarkers: true,
|
|
},
|
|
id: 28,
|
|
interval: null,
|
|
isNew: true,
|
|
links: [],
|
|
mappingType: 1,
|
|
mappingTypes: [
|
|
{
|
|
name: 'value to text',
|
|
value: 1,
|
|
},
|
|
{
|
|
name: 'range to text',
|
|
value: 2,
|
|
},
|
|
],
|
|
maxDataPoints: 100,
|
|
nullPointMode: 'connected',
|
|
nullText: null,
|
|
postfix: '',
|
|
postfixFontSize: '50%',
|
|
prefix: '',
|
|
prefixFontSize: '50%',
|
|
rangeMaps: [{
|
|
from: 'null',
|
|
text: 'N/A',
|
|
to: 'null',
|
|
}],
|
|
span: 3,
|
|
sparkline: {
|
|
fillColor: 'rgba(31, 118, 189, 0.18)',
|
|
full: false,
|
|
lineColor: 'rgb(31, 120, 193)',
|
|
show: false,
|
|
},
|
|
targets: [{
|
|
expr: 'sum(etcd_server_has_leader{%s="$cluster"})' % $._config.clusterLabel,
|
|
intervalFactor: 2,
|
|
legendFormat: '',
|
|
metric: 'etcd_server_has_leader',
|
|
refId: 'A',
|
|
step: 20,
|
|
}],
|
|
thresholds: '',
|
|
title: 'Up',
|
|
type: 'singlestat',
|
|
valueFontSize: '200%',
|
|
valueMaps: [{
|
|
op: '=',
|
|
text: 'N/A',
|
|
value: 'null',
|
|
}],
|
|
valueName: 'avg',
|
|
},
|
|
{
|
|
aliasColors: {},
|
|
bars: false,
|
|
datasource: '$datasource',
|
|
editable: true,
|
|
'error': false,
|
|
fill: 0,
|
|
id: 23,
|
|
isNew: true,
|
|
legend: {
|
|
avg: false,
|
|
current: false,
|
|
max: false,
|
|
min: false,
|
|
show: false,
|
|
total: false,
|
|
values: false,
|
|
},
|
|
lines: true,
|
|
linewidth: 2,
|
|
links: [],
|
|
nullPointMode: 'connected',
|
|
percentage: false,
|
|
pointradius: 5,
|
|
points: false,
|
|
renderer: 'flot',
|
|
seriesOverrides: [],
|
|
span: 5,
|
|
stack: false,
|
|
steppedLine: false,
|
|
targets: [
|
|
{
|
|
expr: 'sum(rate(grpc_server_started_total{%s="$cluster",grpc_type="unary"}[$__rate_interval]))' % $._config.clusterLabel,
|
|
format: 'time_series',
|
|
intervalFactor: 2,
|
|
legendFormat: 'RPC Rate',
|
|
metric: 'grpc_server_started_total',
|
|
refId: 'A',
|
|
step: 2,
|
|
},
|
|
{
|
|
expr: 'sum(rate(grpc_server_handled_total{%s="$cluster",grpc_type="unary",grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[$__rate_interval]))' % $._config.clusterLabel,
|
|
format: 'time_series',
|
|
intervalFactor: 2,
|
|
legendFormat: 'RPC Failed Rate',
|
|
metric: 'grpc_server_handled_total',
|
|
refId: 'B',
|
|
step: 2,
|
|
},
|
|
],
|
|
thresholds: [],
|
|
timeFrom: null,
|
|
timeShift: null,
|
|
title: 'RPC Rate',
|
|
tooltip: {
|
|
msResolution: false,
|
|
shared: true,
|
|
sort: 0,
|
|
value_type: 'individual',
|
|
},
|
|
type: 'graph',
|
|
xaxis: {
|
|
mode: 'time',
|
|
name: null,
|
|
show: true,
|
|
values: [],
|
|
},
|
|
yaxes: [
|
|
{
|
|
format: 'ops',
|
|
label: null,
|
|
logBase: 1,
|
|
max: null,
|
|
min: null,
|
|
show: true,
|
|
},
|
|
{
|
|
format: 'short',
|
|
label: null,
|
|
logBase: 1,
|
|
max: null,
|
|
min: null,
|
|
show: true,
|
|
},
|
|
],
|
|
},
|
|
{
|
|
aliasColors: {},
|
|
bars: false,
|
|
datasource: '$datasource',
|
|
editable: true,
|
|
'error': false,
|
|
fill: 0,
|
|
id: 41,
|
|
isNew: true,
|
|
legend: {
|
|
avg: false,
|
|
current: false,
|
|
max: false,
|
|
min: false,
|
|
show: false,
|
|
total: false,
|
|
values: false,
|
|
},
|
|
lines: true,
|
|
linewidth: 2,
|
|
links: [],
|
|
nullPointMode: 'connected',
|
|
percentage: false,
|
|
pointradius: 5,
|
|
points: false,
|
|
renderer: 'flot',
|
|
seriesOverrides: [],
|
|
span: 4,
|
|
stack: true,
|
|
steppedLine: false,
|
|
targets: [
|
|
{
|
|
expr: 'sum(grpc_server_started_total{%(clusterLabel)s="$cluster",grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{%(clusterLabel)s="$cluster",grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"})' % $._config,
|
|
intervalFactor: 2,
|
|
legendFormat: 'Watch Streams',
|
|
metric: 'grpc_server_handled_total',
|
|
refId: 'A',
|
|
step: 4,
|
|
},
|
|
{
|
|
expr: 'sum(grpc_server_started_total{%(clusterLabel)s="$cluster",grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{%(clusterLabel)s="$cluster",grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"})' % $._config,
|
|
intervalFactor: 2,
|
|
legendFormat: 'Lease Streams',
|
|
metric: 'grpc_server_handled_total',
|
|
refId: 'B',
|
|
step: 4,
|
|
},
|
|
],
|
|
thresholds: [],
|
|
timeFrom: null,
|
|
timeShift: null,
|
|
title: 'Active Streams',
|
|
tooltip: {
|
|
msResolution: false,
|
|
shared: true,
|
|
sort: 0,
|
|
value_type: 'individual',
|
|
},
|
|
type: 'graph',
|
|
xaxis: {
|
|
mode: 'time',
|
|
name: null,
|
|
show: true,
|
|
values: [],
|
|
},
|
|
yaxes: [
|
|
{
|
|
format: 'short',
|
|
label: '',
|
|
logBase: 1,
|
|
max: null,
|
|
min: null,
|
|
show: true,
|
|
},
|
|
{
|
|
format: 'short',
|
|
label: null,
|
|
logBase: 1,
|
|
max: null,
|
|
min: null,
|
|
show: true,
|
|
},
|
|
],
|
|
},
|
|
],
|
|
showTitle: false,
|
|
title: 'Row',
|
|
},
|
|
{
|
|
collapse: false,
|
|
editable: true,
|
|
height: '250px',
|
|
panels: [
|
|
{
|
|
aliasColors: {},
|
|
bars: false,
|
|
datasource: '$datasource',
|
|
decimals: null,
|
|
editable: true,
|
|
'error': false,
|
|
fill: 0,
|
|
grid: {},
|
|
id: 1,
|
|
legend: {
|
|
avg: false,
|
|
current: false,
|
|
max: false,
|
|
min: false,
|
|
show: false,
|
|
total: false,
|
|
values: false,
|
|
},
|
|
lines: true,
|
|
linewidth: 2,
|
|
links: [],
|
|
nullPointMode: 'connected',
|
|
percentage: false,
|
|
pointradius: 5,
|
|
points: false,
|
|
renderer: 'flot',
|
|
seriesOverrides: [],
|
|
span: 4,
|
|
stack: false,
|
|
steppedLine: false,
|
|
targets: [{
|
|
expr: 'etcd_mvcc_db_total_size_in_bytes{%s="$cluster"}' % $._config.clusterLabel,
|
|
hide: false,
|
|
interval: '',
|
|
intervalFactor: 2,
|
|
legendFormat: '{{instance}} DB Size',
|
|
metric: '',
|
|
refId: 'A',
|
|
step: 4,
|
|
}],
|
|
thresholds: [],
|
|
timeFrom: null,
|
|
timeShift: null,
|
|
title: 'DB Size',
|
|
tooltip: {
|
|
msResolution: false,
|
|
shared: true,
|
|
sort: 0,
|
|
value_type: 'cumulative',
|
|
},
|
|
type: 'graph',
|
|
xaxis: {
|
|
mode: 'time',
|
|
name: null,
|
|
show: true,
|
|
values: [],
|
|
},
|
|
yaxes: [
|
|
{
|
|
format: 'bytes',
|
|
logBase: 1,
|
|
max: null,
|
|
min: null,
|
|
show: true,
|
|
},
|
|
{
|
|
format: 'short',
|
|
logBase: 1,
|
|
max: null,
|
|
min: null,
|
|
show: false,
|
|
},
|
|
],
|
|
},
|
|
{
|
|
aliasColors: {},
|
|
bars: false,
|
|
datasource: '$datasource',
|
|
editable: true,
|
|
'error': false,
|
|
fill: 0,
|
|
grid: {},
|
|
id: 3,
|
|
legend: {
|
|
avg: false,
|
|
current: false,
|
|
max: false,
|
|
min: false,
|
|
show: false,
|
|
total: false,
|
|
values: false,
|
|
},
|
|
lines: true,
|
|
linewidth: 2,
|
|
links: [],
|
|
nullPointMode: 'connected',
|
|
percentage: false,
|
|
pointradius: 1,
|
|
points: false,
|
|
renderer: 'flot',
|
|
seriesOverrides: [],
|
|
span: 4,
|
|
stack: false,
|
|
steppedLine: true,
|
|
targets: [
|
|
{
|
|
expr: 'histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket{%s="$cluster"}[$__rate_interval])) by (instance, le))' % $._config.clusterLabel,
|
|
hide: false,
|
|
intervalFactor: 2,
|
|
legendFormat: '{{instance}} WAL fsync',
|
|
metric: 'etcd_disk_wal_fsync_duration_seconds_bucket',
|
|
refId: 'A',
|
|
step: 4,
|
|
},
|
|
{
|
|
expr: 'histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket{%s="$cluster"}[$__rate_interval])) by (instance, le))' % $._config.clusterLabel,
|
|
intervalFactor: 2,
|
|
legendFormat: '{{instance}} DB fsync',
|
|
metric: 'etcd_disk_backend_commit_duration_seconds_bucket',
|
|
refId: 'B',
|
|
step: 4,
|
|
},
|
|
],
|
|
thresholds: [],
|
|
timeFrom: null,
|
|
timeShift: null,
|
|
title: 'Disk Sync Duration',
|
|
tooltip: {
|
|
msResolution: false,
|
|
shared: true,
|
|
sort: 0,
|
|
value_type: 'cumulative',
|
|
},
|
|
type: 'graph',
|
|
xaxis: {
|
|
mode: 'time',
|
|
name: null,
|
|
show: true,
|
|
values: [],
|
|
},
|
|
yaxes: [
|
|
{
|
|
format: 's',
|
|
logBase: 1,
|
|
max: null,
|
|
min: null,
|
|
show: true,
|
|
},
|
|
{
|
|
format: 'short',
|
|
logBase: 1,
|
|
max: null,
|
|
min: null,
|
|
show: false,
|
|
},
|
|
],
|
|
},
|
|
{
|
|
aliasColors: {},
|
|
bars: false,
|
|
datasource: '$datasource',
|
|
editable: true,
|
|
'error': false,
|
|
fill: 0,
|
|
id: 29,
|
|
isNew: true,
|
|
legend: {
|
|
avg: false,
|
|
current: false,
|
|
max: false,
|
|
min: false,
|
|
show: false,
|
|
total: false,
|
|
values: false,
|
|
},
|
|
lines: true,
|
|
linewidth: 2,
|
|
links: [],
|
|
nullPointMode: 'connected',
|
|
percentage: false,
|
|
pointradius: 5,
|
|
points: false,
|
|
renderer: 'flot',
|
|
seriesOverrides: [],
|
|
span: 4,
|
|
stack: false,
|
|
steppedLine: false,
|
|
targets: [{
|
|
expr: 'process_resident_memory_bytes{%s="$cluster"}' % $._config.clusterLabel,
|
|
intervalFactor: 2,
|
|
legendFormat: '{{instance}} Resident Memory',
|
|
metric: 'process_resident_memory_bytes',
|
|
refId: 'A',
|
|
step: 4,
|
|
}],
|
|
thresholds: [],
|
|
timeFrom: null,
|
|
timeShift: null,
|
|
title: 'Memory',
|
|
tooltip: {
|
|
msResolution: false,
|
|
shared: true,
|
|
sort: 0,
|
|
value_type: 'individual',
|
|
},
|
|
type: 'graph',
|
|
xaxis: {
|
|
mode: 'time',
|
|
name: null,
|
|
show: true,
|
|
values: [],
|
|
},
|
|
yaxes: [
|
|
{
|
|
format: 'bytes',
|
|
label: null,
|
|
logBase: 1,
|
|
max: null,
|
|
min: null,
|
|
show: true,
|
|
},
|
|
{
|
|
format: 'short',
|
|
label: null,
|
|
logBase: 1,
|
|
max: null,
|
|
min: null,
|
|
show: true,
|
|
},
|
|
],
|
|
},
|
|
],
|
|
title: 'New row',
|
|
},
|
|
{
|
|
collapse: false,
|
|
editable: true,
|
|
height: '250px',
|
|
panels: [
|
|
{
|
|
aliasColors: {},
|
|
bars: false,
|
|
datasource: '$datasource',
|
|
editable: true,
|
|
'error': false,
|
|
fill: 5,
|
|
id: 22,
|
|
isNew: true,
|
|
legend: {
|
|
avg: false,
|
|
current: false,
|
|
max: false,
|
|
min: false,
|
|
show: false,
|
|
total: false,
|
|
values: false,
|
|
},
|
|
lines: true,
|
|
linewidth: 2,
|
|
links: [],
|
|
nullPointMode: 'connected',
|
|
percentage: false,
|
|
pointradius: 5,
|
|
points: false,
|
|
renderer: 'flot',
|
|
seriesOverrides: [],
|
|
span: 3,
|
|
stack: true,
|
|
steppedLine: false,
|
|
targets: [{
|
|
expr: 'rate(etcd_network_client_grpc_received_bytes_total{%s="$cluster"}[$__rate_interval])' % $._config.clusterLabel,
|
|
intervalFactor: 2,
|
|
legendFormat: '{{instance}} Client Traffic In',
|
|
metric: 'etcd_network_client_grpc_received_bytes_total',
|
|
refId: 'A',
|
|
step: 4,
|
|
}],
|
|
thresholds: [],
|
|
timeFrom: null,
|
|
timeShift: null,
|
|
title: 'Client Traffic In',
|
|
tooltip: {
|
|
msResolution: false,
|
|
shared: true,
|
|
sort: 0,
|
|
value_type: 'individual',
|
|
},
|
|
type: 'graph',
|
|
xaxis: {
|
|
mode: 'time',
|
|
name: null,
|
|
show: true,
|
|
values: [],
|
|
},
|
|
yaxes: [
|
|
{
|
|
format: 'Bps',
|
|
label: null,
|
|
logBase: 1,
|
|
max: null,
|
|
min: null,
|
|
show: true,
|
|
},
|
|
{
|
|
format: 'short',
|
|
label: null,
|
|
logBase: 1,
|
|
max: null,
|
|
min: null,
|
|
show: true,
|
|
},
|
|
],
|
|
},
|
|
{
|
|
aliasColors: {},
|
|
bars: false,
|
|
datasource: '$datasource',
|
|
editable: true,
|
|
'error': false,
|
|
fill: 5,
|
|
id: 21,
|
|
isNew: true,
|
|
legend: {
|
|
avg: false,
|
|
current: false,
|
|
max: false,
|
|
min: false,
|
|
show: false,
|
|
total: false,
|
|
values: false,
|
|
},
|
|
lines: true,
|
|
linewidth: 2,
|
|
links: [],
|
|
nullPointMode: 'connected',
|
|
percentage: false,
|
|
pointradius: 5,
|
|
points: false,
|
|
renderer: 'flot',
|
|
seriesOverrides: [],
|
|
span: 3,
|
|
stack: true,
|
|
steppedLine: false,
|
|
targets: [{
|
|
expr: 'rate(etcd_network_client_grpc_sent_bytes_total{%s="$cluster"}[$__rate_interval])' % $._config.clusterLabel,
|
|
intervalFactor: 2,
|
|
legendFormat: '{{instance}} Client Traffic Out',
|
|
metric: 'etcd_network_client_grpc_sent_bytes_total',
|
|
refId: 'A',
|
|
step: 4,
|
|
}],
|
|
thresholds: [],
|
|
timeFrom: null,
|
|
timeShift: null,
|
|
title: 'Client Traffic Out',
|
|
tooltip: {
|
|
msResolution: false,
|
|
shared: true,
|
|
sort: 0,
|
|
value_type: 'individual',
|
|
},
|
|
type: 'graph',
|
|
xaxis: {
|
|
mode: 'time',
|
|
name: null,
|
|
show: true,
|
|
values: [],
|
|
},
|
|
yaxes: [
|
|
{
|
|
format: 'Bps',
|
|
label: null,
|
|
logBase: 1,
|
|
max: null,
|
|
min: null,
|
|
show: true,
|
|
},
|
|
{
|
|
format: 'short',
|
|
label: null,
|
|
logBase: 1,
|
|
max: null,
|
|
min: null,
|
|
show: true,
|
|
},
|
|
],
|
|
},
|
|
{
|
|
aliasColors: {},
|
|
bars: false,
|
|
datasource: '$datasource',
|
|
editable: true,
|
|
'error': false,
|
|
fill: 0,
|
|
id: 20,
|
|
isNew: true,
|
|
legend: {
|
|
avg: false,
|
|
current: false,
|
|
max: false,
|
|
min: false,
|
|
show: false,
|
|
total: false,
|
|
values: false,
|
|
},
|
|
lines: true,
|
|
linewidth: 2,
|
|
links: [],
|
|
nullPointMode: 'connected',
|
|
percentage: false,
|
|
pointradius: 5,
|
|
points: false,
|
|
renderer: 'flot',
|
|
seriesOverrides: [],
|
|
span: 3,
|
|
stack: false,
|
|
steppedLine: false,
|
|
targets: [{
|
|
expr: 'sum(rate(etcd_network_peer_received_bytes_total{%s="$cluster"}[$__rate_interval])) by (instance)' % $._config.clusterLabel,
|
|
intervalFactor: 2,
|
|
legendFormat: '{{instance}} Peer Traffic In',
|
|
metric: 'etcd_network_peer_received_bytes_total',
|
|
refId: 'A',
|
|
step: 4,
|
|
}],
|
|
thresholds: [],
|
|
timeFrom: null,
|
|
timeShift: null,
|
|
title: 'Peer Traffic In',
|
|
tooltip: {
|
|
msResolution: false,
|
|
shared: true,
|
|
sort: 0,
|
|
value_type: 'individual',
|
|
},
|
|
type: 'graph',
|
|
xaxis: {
|
|
mode: 'time',
|
|
name: null,
|
|
show: true,
|
|
values: [],
|
|
},
|
|
yaxes: [
|
|
{
|
|
format: 'Bps',
|
|
label: null,
|
|
logBase: 1,
|
|
max: null,
|
|
min: null,
|
|
show: true,
|
|
},
|
|
{
|
|
format: 'short',
|
|
label: null,
|
|
logBase: 1,
|
|
max: null,
|
|
min: null,
|
|
show: true,
|
|
},
|
|
],
|
|
},
|
|
{
|
|
aliasColors: {},
|
|
bars: false,
|
|
datasource: '$datasource',
|
|
decimals: null,
|
|
editable: true,
|
|
'error': false,
|
|
fill: 0,
|
|
grid: {},
|
|
id: 16,
|
|
legend: {
|
|
avg: false,
|
|
current: false,
|
|
max: false,
|
|
min: false,
|
|
show: false,
|
|
total: false,
|
|
values: false,
|
|
},
|
|
lines: true,
|
|
linewidth: 2,
|
|
links: [],
|
|
nullPointMode: 'connected',
|
|
percentage: false,
|
|
pointradius: 5,
|
|
points: false,
|
|
renderer: 'flot',
|
|
seriesOverrides: [],
|
|
span: 3,
|
|
stack: false,
|
|
steppedLine: false,
|
|
targets: [{
|
|
expr: 'sum(rate(etcd_network_peer_sent_bytes_total{%s="$cluster"}[$__rate_interval])) by (instance)' % $._config.clusterLabel,
|
|
hide: false,
|
|
interval: '',
|
|
intervalFactor: 2,
|
|
legendFormat: '{{instance}} Peer Traffic Out',
|
|
metric: 'etcd_network_peer_sent_bytes_total',
|
|
refId: 'A',
|
|
step: 4,
|
|
}],
|
|
thresholds: [],
|
|
timeFrom: null,
|
|
timeShift: null,
|
|
title: 'Peer Traffic Out',
|
|
tooltip: {
|
|
msResolution: false,
|
|
shared: true,
|
|
sort: 0,
|
|
value_type: 'cumulative',
|
|
},
|
|
type: 'graph',
|
|
xaxis: {
|
|
mode: 'time',
|
|
name: null,
|
|
show: true,
|
|
values: [],
|
|
},
|
|
yaxes: [
|
|
{
|
|
format: 'Bps',
|
|
logBase: 1,
|
|
max: null,
|
|
min: null,
|
|
show: true,
|
|
},
|
|
{
|
|
format: 'short',
|
|
logBase: 1,
|
|
max: null,
|
|
min: null,
|
|
show: true,
|
|
},
|
|
],
|
|
},
|
|
],
|
|
title: 'New row',
|
|
},
|
|
{
|
|
collapse: false,
|
|
editable: true,
|
|
height: '250px',
|
|
panels: [
|
|
{
|
|
aliasColors: {},
|
|
bars: false,
|
|
datasource: '$datasource',
|
|
editable: true,
|
|
'error': false,
|
|
fill: 0,
|
|
id: 40,
|
|
isNew: true,
|
|
legend: {
|
|
avg: false,
|
|
current: false,
|
|
max: false,
|
|
min: false,
|
|
show: false,
|
|
total: false,
|
|
values: false,
|
|
},
|
|
lines: true,
|
|
linewidth: 2,
|
|
links: [],
|
|
nullPointMode: 'connected',
|
|
percentage: false,
|
|
pointradius: 5,
|
|
points: false,
|
|
renderer: 'flot',
|
|
seriesOverrides: [],
|
|
span: 6,
|
|
stack: false,
|
|
steppedLine: false,
|
|
targets: [
|
|
{
|
|
expr: 'sum(rate(etcd_server_proposals_failed_total{%s="$cluster"}[$__rate_interval]))' % $._config.clusterLabel,
|
|
intervalFactor: 2,
|
|
legendFormat: 'Proposal Failure Rate',
|
|
metric: 'etcd_server_proposals_failed_total',
|
|
refId: 'A',
|
|
step: 2,
|
|
},
|
|
{
|
|
expr: 'sum(etcd_server_proposals_pending{%s="$cluster"})' % $._config.clusterLabel,
|
|
intervalFactor: 2,
|
|
legendFormat: 'Proposal Pending Total',
|
|
metric: 'etcd_server_proposals_pending',
|
|
refId: 'B',
|
|
step: 2,
|
|
},
|
|
{
|
|
expr: 'sum(rate(etcd_server_proposals_committed_total{%s="$cluster"}[$__rate_interval]))' % $._config.clusterLabel,
|
|
intervalFactor: 2,
|
|
legendFormat: 'Proposal Commit Rate',
|
|
metric: 'etcd_server_proposals_committed_total',
|
|
refId: 'C',
|
|
step: 2,
|
|
},
|
|
{
|
|
expr: 'sum(rate(etcd_server_proposals_applied_total{%s="$cluster"}[$__rate_interval]))' % $._config.clusterLabel,
|
|
intervalFactor: 2,
|
|
legendFormat: 'Proposal Apply Rate',
|
|
refId: 'D',
|
|
step: 2,
|
|
},
|
|
],
|
|
thresholds: [],
|
|
timeFrom: null,
|
|
timeShift: null,
|
|
title: 'Raft Proposals',
|
|
tooltip: {
|
|
msResolution: false,
|
|
shared: true,
|
|
sort: 0,
|
|
value_type: 'individual',
|
|
},
|
|
type: 'graph',
|
|
xaxis: {
|
|
mode: 'time',
|
|
name: null,
|
|
show: true,
|
|
values: [],
|
|
},
|
|
yaxes: [
|
|
{
|
|
format: 'short',
|
|
label: '',
|
|
logBase: 1,
|
|
max: null,
|
|
min: null,
|
|
show: true,
|
|
},
|
|
{
|
|
format: 'short',
|
|
label: null,
|
|
logBase: 1,
|
|
max: null,
|
|
min: null,
|
|
show: true,
|
|
},
|
|
],
|
|
},
|
|
{
|
|
aliasColors: {},
|
|
bars: false,
|
|
datasource: '$datasource',
|
|
decimals: 0,
|
|
editable: true,
|
|
'error': false,
|
|
fill: 0,
|
|
id: 19,
|
|
isNew: true,
|
|
legend: {
|
|
alignAsTable: false,
|
|
avg: false,
|
|
current: false,
|
|
max: false,
|
|
min: false,
|
|
rightSide: false,
|
|
show: false,
|
|
total: false,
|
|
values: false,
|
|
},
|
|
lines: true,
|
|
linewidth: 2,
|
|
links: [],
|
|
nullPointMode: 'connected',
|
|
percentage: false,
|
|
pointradius: 5,
|
|
points: false,
|
|
renderer: 'flot',
|
|
seriesOverrides: [],
|
|
span: 6,
|
|
stack: false,
|
|
steppedLine: false,
|
|
targets: [{
|
|
expr: 'changes(etcd_server_leader_changes_seen_total{%s="$cluster"}[1d])' % $._config.clusterLabel,
|
|
intervalFactor: 2,
|
|
legendFormat: '{{instance}} Total Leader Elections Per Day',
|
|
metric: 'etcd_server_leader_changes_seen_total',
|
|
refId: 'A',
|
|
step: 2,
|
|
}],
|
|
thresholds: [],
|
|
timeFrom: null,
|
|
timeShift: null,
|
|
title: 'Total Leader Elections Per Day',
|
|
tooltip: {
|
|
msResolution: false,
|
|
shared: true,
|
|
sort: 0,
|
|
value_type: 'individual',
|
|
},
|
|
type: 'graph',
|
|
xaxis: {
|
|
mode: 'time',
|
|
name: null,
|
|
show: true,
|
|
values: [],
|
|
},
|
|
yaxes: [
|
|
{
|
|
format: 'short',
|
|
label: null,
|
|
logBase: 1,
|
|
max: null,
|
|
min: null,
|
|
show: true,
|
|
},
|
|
{
|
|
format: 'short',
|
|
label: null,
|
|
logBase: 1,
|
|
max: null,
|
|
min: null,
|
|
show: true,
|
|
},
|
|
],
|
|
},
|
|
{
|
|
aliasColors: {},
|
|
bars: false,
|
|
dashLength: 10,
|
|
dashes: false,
|
|
datasource: '$datasource',
|
|
decimals: 0,
|
|
editable: true,
|
|
'error': false,
|
|
fieldConfig: {
|
|
defaults: {
|
|
custom: {},
|
|
},
|
|
overrides: [],
|
|
},
|
|
fill: 0,
|
|
fillGradient: 0,
|
|
gridPos: {
|
|
h: 7,
|
|
w: 12,
|
|
x: 0,
|
|
y: 28,
|
|
},
|
|
hiddenSeries: false,
|
|
id: 42,
|
|
isNew: true,
|
|
legend: {
|
|
alignAsTable: false,
|
|
avg: false,
|
|
current: false,
|
|
max: false,
|
|
min: false,
|
|
rightSide: false,
|
|
show: false,
|
|
total: false,
|
|
values: false,
|
|
},
|
|
lines: true,
|
|
linewidth: 2,
|
|
links: [],
|
|
nullPointMode: 'connected',
|
|
options: {
|
|
alertThreshold: true,
|
|
},
|
|
percentage: false,
|
|
pluginVersion: '7.4.3',
|
|
pointradius: 5,
|
|
points: false,
|
|
renderer: 'flot',
|
|
seriesOverrides: [],
|
|
spaceLength: 10,
|
|
stack: false,
|
|
steppedLine: false,
|
|
targets: [
|
|
{
|
|
expr: 'histogram_quantile(0.99, sum by (instance, le) (rate(etcd_network_peer_round_trip_time_seconds_bucket{%s="$cluster"}[$__rate_interval])))' % $._config.clusterLabel,
|
|
interval: '',
|
|
intervalFactor: 2,
|
|
legendFormat: '{{instance}} Peer round trip time',
|
|
metric: 'etcd_network_peer_round_trip_time_seconds_bucket',
|
|
refId: 'A',
|
|
step: 2,
|
|
},
|
|
],
|
|
thresholds: [],
|
|
timeFrom: null,
|
|
timeRegions: [],
|
|
timeShift: null,
|
|
title: 'Peer round trip time',
|
|
tooltip: {
|
|
msResolution: false,
|
|
shared: true,
|
|
sort: 0,
|
|
value_type: 'individual',
|
|
},
|
|
type: 'graph',
|
|
xaxis: {
|
|
buckets: null,
|
|
mode: 'time',
|
|
name: null,
|
|
show: true,
|
|
values: [],
|
|
},
|
|
yaxes: [
|
|
{
|
|
'$$hashKey': 'object:925',
|
|
decimals: null,
|
|
format: 's',
|
|
label: null,
|
|
logBase: 1,
|
|
max: null,
|
|
min: null,
|
|
show: true,
|
|
},
|
|
{
|
|
'$$hashKey': 'object:926',
|
|
format: 'short',
|
|
label: null,
|
|
logBase: 1,
|
|
max: null,
|
|
min: null,
|
|
show: true,
|
|
},
|
|
],
|
|
yaxis: {
|
|
align: false,
|
|
alignLevel: null,
|
|
},
|
|
},
|
|
],
|
|
title: 'New row',
|
|
},
|
|
],
|
|
time: {
|
|
from: 'now-15m',
|
|
to: 'now',
|
|
},
|
|
timepicker: {
|
|
now: true,
|
|
refresh_intervals: [
|
|
'5s',
|
|
'10s',
|
|
'30s',
|
|
'1m',
|
|
'5m',
|
|
'15m',
|
|
'30m',
|
|
'1h',
|
|
'2h',
|
|
'1d',
|
|
],
|
|
time_options: [
|
|
'5m',
|
|
'15m',
|
|
'1h',
|
|
'6h',
|
|
'12h',
|
|
'24h',
|
|
'2d',
|
|
'7d',
|
|
'30d',
|
|
],
|
|
},
|
|
templating: {
|
|
list: [
|
|
{
|
|
current: {
|
|
text: 'Prometheus',
|
|
value: 'Prometheus',
|
|
},
|
|
hide: 0,
|
|
label: 'Data Source',
|
|
name: 'datasource',
|
|
options: [],
|
|
query: 'prometheus',
|
|
refresh: 1,
|
|
regex: '',
|
|
type: 'datasource',
|
|
},
|
|
{
|
|
allValue: null,
|
|
current: {
|
|
text: 'prod',
|
|
value: 'prod',
|
|
},
|
|
datasource: '$datasource',
|
|
hide: 0,
|
|
includeAll: false,
|
|
label: 'cluster',
|
|
multi: false,
|
|
name: 'cluster',
|
|
options: [],
|
|
query: 'label_values(etcd_server_has_leader, %s)' % $._config.clusterLabel,
|
|
refresh: $._config.dashboard_var_refresh,
|
|
regex: '',
|
|
sort: 2,
|
|
tagValuesQuery: '',
|
|
tags: [],
|
|
tagsQuery: '',
|
|
type: 'query',
|
|
useTags: false,
|
|
},
|
|
],
|
|
},
|
|
annotations: {
|
|
list: [],
|
|
},
|
|
refresh: '10s',
|
|
schemaVersion: 13,
|
|
version: 215,
|
|
links: [],
|
|
gnetId: null,
|
|
},
|
|
},
|
|
}
|