mirror of
https://github.com/etcd-io/etcd.git
synced 2024-09-27 06:25:44 +00:00

An etcd member being down is an important failure state - while normal admin operations may cause transient outages to rotate, when any member is down the cluster is operating in a degraded fashion. Add an alert that records when any members are down so that administrators know whether the next failure is fatal. The rule is more complicated than `up{...} == 0` because not all failure modes for etcd may have an `up{...}` entry for each member. For instance, a Kubernetes service in front of an etcd cluster might only have 2 endpoints recorded in `up` because the third pod is evicted by the kubelet - the cluster is degraded but `count(up{...})` would not return the full quorum size. Instead, use network peer send failures as a failure detector and attempt to return the max of down services or failing peers. We may undercount the number of total failures, but we will at least alert that a member is down.
1294 lines
39 KiB
Jsonnet
1294 lines
39 KiB
Jsonnet
{
|
|
_config+:: {
|
|
etcd_selector: 'job=~".*etcd.*"',
|
|
},
|
|
|
|
prometheusAlerts+:: {
|
|
groups+: [
|
|
{
|
|
name: 'etcd',
|
|
rules: [
|
|
{
|
|
alert: 'etcdMembersDown',
|
|
expr: |||
|
|
max by (job) (
|
|
sum by (job) (up{%(etcd_selector)s} == bool 0)
|
|
or
|
|
count by (job,endpoint) (
|
|
sum by (job,endpoint,To) (rate(etcd_network_peer_sent_failures_total{%(etcd_selector)s}[3m])) > 0.01
|
|
)
|
|
)
|
|
> 0
|
|
||| % $._config,
|
|
'for': '3m',
|
|
labels: {
|
|
severity: 'critical',
|
|
},
|
|
annotations: {
|
|
message: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value }}).',
|
|
},
|
|
},
|
|
{
|
|
alert: 'etcdInsufficientMembers',
|
|
expr: |||
|
|
sum(up{%(etcd_selector)s} == bool 1) by (job) < ((count(up{%(etcd_selector)s}) by (job) + 1) / 2)
|
|
||| % $._config,
|
|
'for': '3m',
|
|
labels: {
|
|
severity: 'critical',
|
|
},
|
|
annotations: {
|
|
message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).',
|
|
},
|
|
},
|
|
{
|
|
alert: 'etcdNoLeader',
|
|
expr: |||
|
|
etcd_server_has_leader{%(etcd_selector)s} == 0
|
|
||| % $._config,
|
|
'for': '1m',
|
|
labels: {
|
|
severity: 'critical',
|
|
},
|
|
annotations: {
|
|
message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.',
|
|
},
|
|
},
|
|
{
|
|
alert: 'etcdHighNumberOfLeaderChanges',
|
|
expr: |||
|
|
rate(etcd_server_leader_changes_seen_total{%(etcd_selector)s}[15m]) > 3
|
|
||| % $._config,
|
|
'for': '15m',
|
|
labels: {
|
|
severity: 'warning',
|
|
},
|
|
annotations: {
|
|
message: 'etcd cluster "{{ $labels.job }}": instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last 30 minutes.',
|
|
},
|
|
},
|
|
{
|
|
alert: 'etcdHighNumberOfFailedGRPCRequests',
|
|
expr: |||
|
|
100 * sum(rate(grpc_server_handled_total{%(etcd_selector)s, grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
|
|
/
|
|
sum(rate(grpc_server_handled_total{%(etcd_selector)s}[5m])) BY (job, instance, grpc_service, grpc_method)
|
|
> 1
|
|
||| % $._config,
|
|
'for': '10m',
|
|
labels: {
|
|
severity: 'warning',
|
|
},
|
|
annotations: {
|
|
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.',
|
|
},
|
|
},
|
|
{
|
|
alert: 'etcdHighNumberOfFailedGRPCRequests',
|
|
expr: |||
|
|
100 * sum(rate(grpc_server_handled_total{%(etcd_selector)s, grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
|
|
/
|
|
sum(rate(grpc_server_handled_total{%(etcd_selector)s}[5m])) BY (job, instance, grpc_service, grpc_method)
|
|
> 5
|
|
||| % $._config,
|
|
'for': '5m',
|
|
labels: {
|
|
severity: 'critical',
|
|
},
|
|
annotations: {
|
|
message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.',
|
|
},
|
|
},
|
|
{
|
|
alert: 'etcdGRPCRequestsSlow',
|
|
expr: |||
|
|
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{%(etcd_selector)s, grpc_type="unary"}[5m])) by (job, instance, grpc_service, grpc_method, le))
|
|
> 0.15
|
|
||| % $._config,
|
|
'for': '10m',
|
|
labels: {
|
|
severity: 'critical',
|
|
},
|
|
annotations: {
|
|
message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.',
|
|
},
|
|
},
|
|
{
|
|
alert: 'etcdMemberCommunicationSlow',
|
|
expr: |||
|
|
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{%(etcd_selector)s}[5m]))
|
|
> 0.15
|
|
||| % $._config,
|
|
'for': '10m',
|
|
labels: {
|
|
severity: 'warning',
|
|
},
|
|
annotations: {
|
|
message: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.',
|
|
},
|
|
},
|
|
{
|
|
alert: 'etcdHighNumberOfFailedProposals',
|
|
expr: |||
|
|
rate(etcd_server_proposals_failed_total{%(etcd_selector)s}[15m]) > 5
|
|
||| % $._config,
|
|
'for': '15m',
|
|
labels: {
|
|
severity: 'warning',
|
|
},
|
|
annotations: {
|
|
message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.',
|
|
},
|
|
},
|
|
{
|
|
alert: 'etcdHighFsyncDurations',
|
|
expr: |||
|
|
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{%(etcd_selector)s}[5m]))
|
|
> 0.5
|
|
||| % $._config,
|
|
'for': '10m',
|
|
labels: {
|
|
severity: 'warning',
|
|
},
|
|
annotations: {
|
|
message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.',
|
|
},
|
|
},
|
|
{
|
|
alert: 'etcdHighCommitDurations',
|
|
expr: |||
|
|
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{%(etcd_selector)s}[5m]))
|
|
> 0.25
|
|
||| % $._config,
|
|
'for': '10m',
|
|
labels: {
|
|
severity: 'warning',
|
|
},
|
|
annotations: {
|
|
message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.',
|
|
},
|
|
},
|
|
{
|
|
alert: 'etcdHighNumberOfFailedHTTPRequests',
|
|
expr: |||
|
|
sum(rate(etcd_http_failed_total{%(etcd_selector)s, code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{%(etcd_selector)s}[5m]))
|
|
BY (method) > 0.01
|
|
||| % $._config,
|
|
'for': '10m',
|
|
labels: {
|
|
severity: 'warning',
|
|
},
|
|
annotations: {
|
|
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}',
|
|
},
|
|
},
|
|
{
|
|
alert: 'etcdHighNumberOfFailedHTTPRequests',
|
|
expr: |||
|
|
sum(rate(etcd_http_failed_total{%(etcd_selector)s, code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{%(etcd_selector)s}[5m]))
|
|
BY (method) > 0.05
|
|
||| % $._config,
|
|
'for': '10m',
|
|
labels: {
|
|
severity: 'critical',
|
|
},
|
|
annotations: {
|
|
message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}.',
|
|
},
|
|
},
|
|
{
|
|
alert: 'etcdHTTPRequestsSlow',
|
|
expr: |||
|
|
histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
|
|
> 0.15
|
|
||| % $._config,
|
|
'for': '10m',
|
|
labels: {
|
|
severity: 'warning',
|
|
},
|
|
annotations: {
|
|
message: 'etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow.',
|
|
},
|
|
},
|
|
],
|
|
},
|
|
],
|
|
},
|
|
|
|
grafanaDashboards+:: {
|
|
'etcd.json': {
|
|
id: 6,
|
|
title: 'etcd',
|
|
description: 'etcd sample Grafana dashboard with Prometheus',
|
|
tags: [],
|
|
style: 'dark',
|
|
timezone: 'browser',
|
|
editable: true,
|
|
hideControls: false,
|
|
sharedCrosshair: false,
|
|
rows: [
|
|
{
|
|
collapse: false,
|
|
editable: true,
|
|
height: '250px',
|
|
panels: [
|
|
{
|
|
cacheTimeout: null,
|
|
colorBackground: false,
|
|
colorValue: false,
|
|
colors: [
|
|
'rgba(245, 54, 54, 0.9)',
|
|
'rgba(237, 129, 40, 0.89)',
|
|
'rgba(50, 172, 45, 0.97)',
|
|
],
|
|
datasource: '$datasource',
|
|
editable: true,
|
|
'error': false,
|
|
format: 'none',
|
|
gauge: {
|
|
maxValue: 100,
|
|
minValue: 0,
|
|
show: false,
|
|
thresholdLabels: false,
|
|
thresholdMarkers: true,
|
|
},
|
|
id: 28,
|
|
interval: null,
|
|
isNew: true,
|
|
links: [],
|
|
mappingType: 1,
|
|
mappingTypes: [
|
|
{
|
|
name: 'value to text',
|
|
value: 1,
|
|
},
|
|
{
|
|
name: 'range to text',
|
|
value: 2,
|
|
},
|
|
],
|
|
maxDataPoints: 100,
|
|
nullPointMode: 'connected',
|
|
nullText: null,
|
|
postfix: '',
|
|
postfixFontSize: '50%',
|
|
prefix: '',
|
|
prefixFontSize: '50%',
|
|
rangeMaps: [{
|
|
from: 'null',
|
|
text: 'N/A',
|
|
to: 'null',
|
|
}],
|
|
span: 3,
|
|
sparkline: {
|
|
fillColor: 'rgba(31, 118, 189, 0.18)',
|
|
full: false,
|
|
lineColor: 'rgb(31, 120, 193)',
|
|
show: false,
|
|
},
|
|
targets: [{
|
|
expr: 'sum(etcd_server_has_leader{job="$cluster"})',
|
|
intervalFactor: 2,
|
|
legendFormat: '',
|
|
metric: 'etcd_server_has_leader',
|
|
refId: 'A',
|
|
step: 20,
|
|
}],
|
|
thresholds: '',
|
|
title: 'Up',
|
|
type: 'singlestat',
|
|
valueFontSize: '200%',
|
|
valueMaps: [{
|
|
op: '=',
|
|
text: 'N/A',
|
|
value: 'null',
|
|
}],
|
|
valueName: 'avg',
|
|
},
|
|
{
|
|
aliasColors: {},
|
|
bars: false,
|
|
datasource: '$datasource',
|
|
editable: true,
|
|
'error': false,
|
|
fill: 0,
|
|
id: 23,
|
|
isNew: true,
|
|
legend: {
|
|
avg: false,
|
|
current: false,
|
|
max: false,
|
|
min: false,
|
|
show: false,
|
|
total: false,
|
|
values: false,
|
|
},
|
|
lines: true,
|
|
linewidth: 2,
|
|
links: [],
|
|
nullPointMode: 'connected',
|
|
percentage: false,
|
|
pointradius: 5,
|
|
points: false,
|
|
renderer: 'flot',
|
|
seriesOverrides: [],
|
|
span: 5,
|
|
stack: false,
|
|
steppedLine: false,
|
|
targets: [
|
|
{
|
|
expr: 'sum(rate(grpc_server_started_total{job="$cluster",grpc_type="unary"}[5m]))',
|
|
format: 'time_series',
|
|
intervalFactor: 2,
|
|
legendFormat: 'RPC Rate',
|
|
metric: 'grpc_server_started_total',
|
|
refId: 'A',
|
|
step: 2,
|
|
},
|
|
{
|
|
expr: 'sum(rate(grpc_server_handled_total{job="$cluster",grpc_type="unary",grpc_code!="OK"}[5m]))',
|
|
format: 'time_series',
|
|
intervalFactor: 2,
|
|
legendFormat: 'RPC Failed Rate',
|
|
metric: 'grpc_server_handled_total',
|
|
refId: 'B',
|
|
step: 2,
|
|
},
|
|
],
|
|
thresholds: [],
|
|
timeFrom: null,
|
|
timeShift: null,
|
|
title: 'RPC Rate',
|
|
tooltip: {
|
|
msResolution: false,
|
|
shared: true,
|
|
sort: 0,
|
|
value_type: 'individual',
|
|
},
|
|
type: 'graph',
|
|
xaxis: {
|
|
mode: 'time',
|
|
name: null,
|
|
show: true,
|
|
values: [],
|
|
},
|
|
yaxes: [
|
|
{
|
|
format: 'ops',
|
|
label: null,
|
|
logBase: 1,
|
|
max: null,
|
|
min: null,
|
|
show: true,
|
|
},
|
|
{
|
|
format: 'short',
|
|
label: null,
|
|
logBase: 1,
|
|
max: null,
|
|
min: null,
|
|
show: true,
|
|
},
|
|
],
|
|
},
|
|
{
|
|
aliasColors: {},
|
|
bars: false,
|
|
datasource: '$datasource',
|
|
editable: true,
|
|
'error': false,
|
|
fill: 0,
|
|
id: 41,
|
|
isNew: true,
|
|
legend: {
|
|
avg: false,
|
|
current: false,
|
|
max: false,
|
|
min: false,
|
|
show: false,
|
|
total: false,
|
|
values: false,
|
|
},
|
|
lines: true,
|
|
linewidth: 2,
|
|
links: [],
|
|
nullPointMode: 'connected',
|
|
percentage: false,
|
|
pointradius: 5,
|
|
points: false,
|
|
renderer: 'flot',
|
|
seriesOverrides: [],
|
|
span: 4,
|
|
stack: true,
|
|
steppedLine: false,
|
|
targets: [
|
|
{
|
|
expr: 'sum(grpc_server_started_total{job="$cluster",grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{job="$cluster",grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"})',
|
|
intervalFactor: 2,
|
|
legendFormat: 'Watch Streams',
|
|
metric: 'grpc_server_handled_total',
|
|
refId: 'A',
|
|
step: 4,
|
|
},
|
|
{
|
|
expr: 'sum(grpc_server_started_total{job="$cluster",grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{job="$cluster",grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"})',
|
|
intervalFactor: 2,
|
|
legendFormat: 'Lease Streams',
|
|
metric: 'grpc_server_handled_total',
|
|
refId: 'B',
|
|
step: 4,
|
|
},
|
|
],
|
|
thresholds: [],
|
|
timeFrom: null,
|
|
timeShift: null,
|
|
title: 'Active Streams',
|
|
tooltip: {
|
|
msResolution: false,
|
|
shared: true,
|
|
sort: 0,
|
|
value_type: 'individual',
|
|
},
|
|
type: 'graph',
|
|
xaxis: {
|
|
mode: 'time',
|
|
name: null,
|
|
show: true,
|
|
values: [],
|
|
},
|
|
yaxes: [
|
|
{
|
|
format: 'short',
|
|
label: '',
|
|
logBase: 1,
|
|
max: null,
|
|
min: null,
|
|
show: true,
|
|
},
|
|
{
|
|
format: 'short',
|
|
label: null,
|
|
logBase: 1,
|
|
max: null,
|
|
min: null,
|
|
show: true,
|
|
},
|
|
],
|
|
},
|
|
],
|
|
showTitle: false,
|
|
title: 'Row',
|
|
},
|
|
{
|
|
collapse: false,
|
|
editable: true,
|
|
height: '250px',
|
|
panels: [
|
|
{
|
|
aliasColors: {},
|
|
bars: false,
|
|
datasource: '$datasource',
|
|
decimals: null,
|
|
editable: true,
|
|
'error': false,
|
|
fill: 0,
|
|
grid: {},
|
|
id: 1,
|
|
legend: {
|
|
avg: false,
|
|
current: false,
|
|
max: false,
|
|
min: false,
|
|
show: false,
|
|
total: false,
|
|
values: false,
|
|
},
|
|
lines: true,
|
|
linewidth: 2,
|
|
links: [],
|
|
nullPointMode: 'connected',
|
|
percentage: false,
|
|
pointradius: 5,
|
|
points: false,
|
|
renderer: 'flot',
|
|
seriesOverrides: [],
|
|
span: 4,
|
|
stack: false,
|
|
steppedLine: false,
|
|
targets: [{
|
|
expr: 'etcd_debugging_mvcc_db_total_size_in_bytes{job="$cluster"}',
|
|
hide: false,
|
|
interval: '',
|
|
intervalFactor: 2,
|
|
legendFormat: '{{instance}} DB Size',
|
|
metric: '',
|
|
refId: 'A',
|
|
step: 4,
|
|
}],
|
|
thresholds: [],
|
|
timeFrom: null,
|
|
timeShift: null,
|
|
title: 'DB Size',
|
|
tooltip: {
|
|
msResolution: false,
|
|
shared: true,
|
|
sort: 0,
|
|
value_type: 'cumulative',
|
|
},
|
|
type: 'graph',
|
|
xaxis: {
|
|
mode: 'time',
|
|
name: null,
|
|
show: true,
|
|
values: [],
|
|
},
|
|
yaxes: [
|
|
{
|
|
format: 'bytes',
|
|
logBase: 1,
|
|
max: null,
|
|
min: null,
|
|
show: true,
|
|
},
|
|
{
|
|
format: 'short',
|
|
logBase: 1,
|
|
max: null,
|
|
min: null,
|
|
show: false,
|
|
},
|
|
],
|
|
},
|
|
{
|
|
aliasColors: {},
|
|
bars: false,
|
|
datasource: '$datasource',
|
|
editable: true,
|
|
'error': false,
|
|
fill: 0,
|
|
grid: {},
|
|
id: 3,
|
|
legend: {
|
|
avg: false,
|
|
current: false,
|
|
max: false,
|
|
min: false,
|
|
show: false,
|
|
total: false,
|
|
values: false,
|
|
},
|
|
lines: true,
|
|
linewidth: 2,
|
|
links: [],
|
|
nullPointMode: 'connected',
|
|
percentage: false,
|
|
pointradius: 1,
|
|
points: false,
|
|
renderer: 'flot',
|
|
seriesOverrides: [],
|
|
span: 4,
|
|
stack: false,
|
|
steppedLine: true,
|
|
targets: [
|
|
{
|
|
expr: 'histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket{job="$cluster"}[5m])) by (instance, le))',
|
|
hide: false,
|
|
intervalFactor: 2,
|
|
legendFormat: '{{instance}} WAL fsync',
|
|
metric: 'etcd_disk_wal_fsync_duration_seconds_bucket',
|
|
refId: 'A',
|
|
step: 4,
|
|
},
|
|
{
|
|
expr: 'histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket{job="$cluster"}[5m])) by (instance, le))',
|
|
intervalFactor: 2,
|
|
legendFormat: '{{instance}} DB fsync',
|
|
metric: 'etcd_disk_backend_commit_duration_seconds_bucket',
|
|
refId: 'B',
|
|
step: 4,
|
|
},
|
|
],
|
|
thresholds: [],
|
|
timeFrom: null,
|
|
timeShift: null,
|
|
title: 'Disk Sync Duration',
|
|
tooltip: {
|
|
msResolution: false,
|
|
shared: true,
|
|
sort: 0,
|
|
value_type: 'cumulative',
|
|
},
|
|
type: 'graph',
|
|
xaxis: {
|
|
mode: 'time',
|
|
name: null,
|
|
show: true,
|
|
values: [],
|
|
},
|
|
yaxes: [
|
|
{
|
|
format: 's',
|
|
logBase: 1,
|
|
max: null,
|
|
min: null,
|
|
show: true,
|
|
},
|
|
{
|
|
format: 'short',
|
|
logBase: 1,
|
|
max: null,
|
|
min: null,
|
|
show: false,
|
|
},
|
|
],
|
|
},
|
|
{
|
|
aliasColors: {},
|
|
bars: false,
|
|
datasource: '$datasource',
|
|
editable: true,
|
|
'error': false,
|
|
fill: 0,
|
|
id: 29,
|
|
isNew: true,
|
|
legend: {
|
|
avg: false,
|
|
current: false,
|
|
max: false,
|
|
min: false,
|
|
show: false,
|
|
total: false,
|
|
values: false,
|
|
},
|
|
lines: true,
|
|
linewidth: 2,
|
|
links: [],
|
|
nullPointMode: 'connected',
|
|
percentage: false,
|
|
pointradius: 5,
|
|
points: false,
|
|
renderer: 'flot',
|
|
seriesOverrides: [],
|
|
span: 4,
|
|
stack: false,
|
|
steppedLine: false,
|
|
targets: [{
|
|
expr: 'process_resident_memory_bytes{job="$cluster"}',
|
|
intervalFactor: 2,
|
|
legendFormat: '{{instance}} Resident Memory',
|
|
metric: 'process_resident_memory_bytes',
|
|
refId: 'A',
|
|
step: 4,
|
|
}],
|
|
thresholds: [],
|
|
timeFrom: null,
|
|
timeShift: null,
|
|
title: 'Memory',
|
|
tooltip: {
|
|
msResolution: false,
|
|
shared: true,
|
|
sort: 0,
|
|
value_type: 'individual',
|
|
},
|
|
type: 'graph',
|
|
xaxis: {
|
|
mode: 'time',
|
|
name: null,
|
|
show: true,
|
|
values: [],
|
|
},
|
|
yaxes: [
|
|
{
|
|
format: 'bytes',
|
|
label: null,
|
|
logBase: 1,
|
|
max: null,
|
|
min: null,
|
|
show: true,
|
|
},
|
|
{
|
|
format: 'short',
|
|
label: null,
|
|
logBase: 1,
|
|
max: null,
|
|
min: null,
|
|
show: true,
|
|
},
|
|
],
|
|
},
|
|
],
|
|
title: 'New row',
|
|
},
|
|
{
|
|
collapse: false,
|
|
editable: true,
|
|
height: '250px',
|
|
panels: [
|
|
{
|
|
aliasColors: {},
|
|
bars: false,
|
|
datasource: '$datasource',
|
|
editable: true,
|
|
'error': false,
|
|
fill: 5,
|
|
id: 22,
|
|
isNew: true,
|
|
legend: {
|
|
avg: false,
|
|
current: false,
|
|
max: false,
|
|
min: false,
|
|
show: false,
|
|
total: false,
|
|
values: false,
|
|
},
|
|
lines: true,
|
|
linewidth: 2,
|
|
links: [],
|
|
nullPointMode: 'connected',
|
|
percentage: false,
|
|
pointradius: 5,
|
|
points: false,
|
|
renderer: 'flot',
|
|
seriesOverrides: [],
|
|
span: 3,
|
|
stack: true,
|
|
steppedLine: false,
|
|
targets: [{
|
|
expr: 'rate(etcd_network_client_grpc_received_bytes_total{job="$cluster"}[5m])',
|
|
intervalFactor: 2,
|
|
legendFormat: '{{instance}} Client Traffic In',
|
|
metric: 'etcd_network_client_grpc_received_bytes_total',
|
|
refId: 'A',
|
|
step: 4,
|
|
}],
|
|
thresholds: [],
|
|
timeFrom: null,
|
|
timeShift: null,
|
|
title: 'Client Traffic In',
|
|
tooltip: {
|
|
msResolution: false,
|
|
shared: true,
|
|
sort: 0,
|
|
value_type: 'individual',
|
|
},
|
|
type: 'graph',
|
|
xaxis: {
|
|
mode: 'time',
|
|
name: null,
|
|
show: true,
|
|
values: [],
|
|
},
|
|
yaxes: [
|
|
{
|
|
format: 'Bps',
|
|
label: null,
|
|
logBase: 1,
|
|
max: null,
|
|
min: null,
|
|
show: true,
|
|
},
|
|
{
|
|
format: 'short',
|
|
label: null,
|
|
logBase: 1,
|
|
max: null,
|
|
min: null,
|
|
show: true,
|
|
},
|
|
],
|
|
},
|
|
{
|
|
aliasColors: {},
|
|
bars: false,
|
|
datasource: '$datasource',
|
|
editable: true,
|
|
'error': false,
|
|
fill: 5,
|
|
id: 21,
|
|
isNew: true,
|
|
legend: {
|
|
avg: false,
|
|
current: false,
|
|
max: false,
|
|
min: false,
|
|
show: false,
|
|
total: false,
|
|
values: false,
|
|
},
|
|
lines: true,
|
|
linewidth: 2,
|
|
links: [],
|
|
nullPointMode: 'connected',
|
|
percentage: false,
|
|
pointradius: 5,
|
|
points: false,
|
|
renderer: 'flot',
|
|
seriesOverrides: [],
|
|
span: 3,
|
|
stack: true,
|
|
steppedLine: false,
|
|
targets: [{
|
|
expr: 'rate(etcd_network_client_grpc_sent_bytes_total{job="$cluster"}[5m])',
|
|
intervalFactor: 2,
|
|
legendFormat: '{{instance}} Client Traffic Out',
|
|
metric: 'etcd_network_client_grpc_sent_bytes_total',
|
|
refId: 'A',
|
|
step: 4,
|
|
}],
|
|
thresholds: [],
|
|
timeFrom: null,
|
|
timeShift: null,
|
|
title: 'Client Traffic Out',
|
|
tooltip: {
|
|
msResolution: false,
|
|
shared: true,
|
|
sort: 0,
|
|
value_type: 'individual',
|
|
},
|
|
type: 'graph',
|
|
xaxis: {
|
|
mode: 'time',
|
|
name: null,
|
|
show: true,
|
|
values: [],
|
|
},
|
|
yaxes: [
|
|
{
|
|
format: 'Bps',
|
|
label: null,
|
|
logBase: 1,
|
|
max: null,
|
|
min: null,
|
|
show: true,
|
|
},
|
|
{
|
|
format: 'short',
|
|
label: null,
|
|
logBase: 1,
|
|
max: null,
|
|
min: null,
|
|
show: true,
|
|
},
|
|
],
|
|
},
|
|
{
|
|
aliasColors: {},
|
|
bars: false,
|
|
datasource: '$datasource',
|
|
editable: true,
|
|
'error': false,
|
|
fill: 0,
|
|
id: 20,
|
|
isNew: true,
|
|
legend: {
|
|
avg: false,
|
|
current: false,
|
|
max: false,
|
|
min: false,
|
|
show: false,
|
|
total: false,
|
|
values: false,
|
|
},
|
|
lines: true,
|
|
linewidth: 2,
|
|
links: [],
|
|
nullPointMode: 'connected',
|
|
percentage: false,
|
|
pointradius: 5,
|
|
points: false,
|
|
renderer: 'flot',
|
|
seriesOverrides: [],
|
|
span: 3,
|
|
stack: false,
|
|
steppedLine: false,
|
|
targets: [{
|
|
expr: 'sum(rate(etcd_network_peer_received_bytes_total{job="$cluster"}[5m])) by (instance)',
|
|
intervalFactor: 2,
|
|
legendFormat: '{{instance}} Peer Traffic In',
|
|
metric: 'etcd_network_peer_received_bytes_total',
|
|
refId: 'A',
|
|
step: 4,
|
|
}],
|
|
thresholds: [],
|
|
timeFrom: null,
|
|
timeShift: null,
|
|
title: 'Peer Traffic In',
|
|
tooltip: {
|
|
msResolution: false,
|
|
shared: true,
|
|
sort: 0,
|
|
value_type: 'individual',
|
|
},
|
|
type: 'graph',
|
|
xaxis: {
|
|
mode: 'time',
|
|
name: null,
|
|
show: true,
|
|
values: [],
|
|
},
|
|
yaxes: [
|
|
{
|
|
format: 'Bps',
|
|
label: null,
|
|
logBase: 1,
|
|
max: null,
|
|
min: null,
|
|
show: true,
|
|
},
|
|
{
|
|
format: 'short',
|
|
label: null,
|
|
logBase: 1,
|
|
max: null,
|
|
min: null,
|
|
show: true,
|
|
},
|
|
],
|
|
},
|
|
{
|
|
aliasColors: {},
|
|
bars: false,
|
|
datasource: '$datasource',
|
|
decimals: null,
|
|
editable: true,
|
|
'error': false,
|
|
fill: 0,
|
|
grid: {},
|
|
id: 16,
|
|
legend: {
|
|
avg: false,
|
|
current: false,
|
|
max: false,
|
|
min: false,
|
|
show: false,
|
|
total: false,
|
|
values: false,
|
|
},
|
|
lines: true,
|
|
linewidth: 2,
|
|
links: [],
|
|
nullPointMode: 'connected',
|
|
percentage: false,
|
|
pointradius: 5,
|
|
points: false,
|
|
renderer: 'flot',
|
|
seriesOverrides: [],
|
|
span: 3,
|
|
stack: false,
|
|
steppedLine: false,
|
|
targets: [{
|
|
expr: 'sum(rate(etcd_network_peer_sent_bytes_total{job="$cluster"}[5m])) by (instance)',
|
|
hide: false,
|
|
interval: '',
|
|
intervalFactor: 2,
|
|
legendFormat: '{{instance}} Peer Traffic Out',
|
|
metric: 'etcd_network_peer_sent_bytes_total',
|
|
refId: 'A',
|
|
step: 4,
|
|
}],
|
|
thresholds: [],
|
|
timeFrom: null,
|
|
timeShift: null,
|
|
title: 'Peer Traffic Out',
|
|
tooltip: {
|
|
msResolution: false,
|
|
shared: true,
|
|
sort: 0,
|
|
value_type: 'cumulative',
|
|
},
|
|
type: 'graph',
|
|
xaxis: {
|
|
mode: 'time',
|
|
name: null,
|
|
show: true,
|
|
values: [],
|
|
},
|
|
yaxes: [
|
|
{
|
|
format: 'Bps',
|
|
logBase: 1,
|
|
max: null,
|
|
min: null,
|
|
show: true,
|
|
},
|
|
{
|
|
format: 'short',
|
|
logBase: 1,
|
|
max: null,
|
|
min: null,
|
|
show: true,
|
|
},
|
|
],
|
|
},
|
|
],
|
|
title: 'New row',
|
|
},
|
|
{
|
|
collapse: false,
|
|
editable: true,
|
|
height: '250px',
|
|
panels: [
|
|
{
|
|
aliasColors: {},
|
|
bars: false,
|
|
datasource: '$datasource',
|
|
editable: true,
|
|
'error': false,
|
|
fill: 0,
|
|
id: 40,
|
|
isNew: true,
|
|
legend: {
|
|
avg: false,
|
|
current: false,
|
|
max: false,
|
|
min: false,
|
|
show: false,
|
|
total: false,
|
|
values: false,
|
|
},
|
|
lines: true,
|
|
linewidth: 2,
|
|
links: [],
|
|
nullPointMode: 'connected',
|
|
percentage: false,
|
|
pointradius: 5,
|
|
points: false,
|
|
renderer: 'flot',
|
|
seriesOverrides: [],
|
|
span: 6,
|
|
stack: false,
|
|
steppedLine: false,
|
|
targets: [
|
|
{
|
|
expr: 'sum(rate(etcd_server_proposals_failed_total{job="$cluster"}[5m]))',
|
|
intervalFactor: 2,
|
|
legendFormat: 'Proposal Failure Rate',
|
|
metric: 'etcd_server_proposals_failed_total',
|
|
refId: 'A',
|
|
step: 2,
|
|
},
|
|
{
|
|
expr: 'sum(etcd_server_proposals_pending{job="$cluster"})',
|
|
intervalFactor: 2,
|
|
legendFormat: 'Proposal Pending Total',
|
|
metric: 'etcd_server_proposals_pending',
|
|
refId: 'B',
|
|
step: 2,
|
|
},
|
|
{
|
|
expr: 'sum(rate(etcd_server_proposals_committed_total{job="$cluster"}[5m]))',
|
|
intervalFactor: 2,
|
|
legendFormat: 'Proposal Commit Rate',
|
|
metric: 'etcd_server_proposals_committed_total',
|
|
refId: 'C',
|
|
step: 2,
|
|
},
|
|
{
|
|
expr: 'sum(rate(etcd_server_proposals_applied_total{job="$cluster"}[5m]))',
|
|
intervalFactor: 2,
|
|
legendFormat: 'Proposal Apply Rate',
|
|
refId: 'D',
|
|
step: 2,
|
|
},
|
|
],
|
|
thresholds: [],
|
|
timeFrom: null,
|
|
timeShift: null,
|
|
title: 'Raft Proposals',
|
|
tooltip: {
|
|
msResolution: false,
|
|
shared: true,
|
|
sort: 0,
|
|
value_type: 'individual',
|
|
},
|
|
type: 'graph',
|
|
xaxis: {
|
|
mode: 'time',
|
|
name: null,
|
|
show: true,
|
|
values: [],
|
|
},
|
|
yaxes: [
|
|
{
|
|
format: 'short',
|
|
label: '',
|
|
logBase: 1,
|
|
max: null,
|
|
min: null,
|
|
show: true,
|
|
},
|
|
{
|
|
format: 'short',
|
|
label: null,
|
|
logBase: 1,
|
|
max: null,
|
|
min: null,
|
|
show: true,
|
|
},
|
|
],
|
|
},
|
|
{
|
|
aliasColors: {},
|
|
bars: false,
|
|
datasource: '$datasource',
|
|
decimals: 0,
|
|
editable: true,
|
|
'error': false,
|
|
fill: 0,
|
|
id: 19,
|
|
isNew: true,
|
|
legend: {
|
|
alignAsTable: false,
|
|
avg: false,
|
|
current: false,
|
|
max: false,
|
|
min: false,
|
|
rightSide: false,
|
|
show: false,
|
|
total: false,
|
|
values: false,
|
|
},
|
|
lines: true,
|
|
linewidth: 2,
|
|
links: [],
|
|
nullPointMode: 'connected',
|
|
percentage: false,
|
|
pointradius: 5,
|
|
points: false,
|
|
renderer: 'flot',
|
|
seriesOverrides: [],
|
|
span: 6,
|
|
stack: false,
|
|
steppedLine: false,
|
|
targets: [{
|
|
expr: 'changes(etcd_server_leader_changes_seen_total{job="$cluster"}[1d])',
|
|
intervalFactor: 2,
|
|
legendFormat: '{{instance}} Total Leader Elections Per Day',
|
|
metric: 'etcd_server_leader_changes_seen_total',
|
|
refId: 'A',
|
|
step: 2,
|
|
}],
|
|
thresholds: [],
|
|
timeFrom: null,
|
|
timeShift: null,
|
|
title: 'Total Leader Elections Per Day',
|
|
tooltip: {
|
|
msResolution: false,
|
|
shared: true,
|
|
sort: 0,
|
|
value_type: 'individual',
|
|
},
|
|
type: 'graph',
|
|
xaxis: {
|
|
mode: 'time',
|
|
name: null,
|
|
show: true,
|
|
values: [],
|
|
},
|
|
yaxes: [
|
|
{
|
|
format: 'short',
|
|
label: null,
|
|
logBase: 1,
|
|
max: null,
|
|
min: null,
|
|
show: true,
|
|
},
|
|
{
|
|
format: 'short',
|
|
label: null,
|
|
logBase: 1,
|
|
max: null,
|
|
min: null,
|
|
show: true,
|
|
},
|
|
],
|
|
},
|
|
],
|
|
title: 'New row',
|
|
},
|
|
],
|
|
time: {
|
|
from: 'now-15m',
|
|
to: 'now',
|
|
},
|
|
timepicker: {
|
|
now: true,
|
|
refresh_intervals: [
|
|
'5s',
|
|
'10s',
|
|
'30s',
|
|
'1m',
|
|
'5m',
|
|
'15m',
|
|
'30m',
|
|
'1h',
|
|
'2h',
|
|
'1d',
|
|
],
|
|
time_options: [
|
|
'5m',
|
|
'15m',
|
|
'1h',
|
|
'6h',
|
|
'12h',
|
|
'24h',
|
|
'2d',
|
|
'7d',
|
|
'30d',
|
|
],
|
|
},
|
|
templating: {
|
|
list: [
|
|
{
|
|
current: {
|
|
text: 'Prometheus',
|
|
value: 'Prometheus',
|
|
},
|
|
hide: 0,
|
|
label: null,
|
|
name: 'datasource',
|
|
options: [],
|
|
query: 'prometheus',
|
|
refresh: 1,
|
|
regex: '',
|
|
type: 'datasource',
|
|
},
|
|
{
|
|
allValue: null,
|
|
current: {
|
|
text: 'prod',
|
|
value: 'prod',
|
|
},
|
|
datasource: '$datasource',
|
|
hide: 0,
|
|
includeAll: false,
|
|
label: 'cluster',
|
|
multi: false,
|
|
name: 'cluster',
|
|
options: [],
|
|
query: 'label_values(etcd_server_has_leader, job)',
|
|
refresh: 1,
|
|
regex: '',
|
|
sort: 2,
|
|
tagValuesQuery: '',
|
|
tags: [],
|
|
tagsQuery: '',
|
|
type: 'query',
|
|
useTags: false,
|
|
},
|
|
],
|
|
},
|
|
annotations: {
|
|
list: [],
|
|
},
|
|
refresh: '10s',
|
|
schemaVersion: 13,
|
|
version: 215,
|
|
links: [],
|
|
gnetId: null,
|
|
},
|
|
},
|
|
}
|