{ _config+:: { etcd_selector: 'job=~".*etcd.*"', // etcd_instance_labels are the label names that are uniquely // identifying an instance and need to be aggreated away for alerts // that are about an etcd cluster as a whole. For example, if etcd // instances are deployed on K8s, you will likely want to change // this to 'instance, pod'. etcd_instance_labels: 'instance', // scrape_interval_seconds is the global scrape interval which can be // used to dynamically adjust rate windows as a function of the interval. scrape_interval_seconds: 30, // Dashboard variable refresh option on Grafana (https://grafana.com/docs/grafana/latest/datasources/prometheus/). // 0 : Never (Will never refresh the Dashboard variables values) // 1 : On Dashboard Load (Will refresh Dashboards variables when dashboard are loaded) // 2 : On Time Range Change (Will refresh Dashboards variables when time range will be changed) dashboard_var_refresh: 2, // clusterLabel is used to identify a cluster. clusterLabel: 'job', }, prometheusAlerts+:: { groups+: [ { name: 'etcd', rules: [ { alert: 'etcdMembersDown', expr: ||| max without (endpoint) ( sum without (%(etcd_instance_labels)s) (up{%(etcd_selector)s} == bool 0) or count without (To) ( sum without (%(etcd_instance_labels)s) (rate(etcd_network_peer_sent_failures_total{%(etcd_selector)s}[%(network_failure_range)ss])) > 0.01 ) ) > 0 ||| % { etcd_instance_labels: $._config.etcd_instance_labels, etcd_selector: $._config.etcd_selector, network_failure_range: $._config.scrape_interval_seconds * 4 }, 'for': '10m', labels: { severity: 'critical', }, annotations: { description: 'etcd cluster "{{ $labels.%s }}": members are down ({{ $value }}).' % $._config.clusterLabel, summary: 'etcd cluster members are down.', }, }, { alert: 'etcdInsufficientMembers', expr: ||| sum(up{%(etcd_selector)s} == bool 1) without (%(etcd_instance_labels)s) < ((count(up{%(etcd_selector)s}) without (%(etcd_instance_labels)s) + 1) / 2) ||| % $._config, 'for': '3m', labels: { severity: 'critical', }, annotations: { description: 'etcd cluster "{{ $labels.%s }}": insufficient members ({{ $value }}).' % $._config.clusterLabel, summary: 'etcd cluster has insufficient number of members.', }, }, { alert: 'etcdNoLeader', expr: ||| etcd_server_has_leader{%(etcd_selector)s} == 0 ||| % $._config, 'for': '1m', labels: { severity: 'critical', }, annotations: { description: 'etcd cluster "{{ $labels.%s }}": member {{ $labels.instance }} has no leader.' % $._config.clusterLabel, summary: 'etcd cluster has no leader.', }, }, { alert: 'etcdHighNumberOfLeaderChanges', expr: ||| increase((max without (%(etcd_instance_labels)s) (etcd_server_leader_changes_seen_total{%(etcd_selector)s}) or 0*absent(etcd_server_leader_changes_seen_total{%(etcd_selector)s}))[15m:1m]) >= 4 ||| % $._config, 'for': '5m', labels: { severity: 'warning', }, annotations: { description: 'etcd cluster "{{ $labels.%s }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.' % $._config.clusterLabel, summary: 'etcd cluster has high number of leader changes.', }, }, { alert: 'etcdHighNumberOfFailedGRPCRequests', expr: ||| 100 * sum(rate(grpc_server_handled_total{%(etcd_selector)s, grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code) / sum(rate(grpc_server_handled_total{%(etcd_selector)s}[5m])) without (grpc_type, grpc_code) > 1 ||| % $._config, 'for': '10m', labels: { severity: 'warning', }, annotations: { description: 'etcd cluster "{{ $labels.%s }}": {{ $value }}%% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.' % $._config.clusterLabel, summary: 'etcd cluster has high number of failed grpc requests.', }, }, { alert: 'etcdHighNumberOfFailedGRPCRequests', expr: ||| 100 * sum(rate(grpc_server_handled_total{%(etcd_selector)s, grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code) / sum(rate(grpc_server_handled_total{%(etcd_selector)s}[5m])) without (grpc_type, grpc_code) > 5 ||| % $._config, 'for': '5m', labels: { severity: 'critical', }, annotations: { description: 'etcd cluster "{{ $labels.%s }}": {{ $value }}%% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.' % $._config.clusterLabel, summary: 'etcd cluster has high number of failed grpc requests.', }, }, { alert: 'etcdGRPCRequestsSlow', expr: ||| histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{%(etcd_selector)s, grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type)) > 0.15 ||| % $._config, 'for': '10m', labels: { severity: 'critical', }, annotations: { description: 'etcd cluster "{{ $labels.%s }}": 99th percentile of gRPC requests is {{ $value }}s on etcd instance {{ $labels.instance }} for {{ $labels.grpc_method }} method.' % $._config.clusterLabel, summary: 'etcd grpc requests are slow', }, }, { alert: 'etcdMemberCommunicationSlow', expr: ||| histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{%(etcd_selector)s}[5m])) > 0.15 ||| % $._config, 'for': '10m', labels: { severity: 'warning', }, annotations: { description: 'etcd cluster "{{ $labels.%s }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.' % $._config.clusterLabel, summary: 'etcd cluster member communication is slow.', }, }, { alert: 'etcdHighNumberOfFailedProposals', expr: ||| rate(etcd_server_proposals_failed_total{%(etcd_selector)s}[15m]) > 5 ||| % $._config, 'for': '15m', labels: { severity: 'warning', }, annotations: { description: 'etcd cluster "{{ $labels.%s }}": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.' % $._config.clusterLabel, summary: 'etcd cluster has high number of proposal failures.', }, }, { alert: 'etcdHighFsyncDurations', expr: ||| histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{%(etcd_selector)s}[5m])) > 0.5 ||| % $._config, 'for': '10m', labels: { severity: 'warning', }, annotations: { description: 'etcd cluster "{{ $labels.%s }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.' % $._config.clusterLabel, summary: 'etcd cluster 99th percentile fsync durations are too high.', }, }, { alert: 'etcdHighFsyncDurations', expr: ||| histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{%(etcd_selector)s}[5m])) > 1 ||| % $._config, 'for': '10m', labels: { severity: 'critical', }, annotations: { description: 'etcd cluster "{{ $labels.%s }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.' % $._config.clusterLabel, summary: 'etcd cluster 99th percentile fsync durations are too high.', }, }, { alert: 'etcdHighCommitDurations', expr: ||| histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{%(etcd_selector)s}[5m])) > 0.25 ||| % $._config, 'for': '10m', labels: { severity: 'warning', }, annotations: { description: 'etcd cluster "{{ $labels.%s }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.' % $._config.clusterLabel, summary: 'etcd cluster 99th percentile commit durations are too high.', }, }, { alert: 'etcdDatabaseQuotaLowSpace', expr: ||| (last_over_time(etcd_mvcc_db_total_size_in_bytes[5m]) / last_over_time(etcd_server_quota_backend_bytes[5m]))*100 > 95 ||| % $._config, 'for': '10m', labels: { severity: 'critical', }, annotations: { description: 'etcd cluster "{{ $labels.%s }}": database size exceeds the defined quota on etcd instance {{ $labels.instance }}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.' % $._config.clusterLabel, summary: 'etcd cluster database is running full.', }, }, { alert: 'etcdExcessiveDatabaseGrowth', expr: ||| predict_linear(etcd_mvcc_db_total_size_in_bytes[4h], 4*60*60) > etcd_server_quota_backend_bytes ||| % $._config, 'for': '10m', labels: { severity: 'warning', }, annotations: { description: 'etcd cluster "{{ $labels.%s }}": Predicting running out of disk space in the next four hours, based on write observations within the past four hours on etcd instance {{ $labels.instance }}, please check as it might be disruptive.' % $._config.clusterLabel, summary: 'etcd cluster database growing very fast.', }, }, { alert: 'etcdDatabaseHighFragmentationRatio', expr: ||| (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes[5m])) < 0.5 ||| % $._config, 'for': '10m', labels: { severity: 'warning', }, annotations: { description: 'etcd cluster "{{ $labels.%s }}": database size in use on instance {{ $labels.instance }} is {{ $value | humanizePercentage }} of the actual allocated disk space, please run defragmentation (e.g. etcdctl defrag) to retrieve the unused fragmented disk space.' % $._config.clusterLabel, summary: 'etcd database size in use is less than 50% of the actual allocated storage.', runbook_url: 'https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation', }, }, ], }, ], }, grafanaDashboards+:: { 'etcd.json': { uid: std.md5('etcd.json'), title: 'etcd', description: 'etcd sample Grafana dashboard with Prometheus', tags: ['etcd-mixin'], style: 'dark', timezone: 'browser', editable: true, hideControls: false, sharedCrosshair: false, rows: [ { collapse: false, editable: true, height: '250px', panels: [ { cacheTimeout: null, colorBackground: false, colorValue: false, colors: [ 'rgba(245, 54, 54, 0.9)', 'rgba(237, 129, 40, 0.89)', 'rgba(50, 172, 45, 0.97)', ], datasource: '$datasource', editable: true, 'error': false, format: 'none', gauge: { maxValue: 100, minValue: 0, show: false, thresholdLabels: false, thresholdMarkers: true, }, id: 28, interval: null, isNew: true, links: [], mappingType: 1, mappingTypes: [ { name: 'value to text', value: 1, }, { name: 'range to text', value: 2, }, ], maxDataPoints: 100, nullPointMode: 'connected', nullText: null, postfix: '', postfixFontSize: '50%', prefix: '', prefixFontSize: '50%', rangeMaps: [{ from: 'null', text: 'N/A', to: 'null', }], span: 3, sparkline: { fillColor: 'rgba(31, 118, 189, 0.18)', full: false, lineColor: 'rgb(31, 120, 193)', show: false, }, targets: [{ expr: 'sum(etcd_server_has_leader{%s="$cluster"})' % $._config.clusterLabel, intervalFactor: 2, legendFormat: '', metric: 'etcd_server_has_leader', refId: 'A', step: 20, }], thresholds: '', title: 'Up', type: 'singlestat', valueFontSize: '200%', valueMaps: [{ op: '=', text: 'N/A', value: 'null', }], valueName: 'avg', }, { aliasColors: {}, bars: false, datasource: '$datasource', editable: true, 'error': false, fill: 0, id: 23, isNew: true, legend: { avg: false, current: false, max: false, min: false, show: false, total: false, values: false, }, lines: true, linewidth: 2, links: [], nullPointMode: 'connected', percentage: false, pointradius: 5, points: false, renderer: 'flot', seriesOverrides: [], span: 5, stack: false, steppedLine: false, targets: [ { expr: 'sum(rate(grpc_server_started_total{%s="$cluster",grpc_type="unary"}[$__rate_interval]))' % $._config.clusterLabel, format: 'time_series', intervalFactor: 2, legendFormat: 'RPC Rate', metric: 'grpc_server_started_total', refId: 'A', step: 2, }, { expr: 'sum(rate(grpc_server_handled_total{%s="$cluster",grpc_type="unary",grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[$__rate_interval]))' % $._config.clusterLabel, format: 'time_series', intervalFactor: 2, legendFormat: 'RPC Failed Rate', metric: 'grpc_server_handled_total', refId: 'B', step: 2, }, ], thresholds: [], timeFrom: null, timeShift: null, title: 'RPC Rate', tooltip: { msResolution: false, shared: true, sort: 0, value_type: 'individual', }, type: 'graph', xaxis: { mode: 'time', name: null, show: true, values: [], }, yaxes: [ { format: 'ops', label: null, logBase: 1, max: null, min: null, show: true, }, { format: 'short', label: null, logBase: 1, max: null, min: null, show: true, }, ], }, { aliasColors: {}, bars: false, datasource: '$datasource', editable: true, 'error': false, fill: 0, id: 41, isNew: true, legend: { avg: false, current: false, max: false, min: false, show: false, total: false, values: false, }, lines: true, linewidth: 2, links: [], nullPointMode: 'connected', percentage: false, pointradius: 5, points: false, renderer: 'flot', seriesOverrides: [], span: 4, stack: true, steppedLine: false, targets: [ { expr: 'sum(grpc_server_started_total{%(clusterLabel)s="$cluster",grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{%(clusterLabel)s="$cluster",grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"})' % $._config, intervalFactor: 2, legendFormat: 'Watch Streams', metric: 'grpc_server_handled_total', refId: 'A', step: 4, }, { expr: 'sum(grpc_server_started_total{%(clusterLabel)s="$cluster",grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{%(clusterLabel)s="$cluster",grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"})' % $._config, intervalFactor: 2, legendFormat: 'Lease Streams', metric: 'grpc_server_handled_total', refId: 'B', step: 4, }, ], thresholds: [], timeFrom: null, timeShift: null, title: 'Active Streams', tooltip: { msResolution: false, shared: true, sort: 0, value_type: 'individual', }, type: 'graph', xaxis: { mode: 'time', name: null, show: true, values: [], }, yaxes: [ { format: 'short', label: '', logBase: 1, max: null, min: null, show: true, }, { format: 'short', label: null, logBase: 1, max: null, min: null, show: true, }, ], }, ], showTitle: false, title: 'Row', }, { collapse: false, editable: true, height: '250px', panels: [ { aliasColors: {}, bars: false, datasource: '$datasource', decimals: null, editable: true, 'error': false, fill: 0, grid: {}, id: 1, legend: { avg: false, current: false, max: false, min: false, show: false, total: false, values: false, }, lines: true, linewidth: 2, links: [], nullPointMode: 'connected', percentage: false, pointradius: 5, points: false, renderer: 'flot', seriesOverrides: [], span: 4, stack: false, steppedLine: false, targets: [{ expr: 'etcd_mvcc_db_total_size_in_bytes{%s="$cluster"}' % $._config.clusterLabel, hide: false, interval: '', intervalFactor: 2, legendFormat: '{{instance}} DB Size', metric: '', refId: 'A', step: 4, }], thresholds: [], timeFrom: null, timeShift: null, title: 'DB Size', tooltip: { msResolution: false, shared: true, sort: 0, value_type: 'cumulative', }, type: 'graph', xaxis: { mode: 'time', name: null, show: true, values: [], }, yaxes: [ { format: 'bytes', logBase: 1, max: null, min: null, show: true, }, { format: 'short', logBase: 1, max: null, min: null, show: false, }, ], }, { aliasColors: {}, bars: false, datasource: '$datasource', editable: true, 'error': false, fill: 0, grid: {}, id: 3, legend: { avg: false, current: false, max: false, min: false, show: false, total: false, values: false, }, lines: true, linewidth: 2, links: [], nullPointMode: 'connected', percentage: false, pointradius: 1, points: false, renderer: 'flot', seriesOverrides: [], span: 4, stack: false, steppedLine: true, targets: [ { expr: 'histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket{%s="$cluster"}[$__rate_interval])) by (instance, le))' % $._config.clusterLabel, hide: false, intervalFactor: 2, legendFormat: '{{instance}} WAL fsync', metric: 'etcd_disk_wal_fsync_duration_seconds_bucket', refId: 'A', step: 4, }, { expr: 'histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket{%s="$cluster"}[$__rate_interval])) by (instance, le))' % $._config.clusterLabel, intervalFactor: 2, legendFormat: '{{instance}} DB fsync', metric: 'etcd_disk_backend_commit_duration_seconds_bucket', refId: 'B', step: 4, }, ], thresholds: [], timeFrom: null, timeShift: null, title: 'Disk Sync Duration', tooltip: { msResolution: false, shared: true, sort: 0, value_type: 'cumulative', }, type: 'graph', xaxis: { mode: 'time', name: null, show: true, values: [], }, yaxes: [ { format: 's', logBase: 1, max: null, min: null, show: true, }, { format: 'short', logBase: 1, max: null, min: null, show: false, }, ], }, { aliasColors: {}, bars: false, datasource: '$datasource', editable: true, 'error': false, fill: 0, id: 29, isNew: true, legend: { avg: false, current: false, max: false, min: false, show: false, total: false, values: false, }, lines: true, linewidth: 2, links: [], nullPointMode: 'connected', percentage: false, pointradius: 5, points: false, renderer: 'flot', seriesOverrides: [], span: 4, stack: false, steppedLine: false, targets: [{ expr: 'process_resident_memory_bytes{%s="$cluster"}' % $._config.clusterLabel, intervalFactor: 2, legendFormat: '{{instance}} Resident Memory', metric: 'process_resident_memory_bytes', refId: 'A', step: 4, }], thresholds: [], timeFrom: null, timeShift: null, title: 'Memory', tooltip: { msResolution: false, shared: true, sort: 0, value_type: 'individual', }, type: 'graph', xaxis: { mode: 'time', name: null, show: true, values: [], }, yaxes: [ { format: 'bytes', label: null, logBase: 1, max: null, min: null, show: true, }, { format: 'short', label: null, logBase: 1, max: null, min: null, show: true, }, ], }, ], title: 'New row', }, { collapse: false, editable: true, height: '250px', panels: [ { aliasColors: {}, bars: false, datasource: '$datasource', editable: true, 'error': false, fill: 5, id: 22, isNew: true, legend: { avg: false, current: false, max: false, min: false, show: false, total: false, values: false, }, lines: true, linewidth: 2, links: [], nullPointMode: 'connected', percentage: false, pointradius: 5, points: false, renderer: 'flot', seriesOverrides: [], span: 3, stack: true, steppedLine: false, targets: [{ expr: 'rate(etcd_network_client_grpc_received_bytes_total{%s="$cluster"}[$__rate_interval])' % $._config.clusterLabel, intervalFactor: 2, legendFormat: '{{instance}} Client Traffic In', metric: 'etcd_network_client_grpc_received_bytes_total', refId: 'A', step: 4, }], thresholds: [], timeFrom: null, timeShift: null, title: 'Client Traffic In', tooltip: { msResolution: false, shared: true, sort: 0, value_type: 'individual', }, type: 'graph', xaxis: { mode: 'time', name: null, show: true, values: [], }, yaxes: [ { format: 'Bps', label: null, logBase: 1, max: null, min: null, show: true, }, { format: 'short', label: null, logBase: 1, max: null, min: null, show: true, }, ], }, { aliasColors: {}, bars: false, datasource: '$datasource', editable: true, 'error': false, fill: 5, id: 21, isNew: true, legend: { avg: false, current: false, max: false, min: false, show: false, total: false, values: false, }, lines: true, linewidth: 2, links: [], nullPointMode: 'connected', percentage: false, pointradius: 5, points: false, renderer: 'flot', seriesOverrides: [], span: 3, stack: true, steppedLine: false, targets: [{ expr: 'rate(etcd_network_client_grpc_sent_bytes_total{%s="$cluster"}[$__rate_interval])' % $._config.clusterLabel, intervalFactor: 2, legendFormat: '{{instance}} Client Traffic Out', metric: 'etcd_network_client_grpc_sent_bytes_total', refId: 'A', step: 4, }], thresholds: [], timeFrom: null, timeShift: null, title: 'Client Traffic Out', tooltip: { msResolution: false, shared: true, sort: 0, value_type: 'individual', }, type: 'graph', xaxis: { mode: 'time', name: null, show: true, values: [], }, yaxes: [ { format: 'Bps', label: null, logBase: 1, max: null, min: null, show: true, }, { format: 'short', label: null, logBase: 1, max: null, min: null, show: true, }, ], }, { aliasColors: {}, bars: false, datasource: '$datasource', editable: true, 'error': false, fill: 0, id: 20, isNew: true, legend: { avg: false, current: false, max: false, min: false, show: false, total: false, values: false, }, lines: true, linewidth: 2, links: [], nullPointMode: 'connected', percentage: false, pointradius: 5, points: false, renderer: 'flot', seriesOverrides: [], span: 3, stack: false, steppedLine: false, targets: [{ expr: 'sum(rate(etcd_network_peer_received_bytes_total{%s="$cluster"}[$__rate_interval])) by (instance)' % $._config.clusterLabel, intervalFactor: 2, legendFormat: '{{instance}} Peer Traffic In', metric: 'etcd_network_peer_received_bytes_total', refId: 'A', step: 4, }], thresholds: [], timeFrom: null, timeShift: null, title: 'Peer Traffic In', tooltip: { msResolution: false, shared: true, sort: 0, value_type: 'individual', }, type: 'graph', xaxis: { mode: 'time', name: null, show: true, values: [], }, yaxes: [ { format: 'Bps', label: null, logBase: 1, max: null, min: null, show: true, }, { format: 'short', label: null, logBase: 1, max: null, min: null, show: true, }, ], }, { aliasColors: {}, bars: false, datasource: '$datasource', decimals: null, editable: true, 'error': false, fill: 0, grid: {}, id: 16, legend: { avg: false, current: false, max: false, min: false, show: false, total: false, values: false, }, lines: true, linewidth: 2, links: [], nullPointMode: 'connected', percentage: false, pointradius: 5, points: false, renderer: 'flot', seriesOverrides: [], span: 3, stack: false, steppedLine: false, targets: [{ expr: 'sum(rate(etcd_network_peer_sent_bytes_total{%s="$cluster"}[$__rate_interval])) by (instance)' % $._config.clusterLabel, hide: false, interval: '', intervalFactor: 2, legendFormat: '{{instance}} Peer Traffic Out', metric: 'etcd_network_peer_sent_bytes_total', refId: 'A', step: 4, }], thresholds: [], timeFrom: null, timeShift: null, title: 'Peer Traffic Out', tooltip: { msResolution: false, shared: true, sort: 0, value_type: 'cumulative', }, type: 'graph', xaxis: { mode: 'time', name: null, show: true, values: [], }, yaxes: [ { format: 'Bps', logBase: 1, max: null, min: null, show: true, }, { format: 'short', logBase: 1, max: null, min: null, show: true, }, ], }, ], title: 'New row', }, { collapse: false, editable: true, height: '250px', panels: [ { aliasColors: {}, bars: false, datasource: '$datasource', editable: true, 'error': false, fill: 0, id: 40, isNew: true, legend: { avg: false, current: false, max: false, min: false, show: false, total: false, values: false, }, lines: true, linewidth: 2, links: [], nullPointMode: 'connected', percentage: false, pointradius: 5, points: false, renderer: 'flot', seriesOverrides: [], span: 6, stack: false, steppedLine: false, targets: [ { expr: 'sum(rate(etcd_server_proposals_failed_total{%s="$cluster"}[$__rate_interval]))' % $._config.clusterLabel, intervalFactor: 2, legendFormat: 'Proposal Failure Rate', metric: 'etcd_server_proposals_failed_total', refId: 'A', step: 2, }, { expr: 'sum(etcd_server_proposals_pending{%s="$cluster"})' % $._config.clusterLabel, intervalFactor: 2, legendFormat: 'Proposal Pending Total', metric: 'etcd_server_proposals_pending', refId: 'B', step: 2, }, { expr: 'sum(rate(etcd_server_proposals_committed_total{%s="$cluster"}[$__rate_interval]))' % $._config.clusterLabel, intervalFactor: 2, legendFormat: 'Proposal Commit Rate', metric: 'etcd_server_proposals_committed_total', refId: 'C', step: 2, }, { expr: 'sum(rate(etcd_server_proposals_applied_total{%s="$cluster"}[$__rate_interval]))' % $._config.clusterLabel, intervalFactor: 2, legendFormat: 'Proposal Apply Rate', refId: 'D', step: 2, }, ], thresholds: [], timeFrom: null, timeShift: null, title: 'Raft Proposals', tooltip: { msResolution: false, shared: true, sort: 0, value_type: 'individual', }, type: 'graph', xaxis: { mode: 'time', name: null, show: true, values: [], }, yaxes: [ { format: 'short', label: '', logBase: 1, max: null, min: null, show: true, }, { format: 'short', label: null, logBase: 1, max: null, min: null, show: true, }, ], }, { aliasColors: {}, bars: false, datasource: '$datasource', decimals: 0, editable: true, 'error': false, fill: 0, id: 19, isNew: true, legend: { alignAsTable: false, avg: false, current: false, max: false, min: false, rightSide: false, show: false, total: false, values: false, }, lines: true, linewidth: 2, links: [], nullPointMode: 'connected', percentage: false, pointradius: 5, points: false, renderer: 'flot', seriesOverrides: [], span: 6, stack: false, steppedLine: false, targets: [{ expr: 'changes(etcd_server_leader_changes_seen_total{%s="$cluster"}[1d])' % $._config.clusterLabel, intervalFactor: 2, legendFormat: '{{instance}} Total Leader Elections Per Day', metric: 'etcd_server_leader_changes_seen_total', refId: 'A', step: 2, }], thresholds: [], timeFrom: null, timeShift: null, title: 'Total Leader Elections Per Day', tooltip: { msResolution: false, shared: true, sort: 0, value_type: 'individual', }, type: 'graph', xaxis: { mode: 'time', name: null, show: true, values: [], }, yaxes: [ { format: 'short', label: null, logBase: 1, max: null, min: null, show: true, }, { format: 'short', label: null, logBase: 1, max: null, min: null, show: true, }, ], }, { aliasColors: {}, bars: false, dashLength: 10, dashes: false, datasource: '$datasource', decimals: 0, editable: true, 'error': false, fieldConfig: { defaults: { custom: {}, }, overrides: [], }, fill: 0, fillGradient: 0, gridPos: { h: 7, w: 12, x: 0, y: 28, }, hiddenSeries: false, id: 42, isNew: true, legend: { alignAsTable: false, avg: false, current: false, max: false, min: false, rightSide: false, show: false, total: false, values: false, }, lines: true, linewidth: 2, links: [], nullPointMode: 'connected', options: { alertThreshold: true, }, percentage: false, pluginVersion: '7.4.3', pointradius: 5, points: false, renderer: 'flot', seriesOverrides: [], spaceLength: 10, stack: false, steppedLine: false, targets: [ { expr: 'histogram_quantile(0.99, sum by (instance, le) (rate(etcd_network_peer_round_trip_time_seconds_bucket{%s="$cluster"}[$__rate_interval])))' % $._config.clusterLabel, interval: '', intervalFactor: 2, legendFormat: '{{instance}} Peer round trip time', metric: 'etcd_network_peer_round_trip_time_seconds_bucket', refId: 'A', step: 2, }, ], thresholds: [], timeFrom: null, timeRegions: [], timeShift: null, title: 'Peer round trip time', tooltip: { msResolution: false, shared: true, sort: 0, value_type: 'individual', }, type: 'graph', xaxis: { buckets: null, mode: 'time', name: null, show: true, values: [], }, yaxes: [ { '$$hashKey': 'object:925', decimals: null, format: 's', label: null, logBase: 1, max: null, min: null, show: true, }, { '$$hashKey': 'object:926', format: 'short', label: null, logBase: 1, max: null, min: null, show: true, }, ], yaxis: { align: false, alignLevel: null, }, }, ], title: 'New row', }, ], time: { from: 'now-15m', to: 'now', }, timepicker: { now: true, refresh_intervals: [ '5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d', ], time_options: [ '5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d', ], }, templating: { list: [ { current: { text: 'Prometheus', value: 'Prometheus', }, hide: 0, label: 'Data Source', name: 'datasource', options: [], query: 'prometheus', refresh: 1, regex: '', type: 'datasource', }, { allValue: null, current: { text: 'prod', value: 'prod', }, datasource: '$datasource', hide: 0, includeAll: false, label: 'cluster', multi: false, name: 'cluster', options: [], query: 'label_values(etcd_server_has_leader, %s)' % $._config.clusterLabel, refresh: $._config.dashboard_var_refresh, regex: '', sort: 2, tagValuesQuery: '', tags: [], tagsQuery: '', type: 'query', useTags: false, }, ], }, annotations: { list: [], }, refresh: '10s', schemaVersion: 13, version: 215, links: [], gnetId: null, }, }, }