diff --git a/contrib/mixin/README.md b/contrib/mixin/README.md new file mode 100644 index 000000000..224066f45 --- /dev/null +++ b/contrib/mixin/README.md @@ -0,0 +1,25 @@ +# Prometheus Monitoring Mixin for etcd + +> NOTE: This project is *alpha* stage. Flags, configuration, behaviour and design may change significantly in following releases. + +A set of customisable Prometheus alerts for etcd. + +Instructions for use are the same as the [kubernetes-mixin](https://github.com/kubernetes-monitoring/kubernetes-mixin). + +## Background + +* For more information about monitoring mixins, see this [design doc](https://docs.google.com/document/d/1A9xvzwqnFVSOZ5fD3blKODXfsat5fg6ZhnKu9LK3lB4/edit#). + +## Testing alerts + +Make sure to have [jsonnet](https://jsonnet.org/) and [gojsontoyaml](https://github.com/brancz/gojsontoyaml) installed. + +First compile the mixin to a YAML file, which the promtool will read: +``` +jsonnet -e '(import "mixin.libsonnet").prometheusAlerts' | gojsontoyaml > mixin.yaml +``` + +Then run the unit test: +``` +promtool test rules test.yaml +``` diff --git a/contrib/mixin/mixin.libsonnet b/contrib/mixin/mixin.libsonnet new file mode 100644 index 000000000..ac087934a --- /dev/null +++ b/contrib/mixin/mixin.libsonnet @@ -0,0 +1,1311 @@ +{ + _config+:: { + etcd_selector: 'job=~".*etcd.*"', + // etcd_instance_labels are the label names that are uniquely + // identifying an instance and need to be aggreated away for alerts + // that are about an etcd cluster as a whole. For example, if etcd + // instances are deployed on K8s, you will likely want to change + // this to 'instance, pod'. + etcd_instance_labels: 'instance', + // scrape_interval_seconds is the global scrape interval which can be + // used to dynamically adjust rate windows as a function of the interval. + scrape_interval_seconds: 30, + }, + + prometheusAlerts+:: { + groups+: [ + { + name: 'etcd', + rules: [ + { + alert: 'etcdMembersDown', + expr: ||| + max without (endpoint) ( + sum without (%(etcd_instance_labels)s) (up{%(etcd_selector)s} == bool 0) + or + count without (To) ( + sum without (%(etcd_instance_labels)s) (rate(etcd_network_peer_sent_failures_total{%(etcd_selector)s}[%(network_failure_range)ss])) > 0.01 + ) + ) + > 0 + ||| % {etcd_instance_labels: $._config.etcd_instance_labels, etcd_selector: $._config.etcd_selector, network_failure_range: $._config.scrape_interval_seconds*4}, + 'for': '10m', + labels: { + severity: 'critical', + }, + annotations: { + description: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value }}).', + summary: 'etcd cluster members are down.', + }, + }, + { + alert: 'etcdInsufficientMembers', + expr: ||| + sum(up{%(etcd_selector)s} == bool 1) without (%(etcd_instance_labels)s) < ((count(up{%(etcd_selector)s}) without (%(etcd_instance_labels)s) + 1) / 2) + ||| % $._config, + 'for': '3m', + labels: { + severity: 'critical', + }, + annotations: { + description: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).', + summary: 'etcd cluster has insufficient number of members.', + }, + }, + { + alert: 'etcdNoLeader', + expr: ||| + etcd_server_has_leader{%(etcd_selector)s} == 0 + ||| % $._config, + 'for': '1m', + labels: { + severity: 'critical', + }, + annotations: { + description: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.', + summary: 'etcd cluster has no leader.', + }, + }, + { + alert: 'etcdHighNumberOfLeaderChanges', + expr: ||| + increase((max without (%(etcd_instance_labels)s) (etcd_server_leader_changes_seen_total{%(etcd_selector)s}) or 0*absent(etcd_server_leader_changes_seen_total{%(etcd_selector)s}))[15m:1m]) >= 4 + ||| % $._config, + 'for': '5m', + labels: { + severity: 'warning', + }, + annotations: { + description: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.', + summary: 'etcd cluster has high number of leader changes.', + }, + }, + { + alert: 'etcdHighNumberOfFailedGRPCRequests', + expr: ||| + 100 * sum(rate(grpc_server_handled_total{%(etcd_selector)s, grpc_code!="OK"}[5m])) without (grpc_type, grpc_code) + / + sum(rate(grpc_server_handled_total{%(etcd_selector)s}[5m])) without (grpc_type, grpc_code) + > 1 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'warning', + }, + annotations: { + description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.', + summary: 'etcd cluster has high number of failed grpc requests.', + }, + }, + { + alert: 'etcdHighNumberOfFailedGRPCRequests', + expr: ||| + 100 * sum(rate(grpc_server_handled_total{%(etcd_selector)s, grpc_code!="OK"}[5m])) without (grpc_type, grpc_code) + / + sum(rate(grpc_server_handled_total{%(etcd_selector)s}[5m])) without (grpc_type, grpc_code) + > 5 + ||| % $._config, + 'for': '5m', + labels: { + severity: 'critical', + }, + annotations: { + description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.', + summary: 'etcd cluster has high number of failed grpc requests.', + }, + }, + { + alert: 'etcdGRPCRequestsSlow', + expr: ||| + histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{%(etcd_selector)s, grpc_type="unary"}[5m])) without(grpc_type)) + > 0.15 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'critical', + }, + annotations: { + description: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.', + summary: 'etcd grpc requests are slow', + }, + }, + { + alert: 'etcdMemberCommunicationSlow', + expr: ||| + histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{%(etcd_selector)s}[5m])) + > 0.15 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'warning', + }, + annotations: { + description: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.', + summary: 'etcd cluster member communication is slow.', + }, + }, + { + alert: 'etcdHighNumberOfFailedProposals', + expr: ||| + rate(etcd_server_proposals_failed_total{%(etcd_selector)s}[15m]) > 5 + ||| % $._config, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + description: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.', + summary: 'etcd cluster has high number of proposal failures.', + }, + }, + { + alert: 'etcdHighFsyncDurations', + expr: ||| + histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{%(etcd_selector)s}[5m])) + > 0.5 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'warning', + }, + annotations: { + description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.', + summary: 'etcd cluster 99th percentile fsync durations are too high.', + }, + }, + { + alert: 'etcdHighFsyncDurations', + expr: ||| + histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{%(etcd_selector)s}[5m])) + > 1 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'critical', + }, + annotations: { + message: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.', + }, + }, + { + alert: 'etcdHighCommitDurations', + expr: ||| + histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{%(etcd_selector)s}[5m])) + > 0.25 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'warning', + }, + annotations: { + description: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.', + summary: 'etcd cluster 99th percentile commit durations are too high.', + }, + }, + { + alert: 'etcdBackendQuotaLowSpace', + expr: ||| + (etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100 > 95 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'critical', + }, + annotations: { + message: 'etcd cluster "{{ $labels.job }}": database size exceeds the defined quota on etcd instance {{ $labels.instance }}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.', + }, + }, + { + alert: 'etcdExcessiveDatabaseGrowth', + expr: ||| + increase(((etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100)[240m:1m]) > 50 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'warning', + }, + annotations: { + message: 'etcd cluster "{{ $labels.job }}": Observed surge in etcd writes leading to 50% increase in database size over the past four hours on etcd instance {{ $labels.instance }}, please check as it might be disruptive.', + }, + }, + ], + }, + ], + }, + + grafanaDashboards+:: { + 'etcd.json': { + uid: std.md5('etcd.json'), + title: 'etcd', + description: 'etcd sample Grafana dashboard with Prometheus', + tags: [ 'etcd-mixin' ], + style: 'dark', + timezone: 'browser', + editable: true, + hideControls: false, + sharedCrosshair: false, + rows: [ + { + collapse: false, + editable: true, + height: '250px', + panels: [ + { + cacheTimeout: null, + colorBackground: false, + colorValue: false, + colors: [ + 'rgba(245, 54, 54, 0.9)', + 'rgba(237, 129, 40, 0.89)', + 'rgba(50, 172, 45, 0.97)', + ], + datasource: '$datasource', + editable: true, + 'error': false, + format: 'none', + gauge: { + maxValue: 100, + minValue: 0, + show: false, + thresholdLabels: false, + thresholdMarkers: true, + }, + id: 28, + interval: null, + isNew: true, + links: [], + mappingType: 1, + mappingTypes: [ + { + name: 'value to text', + value: 1, + }, + { + name: 'range to text', + value: 2, + }, + ], + maxDataPoints: 100, + nullPointMode: 'connected', + nullText: null, + postfix: '', + postfixFontSize: '50%', + prefix: '', + prefixFontSize: '50%', + rangeMaps: [{ + from: 'null', + text: 'N/A', + to: 'null', + }], + span: 3, + sparkline: { + fillColor: 'rgba(31, 118, 189, 0.18)', + full: false, + lineColor: 'rgb(31, 120, 193)', + show: false, + }, + targets: [{ + expr: 'sum(etcd_server_has_leader{job="$cluster"})', + intervalFactor: 2, + legendFormat: '', + metric: 'etcd_server_has_leader', + refId: 'A', + step: 20, + }], + thresholds: '', + title: 'Up', + type: 'singlestat', + valueFontSize: '200%', + valueMaps: [{ + op: '=', + text: 'N/A', + value: 'null', + }], + valueName: 'avg', + }, + { + aliasColors: {}, + bars: false, + datasource: '$datasource', + editable: true, + 'error': false, + fill: 0, + id: 23, + isNew: true, + legend: { + avg: false, + current: false, + max: false, + min: false, + show: false, + total: false, + values: false, + }, + lines: true, + linewidth: 2, + links: [], + nullPointMode: 'connected', + percentage: false, + pointradius: 5, + points: false, + renderer: 'flot', + seriesOverrides: [], + span: 5, + stack: false, + steppedLine: false, + targets: [ + { + expr: 'sum(rate(grpc_server_started_total{job="$cluster",grpc_type="unary"}[5m]))', + format: 'time_series', + intervalFactor: 2, + legendFormat: 'RPC Rate', + metric: 'grpc_server_started_total', + refId: 'A', + step: 2, + }, + { + expr: 'sum(rate(grpc_server_handled_total{job="$cluster",grpc_type="unary",grpc_code!="OK"}[5m]))', + format: 'time_series', + intervalFactor: 2, + legendFormat: 'RPC Failed Rate', + metric: 'grpc_server_handled_total', + refId: 'B', + step: 2, + }, + ], + thresholds: [], + timeFrom: null, + timeShift: null, + title: 'RPC Rate', + tooltip: { + msResolution: false, + shared: true, + sort: 0, + value_type: 'individual', + }, + type: 'graph', + xaxis: { + mode: 'time', + name: null, + show: true, + values: [], + }, + yaxes: [ + { + format: 'ops', + label: null, + logBase: 1, + max: null, + min: null, + show: true, + }, + { + format: 'short', + label: null, + logBase: 1, + max: null, + min: null, + show: true, + }, + ], + }, + { + aliasColors: {}, + bars: false, + datasource: '$datasource', + editable: true, + 'error': false, + fill: 0, + id: 41, + isNew: true, + legend: { + avg: false, + current: false, + max: false, + min: false, + show: false, + total: false, + values: false, + }, + lines: true, + linewidth: 2, + links: [], + nullPointMode: 'connected', + percentage: false, + pointradius: 5, + points: false, + renderer: 'flot', + seriesOverrides: [], + span: 4, + stack: true, + steppedLine: false, + targets: [ + { + expr: 'sum(grpc_server_started_total{job="$cluster",grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{job="$cluster",grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"})', + intervalFactor: 2, + legendFormat: 'Watch Streams', + metric: 'grpc_server_handled_total', + refId: 'A', + step: 4, + }, + { + expr: 'sum(grpc_server_started_total{job="$cluster",grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{job="$cluster",grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"})', + intervalFactor: 2, + legendFormat: 'Lease Streams', + metric: 'grpc_server_handled_total', + refId: 'B', + step: 4, + }, + ], + thresholds: [], + timeFrom: null, + timeShift: null, + title: 'Active Streams', + tooltip: { + msResolution: false, + shared: true, + sort: 0, + value_type: 'individual', + }, + type: 'graph', + xaxis: { + mode: 'time', + name: null, + show: true, + values: [], + }, + yaxes: [ + { + format: 'short', + label: '', + logBase: 1, + max: null, + min: null, + show: true, + }, + { + format: 'short', + label: null, + logBase: 1, + max: null, + min: null, + show: true, + }, + ], + }, + ], + showTitle: false, + title: 'Row', + }, + { + collapse: false, + editable: true, + height: '250px', + panels: [ + { + aliasColors: {}, + bars: false, + datasource: '$datasource', + decimals: null, + editable: true, + 'error': false, + fill: 0, + grid: {}, + id: 1, + legend: { + avg: false, + current: false, + max: false, + min: false, + show: false, + total: false, + values: false, + }, + lines: true, + linewidth: 2, + links: [], + nullPointMode: 'connected', + percentage: false, + pointradius: 5, + points: false, + renderer: 'flot', + seriesOverrides: [], + span: 4, + stack: false, + steppedLine: false, + targets: [{ + expr: 'etcd_mvcc_db_total_size_in_bytes{job="$cluster"}', + hide: false, + interval: '', + intervalFactor: 2, + legendFormat: '{{instance}} DB Size', + metric: '', + refId: 'A', + step: 4, + }], + thresholds: [], + timeFrom: null, + timeShift: null, + title: 'DB Size', + tooltip: { + msResolution: false, + shared: true, + sort: 0, + value_type: 'cumulative', + }, + type: 'graph', + xaxis: { + mode: 'time', + name: null, + show: true, + values: [], + }, + yaxes: [ + { + format: 'bytes', + logBase: 1, + max: null, + min: null, + show: true, + }, + { + format: 'short', + logBase: 1, + max: null, + min: null, + show: false, + }, + ], + }, + { + aliasColors: {}, + bars: false, + datasource: '$datasource', + editable: true, + 'error': false, + fill: 0, + grid: {}, + id: 3, + legend: { + avg: false, + current: false, + max: false, + min: false, + show: false, + total: false, + values: false, + }, + lines: true, + linewidth: 2, + links: [], + nullPointMode: 'connected', + percentage: false, + pointradius: 1, + points: false, + renderer: 'flot', + seriesOverrides: [], + span: 4, + stack: false, + steppedLine: true, + targets: [ + { + expr: 'histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket{job="$cluster"}[5m])) by (instance, le))', + hide: false, + intervalFactor: 2, + legendFormat: '{{instance}} WAL fsync', + metric: 'etcd_disk_wal_fsync_duration_seconds_bucket', + refId: 'A', + step: 4, + }, + { + expr: 'histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket{job="$cluster"}[5m])) by (instance, le))', + intervalFactor: 2, + legendFormat: '{{instance}} DB fsync', + metric: 'etcd_disk_backend_commit_duration_seconds_bucket', + refId: 'B', + step: 4, + }, + ], + thresholds: [], + timeFrom: null, + timeShift: null, + title: 'Disk Sync Duration', + tooltip: { + msResolution: false, + shared: true, + sort: 0, + value_type: 'cumulative', + }, + type: 'graph', + xaxis: { + mode: 'time', + name: null, + show: true, + values: [], + }, + yaxes: [ + { + format: 's', + logBase: 1, + max: null, + min: null, + show: true, + }, + { + format: 'short', + logBase: 1, + max: null, + min: null, + show: false, + }, + ], + }, + { + aliasColors: {}, + bars: false, + datasource: '$datasource', + editable: true, + 'error': false, + fill: 0, + id: 29, + isNew: true, + legend: { + avg: false, + current: false, + max: false, + min: false, + show: false, + total: false, + values: false, + }, + lines: true, + linewidth: 2, + links: [], + nullPointMode: 'connected', + percentage: false, + pointradius: 5, + points: false, + renderer: 'flot', + seriesOverrides: [], + span: 4, + stack: false, + steppedLine: false, + targets: [{ + expr: 'process_resident_memory_bytes{job="$cluster"}', + intervalFactor: 2, + legendFormat: '{{instance}} Resident Memory', + metric: 'process_resident_memory_bytes', + refId: 'A', + step: 4, + }], + thresholds: [], + timeFrom: null, + timeShift: null, + title: 'Memory', + tooltip: { + msResolution: false, + shared: true, + sort: 0, + value_type: 'individual', + }, + type: 'graph', + xaxis: { + mode: 'time', + name: null, + show: true, + values: [], + }, + yaxes: [ + { + format: 'bytes', + label: null, + logBase: 1, + max: null, + min: null, + show: true, + }, + { + format: 'short', + label: null, + logBase: 1, + max: null, + min: null, + show: true, + }, + ], + }, + ], + title: 'New row', + }, + { + collapse: false, + editable: true, + height: '250px', + panels: [ + { + aliasColors: {}, + bars: false, + datasource: '$datasource', + editable: true, + 'error': false, + fill: 5, + id: 22, + isNew: true, + legend: { + avg: false, + current: false, + max: false, + min: false, + show: false, + total: false, + values: false, + }, + lines: true, + linewidth: 2, + links: [], + nullPointMode: 'connected', + percentage: false, + pointradius: 5, + points: false, + renderer: 'flot', + seriesOverrides: [], + span: 3, + stack: true, + steppedLine: false, + targets: [{ + expr: 'rate(etcd_network_client_grpc_received_bytes_total{job="$cluster"}[5m])', + intervalFactor: 2, + legendFormat: '{{instance}} Client Traffic In', + metric: 'etcd_network_client_grpc_received_bytes_total', + refId: 'A', + step: 4, + }], + thresholds: [], + timeFrom: null, + timeShift: null, + title: 'Client Traffic In', + tooltip: { + msResolution: false, + shared: true, + sort: 0, + value_type: 'individual', + }, + type: 'graph', + xaxis: { + mode: 'time', + name: null, + show: true, + values: [], + }, + yaxes: [ + { + format: 'Bps', + label: null, + logBase: 1, + max: null, + min: null, + show: true, + }, + { + format: 'short', + label: null, + logBase: 1, + max: null, + min: null, + show: true, + }, + ], + }, + { + aliasColors: {}, + bars: false, + datasource: '$datasource', + editable: true, + 'error': false, + fill: 5, + id: 21, + isNew: true, + legend: { + avg: false, + current: false, + max: false, + min: false, + show: false, + total: false, + values: false, + }, + lines: true, + linewidth: 2, + links: [], + nullPointMode: 'connected', + percentage: false, + pointradius: 5, + points: false, + renderer: 'flot', + seriesOverrides: [], + span: 3, + stack: true, + steppedLine: false, + targets: [{ + expr: 'rate(etcd_network_client_grpc_sent_bytes_total{job="$cluster"}[5m])', + intervalFactor: 2, + legendFormat: '{{instance}} Client Traffic Out', + metric: 'etcd_network_client_grpc_sent_bytes_total', + refId: 'A', + step: 4, + }], + thresholds: [], + timeFrom: null, + timeShift: null, + title: 'Client Traffic Out', + tooltip: { + msResolution: false, + shared: true, + sort: 0, + value_type: 'individual', + }, + type: 'graph', + xaxis: { + mode: 'time', + name: null, + show: true, + values: [], + }, + yaxes: [ + { + format: 'Bps', + label: null, + logBase: 1, + max: null, + min: null, + show: true, + }, + { + format: 'short', + label: null, + logBase: 1, + max: null, + min: null, + show: true, + }, + ], + }, + { + aliasColors: {}, + bars: false, + datasource: '$datasource', + editable: true, + 'error': false, + fill: 0, + id: 20, + isNew: true, + legend: { + avg: false, + current: false, + max: false, + min: false, + show: false, + total: false, + values: false, + }, + lines: true, + linewidth: 2, + links: [], + nullPointMode: 'connected', + percentage: false, + pointradius: 5, + points: false, + renderer: 'flot', + seriesOverrides: [], + span: 3, + stack: false, + steppedLine: false, + targets: [{ + expr: 'sum(rate(etcd_network_peer_received_bytes_total{job="$cluster"}[5m])) by (instance)', + intervalFactor: 2, + legendFormat: '{{instance}} Peer Traffic In', + metric: 'etcd_network_peer_received_bytes_total', + refId: 'A', + step: 4, + }], + thresholds: [], + timeFrom: null, + timeShift: null, + title: 'Peer Traffic In', + tooltip: { + msResolution: false, + shared: true, + sort: 0, + value_type: 'individual', + }, + type: 'graph', + xaxis: { + mode: 'time', + name: null, + show: true, + values: [], + }, + yaxes: [ + { + format: 'Bps', + label: null, + logBase: 1, + max: null, + min: null, + show: true, + }, + { + format: 'short', + label: null, + logBase: 1, + max: null, + min: null, + show: true, + }, + ], + }, + { + aliasColors: {}, + bars: false, + datasource: '$datasource', + decimals: null, + editable: true, + 'error': false, + fill: 0, + grid: {}, + id: 16, + legend: { + avg: false, + current: false, + max: false, + min: false, + show: false, + total: false, + values: false, + }, + lines: true, + linewidth: 2, + links: [], + nullPointMode: 'connected', + percentage: false, + pointradius: 5, + points: false, + renderer: 'flot', + seriesOverrides: [], + span: 3, + stack: false, + steppedLine: false, + targets: [{ + expr: 'sum(rate(etcd_network_peer_sent_bytes_total{job="$cluster"}[5m])) by (instance)', + hide: false, + interval: '', + intervalFactor: 2, + legendFormat: '{{instance}} Peer Traffic Out', + metric: 'etcd_network_peer_sent_bytes_total', + refId: 'A', + step: 4, + }], + thresholds: [], + timeFrom: null, + timeShift: null, + title: 'Peer Traffic Out', + tooltip: { + msResolution: false, + shared: true, + sort: 0, + value_type: 'cumulative', + }, + type: 'graph', + xaxis: { + mode: 'time', + name: null, + show: true, + values: [], + }, + yaxes: [ + { + format: 'Bps', + logBase: 1, + max: null, + min: null, + show: true, + }, + { + format: 'short', + logBase: 1, + max: null, + min: null, + show: true, + }, + ], + }, + ], + title: 'New row', + }, + { + collapse: false, + editable: true, + height: '250px', + panels: [ + { + aliasColors: {}, + bars: false, + datasource: '$datasource', + editable: true, + 'error': false, + fill: 0, + id: 40, + isNew: true, + legend: { + avg: false, + current: false, + max: false, + min: false, + show: false, + total: false, + values: false, + }, + lines: true, + linewidth: 2, + links: [], + nullPointMode: 'connected', + percentage: false, + pointradius: 5, + points: false, + renderer: 'flot', + seriesOverrides: [], + span: 6, + stack: false, + steppedLine: false, + targets: [ + { + expr: 'sum(rate(etcd_server_proposals_failed_total{job="$cluster"}[5m]))', + intervalFactor: 2, + legendFormat: 'Proposal Failure Rate', + metric: 'etcd_server_proposals_failed_total', + refId: 'A', + step: 2, + }, + { + expr: 'sum(etcd_server_proposals_pending{job="$cluster"})', + intervalFactor: 2, + legendFormat: 'Proposal Pending Total', + metric: 'etcd_server_proposals_pending', + refId: 'B', + step: 2, + }, + { + expr: 'sum(rate(etcd_server_proposals_committed_total{job="$cluster"}[5m]))', + intervalFactor: 2, + legendFormat: 'Proposal Commit Rate', + metric: 'etcd_server_proposals_committed_total', + refId: 'C', + step: 2, + }, + { + expr: 'sum(rate(etcd_server_proposals_applied_total{job="$cluster"}[5m]))', + intervalFactor: 2, + legendFormat: 'Proposal Apply Rate', + refId: 'D', + step: 2, + }, + ], + thresholds: [], + timeFrom: null, + timeShift: null, + title: 'Raft Proposals', + tooltip: { + msResolution: false, + shared: true, + sort: 0, + value_type: 'individual', + }, + type: 'graph', + xaxis: { + mode: 'time', + name: null, + show: true, + values: [], + }, + yaxes: [ + { + format: 'short', + label: '', + logBase: 1, + max: null, + min: null, + show: true, + }, + { + format: 'short', + label: null, + logBase: 1, + max: null, + min: null, + show: true, + }, + ], + }, + { + aliasColors: {}, + bars: false, + datasource: '$datasource', + decimals: 0, + editable: true, + 'error': false, + fill: 0, + id: 19, + isNew: true, + legend: { + alignAsTable: false, + avg: false, + current: false, + max: false, + min: false, + rightSide: false, + show: false, + total: false, + values: false, + }, + lines: true, + linewidth: 2, + links: [], + nullPointMode: 'connected', + percentage: false, + pointradius: 5, + points: false, + renderer: 'flot', + seriesOverrides: [], + span: 6, + stack: false, + steppedLine: false, + targets: [{ + expr: 'changes(etcd_server_leader_changes_seen_total{job="$cluster"}[1d])', + intervalFactor: 2, + legendFormat: '{{instance}} Total Leader Elections Per Day', + metric: 'etcd_server_leader_changes_seen_total', + refId: 'A', + step: 2, + }], + thresholds: [], + timeFrom: null, + timeShift: null, + title: 'Total Leader Elections Per Day', + tooltip: { + msResolution: false, + shared: true, + sort: 0, + value_type: 'individual', + }, + type: 'graph', + xaxis: { + mode: 'time', + name: null, + show: true, + values: [], + }, + yaxes: [ + { + format: 'short', + label: null, + logBase: 1, + max: null, + min: null, + show: true, + }, + { + format: 'short', + label: null, + logBase: 1, + max: null, + min: null, + show: true, + }, + ], + }, + ], + title: 'New row', + }, + ], + time: { + from: 'now-15m', + to: 'now', + }, + timepicker: { + now: true, + refresh_intervals: [ + '5s', + '10s', + '30s', + '1m', + '5m', + '15m', + '30m', + '1h', + '2h', + '1d', + ], + time_options: [ + '5m', + '15m', + '1h', + '6h', + '12h', + '24h', + '2d', + '7d', + '30d', + ], + }, + templating: { + list: [ + { + current: { + text: 'Prometheus', + value: 'Prometheus', + }, + hide: 0, + label: null, + name: 'datasource', + options: [], + query: 'prometheus', + refresh: 1, + regex: '', + type: 'datasource', + }, + { + allValue: null, + current: { + text: 'prod', + value: 'prod', + }, + datasource: '$datasource', + hide: 0, + includeAll: false, + label: 'cluster', + multi: false, + name: 'cluster', + options: [], + query: 'label_values(etcd_server_has_leader, job)', + refresh: 1, + regex: '', + sort: 2, + tagValuesQuery: '', + tags: [], + tagsQuery: '', + type: 'query', + useTags: false, + }, + ], + }, + annotations: { + list: [], + }, + refresh: '10s', + schemaVersion: 13, + version: 215, + links: [], + gnetId: null, + }, + }, +} diff --git a/contrib/mixin/test.yaml b/contrib/mixin/test.yaml new file mode 100644 index 000000000..24162bd4d --- /dev/null +++ b/contrib/mixin/test.yaml @@ -0,0 +1,135 @@ +rule_files: + - mixin.yaml + +evaluation_interval: 1m + +tests: + - interval: 1m + input_series: + - series: 'up{job="etcd",instance="10.10.10.0"}' + values: '1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0' + - series: 'up{job="etcd",instance="10.10.10.1"}' + values: '1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0' + - series: 'up{job="etcd",instance="10.10.10.2"}' + values: '1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0' + alert_rule_test: + - eval_time: 3m + alertname: etcdInsufficientMembers + - eval_time: 5m + alertname: etcdInsufficientMembers + - eval_time: 12m + alertname: etcdMembersDown + - eval_time: 14m + alertname: etcdMembersDown + exp_alerts: + - exp_labels: + job: etcd + severity: critical + exp_annotations: + description: 'etcd cluster "etcd": members are down (3).' + summary: 'etcd cluster members are down.' + - eval_time: 7m + alertname: etcdInsufficientMembers + - eval_time: 11m + alertname: etcdInsufficientMembers + exp_alerts: + - exp_labels: + job: etcd + severity: critical + exp_annotations: + description: 'etcd cluster "etcd": insufficient members (1).' + summary: 'etcd cluster has insufficient number of members.' + - eval_time: 15m + alertname: etcdInsufficientMembers + exp_alerts: + - exp_labels: + job: etcd + severity: critical + exp_annotations: + description: 'etcd cluster "etcd": insufficient members (0).' + summary: 'etcd cluster has insufficient number of members.' + + - interval: 1m + input_series: + - series: 'up{job="etcd",instance="10.10.10.0"}' + values: '1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0' + - series: 'up{job="etcd",instance="10.10.10.1"}' + values: '1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0' + - series: 'up{job="etcd",instance="10.10.10.2"}' + values: '1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0' + alert_rule_test: + - eval_time: 14m + alertname: etcdMembersDown + exp_alerts: + - exp_labels: + job: etcd + severity: critical + exp_annotations: + description: 'etcd cluster "etcd": members are down (3).' + summary: 'etcd cluster members are down.' + + - interval: 1m + input_series: + - series: 'up{job="etcd",instance="10.10.10.0"}' + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0' + - series: 'up{job="etcd",instance="10.10.10.1"}' + values: '1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0' + - series: 'etcd_network_peer_sent_failures_total{To="member-1",job="etcd",endpoint="test"}' + values: '0 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18' + alert_rule_test: + - eval_time: 13m + alertname: etcdMembersDown + exp_alerts: + - exp_labels: + job: etcd + severity: critical + exp_annotations: + description: 'etcd cluster "etcd": members are down (1).' + summary: 'etcd cluster members are down.' + - interval: 1m + input_series: + - series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.0"}' + values: '0 0 2 0 0 1 0 0 0 0 0 0 0 0 0 0' + - series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.1"}' + values: '0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0' + - series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.2"}' + values: '0 0 0 0 0 0 0 0' + alert_rule_test: + - eval_time: 10m + alertname: etcdHighNumberOfLeaderChanges + exp_alerts: + - exp_labels: + job: etcd + severity: warning + exp_annotations: + description: 'etcd cluster "etcd": 4 leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.' + summary: 'etcd cluster has high number of leader changes.' + - interval: 1m + input_series: + - series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.0"}' + values: '0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0' + - series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.1"}' + values: '0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0' + - series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.2"}' + values: '0 0 0 0 0 0 0 0' + alert_rule_test: + - eval_time: 10m + alertname: etcdHighNumberOfLeaderChanges + exp_alerts: + - interval: 1m + input_series: + - series: '((etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100){job="etcd",instance="10.10.10.0"}' + values: '0 10 20 0 0 10 0 0 30 0 0 0 0 0 0 0' + - series: '((etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100){job="etcd",instance="10.10.10.1"}' + values: '0 0 10 0 20 0 0 0 0 0 0 0 0 0 0 0' + - series: '((etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100){job="etcd",instance="10.10.10.2"}' + values: '0 0 0 0 0 0 0 0' + alert_rule_test: + - eval_time: 10m + alertname: etcdExcessiveDatabaseGrowth + exp_alerts: + - exp_labels: + job: etcd + severity: warning + exp_annotations: + message: 'etcd cluster "etcd": Observed surge in etcd writes leading to 50% increase in database size over the past four hours, please check as it might be disruptive.'