From 7fca120587c26eab0469acef7f21476f3da5e7fd Mon Sep 17 00:00:00 2001 From: Sahdev Zala Date: Thu, 18 Feb 2021 18:44:19 -0500 Subject: [PATCH] Remove mixin from doc The kube-prometheus team has made necessary changes and not using the mixin from doc any more. --- Documentation/etcd-mixin/README.md | 25 - Documentation/etcd-mixin/mixin.libsonnet | 1311 ---------------------- Documentation/etcd-mixin/test.yaml | 135 --- 3 files changed, 1471 deletions(-) delete mode 100644 Documentation/etcd-mixin/README.md delete mode 100644 Documentation/etcd-mixin/mixin.libsonnet delete mode 100644 Documentation/etcd-mixin/test.yaml diff --git a/Documentation/etcd-mixin/README.md b/Documentation/etcd-mixin/README.md deleted file mode 100644 index 224066f45..000000000 --- a/Documentation/etcd-mixin/README.md +++ /dev/null @@ -1,25 +0,0 @@ -# Prometheus Monitoring Mixin for etcd - -> NOTE: This project is *alpha* stage. Flags, configuration, behaviour and design may change significantly in following releases. - -A set of customisable Prometheus alerts for etcd. - -Instructions for use are the same as the [kubernetes-mixin](https://github.com/kubernetes-monitoring/kubernetes-mixin). - -## Background - -* For more information about monitoring mixins, see this [design doc](https://docs.google.com/document/d/1A9xvzwqnFVSOZ5fD3blKODXfsat5fg6ZhnKu9LK3lB4/edit#). - -## Testing alerts - -Make sure to have [jsonnet](https://jsonnet.org/) and [gojsontoyaml](https://github.com/brancz/gojsontoyaml) installed. - -First compile the mixin to a YAML file, which the promtool will read: -``` -jsonnet -e '(import "mixin.libsonnet").prometheusAlerts' | gojsontoyaml > mixin.yaml -``` - -Then run the unit test: -``` -promtool test rules test.yaml -``` diff --git a/Documentation/etcd-mixin/mixin.libsonnet b/Documentation/etcd-mixin/mixin.libsonnet deleted file mode 100644 index ac087934a..000000000 --- a/Documentation/etcd-mixin/mixin.libsonnet +++ /dev/null @@ -1,1311 +0,0 @@ -{ - _config+:: { - etcd_selector: 'job=~".*etcd.*"', - // etcd_instance_labels are the label names that are uniquely - // identifying an instance and need to be aggreated away for alerts - // that are about an etcd cluster as a whole. For example, if etcd - // instances are deployed on K8s, you will likely want to change - // this to 'instance, pod'. - etcd_instance_labels: 'instance', - // scrape_interval_seconds is the global scrape interval which can be - // used to dynamically adjust rate windows as a function of the interval. - scrape_interval_seconds: 30, - }, - - prometheusAlerts+:: { - groups+: [ - { - name: 'etcd', - rules: [ - { - alert: 'etcdMembersDown', - expr: ||| - max without (endpoint) ( - sum without (%(etcd_instance_labels)s) (up{%(etcd_selector)s} == bool 0) - or - count without (To) ( - sum without (%(etcd_instance_labels)s) (rate(etcd_network_peer_sent_failures_total{%(etcd_selector)s}[%(network_failure_range)ss])) > 0.01 - ) - ) - > 0 - ||| % {etcd_instance_labels: $._config.etcd_instance_labels, etcd_selector: $._config.etcd_selector, network_failure_range: $._config.scrape_interval_seconds*4}, - 'for': '10m', - labels: { - severity: 'critical', - }, - annotations: { - description: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value }}).', - summary: 'etcd cluster members are down.', - }, - }, - { - alert: 'etcdInsufficientMembers', - expr: ||| - sum(up{%(etcd_selector)s} == bool 1) without (%(etcd_instance_labels)s) < ((count(up{%(etcd_selector)s}) without (%(etcd_instance_labels)s) + 1) / 2) - ||| % $._config, - 'for': '3m', - labels: { - severity: 'critical', - }, - annotations: { - description: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).', - summary: 'etcd cluster has insufficient number of members.', - }, - }, - { - alert: 'etcdNoLeader', - expr: ||| - etcd_server_has_leader{%(etcd_selector)s} == 0 - ||| % $._config, - 'for': '1m', - labels: { - severity: 'critical', - }, - annotations: { - description: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.', - summary: 'etcd cluster has no leader.', - }, - }, - { - alert: 'etcdHighNumberOfLeaderChanges', - expr: ||| - increase((max without (%(etcd_instance_labels)s) (etcd_server_leader_changes_seen_total{%(etcd_selector)s}) or 0*absent(etcd_server_leader_changes_seen_total{%(etcd_selector)s}))[15m:1m]) >= 4 - ||| % $._config, - 'for': '5m', - labels: { - severity: 'warning', - }, - annotations: { - description: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.', - summary: 'etcd cluster has high number of leader changes.', - }, - }, - { - alert: 'etcdHighNumberOfFailedGRPCRequests', - expr: ||| - 100 * sum(rate(grpc_server_handled_total{%(etcd_selector)s, grpc_code!="OK"}[5m])) without (grpc_type, grpc_code) - / - sum(rate(grpc_server_handled_total{%(etcd_selector)s}[5m])) without (grpc_type, grpc_code) - > 1 - ||| % $._config, - 'for': '10m', - labels: { - severity: 'warning', - }, - annotations: { - description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.', - summary: 'etcd cluster has high number of failed grpc requests.', - }, - }, - { - alert: 'etcdHighNumberOfFailedGRPCRequests', - expr: ||| - 100 * sum(rate(grpc_server_handled_total{%(etcd_selector)s, grpc_code!="OK"}[5m])) without (grpc_type, grpc_code) - / - sum(rate(grpc_server_handled_total{%(etcd_selector)s}[5m])) without (grpc_type, grpc_code) - > 5 - ||| % $._config, - 'for': '5m', - labels: { - severity: 'critical', - }, - annotations: { - description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.', - summary: 'etcd cluster has high number of failed grpc requests.', - }, - }, - { - alert: 'etcdGRPCRequestsSlow', - expr: ||| - histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{%(etcd_selector)s, grpc_type="unary"}[5m])) without(grpc_type)) - > 0.15 - ||| % $._config, - 'for': '10m', - labels: { - severity: 'critical', - }, - annotations: { - description: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.', - summary: 'etcd grpc requests are slow', - }, - }, - { - alert: 'etcdMemberCommunicationSlow', - expr: ||| - histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{%(etcd_selector)s}[5m])) - > 0.15 - ||| % $._config, - 'for': '10m', - labels: { - severity: 'warning', - }, - annotations: { - description: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.', - summary: 'etcd cluster member communication is slow.', - }, - }, - { - alert: 'etcdHighNumberOfFailedProposals', - expr: ||| - rate(etcd_server_proposals_failed_total{%(etcd_selector)s}[15m]) > 5 - ||| % $._config, - 'for': '15m', - labels: { - severity: 'warning', - }, - annotations: { - description: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.', - summary: 'etcd cluster has high number of proposal failures.', - }, - }, - { - alert: 'etcdHighFsyncDurations', - expr: ||| - histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{%(etcd_selector)s}[5m])) - > 0.5 - ||| % $._config, - 'for': '10m', - labels: { - severity: 'warning', - }, - annotations: { - description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.', - summary: 'etcd cluster 99th percentile fsync durations are too high.', - }, - }, - { - alert: 'etcdHighFsyncDurations', - expr: ||| - histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{%(etcd_selector)s}[5m])) - > 1 - ||| % $._config, - 'for': '10m', - labels: { - severity: 'critical', - }, - annotations: { - message: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.', - }, - }, - { - alert: 'etcdHighCommitDurations', - expr: ||| - histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{%(etcd_selector)s}[5m])) - > 0.25 - ||| % $._config, - 'for': '10m', - labels: { - severity: 'warning', - }, - annotations: { - description: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.', - summary: 'etcd cluster 99th percentile commit durations are too high.', - }, - }, - { - alert: 'etcdBackendQuotaLowSpace', - expr: ||| - (etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100 > 95 - ||| % $._config, - 'for': '10m', - labels: { - severity: 'critical', - }, - annotations: { - message: 'etcd cluster "{{ $labels.job }}": database size exceeds the defined quota on etcd instance {{ $labels.instance }}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.', - }, - }, - { - alert: 'etcdExcessiveDatabaseGrowth', - expr: ||| - increase(((etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100)[240m:1m]) > 50 - ||| % $._config, - 'for': '10m', - labels: { - severity: 'warning', - }, - annotations: { - message: 'etcd cluster "{{ $labels.job }}": Observed surge in etcd writes leading to 50% increase in database size over the past four hours on etcd instance {{ $labels.instance }}, please check as it might be disruptive.', - }, - }, - ], - }, - ], - }, - - grafanaDashboards+:: { - 'etcd.json': { - uid: std.md5('etcd.json'), - title: 'etcd', - description: 'etcd sample Grafana dashboard with Prometheus', - tags: [ 'etcd-mixin' ], - style: 'dark', - timezone: 'browser', - editable: true, - hideControls: false, - sharedCrosshair: false, - rows: [ - { - collapse: false, - editable: true, - height: '250px', - panels: [ - { - cacheTimeout: null, - colorBackground: false, - colorValue: false, - colors: [ - 'rgba(245, 54, 54, 0.9)', - 'rgba(237, 129, 40, 0.89)', - 'rgba(50, 172, 45, 0.97)', - ], - datasource: '$datasource', - editable: true, - 'error': false, - format: 'none', - gauge: { - maxValue: 100, - minValue: 0, - show: false, - thresholdLabels: false, - thresholdMarkers: true, - }, - id: 28, - interval: null, - isNew: true, - links: [], - mappingType: 1, - mappingTypes: [ - { - name: 'value to text', - value: 1, - }, - { - name: 'range to text', - value: 2, - }, - ], - maxDataPoints: 100, - nullPointMode: 'connected', - nullText: null, - postfix: '', - postfixFontSize: '50%', - prefix: '', - prefixFontSize: '50%', - rangeMaps: [{ - from: 'null', - text: 'N/A', - to: 'null', - }], - span: 3, - sparkline: { - fillColor: 'rgba(31, 118, 189, 0.18)', - full: false, - lineColor: 'rgb(31, 120, 193)', - show: false, - }, - targets: [{ - expr: 'sum(etcd_server_has_leader{job="$cluster"})', - intervalFactor: 2, - legendFormat: '', - metric: 'etcd_server_has_leader', - refId: 'A', - step: 20, - }], - thresholds: '', - title: 'Up', - type: 'singlestat', - valueFontSize: '200%', - valueMaps: [{ - op: '=', - text: 'N/A', - value: 'null', - }], - valueName: 'avg', - }, - { - aliasColors: {}, - bars: false, - datasource: '$datasource', - editable: true, - 'error': false, - fill: 0, - id: 23, - isNew: true, - legend: { - avg: false, - current: false, - max: false, - min: false, - show: false, - total: false, - values: false, - }, - lines: true, - linewidth: 2, - links: [], - nullPointMode: 'connected', - percentage: false, - pointradius: 5, - points: false, - renderer: 'flot', - seriesOverrides: [], - span: 5, - stack: false, - steppedLine: false, - targets: [ - { - expr: 'sum(rate(grpc_server_started_total{job="$cluster",grpc_type="unary"}[5m]))', - format: 'time_series', - intervalFactor: 2, - legendFormat: 'RPC Rate', - metric: 'grpc_server_started_total', - refId: 'A', - step: 2, - }, - { - expr: 'sum(rate(grpc_server_handled_total{job="$cluster",grpc_type="unary",grpc_code!="OK"}[5m]))', - format: 'time_series', - intervalFactor: 2, - legendFormat: 'RPC Failed Rate', - metric: 'grpc_server_handled_total', - refId: 'B', - step: 2, - }, - ], - thresholds: [], - timeFrom: null, - timeShift: null, - title: 'RPC Rate', - tooltip: { - msResolution: false, - shared: true, - sort: 0, - value_type: 'individual', - }, - type: 'graph', - xaxis: { - mode: 'time', - name: null, - show: true, - values: [], - }, - yaxes: [ - { - format: 'ops', - label: null, - logBase: 1, - max: null, - min: null, - show: true, - }, - { - format: 'short', - label: null, - logBase: 1, - max: null, - min: null, - show: true, - }, - ], - }, - { - aliasColors: {}, - bars: false, - datasource: '$datasource', - editable: true, - 'error': false, - fill: 0, - id: 41, - isNew: true, - legend: { - avg: false, - current: false, - max: false, - min: false, - show: false, - total: false, - values: false, - }, - lines: true, - linewidth: 2, - links: [], - nullPointMode: 'connected', - percentage: false, - pointradius: 5, - points: false, - renderer: 'flot', - seriesOverrides: [], - span: 4, - stack: true, - steppedLine: false, - targets: [ - { - expr: 'sum(grpc_server_started_total{job="$cluster",grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{job="$cluster",grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"})', - intervalFactor: 2, - legendFormat: 'Watch Streams', - metric: 'grpc_server_handled_total', - refId: 'A', - step: 4, - }, - { - expr: 'sum(grpc_server_started_total{job="$cluster",grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{job="$cluster",grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"})', - intervalFactor: 2, - legendFormat: 'Lease Streams', - metric: 'grpc_server_handled_total', - refId: 'B', - step: 4, - }, - ], - thresholds: [], - timeFrom: null, - timeShift: null, - title: 'Active Streams', - tooltip: { - msResolution: false, - shared: true, - sort: 0, - value_type: 'individual', - }, - type: 'graph', - xaxis: { - mode: 'time', - name: null, - show: true, - values: [], - }, - yaxes: [ - { - format: 'short', - label: '', - logBase: 1, - max: null, - min: null, - show: true, - }, - { - format: 'short', - label: null, - logBase: 1, - max: null, - min: null, - show: true, - }, - ], - }, - ], - showTitle: false, - title: 'Row', - }, - { - collapse: false, - editable: true, - height: '250px', - panels: [ - { - aliasColors: {}, - bars: false, - datasource: '$datasource', - decimals: null, - editable: true, - 'error': false, - fill: 0, - grid: {}, - id: 1, - legend: { - avg: false, - current: false, - max: false, - min: false, - show: false, - total: false, - values: false, - }, - lines: true, - linewidth: 2, - links: [], - nullPointMode: 'connected', - percentage: false, - pointradius: 5, - points: false, - renderer: 'flot', - seriesOverrides: [], - span: 4, - stack: false, - steppedLine: false, - targets: [{ - expr: 'etcd_mvcc_db_total_size_in_bytes{job="$cluster"}', - hide: false, - interval: '', - intervalFactor: 2, - legendFormat: '{{instance}} DB Size', - metric: '', - refId: 'A', - step: 4, - }], - thresholds: [], - timeFrom: null, - timeShift: null, - title: 'DB Size', - tooltip: { - msResolution: false, - shared: true, - sort: 0, - value_type: 'cumulative', - }, - type: 'graph', - xaxis: { - mode: 'time', - name: null, - show: true, - values: [], - }, - yaxes: [ - { - format: 'bytes', - logBase: 1, - max: null, - min: null, - show: true, - }, - { - format: 'short', - logBase: 1, - max: null, - min: null, - show: false, - }, - ], - }, - { - aliasColors: {}, - bars: false, - datasource: '$datasource', - editable: true, - 'error': false, - fill: 0, - grid: {}, - id: 3, - legend: { - avg: false, - current: false, - max: false, - min: false, - show: false, - total: false, - values: false, - }, - lines: true, - linewidth: 2, - links: [], - nullPointMode: 'connected', - percentage: false, - pointradius: 1, - points: false, - renderer: 'flot', - seriesOverrides: [], - span: 4, - stack: false, - steppedLine: true, - targets: [ - { - expr: 'histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket{job="$cluster"}[5m])) by (instance, le))', - hide: false, - intervalFactor: 2, - legendFormat: '{{instance}} WAL fsync', - metric: 'etcd_disk_wal_fsync_duration_seconds_bucket', - refId: 'A', - step: 4, - }, - { - expr: 'histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket{job="$cluster"}[5m])) by (instance, le))', - intervalFactor: 2, - legendFormat: '{{instance}} DB fsync', - metric: 'etcd_disk_backend_commit_duration_seconds_bucket', - refId: 'B', - step: 4, - }, - ], - thresholds: [], - timeFrom: null, - timeShift: null, - title: 'Disk Sync Duration', - tooltip: { - msResolution: false, - shared: true, - sort: 0, - value_type: 'cumulative', - }, - type: 'graph', - xaxis: { - mode: 'time', - name: null, - show: true, - values: [], - }, - yaxes: [ - { - format: 's', - logBase: 1, - max: null, - min: null, - show: true, - }, - { - format: 'short', - logBase: 1, - max: null, - min: null, - show: false, - }, - ], - }, - { - aliasColors: {}, - bars: false, - datasource: '$datasource', - editable: true, - 'error': false, - fill: 0, - id: 29, - isNew: true, - legend: { - avg: false, - current: false, - max: false, - min: false, - show: false, - total: false, - values: false, - }, - lines: true, - linewidth: 2, - links: [], - nullPointMode: 'connected', - percentage: false, - pointradius: 5, - points: false, - renderer: 'flot', - seriesOverrides: [], - span: 4, - stack: false, - steppedLine: false, - targets: [{ - expr: 'process_resident_memory_bytes{job="$cluster"}', - intervalFactor: 2, - legendFormat: '{{instance}} Resident Memory', - metric: 'process_resident_memory_bytes', - refId: 'A', - step: 4, - }], - thresholds: [], - timeFrom: null, - timeShift: null, - title: 'Memory', - tooltip: { - msResolution: false, - shared: true, - sort: 0, - value_type: 'individual', - }, - type: 'graph', - xaxis: { - mode: 'time', - name: null, - show: true, - values: [], - }, - yaxes: [ - { - format: 'bytes', - label: null, - logBase: 1, - max: null, - min: null, - show: true, - }, - { - format: 'short', - label: null, - logBase: 1, - max: null, - min: null, - show: true, - }, - ], - }, - ], - title: 'New row', - }, - { - collapse: false, - editable: true, - height: '250px', - panels: [ - { - aliasColors: {}, - bars: false, - datasource: '$datasource', - editable: true, - 'error': false, - fill: 5, - id: 22, - isNew: true, - legend: { - avg: false, - current: false, - max: false, - min: false, - show: false, - total: false, - values: false, - }, - lines: true, - linewidth: 2, - links: [], - nullPointMode: 'connected', - percentage: false, - pointradius: 5, - points: false, - renderer: 'flot', - seriesOverrides: [], - span: 3, - stack: true, - steppedLine: false, - targets: [{ - expr: 'rate(etcd_network_client_grpc_received_bytes_total{job="$cluster"}[5m])', - intervalFactor: 2, - legendFormat: '{{instance}} Client Traffic In', - metric: 'etcd_network_client_grpc_received_bytes_total', - refId: 'A', - step: 4, - }], - thresholds: [], - timeFrom: null, - timeShift: null, - title: 'Client Traffic In', - tooltip: { - msResolution: false, - shared: true, - sort: 0, - value_type: 'individual', - }, - type: 'graph', - xaxis: { - mode: 'time', - name: null, - show: true, - values: [], - }, - yaxes: [ - { - format: 'Bps', - label: null, - logBase: 1, - max: null, - min: null, - show: true, - }, - { - format: 'short', - label: null, - logBase: 1, - max: null, - min: null, - show: true, - }, - ], - }, - { - aliasColors: {}, - bars: false, - datasource: '$datasource', - editable: true, - 'error': false, - fill: 5, - id: 21, - isNew: true, - legend: { - avg: false, - current: false, - max: false, - min: false, - show: false, - total: false, - values: false, - }, - lines: true, - linewidth: 2, - links: [], - nullPointMode: 'connected', - percentage: false, - pointradius: 5, - points: false, - renderer: 'flot', - seriesOverrides: [], - span: 3, - stack: true, - steppedLine: false, - targets: [{ - expr: 'rate(etcd_network_client_grpc_sent_bytes_total{job="$cluster"}[5m])', - intervalFactor: 2, - legendFormat: '{{instance}} Client Traffic Out', - metric: 'etcd_network_client_grpc_sent_bytes_total', - refId: 'A', - step: 4, - }], - thresholds: [], - timeFrom: null, - timeShift: null, - title: 'Client Traffic Out', - tooltip: { - msResolution: false, - shared: true, - sort: 0, - value_type: 'individual', - }, - type: 'graph', - xaxis: { - mode: 'time', - name: null, - show: true, - values: [], - }, - yaxes: [ - { - format: 'Bps', - label: null, - logBase: 1, - max: null, - min: null, - show: true, - }, - { - format: 'short', - label: null, - logBase: 1, - max: null, - min: null, - show: true, - }, - ], - }, - { - aliasColors: {}, - bars: false, - datasource: '$datasource', - editable: true, - 'error': false, - fill: 0, - id: 20, - isNew: true, - legend: { - avg: false, - current: false, - max: false, - min: false, - show: false, - total: false, - values: false, - }, - lines: true, - linewidth: 2, - links: [], - nullPointMode: 'connected', - percentage: false, - pointradius: 5, - points: false, - renderer: 'flot', - seriesOverrides: [], - span: 3, - stack: false, - steppedLine: false, - targets: [{ - expr: 'sum(rate(etcd_network_peer_received_bytes_total{job="$cluster"}[5m])) by (instance)', - intervalFactor: 2, - legendFormat: '{{instance}} Peer Traffic In', - metric: 'etcd_network_peer_received_bytes_total', - refId: 'A', - step: 4, - }], - thresholds: [], - timeFrom: null, - timeShift: null, - title: 'Peer Traffic In', - tooltip: { - msResolution: false, - shared: true, - sort: 0, - value_type: 'individual', - }, - type: 'graph', - xaxis: { - mode: 'time', - name: null, - show: true, - values: [], - }, - yaxes: [ - { - format: 'Bps', - label: null, - logBase: 1, - max: null, - min: null, - show: true, - }, - { - format: 'short', - label: null, - logBase: 1, - max: null, - min: null, - show: true, - }, - ], - }, - { - aliasColors: {}, - bars: false, - datasource: '$datasource', - decimals: null, - editable: true, - 'error': false, - fill: 0, - grid: {}, - id: 16, - legend: { - avg: false, - current: false, - max: false, - min: false, - show: false, - total: false, - values: false, - }, - lines: true, - linewidth: 2, - links: [], - nullPointMode: 'connected', - percentage: false, - pointradius: 5, - points: false, - renderer: 'flot', - seriesOverrides: [], - span: 3, - stack: false, - steppedLine: false, - targets: [{ - expr: 'sum(rate(etcd_network_peer_sent_bytes_total{job="$cluster"}[5m])) by (instance)', - hide: false, - interval: '', - intervalFactor: 2, - legendFormat: '{{instance}} Peer Traffic Out', - metric: 'etcd_network_peer_sent_bytes_total', - refId: 'A', - step: 4, - }], - thresholds: [], - timeFrom: null, - timeShift: null, - title: 'Peer Traffic Out', - tooltip: { - msResolution: false, - shared: true, - sort: 0, - value_type: 'cumulative', - }, - type: 'graph', - xaxis: { - mode: 'time', - name: null, - show: true, - values: [], - }, - yaxes: [ - { - format: 'Bps', - logBase: 1, - max: null, - min: null, - show: true, - }, - { - format: 'short', - logBase: 1, - max: null, - min: null, - show: true, - }, - ], - }, - ], - title: 'New row', - }, - { - collapse: false, - editable: true, - height: '250px', - panels: [ - { - aliasColors: {}, - bars: false, - datasource: '$datasource', - editable: true, - 'error': false, - fill: 0, - id: 40, - isNew: true, - legend: { - avg: false, - current: false, - max: false, - min: false, - show: false, - total: false, - values: false, - }, - lines: true, - linewidth: 2, - links: [], - nullPointMode: 'connected', - percentage: false, - pointradius: 5, - points: false, - renderer: 'flot', - seriesOverrides: [], - span: 6, - stack: false, - steppedLine: false, - targets: [ - { - expr: 'sum(rate(etcd_server_proposals_failed_total{job="$cluster"}[5m]))', - intervalFactor: 2, - legendFormat: 'Proposal Failure Rate', - metric: 'etcd_server_proposals_failed_total', - refId: 'A', - step: 2, - }, - { - expr: 'sum(etcd_server_proposals_pending{job="$cluster"})', - intervalFactor: 2, - legendFormat: 'Proposal Pending Total', - metric: 'etcd_server_proposals_pending', - refId: 'B', - step: 2, - }, - { - expr: 'sum(rate(etcd_server_proposals_committed_total{job="$cluster"}[5m]))', - intervalFactor: 2, - legendFormat: 'Proposal Commit Rate', - metric: 'etcd_server_proposals_committed_total', - refId: 'C', - step: 2, - }, - { - expr: 'sum(rate(etcd_server_proposals_applied_total{job="$cluster"}[5m]))', - intervalFactor: 2, - legendFormat: 'Proposal Apply Rate', - refId: 'D', - step: 2, - }, - ], - thresholds: [], - timeFrom: null, - timeShift: null, - title: 'Raft Proposals', - tooltip: { - msResolution: false, - shared: true, - sort: 0, - value_type: 'individual', - }, - type: 'graph', - xaxis: { - mode: 'time', - name: null, - show: true, - values: [], - }, - yaxes: [ - { - format: 'short', - label: '', - logBase: 1, - max: null, - min: null, - show: true, - }, - { - format: 'short', - label: null, - logBase: 1, - max: null, - min: null, - show: true, - }, - ], - }, - { - aliasColors: {}, - bars: false, - datasource: '$datasource', - decimals: 0, - editable: true, - 'error': false, - fill: 0, - id: 19, - isNew: true, - legend: { - alignAsTable: false, - avg: false, - current: false, - max: false, - min: false, - rightSide: false, - show: false, - total: false, - values: false, - }, - lines: true, - linewidth: 2, - links: [], - nullPointMode: 'connected', - percentage: false, - pointradius: 5, - points: false, - renderer: 'flot', - seriesOverrides: [], - span: 6, - stack: false, - steppedLine: false, - targets: [{ - expr: 'changes(etcd_server_leader_changes_seen_total{job="$cluster"}[1d])', - intervalFactor: 2, - legendFormat: '{{instance}} Total Leader Elections Per Day', - metric: 'etcd_server_leader_changes_seen_total', - refId: 'A', - step: 2, - }], - thresholds: [], - timeFrom: null, - timeShift: null, - title: 'Total Leader Elections Per Day', - tooltip: { - msResolution: false, - shared: true, - sort: 0, - value_type: 'individual', - }, - type: 'graph', - xaxis: { - mode: 'time', - name: null, - show: true, - values: [], - }, - yaxes: [ - { - format: 'short', - label: null, - logBase: 1, - max: null, - min: null, - show: true, - }, - { - format: 'short', - label: null, - logBase: 1, - max: null, - min: null, - show: true, - }, - ], - }, - ], - title: 'New row', - }, - ], - time: { - from: 'now-15m', - to: 'now', - }, - timepicker: { - now: true, - refresh_intervals: [ - '5s', - '10s', - '30s', - '1m', - '5m', - '15m', - '30m', - '1h', - '2h', - '1d', - ], - time_options: [ - '5m', - '15m', - '1h', - '6h', - '12h', - '24h', - '2d', - '7d', - '30d', - ], - }, - templating: { - list: [ - { - current: { - text: 'Prometheus', - value: 'Prometheus', - }, - hide: 0, - label: null, - name: 'datasource', - options: [], - query: 'prometheus', - refresh: 1, - regex: '', - type: 'datasource', - }, - { - allValue: null, - current: { - text: 'prod', - value: 'prod', - }, - datasource: '$datasource', - hide: 0, - includeAll: false, - label: 'cluster', - multi: false, - name: 'cluster', - options: [], - query: 'label_values(etcd_server_has_leader, job)', - refresh: 1, - regex: '', - sort: 2, - tagValuesQuery: '', - tags: [], - tagsQuery: '', - type: 'query', - useTags: false, - }, - ], - }, - annotations: { - list: [], - }, - refresh: '10s', - schemaVersion: 13, - version: 215, - links: [], - gnetId: null, - }, - }, -} diff --git a/Documentation/etcd-mixin/test.yaml b/Documentation/etcd-mixin/test.yaml deleted file mode 100644 index 24162bd4d..000000000 --- a/Documentation/etcd-mixin/test.yaml +++ /dev/null @@ -1,135 +0,0 @@ -rule_files: - - mixin.yaml - -evaluation_interval: 1m - -tests: - - interval: 1m - input_series: - - series: 'up{job="etcd",instance="10.10.10.0"}' - values: '1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0' - - series: 'up{job="etcd",instance="10.10.10.1"}' - values: '1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0' - - series: 'up{job="etcd",instance="10.10.10.2"}' - values: '1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0' - alert_rule_test: - - eval_time: 3m - alertname: etcdInsufficientMembers - - eval_time: 5m - alertname: etcdInsufficientMembers - - eval_time: 12m - alertname: etcdMembersDown - - eval_time: 14m - alertname: etcdMembersDown - exp_alerts: - - exp_labels: - job: etcd - severity: critical - exp_annotations: - description: 'etcd cluster "etcd": members are down (3).' - summary: 'etcd cluster members are down.' - - eval_time: 7m - alertname: etcdInsufficientMembers - - eval_time: 11m - alertname: etcdInsufficientMembers - exp_alerts: - - exp_labels: - job: etcd - severity: critical - exp_annotations: - description: 'etcd cluster "etcd": insufficient members (1).' - summary: 'etcd cluster has insufficient number of members.' - - eval_time: 15m - alertname: etcdInsufficientMembers - exp_alerts: - - exp_labels: - job: etcd - severity: critical - exp_annotations: - description: 'etcd cluster "etcd": insufficient members (0).' - summary: 'etcd cluster has insufficient number of members.' - - - interval: 1m - input_series: - - series: 'up{job="etcd",instance="10.10.10.0"}' - values: '1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0' - - series: 'up{job="etcd",instance="10.10.10.1"}' - values: '1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0' - - series: 'up{job="etcd",instance="10.10.10.2"}' - values: '1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0' - alert_rule_test: - - eval_time: 14m - alertname: etcdMembersDown - exp_alerts: - - exp_labels: - job: etcd - severity: critical - exp_annotations: - description: 'etcd cluster "etcd": members are down (3).' - summary: 'etcd cluster members are down.' - - - interval: 1m - input_series: - - series: 'up{job="etcd",instance="10.10.10.0"}' - values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0' - - series: 'up{job="etcd",instance="10.10.10.1"}' - values: '1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0' - - series: 'etcd_network_peer_sent_failures_total{To="member-1",job="etcd",endpoint="test"}' - values: '0 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18' - alert_rule_test: - - eval_time: 13m - alertname: etcdMembersDown - exp_alerts: - - exp_labels: - job: etcd - severity: critical - exp_annotations: - description: 'etcd cluster "etcd": members are down (1).' - summary: 'etcd cluster members are down.' - - interval: 1m - input_series: - - series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.0"}' - values: '0 0 2 0 0 1 0 0 0 0 0 0 0 0 0 0' - - series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.1"}' - values: '0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0' - - series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.2"}' - values: '0 0 0 0 0 0 0 0' - alert_rule_test: - - eval_time: 10m - alertname: etcdHighNumberOfLeaderChanges - exp_alerts: - - exp_labels: - job: etcd - severity: warning - exp_annotations: - description: 'etcd cluster "etcd": 4 leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.' - summary: 'etcd cluster has high number of leader changes.' - - interval: 1m - input_series: - - series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.0"}' - values: '0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0' - - series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.1"}' - values: '0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0' - - series: 'etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.2"}' - values: '0 0 0 0 0 0 0 0' - alert_rule_test: - - eval_time: 10m - alertname: etcdHighNumberOfLeaderChanges - exp_alerts: - - interval: 1m - input_series: - - series: '((etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100){job="etcd",instance="10.10.10.0"}' - values: '0 10 20 0 0 10 0 0 30 0 0 0 0 0 0 0' - - series: '((etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100){job="etcd",instance="10.10.10.1"}' - values: '0 0 10 0 20 0 0 0 0 0 0 0 0 0 0 0' - - series: '((etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100){job="etcd",instance="10.10.10.2"}' - values: '0 0 0 0 0 0 0 0' - alert_rule_test: - - eval_time: 10m - alertname: etcdExcessiveDatabaseGrowth - exp_alerts: - - exp_labels: - job: etcd - severity: warning - exp_annotations: - message: 'etcd cluster "etcd": Observed surge in etcd writes leading to 50% increase in database size over the past four hours, please check as it might be disruptive.'