Refactor monitroing mixin's dashboard

Uses new grafonnet lib to declare dashboard. Generated dashboard has same layout, but now has timeseries panels instead of deprecated graphs

Signed-off-by: Vitaly Zhuravlev <v-zhuravlev@users.noreply.github.com>
This commit is contained in:
Vitaly Zhuravlev 2023-07-15 00:03:48 +00:00
parent a3bd22beef
commit f5644361d0
10 changed files with 329 additions and 1184 deletions

1
contrib/mixin/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
vendor

View File

@ -1,4 +1,4 @@
.PHONY: tools manifests test clean
.PHONY: tools manifests test clean jb_install
OS := linux
ARCH ?= amd64
@ -7,6 +7,7 @@ PROMETHEUS_VERSION := 2.33.1
tools:
go install github.com/google/go-jsonnet/cmd/jsonnet@latest
go install github.com/brancz/gojsontoyaml@latest
go install -github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb@latest
wget -qO- "https://github.com/prometheus/prometheus/releases/download/v${PROMETHEUS_VERSION}/prometheus-${PROMETHEUS_VERSION}.${OS}-${ARCH}.tar.gz" |\
tar xvz --strip-components=1 -C "$$(go env GOPATH)/bin" prometheus-${PROMETHEUS_VERSION}.${OS}-${ARCH}/promtool
@ -19,5 +20,8 @@ manifests/etcd-prometheusRules.yaml:
test: manifests/etcd-prometheusRules.yaml
promtool test rules test.yaml
jb_install:
jb install
clean:
rm -rf manifests/*.yaml

View File

@ -2,7 +2,7 @@
> NOTE: This project is *alpha* stage. Flags, configuration, behaviour and design may change significantly in following releases.
A set of customisable Prometheus alerts for etcd.
A customisable set of Grafana dashboard and Prometheus alerts for etcd.
Instructions for use are the same as the [kubernetes-mixin](https://github.com/kubernetes-monitoring/kubernetes-mixin).

View File

@ -0,0 +1 @@
import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'

View File

@ -0,0 +1,15 @@
{
"version": 1,
"dependencies": [
{
"source": {
"git": {
"remote": "https://github.com/grafana/grafonnet.git",
"subdir": "gen/grafonnet-latest"
}
},
"version": "main"
}
],
"legacyImports": true
}

View File

@ -0,0 +1,86 @@
{
"version": 1,
"dependencies": [
{
"source": {
"git": {
"remote": "https://github.com/crdsonnet/crdsonnet.git",
"subdir": "crdsonnet"
}
},
"version": "3dab27c8c9119eb14ecc19592240caf5f7330d23",
"sum": "Vh6Wo/7IIyIkmDE2XAJfWZDlTHClRrdMa9CCBPrcF1U="
},
{
"source": {
"git": {
"remote": "https://github.com/crdsonnet/validate-libsonnet.git",
"subdir": ""
}
},
"version": "a78ca15fbfece3110c4807d1f059132ece01d97b",
"sum": "qYLH56MqvmgxE4YMNeTbuJ1XSsCpl1sumHN5x8IQv2I="
},
{
"source": {
"git": {
"remote": "https://github.com/grafana/grafonnet.git",
"subdir": "gen/grafonnet-latest"
}
},
"version": "441b95e5a16ee5e511d54beb5f07e9b05f2bb5a2",
"sum": "sVzVlSLbxPkAurwO19YERigLMmRfVsViMcWC0gkTTqU="
},
{
"source": {
"git": {
"remote": "https://github.com/grafana/grafonnet.git",
"subdir": "gen/grafonnet-v10.0.0"
}
},
"version": "441b95e5a16ee5e511d54beb5f07e9b05f2bb5a2",
"sum": "Lv5DM6d93sW3q2v9l5A6V0Etnnrzy5t19uYTEjO77Ag="
},
{
"source": {
"git": {
"remote": "https://github.com/grafana/grafonnet.git",
"subdir": "grafonnet-base"
}
},
"version": "441b95e5a16ee5e511d54beb5f07e9b05f2bb5a2",
"sum": "5zmj5Vn/cKhKrD+jA5BfLliR4t74upkM4PfnuJC5t4M="
},
{
"source": {
"git": {
"remote": "https://github.com/grafana/grok.git",
"subdir": "jsonnet/v10.0.0"
}
},
"version": "8a4d53df8dc53b005a22125bc199366787dd271a",
"sum": "cpxEEjeaQMUYA938vuVie3s5SSRv+O9IWwL9F1m44Qk="
},
{
"source": {
"git": {
"remote": "https://github.com/jsonnet-libs/docsonnet.git",
"subdir": "doc-util"
}
},
"version": "7c865ec0606f2b68c0f6b2721f101e6a99cd2593",
"sum": "zjjufxN4yAIevldYEERiZEp27vK0BJKj1VvZcVtWiOo="
},
{
"source": {
"git": {
"remote": "https://github.com/jsonnet-libs/xtd.git",
"subdir": ""
}
},
"version": "d1929c3c1728274424a21a617f1b82095685d640",
"sum": "DHqlMNfQH/eCWHbHD2Ouc/bH00AwTkCS86z6cgzQ7UI="
}
],
"legacyImports": false
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,59 @@
local g = import 'g.libsonnet';
{
stat: {
local stat = g.panel.stat,
base(title, targets):
stat.new(title)
+ stat.queryOptions.withTargets(targets)
+ stat.queryOptions.withInterval('1m'),
up(title, targets):
self.base(title, targets)
+ stat.options.withColorMode('none')
+ stat.options.withGraphMode('none')
+ stat.options.reduceOptions.withCalcs([
'lastNotNull',
]),
},
timeSeries: {
local timeSeries = g.panel.timeSeries,
local fieldOverride = g.panel.timeSeries.fieldOverride,
local custom = timeSeries.fieldConfig.defaults.custom,
local defaults = timeSeries.fieldConfig.defaults,
local options = timeSeries.options,
base(title, targets):
timeSeries.new(title)
+ timeSeries.queryOptions.withTargets(targets)
+ timeSeries.queryOptions.withInterval('1m')
+ custom.withLineWidth(2)
+ custom.withFillOpacity(0)
+ custom.withShowPoints('never'),
rpcRate(title, targets):
self.base(title, targets)
+ timeSeries.standardOptions.withUnit('ops'),
activeStreams(title, targets):
self.base(title, targets),
dbSize(title, targets):
self.base(title, targets)
+ timeSeries.standardOptions.withUnit('bytes'),
diskSync(title, targets):
self.base(title, targets)
+ timeSeries.standardOptions.withUnit('s'),
memory(title, targets):
self.base(title, targets)
+ timeSeries.standardOptions.withUnit('bytes'),
traffic(title, targets):
self.base(title, targets)
+ timeSeries.standardOptions.withUnit('Bps'),
raftProposals(title, targets):
self.base(title, targets),
leaderElections(title, targets):
self.base(title, targets),
peerRtt(title, targets):
self.base(title, targets)
+ timeSeries.standardOptions.withUnit('s'),
},
}

View File

@ -0,0 +1,104 @@
local g = import './g.libsonnet';
local prometheusQuery = g.query.prometheus;
function(variables, config) {
up:
prometheusQuery.new(
'$' + variables.datasource.name,
'sum(etcd_server_has_leader{%s, %s="$cluster"})' % [config.etcd_selector, config.clusterLabel]
)
+ prometheusQuery.withLegendFormat(|||
{{cluster}} - {{namespace}}
|||),
rpcRate:
prometheusQuery.new(
'$' + variables.datasource.name,
'sum(rate(grpc_server_started_total{%s, %s="$cluster",grpc_type="unary"}[$__rate_interval]))' % [config.etcd_selector, config.clusterLabel]
)
+ prometheusQuery.withLegendFormat('RPC rate'),
rpcFailedRate:
prometheusQuery.new(
'$' + variables.datasource.name,
'sum(rate(grpc_server_handled_total{%s, %s="$cluster",grpc_type="unary",grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[$__rate_interval]))' % [config.etcd_selector, config.clusterLabel]
)
+ prometheusQuery.withLegendFormat('RPC failed rate'),
watchStreams:
prometheusQuery.new(
'$' + variables.datasource.name,
'sum(grpc_server_started_total{%(etcd_selector)s,%(clusterLabel)s="$cluster",grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{%(clusterLabel)s="$cluster",grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"})' % config
)
+ prometheusQuery.withLegendFormat('Watch streams'),
leaseStreams:
prometheusQuery.new(
'$' + variables.datasource.name,
'sum(grpc_server_started_total{%(etcd_selector)s,%(clusterLabel)s="$cluster",grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{%(clusterLabel)s="$cluster",grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"})' % config
)
+ prometheusQuery.withLegendFormat('Lease streams'),
dbSize:
prometheusQuery.new(
'$' + variables.datasource.name,
'etcd_mvcc_db_total_size_in_bytes{%s, %s="$cluster"}' % [config.etcd_selector, config.clusterLabel],
)
+ prometheusQuery.withLegendFormat('{{instance}} DB size'),
walFsync:
prometheusQuery.new(
'$' + variables.datasource.name,
'histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket{%s, %s="$cluster"}[$__rate_interval])) by (instance, le))' % [config.etcd_selector, config.clusterLabel],
)
+ prometheusQuery.withLegendFormat('{{instance}} WAL fsync'),
dbFsync:
prometheusQuery.new(
'$' + variables.datasource.name,
'histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket{%s, %s="$cluster"}[$__rate_interval])) by (instance, le))' % [config.etcd_selector, config.clusterLabel],
)
+ prometheusQuery.withLegendFormat('{{instance}} DB fsync'),
memory:
prometheusQuery.new(
'$' + variables.datasource.name,
'process_resident_memory_bytes{%s, %s="$cluster"}' % [config.etcd_selector, config.clusterLabel],
)
+ prometheusQuery.withLegendFormat('{{instance}} resident memory'),
clientTrafficIn:
prometheusQuery.new(
'$' + variables.datasource.name,
'rate(etcd_network_client_grpc_received_bytes_total{%s, %s="$cluster"}[$__rate_interval])' % [config.etcd_selector, config.clusterLabel],
)
+ prometheusQuery.withLegendFormat('{{instance}} client traffic in'),
clientTrafficOut:
prometheusQuery.new(
'$' + variables.datasource.name,
'rate(etcd_network_client_grpc_sent_bytes_total{%s, %s="$cluster"}[$__rate_interval])' % [config.etcd_selector, config.clusterLabel],
)
+ prometheusQuery.withLegendFormat('{{instance}} client traffic out'),
peerTrafficIn:
prometheusQuery.new(
'$' + variables.datasource.name,
'sum(rate(etcd_network_peer_received_bytes_total{%s, %s="$cluster"}[$__rate_interval])) by (instance)' % [config.etcd_selector, config.clusterLabel],
)
+ prometheusQuery.withLegendFormat('{{instance}} peer traffic in'),
peerTrafficOut:
prometheusQuery.new(
'$' + variables.datasource.name,
'sum(rate(etcd_network_peer_sent_bytes_total{%s, %s="$cluster"}[$__rate_interval])) by (instance)' % [config.etcd_selector, config.clusterLabel],
)
+ prometheusQuery.withLegendFormat('{{instance}} peer traffic out'),
raftProposals:
prometheusQuery.new(
'$' + variables.datasource.name,
'changes(etcd_server_leader_changes_seen_total{%s, %s="$cluster"}[1d])' % [config.etcd_selector, config.clusterLabel],
)
+ prometheusQuery.withLegendFormat('{{instance}} total leader elections per day'),
leaderElections:
prometheusQuery.new(
'$' + variables.datasource.name,
'changes(etcd_server_leader_changes_seen_total{%s, %s="$cluster"}[1d])' % [config.etcd_selector, config.clusterLabel],
)
+ prometheusQuery.withLegendFormat('{{instance}} total leader elections per day'),
peerRtt:
prometheusQuery.new(
'$' + variables.datasource.name,
'histogram_quantile(0.99, sum by (instance, le) (rate(etcd_network_peer_round_trip_time_seconds_bucket{%s, %s="$cluster"}[$__rate_interval])))' % [config.etcd_selector, config.clusterLabel],
)
+ prometheusQuery.withLegendFormat('{{instance}} peer round trip time'),
}

View File

@ -0,0 +1,21 @@
// variables.libsonnet
local g = import './g.libsonnet';
local var = g.dashboard.variable;
function(config) {
datasource:
var.datasource.new('datasource', 'prometheus')
+ var.datasource.generalOptions.withLabel('Data Source'),
cluster:
var.query.new(config.clusterLabel)
+ var.query.generalOptions.withLabel('cluster')
+ var.query.withDatasourceFromVariable(self.datasource)
+ { refresh: config.dashboard_var_refresh }
+ var.query.queryTypes.withLabelValues(
config.clusterLabel,
'etcd_server_has_leader{%s}' % [config.etcd_selector]
),
}