mirror of
https://github.com/etcd-io/etcd.git
synced 2024-09-27 06:25:44 +00:00
Refactor monitroing mixin's dashboard
Uses new grafonnet lib to declare dashboard. Generated dashboard has same layout, but now has timeseries panels instead of deprecated graphs Signed-off-by: Vitaly Zhuravlev <v-zhuravlev@users.noreply.github.com>
This commit is contained in:
parent
a3bd22beef
commit
f5644361d0
1
contrib/mixin/.gitignore
vendored
Normal file
1
contrib/mixin/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
vendor
|
@ -1,4 +1,4 @@
|
||||
.PHONY: tools manifests test clean
|
||||
.PHONY: tools manifests test clean jb_install
|
||||
|
||||
OS := linux
|
||||
ARCH ?= amd64
|
||||
@ -7,6 +7,7 @@ PROMETHEUS_VERSION := 2.33.1
|
||||
tools:
|
||||
go install github.com/google/go-jsonnet/cmd/jsonnet@latest
|
||||
go install github.com/brancz/gojsontoyaml@latest
|
||||
go install -github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb@latest
|
||||
wget -qO- "https://github.com/prometheus/prometheus/releases/download/v${PROMETHEUS_VERSION}/prometheus-${PROMETHEUS_VERSION}.${OS}-${ARCH}.tar.gz" |\
|
||||
tar xvz --strip-components=1 -C "$$(go env GOPATH)/bin" prometheus-${PROMETHEUS_VERSION}.${OS}-${ARCH}/promtool
|
||||
|
||||
@ -19,5 +20,8 @@ manifests/etcd-prometheusRules.yaml:
|
||||
test: manifests/etcd-prometheusRules.yaml
|
||||
promtool test rules test.yaml
|
||||
|
||||
jb_install:
|
||||
jb install
|
||||
|
||||
clean:
|
||||
rm -rf manifests/*.yaml
|
||||
|
@ -2,7 +2,7 @@
|
||||
|
||||
> NOTE: This project is *alpha* stage. Flags, configuration, behaviour and design may change significantly in following releases.
|
||||
|
||||
A set of customisable Prometheus alerts for etcd.
|
||||
A customisable set of Grafana dashboard and Prometheus alerts for etcd.
|
||||
|
||||
Instructions for use are the same as the [kubernetes-mixin](https://github.com/kubernetes-monitoring/kubernetes-mixin).
|
||||
|
||||
|
1
contrib/mixin/g.libsonnet
Normal file
1
contrib/mixin/g.libsonnet
Normal file
@ -0,0 +1 @@
|
||||
import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'
|
15
contrib/mixin/jsonnetfile.json
Normal file
15
contrib/mixin/jsonnetfile.json
Normal file
@ -0,0 +1,15 @@
|
||||
{
|
||||
"version": 1,
|
||||
"dependencies": [
|
||||
{
|
||||
"source": {
|
||||
"git": {
|
||||
"remote": "https://github.com/grafana/grafonnet.git",
|
||||
"subdir": "gen/grafonnet-latest"
|
||||
}
|
||||
},
|
||||
"version": "main"
|
||||
}
|
||||
],
|
||||
"legacyImports": true
|
||||
}
|
86
contrib/mixin/jsonnetfile.lock.json
Normal file
86
contrib/mixin/jsonnetfile.lock.json
Normal file
@ -0,0 +1,86 @@
|
||||
{
|
||||
"version": 1,
|
||||
"dependencies": [
|
||||
{
|
||||
"source": {
|
||||
"git": {
|
||||
"remote": "https://github.com/crdsonnet/crdsonnet.git",
|
||||
"subdir": "crdsonnet"
|
||||
}
|
||||
},
|
||||
"version": "3dab27c8c9119eb14ecc19592240caf5f7330d23",
|
||||
"sum": "Vh6Wo/7IIyIkmDE2XAJfWZDlTHClRrdMa9CCBPrcF1U="
|
||||
},
|
||||
{
|
||||
"source": {
|
||||
"git": {
|
||||
"remote": "https://github.com/crdsonnet/validate-libsonnet.git",
|
||||
"subdir": ""
|
||||
}
|
||||
},
|
||||
"version": "a78ca15fbfece3110c4807d1f059132ece01d97b",
|
||||
"sum": "qYLH56MqvmgxE4YMNeTbuJ1XSsCpl1sumHN5x8IQv2I="
|
||||
},
|
||||
{
|
||||
"source": {
|
||||
"git": {
|
||||
"remote": "https://github.com/grafana/grafonnet.git",
|
||||
"subdir": "gen/grafonnet-latest"
|
||||
}
|
||||
},
|
||||
"version": "441b95e5a16ee5e511d54beb5f07e9b05f2bb5a2",
|
||||
"sum": "sVzVlSLbxPkAurwO19YERigLMmRfVsViMcWC0gkTTqU="
|
||||
},
|
||||
{
|
||||
"source": {
|
||||
"git": {
|
||||
"remote": "https://github.com/grafana/grafonnet.git",
|
||||
"subdir": "gen/grafonnet-v10.0.0"
|
||||
}
|
||||
},
|
||||
"version": "441b95e5a16ee5e511d54beb5f07e9b05f2bb5a2",
|
||||
"sum": "Lv5DM6d93sW3q2v9l5A6V0Etnnrzy5t19uYTEjO77Ag="
|
||||
},
|
||||
{
|
||||
"source": {
|
||||
"git": {
|
||||
"remote": "https://github.com/grafana/grafonnet.git",
|
||||
"subdir": "grafonnet-base"
|
||||
}
|
||||
},
|
||||
"version": "441b95e5a16ee5e511d54beb5f07e9b05f2bb5a2",
|
||||
"sum": "5zmj5Vn/cKhKrD+jA5BfLliR4t74upkM4PfnuJC5t4M="
|
||||
},
|
||||
{
|
||||
"source": {
|
||||
"git": {
|
||||
"remote": "https://github.com/grafana/grok.git",
|
||||
"subdir": "jsonnet/v10.0.0"
|
||||
}
|
||||
},
|
||||
"version": "8a4d53df8dc53b005a22125bc199366787dd271a",
|
||||
"sum": "cpxEEjeaQMUYA938vuVie3s5SSRv+O9IWwL9F1m44Qk="
|
||||
},
|
||||
{
|
||||
"source": {
|
||||
"git": {
|
||||
"remote": "https://github.com/jsonnet-libs/docsonnet.git",
|
||||
"subdir": "doc-util"
|
||||
}
|
||||
},
|
||||
"version": "7c865ec0606f2b68c0f6b2721f101e6a99cd2593",
|
||||
"sum": "zjjufxN4yAIevldYEERiZEp27vK0BJKj1VvZcVtWiOo="
|
||||
},
|
||||
{
|
||||
"source": {
|
||||
"git": {
|
||||
"remote": "https://github.com/jsonnet-libs/xtd.git",
|
||||
"subdir": ""
|
||||
}
|
||||
},
|
||||
"version": "d1929c3c1728274424a21a617f1b82095685d640",
|
||||
"sum": "DHqlMNfQH/eCWHbHD2Ouc/bH00AwTkCS86z6cgzQ7UI="
|
||||
}
|
||||
],
|
||||
"legacyImports": false
|
||||
}
|
File diff suppressed because it is too large
Load Diff
59
contrib/mixin/panels.libsonnet
Normal file
59
contrib/mixin/panels.libsonnet
Normal file
@ -0,0 +1,59 @@
|
||||
local g = import 'g.libsonnet';
|
||||
|
||||
{
|
||||
stat: {
|
||||
local stat = g.panel.stat,
|
||||
base(title, targets):
|
||||
stat.new(title)
|
||||
+ stat.queryOptions.withTargets(targets)
|
||||
+ stat.queryOptions.withInterval('1m'),
|
||||
up(title, targets):
|
||||
self.base(title, targets)
|
||||
+ stat.options.withColorMode('none')
|
||||
+ stat.options.withGraphMode('none')
|
||||
+ stat.options.reduceOptions.withCalcs([
|
||||
'lastNotNull',
|
||||
]),
|
||||
},
|
||||
timeSeries: {
|
||||
local timeSeries = g.panel.timeSeries,
|
||||
local fieldOverride = g.panel.timeSeries.fieldOverride,
|
||||
local custom = timeSeries.fieldConfig.defaults.custom,
|
||||
local defaults = timeSeries.fieldConfig.defaults,
|
||||
local options = timeSeries.options,
|
||||
|
||||
|
||||
base(title, targets):
|
||||
timeSeries.new(title)
|
||||
+ timeSeries.queryOptions.withTargets(targets)
|
||||
+ timeSeries.queryOptions.withInterval('1m')
|
||||
+ custom.withLineWidth(2)
|
||||
+ custom.withFillOpacity(0)
|
||||
+ custom.withShowPoints('never'),
|
||||
|
||||
rpcRate(title, targets):
|
||||
self.base(title, targets)
|
||||
+ timeSeries.standardOptions.withUnit('ops'),
|
||||
activeStreams(title, targets):
|
||||
self.base(title, targets),
|
||||
dbSize(title, targets):
|
||||
self.base(title, targets)
|
||||
+ timeSeries.standardOptions.withUnit('bytes'),
|
||||
diskSync(title, targets):
|
||||
self.base(title, targets)
|
||||
+ timeSeries.standardOptions.withUnit('s'),
|
||||
memory(title, targets):
|
||||
self.base(title, targets)
|
||||
+ timeSeries.standardOptions.withUnit('bytes'),
|
||||
traffic(title, targets):
|
||||
self.base(title, targets)
|
||||
+ timeSeries.standardOptions.withUnit('Bps'),
|
||||
raftProposals(title, targets):
|
||||
self.base(title, targets),
|
||||
leaderElections(title, targets):
|
||||
self.base(title, targets),
|
||||
peerRtt(title, targets):
|
||||
self.base(title, targets)
|
||||
+ timeSeries.standardOptions.withUnit('s'),
|
||||
},
|
||||
}
|
104
contrib/mixin/targets.libsonnet
Normal file
104
contrib/mixin/targets.libsonnet
Normal file
@ -0,0 +1,104 @@
|
||||
local g = import './g.libsonnet';
|
||||
local prometheusQuery = g.query.prometheus;
|
||||
|
||||
function(variables, config) {
|
||||
up:
|
||||
prometheusQuery.new(
|
||||
'$' + variables.datasource.name,
|
||||
'sum(etcd_server_has_leader{%s, %s="$cluster"})' % [config.etcd_selector, config.clusterLabel]
|
||||
)
|
||||
+ prometheusQuery.withLegendFormat(|||
|
||||
{{cluster}} - {{namespace}}
|
||||
|||),
|
||||
|
||||
rpcRate:
|
||||
prometheusQuery.new(
|
||||
'$' + variables.datasource.name,
|
||||
'sum(rate(grpc_server_started_total{%s, %s="$cluster",grpc_type="unary"}[$__rate_interval]))' % [config.etcd_selector, config.clusterLabel]
|
||||
)
|
||||
+ prometheusQuery.withLegendFormat('RPC rate'),
|
||||
rpcFailedRate:
|
||||
prometheusQuery.new(
|
||||
'$' + variables.datasource.name,
|
||||
'sum(rate(grpc_server_handled_total{%s, %s="$cluster",grpc_type="unary",grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[$__rate_interval]))' % [config.etcd_selector, config.clusterLabel]
|
||||
)
|
||||
+ prometheusQuery.withLegendFormat('RPC failed rate'),
|
||||
watchStreams:
|
||||
prometheusQuery.new(
|
||||
'$' + variables.datasource.name,
|
||||
'sum(grpc_server_started_total{%(etcd_selector)s,%(clusterLabel)s="$cluster",grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{%(clusterLabel)s="$cluster",grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"})' % config
|
||||
)
|
||||
+ prometheusQuery.withLegendFormat('Watch streams'),
|
||||
leaseStreams:
|
||||
prometheusQuery.new(
|
||||
'$' + variables.datasource.name,
|
||||
'sum(grpc_server_started_total{%(etcd_selector)s,%(clusterLabel)s="$cluster",grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{%(clusterLabel)s="$cluster",grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"})' % config
|
||||
)
|
||||
+ prometheusQuery.withLegendFormat('Lease streams'),
|
||||
dbSize:
|
||||
prometheusQuery.new(
|
||||
'$' + variables.datasource.name,
|
||||
'etcd_mvcc_db_total_size_in_bytes{%s, %s="$cluster"}' % [config.etcd_selector, config.clusterLabel],
|
||||
)
|
||||
+ prometheusQuery.withLegendFormat('{{instance}} DB size'),
|
||||
walFsync:
|
||||
prometheusQuery.new(
|
||||
'$' + variables.datasource.name,
|
||||
'histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket{%s, %s="$cluster"}[$__rate_interval])) by (instance, le))' % [config.etcd_selector, config.clusterLabel],
|
||||
)
|
||||
+ prometheusQuery.withLegendFormat('{{instance}} WAL fsync'),
|
||||
dbFsync:
|
||||
prometheusQuery.new(
|
||||
'$' + variables.datasource.name,
|
||||
'histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket{%s, %s="$cluster"}[$__rate_interval])) by (instance, le))' % [config.etcd_selector, config.clusterLabel],
|
||||
)
|
||||
+ prometheusQuery.withLegendFormat('{{instance}} DB fsync'),
|
||||
memory:
|
||||
prometheusQuery.new(
|
||||
'$' + variables.datasource.name,
|
||||
'process_resident_memory_bytes{%s, %s="$cluster"}' % [config.etcd_selector, config.clusterLabel],
|
||||
)
|
||||
+ prometheusQuery.withLegendFormat('{{instance}} resident memory'),
|
||||
clientTrafficIn:
|
||||
prometheusQuery.new(
|
||||
'$' + variables.datasource.name,
|
||||
'rate(etcd_network_client_grpc_received_bytes_total{%s, %s="$cluster"}[$__rate_interval])' % [config.etcd_selector, config.clusterLabel],
|
||||
)
|
||||
+ prometheusQuery.withLegendFormat('{{instance}} client traffic in'),
|
||||
clientTrafficOut:
|
||||
prometheusQuery.new(
|
||||
'$' + variables.datasource.name,
|
||||
'rate(etcd_network_client_grpc_sent_bytes_total{%s, %s="$cluster"}[$__rate_interval])' % [config.etcd_selector, config.clusterLabel],
|
||||
)
|
||||
+ prometheusQuery.withLegendFormat('{{instance}} client traffic out'),
|
||||
peerTrafficIn:
|
||||
prometheusQuery.new(
|
||||
'$' + variables.datasource.name,
|
||||
'sum(rate(etcd_network_peer_received_bytes_total{%s, %s="$cluster"}[$__rate_interval])) by (instance)' % [config.etcd_selector, config.clusterLabel],
|
||||
)
|
||||
+ prometheusQuery.withLegendFormat('{{instance}} peer traffic in'),
|
||||
peerTrafficOut:
|
||||
prometheusQuery.new(
|
||||
'$' + variables.datasource.name,
|
||||
'sum(rate(etcd_network_peer_sent_bytes_total{%s, %s="$cluster"}[$__rate_interval])) by (instance)' % [config.etcd_selector, config.clusterLabel],
|
||||
)
|
||||
+ prometheusQuery.withLegendFormat('{{instance}} peer traffic out'),
|
||||
raftProposals:
|
||||
prometheusQuery.new(
|
||||
'$' + variables.datasource.name,
|
||||
'changes(etcd_server_leader_changes_seen_total{%s, %s="$cluster"}[1d])' % [config.etcd_selector, config.clusterLabel],
|
||||
)
|
||||
+ prometheusQuery.withLegendFormat('{{instance}} total leader elections per day'),
|
||||
leaderElections:
|
||||
prometheusQuery.new(
|
||||
'$' + variables.datasource.name,
|
||||
'changes(etcd_server_leader_changes_seen_total{%s, %s="$cluster"}[1d])' % [config.etcd_selector, config.clusterLabel],
|
||||
)
|
||||
+ prometheusQuery.withLegendFormat('{{instance}} total leader elections per day'),
|
||||
peerRtt:
|
||||
prometheusQuery.new(
|
||||
'$' + variables.datasource.name,
|
||||
'histogram_quantile(0.99, sum by (instance, le) (rate(etcd_network_peer_round_trip_time_seconds_bucket{%s, %s="$cluster"}[$__rate_interval])))' % [config.etcd_selector, config.clusterLabel],
|
||||
)
|
||||
+ prometheusQuery.withLegendFormat('{{instance}} peer round trip time'),
|
||||
}
|
21
contrib/mixin/variables.libsonnet
Normal file
21
contrib/mixin/variables.libsonnet
Normal file
@ -0,0 +1,21 @@
|
||||
// variables.libsonnet
|
||||
local g = import './g.libsonnet';
|
||||
local var = g.dashboard.variable;
|
||||
|
||||
|
||||
function(config) {
|
||||
datasource:
|
||||
var.datasource.new('datasource', 'prometheus')
|
||||
+ var.datasource.generalOptions.withLabel('Data Source'),
|
||||
|
||||
cluster:
|
||||
var.query.new(config.clusterLabel)
|
||||
+ var.query.generalOptions.withLabel('cluster')
|
||||
+ var.query.withDatasourceFromVariable(self.datasource)
|
||||
+ { refresh: config.dashboard_var_refresh }
|
||||
+ var.query.queryTypes.withLabelValues(
|
||||
config.clusterLabel,
|
||||
'etcd_server_has_leader{%s}' % [config.etcd_selector]
|
||||
),
|
||||
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user