diff --git a/Documentation/metrics.md b/Documentation/metrics.md index 4109dd166..58e526394 100644 --- a/Documentation/metrics.md +++ b/Documentation/metrics.md @@ -82,31 +82,7 @@ All these metrics are prefixed with `etcd_network_` ### gRPC requests -These metrics describe the requests served by a specific etcd member: total received requests, total failed requests, and processing latency. They are useful for tracking user-generated traffic hitting the etcd cluster. - -All these metrics are prefixed with `etcd_grpc_` - -| Name | Description | Type | -|--------------------------------|-------------------------------------------------------------------------------------|------------------------| -| requests_total | Total number of received requests | Counter(method) | -| requests_failed_total | Total number of failed requests.   | Counter(method,error) | -| active_streams | Total number of active streams.   | Gauge(method) | -| unary_requests_duration_seconds | Bucketed handling duration of the requests. | Histogram(method) | - - -Example Prometheus queries that may be useful from these metrics (across all etcd members): - - * `sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[1m]) by (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"})[1m]) by (grpc_method)` - - Shows the fraction of events that failed by gRPC method across all members, across a time window of `1m`. - - * `sum(rate(etcd_grpc_requests_total{job="etcd",grpc_method="PUT"})[1m]) by (grpc_method)` - - Shows the rate of PUT requests across all members, across a time window of `1m`. - - * `histogram_quantile(0.9, sum(rate(etcd_grpc_unary_requests_duration_seconds{job="etcd",grpc_method="PUT"}[5m]) ) by (le))` - - Show the 0.90-tile latency (in seconds) of PUT request handling across all members, with a window of `5m`. +These metrics are exposed via [go-grpc-prometheus][go-grpc-prometheus]. ## etcd_debugging namespace metrics @@ -137,3 +113,4 @@ Heavy file descriptor (`process_open_fds`) usage (i.e., near the process's file [prometheus-getting-started]: http://prometheus.io/docs/introduction/getting_started/ [prometheus-naming]: http://prometheus.io/docs/practices/naming/ [v2-http-metrics]: v2/metrics.md#http-requests +[go-grpc-prometheus]: https://github.com/grpc-ecosystem/go-grpc-prometheus \ No newline at end of file diff --git a/Documentation/op-guide/grafana.json b/Documentation/op-guide/grafana.json index c90a8a473..f6d6b521a 100644 --- a/Documentation/op-guide/grafana.json +++ b/Documentation/op-guide/grafana.json @@ -115,20 +115,20 @@ "stack": false, "steppedLine": false, "targets": [{ - "expr": "sum(rate(etcd_grpc_requests_total [1m]))", + "expr": "sum(rate({grpc_type=\"unary\",grpc_code!=\"OK\"} [1m]))", "intervalFactor": 2, "legendFormat": "{{instance}} RPC Rate", - "metric": "etcd_grpc_requests_total", + "metric": "grpc_server_started_total", "refId": "A", - "step": 4 + "step": 2 }, { - "expr": "sum(rate(etcd_grpc_requests_failed_total [1m]))", + "expr": "sum(rate(grpc_server_started_total{grpc_type=\"unary\",grpc_code!=\"OK\"} [1m])) - sum(rate(grpc_server_handled_total{grpc_type=\"unary\"} [1m]))", "intervalFactor": 2, "legendFormat": "{{instance}} RPC Failed Rate", - "metric": "etcd_grpc_requests_failed_total", + "metric": "grpc_server_handled_total", "refId": "B", - "step": 4 + "step": 2 } ], "thresholds": [], @@ -197,18 +197,18 @@ "stack": true, "steppedLine": false, "targets": [{ - "expr": "sum(etcd_grpc_active_streams {grpc_service=\"etcdserverpb.Watch\"})", + "expr": "sum(grpc_server_started_total {grpc_service=\"etcdserverpb.Watch\",grpc_type=\"bidi_stream\",grpc_code!=\"OK\"}) - sum(grpc_server_handled_total {grpc_service=\"etcdserverpb.Watch\",grpc_type=\"bidi_stream\"})", "intervalFactor": 2, "legendFormat": "Watch Streams", - "metric": "etcd_grpc_active_streams", + "metric": "grpc_server_handled_total", "refId": "A", "step": 4 }, { - "expr": "sum(etcd_grpc_active_streams {grpc_service=\"etcdserverpb.Lease\"})", + "expr": "sum(grpc_server_started_total {grpc_service=\"etcdserverpb.Lease\",grpc_type=\"bidi_stream\"}) - sum(grpc_server_handled_total {grpc_service=\"etcdserverpb.Lease\",grpc_type=\"bidi_stream\"})", "intervalFactor": 2, "legendFormat": "Lease Streams", - "metric": "etcd_grpc_active_streams", + "metric": "grpc_server_handled_total", "refId": "B", "step": 4 }