From 5e810e30ccb42a36b24e933c09177b575e6494e6 Mon Sep 17 00:00:00 2001 From: Gyu-Ho Lee Date: Mon, 14 Nov 2016 13:52:25 -0800 Subject: [PATCH 1/2] v3rpc: replace grpc metrics w/ go-grpc-prometheus And disable histogram --- etcdserver/api/v3rpc/interceptor.go | 35 ++++---------------------- etcdserver/api/v3rpc/metrics.go | 38 ----------------------------- 2 files changed, 5 insertions(+), 68 deletions(-) diff --git a/etcdserver/api/v3rpc/interceptor.go b/etcdserver/api/v3rpc/interceptor.go index ae49d9f5b..7b693bf61 100644 --- a/etcdserver/api/v3rpc/interceptor.go +++ b/etcdserver/api/v3rpc/interceptor.go @@ -25,6 +25,7 @@ import ( "github.com/coreos/etcd/pkg/types" "github.com/coreos/etcd/raft" + prometheus "github.com/grpc-ecosystem/go-grpc-prometheus" "golang.org/x/net/context" "google.golang.org/grpc" "google.golang.org/grpc/metadata" @@ -53,7 +54,8 @@ func newUnaryInterceptor(s *etcdserver.EtcdServer) grpc.UnaryServerInterceptor { } } } - return metricsUnaryInterceptor(ctx, req, info, handler) + + return prometheus.UnaryServerInterceptor(ctx, req, info, handler) } } @@ -88,38 +90,11 @@ func newStreamInterceptor(s *etcdserver.EtcdServer) grpc.StreamServerInterceptor } } - return metricsStreamInterceptor(srv, ss, info, handler) + + return prometheus.StreamServerInterceptor(srv, ss, info, handler) } } -func metricsUnaryInterceptor(ctx context.Context, req interface{}, info *grpc.UnaryServerInfo, handler grpc.UnaryHandler) (resp interface{}, err error) { - service, method := splitMethodName(info.FullMethod) - receivedCounter.WithLabelValues(service, method).Inc() - - start := time.Now() - resp, err = handler(ctx, req) - if err != nil { - failedCounter.WithLabelValues(service, method, grpc.Code(err).String()).Inc() - } - handlingDuration.WithLabelValues(service, method).Observe(time.Since(start).Seconds()) - - return resp, err -} - -func metricsStreamInterceptor(srv interface{}, ss grpc.ServerStream, info *grpc.StreamServerInfo, handler grpc.StreamHandler) error { - service, method := splitMethodName(info.FullMethod) - receivedCounter.WithLabelValues(service, method).Inc() - - streamsGauage.WithLabelValues(service, method).Inc() - err := handler(srv, ss) - streamsGauage.WithLabelValues(service, method).Dec() - if err != nil { - failedCounter.WithLabelValues(service, method, grpc.Code(err).String()).Inc() - } - - return err -} - func splitMethodName(fullMethodName string) (string, string) { fullMethodName = strings.TrimPrefix(fullMethodName, "/") // remove leading slash if i := strings.Index(fullMethodName, "/"); i >= 0 { diff --git a/etcdserver/api/v3rpc/metrics.go b/etcdserver/api/v3rpc/metrics.go index c159cd83a..6cb41a61e 100644 --- a/etcdserver/api/v3rpc/metrics.go +++ b/etcdserver/api/v3rpc/metrics.go @@ -17,39 +17,6 @@ package v3rpc import "github.com/prometheus/client_golang/prometheus" var ( - receivedCounter = prometheus.NewCounterVec( - prometheus.CounterOpts{ - Namespace: "etcd", - Subsystem: "grpc", - Name: "requests_total", - Help: "Counter of received requests.", - }, []string{"grpc_service", "grpc_method"}) - - failedCounter = prometheus.NewCounterVec( - prometheus.CounterOpts{ - Namespace: "etcd", - Subsystem: "grpc", - Name: "requests_failed_total", - Help: "Counter of failed requests.", - }, []string{"grpc_service", "grpc_method", "grpc_code"}) - - streamsGauage = prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Namespace: "etcd", - Subsystem: "grpc", - Name: "active_streams", - Help: "Number of active streams.", - }, []string{"grpc_service", "grpc_method"}) - - handlingDuration = prometheus.NewHistogramVec( - prometheus.HistogramOpts{ - Namespace: "etcd", - Subsystem: "grpc", - Name: "unary_requests_duration_seconds", - Help: "Bucketed histogram of processing time (s) of handled unary (non-stream) requests.", - Buckets: prometheus.ExponentialBuckets(0.0005, 2, 13), - }, []string{"grpc_service", "grpc_method"}) - sentBytes = prometheus.NewCounter(prometheus.CounterOpts{ Namespace: "etcd", Subsystem: "network", @@ -66,11 +33,6 @@ var ( ) func init() { - prometheus.MustRegister(receivedCounter) - prometheus.MustRegister(failedCounter) - prometheus.MustRegister(streamsGauage) - prometheus.MustRegister(handlingDuration) - prometheus.MustRegister(sentBytes) prometheus.MustRegister(receivedBytes) } From 7cac755df2a16bc67a08ebe426c93fbb2f0c9b7a Mon Sep 17 00:00:00 2001 From: Gyu-Ho Lee Date: Mon, 14 Nov 2016 14:33:40 -0800 Subject: [PATCH 2/2] op-guide: update gRPC requests metrics --- Documentation/metrics.md | 27 ++------------------------- Documentation/op-guide/grafana.json | 20 ++++++++++---------- 2 files changed, 12 insertions(+), 35 deletions(-) diff --git a/Documentation/metrics.md b/Documentation/metrics.md index 4109dd166..58e526394 100644 --- a/Documentation/metrics.md +++ b/Documentation/metrics.md @@ -82,31 +82,7 @@ All these metrics are prefixed with `etcd_network_` ### gRPC requests -These metrics describe the requests served by a specific etcd member: total received requests, total failed requests, and processing latency. They are useful for tracking user-generated traffic hitting the etcd cluster. - -All these metrics are prefixed with `etcd_grpc_` - -| Name | Description | Type | -|--------------------------------|-------------------------------------------------------------------------------------|------------------------| -| requests_total | Total number of received requests | Counter(method) | -| requests_failed_total | Total number of failed requests.   | Counter(method,error) | -| active_streams | Total number of active streams.   | Gauge(method) | -| unary_requests_duration_seconds | Bucketed handling duration of the requests. | Histogram(method) | - - -Example Prometheus queries that may be useful from these metrics (across all etcd members): - - * `sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[1m]) by (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"})[1m]) by (grpc_method)` - - Shows the fraction of events that failed by gRPC method across all members, across a time window of `1m`. - - * `sum(rate(etcd_grpc_requests_total{job="etcd",grpc_method="PUT"})[1m]) by (grpc_method)` - - Shows the rate of PUT requests across all members, across a time window of `1m`. - - * `histogram_quantile(0.9, sum(rate(etcd_grpc_unary_requests_duration_seconds{job="etcd",grpc_method="PUT"}[5m]) ) by (le))` - - Show the 0.90-tile latency (in seconds) of PUT request handling across all members, with a window of `5m`. +These metrics are exposed via [go-grpc-prometheus][go-grpc-prometheus]. ## etcd_debugging namespace metrics @@ -137,3 +113,4 @@ Heavy file descriptor (`process_open_fds`) usage (i.e., near the process's file [prometheus-getting-started]: http://prometheus.io/docs/introduction/getting_started/ [prometheus-naming]: http://prometheus.io/docs/practices/naming/ [v2-http-metrics]: v2/metrics.md#http-requests +[go-grpc-prometheus]: https://github.com/grpc-ecosystem/go-grpc-prometheus \ No newline at end of file diff --git a/Documentation/op-guide/grafana.json b/Documentation/op-guide/grafana.json index c90a8a473..f6d6b521a 100644 --- a/Documentation/op-guide/grafana.json +++ b/Documentation/op-guide/grafana.json @@ -115,20 +115,20 @@ "stack": false, "steppedLine": false, "targets": [{ - "expr": "sum(rate(etcd_grpc_requests_total [1m]))", + "expr": "sum(rate({grpc_type=\"unary\",grpc_code!=\"OK\"} [1m]))", "intervalFactor": 2, "legendFormat": "{{instance}} RPC Rate", - "metric": "etcd_grpc_requests_total", + "metric": "grpc_server_started_total", "refId": "A", - "step": 4 + "step": 2 }, { - "expr": "sum(rate(etcd_grpc_requests_failed_total [1m]))", + "expr": "sum(rate(grpc_server_started_total{grpc_type=\"unary\",grpc_code!=\"OK\"} [1m])) - sum(rate(grpc_server_handled_total{grpc_type=\"unary\"} [1m]))", "intervalFactor": 2, "legendFormat": "{{instance}} RPC Failed Rate", - "metric": "etcd_grpc_requests_failed_total", + "metric": "grpc_server_handled_total", "refId": "B", - "step": 4 + "step": 2 } ], "thresholds": [], @@ -197,18 +197,18 @@ "stack": true, "steppedLine": false, "targets": [{ - "expr": "sum(etcd_grpc_active_streams {grpc_service=\"etcdserverpb.Watch\"})", + "expr": "sum(grpc_server_started_total {grpc_service=\"etcdserverpb.Watch\",grpc_type=\"bidi_stream\",grpc_code!=\"OK\"}) - sum(grpc_server_handled_total {grpc_service=\"etcdserverpb.Watch\",grpc_type=\"bidi_stream\"})", "intervalFactor": 2, "legendFormat": "Watch Streams", - "metric": "etcd_grpc_active_streams", + "metric": "grpc_server_handled_total", "refId": "A", "step": 4 }, { - "expr": "sum(etcd_grpc_active_streams {grpc_service=\"etcdserverpb.Lease\"})", + "expr": "sum(grpc_server_started_total {grpc_service=\"etcdserverpb.Lease\",grpc_type=\"bidi_stream\"}) - sum(grpc_server_handled_total {grpc_service=\"etcdserverpb.Lease\",grpc_type=\"bidi_stream\"})", "intervalFactor": 2, "legendFormat": "Lease Streams", - "metric": "etcd_grpc_active_streams", + "metric": "grpc_server_handled_total", "refId": "B", "step": 4 }