Merge pull request #6851 from gyuho/metrics

v3rpc: replace grpc metrics w/ go-grpc-prometheus
This commit is contained in:
Gyu-Ho Lee 2016-11-14 16:09:17 -08:00 committed by GitHub
commit 677606da7d
4 changed files with 17 additions and 103 deletions

View File

@ -82,31 +82,7 @@ All these metrics are prefixed with `etcd_network_`
### gRPC requests
These metrics describe the requests served by a specific etcd member: total received requests, total failed requests, and processing latency. They are useful for tracking user-generated traffic hitting the etcd cluster.
All these metrics are prefixed with `etcd_grpc_`
| Name | Description | Type |
|--------------------------------|-------------------------------------------------------------------------------------|------------------------|
| requests_total | Total number of received requests | Counter(method) |
| requests_failed_total | Total number of failed requests.   | Counter(method,error) |
| active_streams | Total number of active streams.   | Gauge(method) |
| unary_requests_duration_seconds | Bucketed handling duration of the requests. | Histogram(method) |
Example Prometheus queries that may be useful from these metrics (across all etcd members):
* `sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[1m]) by (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"})[1m]) by (grpc_method)`
Shows the fraction of events that failed by gRPC method across all members, across a time window of `1m`.
* `sum(rate(etcd_grpc_requests_total{job="etcd",grpc_method="PUT"})[1m]) by (grpc_method)`
Shows the rate of PUT requests across all members, across a time window of `1m`.
* `histogram_quantile(0.9, sum(rate(etcd_grpc_unary_requests_duration_seconds{job="etcd",grpc_method="PUT"}[5m]) ) by (le))`
Show the 0.90-tile latency (in seconds) of PUT request handling across all members, with a window of `5m`.
These metrics are exposed via [go-grpc-prometheus][go-grpc-prometheus].
## etcd_debugging namespace metrics
@ -137,3 +113,4 @@ Heavy file descriptor (`process_open_fds`) usage (i.e., near the process's file
[prometheus-getting-started]: http://prometheus.io/docs/introduction/getting_started/
[prometheus-naming]: http://prometheus.io/docs/practices/naming/
[v2-http-metrics]: v2/metrics.md#http-requests
[go-grpc-prometheus]: https://github.com/grpc-ecosystem/go-grpc-prometheus

View File

@ -115,20 +115,20 @@
"stack": false,
"steppedLine": false,
"targets": [{
"expr": "sum(rate(etcd_grpc_requests_total [1m]))",
"expr": "sum(rate({grpc_type=\"unary\",grpc_code!=\"OK\"} [1m]))",
"intervalFactor": 2,
"legendFormat": "{{instance}} RPC Rate",
"metric": "etcd_grpc_requests_total",
"metric": "grpc_server_started_total",
"refId": "A",
"step": 4
"step": 2
},
{
"expr": "sum(rate(etcd_grpc_requests_failed_total [1m]))",
"expr": "sum(rate(grpc_server_started_total{grpc_type=\"unary\",grpc_code!=\"OK\"} [1m])) - sum(rate(grpc_server_handled_total{grpc_type=\"unary\"} [1m]))",
"intervalFactor": 2,
"legendFormat": "{{instance}} RPC Failed Rate",
"metric": "etcd_grpc_requests_failed_total",
"metric": "grpc_server_handled_total",
"refId": "B",
"step": 4
"step": 2
}
],
"thresholds": [],
@ -197,18 +197,18 @@
"stack": true,
"steppedLine": false,
"targets": [{
"expr": "sum(etcd_grpc_active_streams {grpc_service=\"etcdserverpb.Watch\"})",
"expr": "sum(grpc_server_started_total {grpc_service=\"etcdserverpb.Watch\",grpc_type=\"bidi_stream\",grpc_code!=\"OK\"}) - sum(grpc_server_handled_total {grpc_service=\"etcdserverpb.Watch\",grpc_type=\"bidi_stream\"})",
"intervalFactor": 2,
"legendFormat": "Watch Streams",
"metric": "etcd_grpc_active_streams",
"metric": "grpc_server_handled_total",
"refId": "A",
"step": 4
},
{
"expr": "sum(etcd_grpc_active_streams {grpc_service=\"etcdserverpb.Lease\"})",
"expr": "sum(grpc_server_started_total {grpc_service=\"etcdserverpb.Lease\",grpc_type=\"bidi_stream\"}) - sum(grpc_server_handled_total {grpc_service=\"etcdserverpb.Lease\",grpc_type=\"bidi_stream\"})",
"intervalFactor": 2,
"legendFormat": "Lease Streams",
"metric": "etcd_grpc_active_streams",
"metric": "grpc_server_handled_total",
"refId": "B",
"step": 4
}

View File

@ -25,6 +25,7 @@ import (
"github.com/coreos/etcd/pkg/types"
"github.com/coreos/etcd/raft"
prometheus "github.com/grpc-ecosystem/go-grpc-prometheus"
"golang.org/x/net/context"
"google.golang.org/grpc"
"google.golang.org/grpc/metadata"
@ -53,7 +54,8 @@ func newUnaryInterceptor(s *etcdserver.EtcdServer) grpc.UnaryServerInterceptor {
}
}
}
return metricsUnaryInterceptor(ctx, req, info, handler)
return prometheus.UnaryServerInterceptor(ctx, req, info, handler)
}
}
@ -88,38 +90,11 @@ func newStreamInterceptor(s *etcdserver.EtcdServer) grpc.StreamServerInterceptor
}
}
return metricsStreamInterceptor(srv, ss, info, handler)
return prometheus.StreamServerInterceptor(srv, ss, info, handler)
}
}
func metricsUnaryInterceptor(ctx context.Context, req interface{}, info *grpc.UnaryServerInfo, handler grpc.UnaryHandler) (resp interface{}, err error) {
service, method := splitMethodName(info.FullMethod)
receivedCounter.WithLabelValues(service, method).Inc()
start := time.Now()
resp, err = handler(ctx, req)
if err != nil {
failedCounter.WithLabelValues(service, method, grpc.Code(err).String()).Inc()
}
handlingDuration.WithLabelValues(service, method).Observe(time.Since(start).Seconds())
return resp, err
}
func metricsStreamInterceptor(srv interface{}, ss grpc.ServerStream, info *grpc.StreamServerInfo, handler grpc.StreamHandler) error {
service, method := splitMethodName(info.FullMethod)
receivedCounter.WithLabelValues(service, method).Inc()
streamsGauage.WithLabelValues(service, method).Inc()
err := handler(srv, ss)
streamsGauage.WithLabelValues(service, method).Dec()
if err != nil {
failedCounter.WithLabelValues(service, method, grpc.Code(err).String()).Inc()
}
return err
}
func splitMethodName(fullMethodName string) (string, string) {
fullMethodName = strings.TrimPrefix(fullMethodName, "/") // remove leading slash
if i := strings.Index(fullMethodName, "/"); i >= 0 {

View File

@ -17,39 +17,6 @@ package v3rpc
import "github.com/prometheus/client_golang/prometheus"
var (
receivedCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "etcd",
Subsystem: "grpc",
Name: "requests_total",
Help: "Counter of received requests.",
}, []string{"grpc_service", "grpc_method"})
failedCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "etcd",
Subsystem: "grpc",
Name: "requests_failed_total",
Help: "Counter of failed requests.",
}, []string{"grpc_service", "grpc_method", "grpc_code"})
streamsGauage = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "etcd",
Subsystem: "grpc",
Name: "active_streams",
Help: "Number of active streams.",
}, []string{"grpc_service", "grpc_method"})
handlingDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: "etcd",
Subsystem: "grpc",
Name: "unary_requests_duration_seconds",
Help: "Bucketed histogram of processing time (s) of handled unary (non-stream) requests.",
Buckets: prometheus.ExponentialBuckets(0.0005, 2, 13),
}, []string{"grpc_service", "grpc_method"})
sentBytes = prometheus.NewCounter(prometheus.CounterOpts{
Namespace: "etcd",
Subsystem: "network",
@ -66,11 +33,6 @@ var (
)
func init() {
prometheus.MustRegister(receivedCounter)
prometheus.MustRegister(failedCounter)
prometheus.MustRegister(streamsGauage)
prometheus.MustRegister(handlingDuration)
prometheus.MustRegister(sentBytes)
prometheus.MustRegister(receivedBytes)
}