From 53fdcdc5a22544610bc1369aa28edb49e3c6e9a6 Mon Sep 17 00:00:00 2001 From: Gyuho Lee Date: Wed, 12 Aug 2020 10:22:56 -0700 Subject: [PATCH 1/3] pkg/runtime: optimize FDUsage by removing sort No need sort when we just want the counts. Signed-off-by: Gyuho Lee --- pkg/runtime/fds_linux.go | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/pkg/runtime/fds_linux.go b/pkg/runtime/fds_linux.go index 8e9359db2..4906d678f 100644 --- a/pkg/runtime/fds_linux.go +++ b/pkg/runtime/fds_linux.go @@ -16,7 +16,7 @@ package runtime import ( - "io/ioutil" + "os" "syscall" ) @@ -29,9 +29,20 @@ func FDLimit() (uint64, error) { } func FDUsage() (uint64, error) { - fds, err := ioutil.ReadDir("/proc/self/fd") + return countFiles("/proc/self/fd") +} + +// countFiles reads the directory named by dirname and returns the count. +// This is same as stdlib "io/ioutil.ReadDir" but without sorting. +func countFiles(dirname string) (uint64, error) { + f, err := os.Open(dirname) if err != nil { return 0, err } - return uint64(len(fds)), nil + list, err := f.Readdir(-1) + f.Close() + if err != nil { + return 0, err + } + return uint64(len(list)), nil } From 421df2ecbb56e94eac29c4832f9a41476fede6a5 Mon Sep 17 00:00:00 2001 From: Gyuho Lee Date: Wed, 12 Aug 2020 10:23:22 -0700 Subject: [PATCH 2/3] etcdserver: add OS level FD metrics Similar counts are exposed via Prometheus. This adds the one that are perceived by etcd server. e.g. os_fd_limit 120000 os_fd_used 14 process_cpu_seconds_total 0.31 process_max_fds 120000 process_open_fds 17 Signed-off-by: Gyuho Lee --- etcdserver/metrics.go | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/etcdserver/metrics.go b/etcdserver/metrics.go index 0c0ce912d..417e05f21 100644 --- a/etcdserver/metrics.go +++ b/etcdserver/metrics.go @@ -151,6 +151,19 @@ var ( Help: "Server or member ID in hexadecimal format. 1 for 'server_id' label with current ID.", }, []string{"server_id"}) + + fdUsed = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: "os", + Subsystem: "fd", + Name: "used", + Help: "The number of used file descriptors.", + }) + fdLimit = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: "os", + Subsystem: "fd", + Name: "limit", + Help: "The file descriptor limit.", + }) ) func init() { @@ -174,6 +187,8 @@ func init() { prometheus.MustRegister(isLearner) prometheus.MustRegister(learnerPromoteSucceed) prometheus.MustRegister(learnerPromoteFailed) + prometheus.MustRegister(fdUsed) + prometheus.MustRegister(fdLimit) currentVersion.With(prometheus.Labels{ "server_version": version.Version, @@ -184,7 +199,6 @@ func init() { } func monitorFileDescriptor(lg *zap.Logger, done <-chan struct{}) { - // This ticker will check File Descriptor Requirements ,and count all fds in used. // And recorded some logs when in used >= limit/5*4. Just recorded message. // If fds was more than 10K,It's low performance due to FDUsage() works. @@ -198,11 +212,13 @@ func monitorFileDescriptor(lg *zap.Logger, done <-chan struct{}) { lg.Warn("failed to get file descriptor usage", zap.Error(err)) return } + fdUsed.Set(float64(used)) limit, err := runtime.FDLimit() if err != nil { lg.Warn("failed to get file descriptor limit", zap.Error(err)) return } + fdLimit.Set(float64(limit)) if used >= limit/5*4 { lg.Warn("80% of file descriptors are used", zap.Uint64("used", used), zap.Uint64("limit", limit)) } From 5678779665751683e57fde78067ae99e7a085470 Mon Sep 17 00:00:00 2001 From: Gyuho Lee Date: Wed, 12 Aug 2020 10:29:41 -0700 Subject: [PATCH 3/3] CHANGELOG: update Signed-off-by: Gyuho Lee --- CHANGELOG-3.5.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG-3.5.md b/CHANGELOG-3.5.md index 0800b0827..3dea56d2e 100644 --- a/CHANGELOG-3.5.md +++ b/CHANGELOG-3.5.md @@ -91,6 +91,7 @@ Note that any `etcd_debugging_*` metrics are experimental and subject to change. - Add [`etcd_server_client_requests_total` with `"type"` and `"client_api_version"` labels](https://github.com/etcd-io/etcd/pull/11687). - Add [`etcd_wal_write_bytes_total`](https://github.com/etcd-io/etcd/pull/11738). - Add [`etcd_debugging_auth_revision`](https://github.com/etcd-io/etcd/commit/f14d2a087f7b0fd6f7980b95b5e0b945109c95f3). +- Add [`os_fd_used` and `os_fd_limit` to monitor current OS file descriptors](https://github.com/etcd-io/etcd/pull/12214). ### etcd server @@ -130,12 +131,16 @@ Note that any `etcd_debugging_*` metrics are experimental and subject to change. - Add [`--unsafe-no-fsync`](https://github.com/etcd-io/etcd/pull/11946) flag. - Setting the flag disables all uses of fsync, which is unsafe and will cause data loss. This flag makes it possible to run an etcd node for testing and development without placing lots of load on the file system. - Add [etcd --auth-token-ttl](https://github.com/etcd-io/etcd/pull/11980) flag to customize `simpleTokenTTL` settings. -- Improve [runtime.FDUsage objects malloc of Memory Usage and CPU Usage](https://github.com/etcd-io/etcd/pull/11986). +- Improve [`runtime.FDUsage` call pattern to reduce objects malloc of Memory Usage and CPU Usage](https://github.com/etcd-io/etcd/pull/11986). - Improve [mvcc.watchResponse channel Memory Usage](https://github.com/etcd-io/etcd/pull/11987). - Log [expensive request info in UnaryInterceptor](https://github.com/etcd-io/etcd/pull/12086). - [Fix invalid Go type in etcdserverpb](https://github.com/etcd-io/etcd/pull/12000). - [Improve healthcheck by using v3 range request and its corresponding timeout](https://github.com/etcd-io/etcd/pull/12195). +### Package `runtime` + +- Optimize [`runtime.FDUsage` by removing unnecessary sorting](https://github.com/etcd-io/etcd/pull/12214). + ### Package `embed` - Remove [`embed.Config.Debug`](https://github.com/etcd-io/etcd/pull/10947).