etcdmain: close client conns when it exceeds limit

This solves the problem that etcd may fatal because its critical path
cannot get file descriptor resource when the number of clients is too
big. The PR lets the client listener close client connections
immediately after they are accepted when
the file descriptor usage in the process reaches some pre-set limit, so
it ensures that the internal critical path could always get file
descriptor when it needs.

When there are tons to clients connecting to the server, the original
behavior is like this:

```
2015/08/4 16:42:08 etcdserver: cannot monitor file descriptor usage
(open /proc/self/fd: too many open files)
2015/08/4 16:42:33 etcdserver: failed to purge snap file open
default2.etcd/member/snap: too many open files
[halted]
```

Current behavior is like this:

```
2015/08/6 19:05:25 transport: accept error: closing connection,
exceed file descriptor usage limitation (fd limit=874)
2015/08/6 19:05:25 transport: accept error: closing connection,
exceed file descriptor usage limitation (fd limit=874)
2015/08/6 19:05:26 transport: accept error: closing connection,
exceed file descriptor usage limitation (fd limit=874)
2015/08/6 19:05:27 transport: accept error: closing connection,
exceed file descriptor usage limitation (fd limit=874)
2015/08/6 19:05:28 transport: accept error: closing connection,
exceed file descriptor usage limitation (fd limit=874)
2015/08/6 19:05:28 etcdserver: 80% of the file descriptor limit is
used [used = 873, limit = 1024]
```

It is available at linux system today because pkg/runtime only has linux
support.
This commit is contained in:
Yicheng Qin 2015-08-04 11:36:24 -07:00
parent 219ed1695b
commit 97923ca3fc
3 changed files with 153 additions and 0 deletions

View File

@ -33,6 +33,7 @@ import (
"github.com/coreos/etcd/pkg/cors"
"github.com/coreos/etcd/pkg/fileutil"
"github.com/coreos/etcd/pkg/osutil"
runtimeutil "github.com/coreos/etcd/pkg/runtime"
"github.com/coreos/etcd/pkg/transport"
"github.com/coreos/etcd/pkg/types"
"github.com/coreos/etcd/proxy"
@ -49,6 +50,18 @@ var plog = capnslog.NewPackageLogger("github.com/coreos/etcd", "etcdmain")
const (
// the owner can make/remove files inside the directory
privateDirMode = 0700
// internal fd usage includes disk usage and transport usage.
// To read/write snapshot, snap pkg needs 1. In normal case, wal pkg needs
// at most 2 to read/lock/write WALs. One case that it needs to 2 is to
// read all logs after some snapshot index, which locates at the end of
// the second last and the head of the last. For purging, it needs to read
// directory, so it needs 1. For fd monitor, it needs 1.
// For transport, rafthttp builds two long-polling connections and at most
// four temporary connections with each member. There are at most 9 members
// in a cluster, so it should reserve 96.
// For the safety, we set the total reserved number to 150.
reservedInternalFDNum = 150
)
var (
@ -188,6 +201,12 @@ func startEtcd(cfg *config) (<-chan struct{}, error) {
if err != nil {
return nil, err
}
if fdLimit, err := runtimeutil.FDLimit(); err == nil {
if fdLimit <= reservedInternalFDNum {
plog.Fatalf("file descriptor limit[%d] of etcd process is too low, and should be set higher than %d to ensure internal usage", fdLimit, reservedInternalFDNum)
}
l = &transport.LimitedConnListener{Listener: l, RuntimeFDLimit: fdLimit - reservedInternalFDNum}
}
urlStr := u.String()
plog.Info("listening for client requests on ", urlStr)

View File

@ -0,0 +1,55 @@
// Copyright 2015 CoreOS, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package transport
import (
"errors"
"net"
"github.com/coreos/etcd/Godeps/_workspace/src/github.com/coreos/pkg/capnslog"
"github.com/coreos/etcd/pkg/runtime"
)
var plog = capnslog.NewPackageLogger("github.com/coreos/etcd/pkg", "transport")
type LimitedConnListener struct {
net.Listener
RuntimeFDLimit uint64
}
func (l *LimitedConnListener) Accept() (net.Conn, error) {
conn, err := l.Listener.Accept()
if err != nil {
return nil, err
}
n, err := runtime.FDUsage()
// Check whether fd number in use exceeds the set limit.
if err == nil && n >= l.RuntimeFDLimit {
conn.Close()
plog.Errorf("accept error: closing connection, exceed file descriptor usage limitation (fd limit=%d)", l.RuntimeFDLimit)
return nil, &acceptError{error: errors.New("exceed file descriptor usage limitation"), temporary: true}
}
return conn, nil
}
type acceptError struct {
error
temporary bool
}
func (e *acceptError) Timeout() bool { return false }
func (e *acceptError) Temporary() bool { return e.temporary }

View File

@ -0,0 +1,79 @@
// Copyright 2015 CoreOS, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package transport
import (
"net"
"net/http"
"net/http/httptest"
"testing"
"github.com/coreos/etcd/pkg/runtime"
)
func TestLimitedConnListenerAccept(t *testing.T) {
if _, err := runtime.FDUsage(); err != nil {
t.Skip("skip test due to unsupported runtime.FDUsage")
}
ln, err := net.Listen("tcp", ":0")
if err != nil {
t.Fatal(err)
}
fdNum, err := runtime.FDUsage()
if err != nil {
t.Fatal(err)
}
srv := &httptest.Server{
Listener: &LimitedConnListener{
Listener: ln,
RuntimeFDLimit: fdNum + 100,
},
Config: &http.Server{},
}
srv.Start()
defer srv.Close()
resp, err := http.Get(srv.URL)
defer resp.Body.Close()
if err != nil {
t.Fatalf("Get error = %v, want nil", err)
}
}
func TestLimitedConnListenerLimit(t *testing.T) {
if _, err := runtime.FDUsage(); err != nil {
t.Skip("skip test due to unsupported runtime.FDUsage")
}
ln, err := net.Listen("tcp", ":0")
if err != nil {
t.Fatal(err)
}
srv := &httptest.Server{
Listener: &LimitedConnListener{
Listener: ln,
RuntimeFDLimit: 0,
},
Config: &http.Server{},
}
srv.Start()
defer srv.Close()
_, err = http.Get(srv.URL)
if err == nil {
t.Fatalf("unexpected nil Get error")
}
}