Merge pull request #9548 from gyuho/functional-tester

functional-tester: clean up, handle Operation_SIGQUIT_ETCD_AND_REMOVE_DATA
This commit is contained in:
Gyuho Lee 2018-04-09 10:20:00 -07:00 committed by GitHub
commit 10a51a3003
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
66 changed files with 4489 additions and 3578 deletions

View File

@ -30,7 +30,7 @@ See [code changes](https://github.com/coreos/etcd/compare/v3.3.0...v3.4.0) and [
- Futhermore, when `--auto-compaction-mode=periodic --auto-compaction-retention=30m` and writes per minute are about 1000, `v3.3.0`, `v3.3.1`, and `v3.3.2` compact revision 30000, 33000, and 36000, for every 3-minute, while `v3.3.3` *or later* compacts revision 30000, 60000, and 90000, for every 30-minute.
- Improve [lease expire/revoke operation performance](https://github.com/coreos/etcd/pull/9418), address [lease scalability issue](https://github.com/coreos/etcd/issues/9496).
- Make [Lease `Lookup` non-blocking with concurrent `Grant`/`Revoke`](https://github.com/coreos/etcd/pull/9229).
- Improve [functional tester](https://github.com/coreos/etcd/tree/master/tools/functional-tester) coverage: use [proxy layer to run network fault tests in CI](https://github.com/coreos/etcd/pull/9081), enable [TLS both for server and client](https://github.com/coreos/etcd/pull/9534), add [liveness mode](https://github.com/coreos/etcd/issues/9230), and [shuffle test sequence](https://github.com/coreos/etcd/issues/9381).
- Improve [functional tester](https://github.com/coreos/etcd/tree/master/functional) coverage: use [proxy layer to run network fault tests in CI](https://github.com/coreos/etcd/pull/9081), enable [TLS both for server and client](https://github.com/coreos/etcd/pull/9534), add [liveness mode](https://github.com/coreos/etcd/issues/9230), and [shuffle test sequence](https://github.com/coreos/etcd/issues/9381).
### Breaking Changes

View File

@ -22,7 +22,7 @@ RUN rm -rf ${GOROOT} \
RUN mkdir -p ${GOPATH}/src/github.com/coreos/etcd
ADD . ${GOPATH}/src/github.com/coreos/etcd
ADD ./tools/functional-tester/tester/local-test.yaml /local-test.yaml
ADD ./functional.yaml /functional.yaml
RUN go get -v github.com/coreos/gofail \
&& pushd ${GOPATH}/src/github.com/coreos/etcd \
@ -32,11 +32,11 @@ RUN go get -v github.com/coreos/gofail \
&& cp ./bin/etcdctl /bin/etcdctl \
&& GO_BUILD_FLAGS="-v" FAILPOINTS=1 ./build \
&& cp ./bin/etcd /bin/etcd-failpoints \
&& ./tools/functional-tester/build \
&& ./functional/build \
&& cp ./bin/etcd-agent /bin/etcd-agent \
&& cp ./bin/etcd-tester /bin/etcd-tester \
&& cp ./bin/etcd-proxy /bin/etcd-proxy \
&& cp ./bin/etcd-runner /bin/etcd-runner \
&& cp ./bin/etcd-tester /bin/etcd-tester \
&& go build -v -o /bin/benchmark ./tools/benchmark \
&& go build -v -o /bin/etcd-test-proxy ./tools/etcd-test-proxy \
&& popd \
&& rm -rf ${GOPATH}/src/github.com/coreos/etcd

View File

@ -469,46 +469,48 @@ docker-dns-srv-test-certs-wildcard-run:
# Example:
# make build-etcd-test-proxy
# make build-functional
# make build-docker-functional
# make push-docker-functional
# make pull-docker-functional
build-etcd-test-proxy:
go build -v -o ./bin/etcd-test-proxy ./tools/etcd-test-proxy
# Example:
# make build-docker-functional-tester
# make push-docker-functional-tester
# make pull-docker-functional-tester
build-docker-functional-tester:
build-functional:
$(info GO_VERSION: $(GO_VERSION))
$(info ETCD_VERSION: $(ETCD_VERSION))
@sed -i.bak 's|REPLACE_ME_GO_VERSION|$(GO_VERSION)|g' ./Dockerfile-functional-tester
./functional/build
./bin/etcd-agent -help || true && \
./bin/etcd-proxy -help || true && \
./bin/etcd-runner --help || true && \
./bin/etcd-tester -help || true
build-docker-functional:
$(info GO_VERSION: $(GO_VERSION))
$(info ETCD_VERSION: $(ETCD_VERSION))
@sed -i.bak 's|REPLACE_ME_GO_VERSION|$(GO_VERSION)|g' ./Dockerfile-functional
docker build \
--tag gcr.io/etcd-development/etcd-functional-tester:go$(GO_VERSION) \
--file ./Dockerfile-functional-tester \
--tag gcr.io/etcd-development/etcd-functional:go$(GO_VERSION) \
--file ./Dockerfile-functional \
.
@mv ./Dockerfile-functional-tester.bak ./Dockerfile-functional-tester
@mv ./Dockerfile-functional.bak ./Dockerfile-functional
docker run \
--rm \
gcr.io/etcd-development/etcd-functional-tester:go$(GO_VERSION) \
gcr.io/etcd-development/etcd-functional:go$(GO_VERSION) \
/bin/bash -c "./bin/etcd --version && \
./bin/etcd-failpoints --version && \
ETCDCTL_API=3 ./bin/etcdctl version && \
./bin/etcd-agent -help || true && \
./bin/etcd-tester -help || true && \
./bin/etcd-proxy -help || true && \
./bin/etcd-runner --help || true && \
./bin/benchmark --help || true && \
./bin/etcd-test-proxy -help || true"
./bin/etcd-tester -help || true && \
./bin/benchmark --help || true"
push-docker-functional-tester:
push-docker-functional:
$(info GO_VERSION: $(GO_VERSION))
$(info ETCD_VERSION: $(ETCD_VERSION))
gcloud docker -- push gcr.io/etcd-development/etcd-functional-tester:go$(GO_VERSION)
gcloud docker -- push gcr.io/etcd-development/etcd-functional:go$(GO_VERSION)
pull-docker-functional-tester:
pull-docker-functional:
$(info GO_VERSION: $(GO_VERSION))
$(info ETCD_VERSION: $(ETCD_VERSION))
docker pull gcr.io/etcd-development/etcd-functional-tester:go$(GO_VERSION)
docker pull gcr.io/etcd-development/etcd-functional:go$(GO_VERSION)

View File

@ -106,9 +106,9 @@ agent-configs:
initial-corrupt-check: true
tester-config:
tester-data-dir: /tmp/etcd-tester-data
tester-network: tcp
tester-addr: 127.0.0.1:9028
data-dir: /tmp/etcd-tester-data
network: tcp
addr: 127.0.0.1:9028
# slow enough to trigger election
delay-latency-ms: 5000
@ -119,13 +119,15 @@ tester-config:
consistency-check: true
enable-pprof: true
failure-delay-ms: 7000
failure-shuffle: true
failure-cases:
- KILL_ONE_FOLLOWER
- KILL_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT
- KILL_LEADER
- KILL_LEADER_UNTIL_TRIGGER_SNAPSHOT
- KILL_QUORUM
- KILL_ALL
- SIGTERM_ONE_FOLLOWER
- SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT
- SIGTERM_LEADER
- SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT
- SIGTERM_QUORUM
- SIGTERM_ALL
- BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER
- BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT
- BLACKHOLE_PEER_PORT_TX_RX_LEADER
@ -147,14 +149,11 @@ tester-config:
- NO_FAIL_WITH_STRESS
- NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS
failure-delay-ms: 7000
failure-shuffle: true
failpoint-commands:
- panic("etcd-tester")
# failpoint-commands:
# - panic("etcd-tester"),1*sleep(1000)
runner-exec-path: /etcd-runner
runner-exec-path: ./bin/etcd-runner
external-exec-path: ""
stress-types:

14
functional/Procfile-proxy Normal file
View File

@ -0,0 +1,14 @@
s1: bin/etcd --name s1 --data-dir /tmp/etcd-proxy-data.s1 --listen-client-urls http://127.0.0.1:1379 --advertise-client-urls http://127.0.0.1:13790 --listen-peer-urls http://127.0.0.1:1380 --initial-advertise-peer-urls http://127.0.0.1:13800 --initial-cluster-token tkn --initial-cluster 's1=http://127.0.0.1:13800,s2=http://127.0.0.1:23800,s3=http://127.0.0.1:33800' --initial-cluster-state new
s1-client-proxy: bin/etcd-proxy --from localhost:13790 --to localhost:1379 --http-port 1378
s1-peer-proxy: bin/etcd-proxy --from localhost:13800 --to localhost:1380 --http-port 1381
s2: bin/etcd --name s2 --data-dir /tmp/etcd-proxy-data.s2 --listen-client-urls http://127.0.0.1:2379 --advertise-client-urls http://127.0.0.1:23790 --listen-peer-urls http://127.0.0.1:2380 --initial-advertise-peer-urls http://127.0.0.1:23800 --initial-cluster-token tkn --initial-cluster 's1=http://127.0.0.1:13800,s2=http://127.0.0.1:23800,s3=http://127.0.0.1:33800' --initial-cluster-state new
s2-client-proxy: bin/etcd-proxy --from localhost:23790 --to localhost:2379 --http-port 2378
s2-peer-proxy: bin/etcd-proxy --from localhost:23800 --to localhost:2380 --http-port 2381
s3: bin/etcd --name s3 --data-dir /tmp/etcd-proxy-data.s3 --listen-client-urls http://127.0.0.1:3379 --advertise-client-urls http://127.0.0.1:33790 --listen-peer-urls http://127.0.0.1:3380 --initial-advertise-peer-urls http://127.0.0.1:33800 --initial-cluster-token tkn --initial-cluster 's1=http://127.0.0.1:13800,s2=http://127.0.0.1:23800,s3=http://127.0.0.1:33800' --initial-cluster-state new
s3-client-proxy: bin/etcd-proxy --from localhost:33790 --to localhost:3379 --http-port 3378
s3-client-proxy: bin/etcd-proxy --from localhost:33800 --to localhost:3380 --http-port 3381

View File

@ -1,4 +1,36 @@
#### etcd-test-proxy
## etcd Functional Testing
`functional` verifies the correct behavior of etcd under various system and network malfunctions. It sets up an etcd cluster under high pressure loads and continuously injects failures into the cluster. Then it expects the etcd cluster to recover within a few seconds. This has been extremely helpful to find critical bugs.
See [functional.yaml](../functional.yaml) for an example configuration.
### Run locally
```bash
PASSES=functional ./test
```
### Run with Docker
```bash
pushd ..
make build-docker-functional
popd
```
And run [example scripts](./scripts).
```bash
# run 3 agents for 3-node local etcd cluster
./scripts/docker-local-agent.sh 1
./scripts/docker-local-agent.sh 2
./scripts/docker-local-agent.sh 3
# to run only 1 tester round
./scripts/docker-local-tester.sh
```
## etcd Proxy
Proxy layer that simulates various network conditions.
@ -8,10 +40,10 @@ Test locally
$ ./build
$ ./bin/etcd
$ make build-etcd-test-proxy
$ make build-functional
$ ./bin/etcd-test-proxy --help
$ ./bin/etcd-test-proxy --from localhost:23790 --to localhost:2379 --http-port 2378 --verbose
$ ./bin/etcd-proxy --help
$ ./bin/etcd-proxy --from localhost:23790 --to localhost:2379 --http-port 2378 --verbose
$ ETCDCTL_API=3 ./bin/etcdctl --endpoints localhost:2379 put foo bar
$ ETCDCTL_API=3 ./bin/etcdctl --endpoints localhost:23790 put foo bar
@ -168,10 +200,10 @@ Trigger leader election
```bash
$ ./build
$ make build-etcd-test-proxy
$ make build-functional
$ rm -rf /tmp/etcd-test-proxy-data.s*
$ goreman -f ./tools/etcd-test-proxy/Procfile start
$ rm -rf /tmp/etcd-proxy-data.s*
$ goreman -f ./functional/Procfile-proxy start
$ ETCDCTL_API=3 ./bin/etcdctl \
--endpoints localhost:13790,localhost:23790,localhost:33790 \

View File

@ -25,9 +25,9 @@ import (
"syscall"
"time"
"github.com/coreos/etcd/functional/rpcpb"
"github.com/coreos/etcd/pkg/fileutil"
"github.com/coreos/etcd/pkg/proxy"
"github.com/coreos/etcd/tools/functional-tester/rpcpb"
"go.uber.org/zap"
)
@ -36,32 +36,40 @@ import (
// return status error in response for wrong configuration/operation (e.g. start etcd twice)
func (srv *Server) handleTesterRequest(req *rpcpb.Request) (resp *rpcpb.Response, err error) {
defer func() {
if err == nil {
if err == nil && req != nil {
srv.last = req.Operation
srv.lg.Info("handler success", zap.String("operation", req.Operation.String()))
}
}()
if req != nil {
srv.Member = req.Member
srv.Tester = req.Tester
}
switch req.Operation {
case rpcpb.Operation_InitialStartEtcd:
return srv.handleInitialStartEtcd(req)
case rpcpb.Operation_RestartEtcd:
return srv.handleRestartEtcd()
case rpcpb.Operation_KillEtcd:
return srv.handleKillEtcd()
case rpcpb.Operation_FailArchive:
return srv.handleFailArchive()
case rpcpb.Operation_DestroyEtcdAgent:
return srv.handleDestroyEtcdAgent()
case rpcpb.Operation_INITIAL_START_ETCD:
return srv.handle_INITIAL_START_ETCD(req)
case rpcpb.Operation_RESTART_ETCD:
return srv.handle_RESTART_ETCD()
case rpcpb.Operation_BlackholePeerPortTxRx:
return srv.handleBlackholePeerPortTxRx()
case rpcpb.Operation_UnblackholePeerPortTxRx:
return srv.handleUnblackholePeerPortTxRx()
case rpcpb.Operation_DelayPeerPortTxRx:
return srv.handleDelayPeerPortTxRx()
case rpcpb.Operation_UndelayPeerPortTxRx:
return srv.handleUndelayPeerPortTxRx()
case rpcpb.Operation_SIGTERM_ETCD:
return srv.handle_SIGTERM_ETCD()
case rpcpb.Operation_SIGQUIT_ETCD_AND_REMOVE_DATA:
return srv.handle_SIGQUIT_ETCD_AND_REMOVE_DATA()
case rpcpb.Operation_SIGQUIT_ETCD_AND_ARCHIVE_DATA:
return srv.handle_SIGQUIT_ETCD_AND_ARCHIVE_DATA()
case rpcpb.Operation_SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT:
return srv.handle_SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT()
case rpcpb.Operation_BLACKHOLE_PEER_PORT_TX_RX:
return srv.handle_BLACKHOLE_PEER_PORT_TX_RX()
case rpcpb.Operation_UNBLACKHOLE_PEER_PORT_TX_RX:
return srv.handle_UNBLACKHOLE_PEER_PORT_TX_RX()
case rpcpb.Operation_DELAY_PEER_PORT_TX_RX:
return srv.handle_DELAY_PEER_PORT_TX_RX()
case rpcpb.Operation_UNDELAY_PEER_PORT_TX_RX:
return srv.handle_UNDELAY_PEER_PORT_TX_RX()
default:
msg := fmt.Sprintf("operation not found (%v)", req.Operation)
@ -69,18 +77,15 @@ func (srv *Server) handleTesterRequest(req *rpcpb.Request) (resp *rpcpb.Response
}
}
func (srv *Server) handleInitialStartEtcd(req *rpcpb.Request) (*rpcpb.Response, error) {
if srv.last != rpcpb.Operation_NotStarted {
func (srv *Server) handle_INITIAL_START_ETCD(req *rpcpb.Request) (*rpcpb.Response, error) {
if srv.last != rpcpb.Operation_NOT_STARTED {
return &rpcpb.Response{
Success: false,
Status: fmt.Sprintf("%q is not valid; last server operation was %q", rpcpb.Operation_InitialStartEtcd.String(), srv.last.String()),
Status: fmt.Sprintf("%q is not valid; last server operation was %q", rpcpb.Operation_INITIAL_START_ETCD.String(), srv.last.String()),
Member: req.Member,
}, nil
}
srv.Member = req.Member
srv.Tester = req.Tester
err := fileutil.TouchDirAll(srv.Member.BaseDir)
if err != nil {
return nil, err
@ -118,7 +123,6 @@ func (srv *Server) handleInitialStartEtcd(req *rpcpb.Request) (*rpcpb.Response,
}, nil
}
// TODO: support TLS
func (srv *Server) startProxy() error {
if srv.Member.EtcdClientProxy {
advertiseClientURL, advertiseClientURLPort, err := getURLAndPort(srv.Member.Etcd.AdvertiseClientURLs[0])
@ -236,45 +240,124 @@ func (srv *Server) creatEtcdCmd() {
srv.etcdCmd.Stderr = srv.etcdLogFile
}
// if started with manual TLS, stores TLS assets
// from tester/client to disk before starting etcd process
func (srv *Server) saveTLSAssets() error {
// if started with manual TLS, stores TLS assets
// from tester/client to disk before starting etcd process
// TODO: not implemented yet
if !srv.Member.Etcd.ClientAutoTLS {
if srv.Member.Etcd.ClientCertAuth {
return fmt.Errorf("manual TLS setup is not implemented yet, but Member.Etcd.ClientCertAuth is %v", srv.Member.Etcd.ClientCertAuth)
if srv.Member.PeerCertPath != "" {
if srv.Member.PeerCertData == "" {
return fmt.Errorf("got empty data for %q", srv.Member.PeerCertPath)
}
if srv.Member.Etcd.ClientCertFile != "" {
return fmt.Errorf("manual TLS setup is not implemented yet, but Member.Etcd.ClientCertFile is %q", srv.Member.Etcd.ClientCertFile)
}
if srv.Member.Etcd.ClientKeyFile != "" {
return fmt.Errorf("manual TLS setup is not implemented yet, but Member.Etcd.ClientKeyFile is %q", srv.Member.Etcd.ClientKeyFile)
}
if srv.Member.Etcd.ClientTrustedCAFile != "" {
return fmt.Errorf("manual TLS setup is not implemented yet, but Member.Etcd.ClientTrustedCAFile is %q", srv.Member.Etcd.ClientTrustedCAFile)
if err := ioutil.WriteFile(srv.Member.PeerCertPath, []byte(srv.Member.PeerCertData), 0644); err != nil {
return err
}
}
if !srv.Member.Etcd.PeerAutoTLS {
if srv.Member.Etcd.PeerClientCertAuth {
return fmt.Errorf("manual TLS setup is not implemented yet, but Member.Etcd.PeerClientCertAuth is %v", srv.Member.Etcd.PeerClientCertAuth)
if srv.Member.PeerKeyPath != "" {
if srv.Member.PeerKeyData == "" {
return fmt.Errorf("got empty data for %q", srv.Member.PeerKeyPath)
}
if srv.Member.Etcd.PeerCertFile != "" {
return fmt.Errorf("manual TLS setup is not implemented yet, but Member.Etcd.PeerCertFile is %q", srv.Member.Etcd.PeerCertFile)
if err := ioutil.WriteFile(srv.Member.PeerKeyPath, []byte(srv.Member.PeerKeyData), 0644); err != nil {
return err
}
if srv.Member.Etcd.PeerKeyFile != "" {
return fmt.Errorf("manual TLS setup is not implemented yet, but Member.Etcd.PeerKeyFile is %q", srv.Member.Etcd.PeerKeyFile)
}
if srv.Member.PeerTrustedCAPath != "" {
if srv.Member.PeerTrustedCAData == "" {
return fmt.Errorf("got empty data for %q", srv.Member.PeerTrustedCAPath)
}
if srv.Member.Etcd.PeerTrustedCAFile != "" {
return fmt.Errorf("manual TLS setup is not implemented yet, but Member.Etcd.PeerTrustedCAFile is %q", srv.Member.Etcd.PeerTrustedCAFile)
if err := ioutil.WriteFile(srv.Member.PeerTrustedCAPath, []byte(srv.Member.PeerTrustedCAData), 0644); err != nil {
return err
}
}
if srv.Member.PeerCertPath != "" &&
srv.Member.PeerKeyPath != "" &&
srv.Member.PeerTrustedCAPath != "" {
srv.lg.Info(
"wrote",
zap.String("peer-cert", srv.Member.PeerCertPath),
zap.String("peer-key", srv.Member.PeerKeyPath),
zap.String("peer-trusted-ca", srv.Member.PeerTrustedCAPath),
)
}
if srv.Member.ClientCertPath != "" {
if srv.Member.ClientCertData == "" {
return fmt.Errorf("got empty data for %q", srv.Member.ClientCertPath)
}
if err := ioutil.WriteFile(srv.Member.ClientCertPath, []byte(srv.Member.ClientCertData), 0644); err != nil {
return err
}
}
if srv.Member.ClientKeyPath != "" {
if srv.Member.ClientKeyData == "" {
return fmt.Errorf("got empty data for %q", srv.Member.ClientKeyPath)
}
if err := ioutil.WriteFile(srv.Member.ClientKeyPath, []byte(srv.Member.ClientKeyData), 0644); err != nil {
return err
}
}
if srv.Member.ClientTrustedCAPath != "" {
if srv.Member.ClientTrustedCAData == "" {
return fmt.Errorf("got empty data for %q", srv.Member.ClientTrustedCAPath)
}
if err := ioutil.WriteFile(srv.Member.ClientTrustedCAPath, []byte(srv.Member.ClientTrustedCAData), 0644); err != nil {
return err
}
}
if srv.Member.ClientCertPath != "" &&
srv.Member.ClientKeyPath != "" &&
srv.Member.ClientTrustedCAPath != "" {
srv.lg.Info(
"wrote",
zap.String("client-cert", srv.Member.ClientCertPath),
zap.String("client-key", srv.Member.ClientKeyPath),
zap.String("client-trusted-ca", srv.Member.ClientTrustedCAPath),
)
}
// TODO
return nil
}
func (srv *Server) loadAutoTLSAssets() error {
// if started with auto TLS, sends back TLS assets to tester/client
if srv.Member.Etcd.PeerAutoTLS {
// in case of slow disk
time.Sleep(time.Second)
fdir := filepath.Join(srv.Member.Etcd.DataDir, "fixtures", "peer")
srv.lg.Info(
"loading client auto TLS assets",
zap.String("dir", fdir),
zap.String("endpoint", srv.EtcdClientEndpoint),
)
certPath := filepath.Join(fdir, "cert.pem")
if !fileutil.Exist(certPath) {
return fmt.Errorf("cannot find %q", certPath)
}
certData, err := ioutil.ReadFile(certPath)
if err != nil {
return fmt.Errorf("cannot read %q (%v)", certPath, err)
}
srv.Member.PeerCertData = string(certData)
keyPath := filepath.Join(fdir, "key.pem")
if !fileutil.Exist(keyPath) {
return fmt.Errorf("cannot find %q", keyPath)
}
keyData, err := ioutil.ReadFile(keyPath)
if err != nil {
return fmt.Errorf("cannot read %q (%v)", keyPath, err)
}
srv.Member.PeerKeyData = string(keyData)
srv.lg.Info(
"loaded peer auto TLS assets",
zap.String("peer-cert-path", certPath),
zap.Int("peer-cert-length", len(certData)),
zap.String("peer-key-path", keyPath),
zap.Int("peer-key-length", len(keyData)),
)
}
if srv.Member.Etcd.ClientAutoTLS {
// in case of slow disk
time.Sleep(time.Second)
@ -315,46 +398,7 @@ func (srv *Server) loadAutoTLSAssets() error {
zap.Int("peer-key-length", len(keyData)),
)
}
if srv.Member.Etcd.ClientAutoTLS {
// in case of slow disk
time.Sleep(time.Second)
fdir := filepath.Join(srv.Member.Etcd.DataDir, "fixtures", "peer")
srv.lg.Info(
"loading client TLS assets",
zap.String("dir", fdir),
zap.String("endpoint", srv.EtcdClientEndpoint),
)
certPath := filepath.Join(fdir, "cert.pem")
if !fileutil.Exist(certPath) {
return fmt.Errorf("cannot find %q", certPath)
}
certData, err := ioutil.ReadFile(certPath)
if err != nil {
return fmt.Errorf("cannot read %q (%v)", certPath, err)
}
srv.Member.PeerCertData = string(certData)
keyPath := filepath.Join(fdir, "key.pem")
if !fileutil.Exist(keyPath) {
return fmt.Errorf("cannot find %q", keyPath)
}
keyData, err := ioutil.ReadFile(keyPath)
if err != nil {
return fmt.Errorf("cannot read %q (%v)", keyPath, err)
}
srv.Member.PeerKeyData = string(keyData)
srv.lg.Info(
"loaded peer TLS assets",
zap.String("peer-cert-path", certPath),
zap.Int("peer-cert-length", len(certData)),
zap.String("peer-key-path", keyPath),
zap.Int("peer-key-length", len(keyData)),
)
}
return nil
}
@ -363,10 +407,17 @@ func (srv *Server) startEtcdCmd() error {
return srv.etcdCmd.Start()
}
func (srv *Server) handleRestartEtcd() (*rpcpb.Response, error) {
func (srv *Server) handle_RESTART_ETCD() (*rpcpb.Response, error) {
var err error
if !fileutil.Exist(srv.Member.BaseDir) {
err = fileutil.TouchDirAll(srv.Member.BaseDir)
if err != nil {
return nil, err
}
}
srv.creatEtcdCmd()
var err error
if err = srv.saveTLSAssets(); err != nil {
return nil, err
}
@ -394,7 +445,7 @@ func (srv *Server) handleRestartEtcd() (*rpcpb.Response, error) {
}, nil
}
func (srv *Server) handleKillEtcd() (*rpcpb.Response, error) {
func (srv *Server) handle_SIGTERM_ETCD() (*rpcpb.Response, error) {
srv.stopProxy()
err := stopWithSig(srv.etcdCmd, syscall.SIGTERM)
@ -405,11 +456,32 @@ func (srv *Server) handleKillEtcd() (*rpcpb.Response, error) {
return &rpcpb.Response{
Success: true,
Status: "successfully killed etcd!",
Status: "killed etcd",
}, nil
}
func (srv *Server) handleFailArchive() (*rpcpb.Response, error) {
func (srv *Server) handle_SIGQUIT_ETCD_AND_REMOVE_DATA() (*rpcpb.Response, error) {
srv.stopProxy()
err := stopWithSig(srv.etcdCmd, syscall.SIGQUIT)
if err != nil {
return nil, err
}
srv.lg.Info("killed etcd", zap.String("signal", syscall.SIGQUIT.String()))
err = os.RemoveAll(srv.Member.BaseDir)
if err != nil {
return nil, err
}
srv.lg.Info("removed base directory", zap.String("dir", srv.Member.BaseDir))
return &rpcpb.Response{
Success: true,
Status: "killed etcd and removed base directory",
}, nil
}
func (srv *Server) handle_SIGQUIT_ETCD_AND_ARCHIVE_DATA() (*rpcpb.Response, error) {
srv.stopProxy()
// exit with stackstrace
@ -444,17 +516,19 @@ func (srv *Server) handleFailArchive() (*rpcpb.Response, error) {
return &rpcpb.Response{
Success: true,
Status: "successfully cleaned up etcd!",
Status: "cleaned up etcd",
}, nil
}
// stop proxy, etcd, delete data directory
func (srv *Server) handleDestroyEtcdAgent() (*rpcpb.Response, error) {
err := stopWithSig(srv.etcdCmd, syscall.SIGTERM)
func (srv *Server) handle_SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT() (*rpcpb.Response, error) {
srv.stopProxy()
err := stopWithSig(srv.etcdCmd, syscall.SIGQUIT)
if err != nil {
return nil, err
}
srv.lg.Info("killed etcd", zap.String("signal", syscall.SIGTERM.String()))
srv.lg.Info("killed etcd", zap.String("signal", syscall.SIGQUIT.String()))
err = os.RemoveAll(srv.Member.BaseDir)
if err != nil {
@ -465,22 +539,13 @@ func (srv *Server) handleDestroyEtcdAgent() (*rpcpb.Response, error) {
// stop agent server
srv.Stop()
for port, px := range srv.advertiseClientPortToProxy {
err := px.Close()
srv.lg.Info("closed proxy", zap.Int("client-port", port), zap.Error(err))
}
for port, px := range srv.advertisePeerPortToProxy {
err := px.Close()
srv.lg.Info("closed proxy", zap.Int("peer-port", port), zap.Error(err))
}
return &rpcpb.Response{
Success: true,
Status: "successfully destroyed etcd and agent!",
Status: "destroyed etcd and agent",
}, nil
}
func (srv *Server) handleBlackholePeerPortTxRx() (*rpcpb.Response, error) {
func (srv *Server) handle_BLACKHOLE_PEER_PORT_TX_RX() (*rpcpb.Response, error) {
for port, px := range srv.advertisePeerPortToProxy {
srv.lg.Info("blackholing", zap.Int("peer-port", port))
px.BlackholeTx()
@ -489,11 +554,11 @@ func (srv *Server) handleBlackholePeerPortTxRx() (*rpcpb.Response, error) {
}
return &rpcpb.Response{
Success: true,
Status: "successfully blackholed peer port tx/rx!",
Status: "blackholed peer port tx/rx",
}, nil
}
func (srv *Server) handleUnblackholePeerPortTxRx() (*rpcpb.Response, error) {
func (srv *Server) handle_UNBLACKHOLE_PEER_PORT_TX_RX() (*rpcpb.Response, error) {
for port, px := range srv.advertisePeerPortToProxy {
srv.lg.Info("unblackholing", zap.Int("peer-port", port))
px.UnblackholeTx()
@ -502,11 +567,11 @@ func (srv *Server) handleUnblackholePeerPortTxRx() (*rpcpb.Response, error) {
}
return &rpcpb.Response{
Success: true,
Status: "successfully unblackholed peer port tx/rx!",
Status: "unblackholed peer port tx/rx",
}, nil
}
func (srv *Server) handleDelayPeerPortTxRx() (*rpcpb.Response, error) {
func (srv *Server) handle_DELAY_PEER_PORT_TX_RX() (*rpcpb.Response, error) {
lat := time.Duration(srv.Tester.UpdatedDelayLatencyMs) * time.Millisecond
rv := time.Duration(srv.Tester.DelayLatencyMsRv) * time.Millisecond
@ -527,11 +592,11 @@ func (srv *Server) handleDelayPeerPortTxRx() (*rpcpb.Response, error) {
return &rpcpb.Response{
Success: true,
Status: "successfully delay peer port tx/rx!",
Status: "delayed peer port tx/rx",
}, nil
}
func (srv *Server) handleUndelayPeerPortTxRx() (*rpcpb.Response, error) {
func (srv *Server) handle_UNDELAY_PEER_PORT_TX_RX() (*rpcpb.Response, error) {
for port, px := range srv.advertisePeerPortToProxy {
srv.lg.Info("undelaying", zap.Int("peer-port", port))
px.UndelayTx()
@ -540,6 +605,6 @@ func (srv *Server) handleUndelayPeerPortTxRx() (*rpcpb.Response, error) {
}
return &rpcpb.Response{
Success: true,
Status: "successfully undelay peer port tx/rx!",
Status: "undelayed peer port tx/rx",
}, nil
}

View File

@ -21,8 +21,8 @@ import (
"os/exec"
"strings"
"github.com/coreos/etcd/functional/rpcpb"
"github.com/coreos/etcd/pkg/proxy"
"github.com/coreos/etcd/tools/functional-tester/rpcpb"
"go.uber.org/zap"
"google.golang.org/grpc"
@ -64,7 +64,7 @@ func NewServer(
lg: lg,
network: network,
address: address,
last: rpcpb.Operation_NotStarted,
last: rpcpb.Operation_NOT_STARTED,
advertiseClientPortToProxy: make(map[int]proxy.Server),
advertisePeerPortToProxy: make(map[int]proxy.Server),
}

11
functional/build Executable file
View File

@ -0,0 +1,11 @@
#!/usr/bin/env bash
if ! [[ "$0" =~ "functional/build" ]]; then
echo "must be run from repository root"
exit 255
fi
CGO_ENABLED=0 go build -a -v -installsuffix cgo -ldflags "-s" -o ./bin/etcd-agent ./functional/cmd/etcd-agent
CGO_ENABLED=0 go build -a -v -installsuffix cgo -ldflags "-s" -o ./bin/etcd-proxy ./functional/cmd/etcd-proxy
CGO_ENABLED=0 go build -a -v -installsuffix cgo -ldflags "-s" -o ./bin/etcd-runner ./functional/cmd/etcd-runner
CGO_ENABLED=0 go build -a -v -installsuffix cgo -ldflags "-s" -o ./bin/etcd-tester ./functional/cmd/etcd-tester

View File

@ -18,7 +18,7 @@ package main
import (
"flag"
"github.com/coreos/etcd/tools/functional-tester/agent"
"github.com/coreos/etcd/functional/agent"
"go.uber.org/zap"
)

View File

@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
// etcd-test-proxy is a proxy layer that simulates various network conditions.
// etcd-proxy is a proxy layer that simulates various network conditions.
package main
import (
@ -40,13 +40,13 @@ func main() {
// TODO: support TLS
flag.StringVar(&from, "from", "localhost:23790", "Address URL to proxy from.")
flag.StringVar(&to, "to", "localhost:2379", "Address URL to forward.")
flag.IntVar(&httpPort, "http-port", 2378, "Port to serve etcd-test-proxy API.")
flag.IntVar(&httpPort, "http-port", 2378, "Port to serve etcd-proxy API.")
flag.BoolVar(&verbose, "verbose", false, "'true' to run proxy in verbose mode.")
flag.Usage = func() {
fmt.Fprintf(os.Stderr, "Usage of %q:\n", os.Args[0])
fmt.Fprintln(os.Stderr, `
etcd-test-proxy simulates various network conditions for etcd testing purposes.
etcd-proxy simulates various network conditions for etcd testing purposes.
See README.md for more examples.
Example:
@ -55,12 +55,12 @@ Example:
$ ./build
$ ./bin/etcd
# build etcd-test-proxy
$ make build-etcd-test-proxy
# build etcd-proxy
$ make build-etcd-proxy
# to test etcd with proxy layer
$ ./bin/etcd-test-proxy --help
$ ./bin/etcd-test-proxy --from localhost:23790 --to localhost:2379 --http-port 2378 --verbose
$ ./bin/etcd-proxy --help
$ ./bin/etcd-proxy --from localhost:23790 --to localhost:2379 --http-port 2378 --verbose
$ ETCDCTL_API=3 ./bin/etcdctl --endpoints localhost:2379 put foo bar
$ ETCDCTL_API=3 ./bin/etcdctl --endpoints localhost:23790 put foo bar`)

View File

@ -12,10 +12,11 @@
// See the License for the specific language governing permissions and
// limitations under the License.
// etcd-runner is a program for testing etcd clientv3 features against a fault injected cluster.
// etcd-runner is a program for testing etcd clientv3 features
// against a fault injected cluster.
package main
import "github.com/coreos/etcd/tools/functional-tester/runner"
import "github.com/coreos/etcd/functional/runner"
func main() {
runner.Start()

View File

@ -18,7 +18,7 @@ package main
import (
"flag"
"github.com/coreos/etcd/tools/functional-tester/tester"
"github.com/coreos/etcd/functional/tester"
"go.uber.org/zap"
)
@ -44,11 +44,11 @@ func main() {
logger.Fatal("failed to create a cluster", zap.Error(err))
}
err = clus.Bootstrap()
err = clus.Send_INITIAL_START_ETCD()
if err != nil {
logger.Fatal("Bootstrap failed", zap.Error(err))
}
defer clus.DestroyEtcdAgents()
defer clus.Send_SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT()
logger.Info("wait health after bootstrap")
err = clus.WaitHealth()
@ -56,5 +56,5 @@ func main() {
logger.Fatal("WaitHealth failed", zap.Error(err))
}
clus.StartTester()
clus.Run()
}

480
functional/rpcpb/rpc.proto Normal file
View File

@ -0,0 +1,480 @@
syntax = "proto3";
package rpcpb;
import "github.com/gogo/protobuf/gogoproto/gogo.proto";
option (gogoproto.marshaler_all) = true;
option (gogoproto.sizer_all) = true;
option (gogoproto.unmarshaler_all) = true;
option (gogoproto.goproto_getters_all) = false;
message Request {
Operation Operation = 1;
// Member contains the same Member object from tester configuration.
Member Member = 2;
// Tester contains tester configuration.
Tester Tester = 3;
}
message Response {
bool Success = 1;
string Status = 2;
// Member contains the same Member object from tester request.
Member Member = 3;
}
service Transport {
rpc Transport(stream Request) returns (stream Response) {}
}
message Member {
// EtcdExecPath is the executable etcd binary path in agent server.
string EtcdExecPath = 1 [(gogoproto.moretags) = "yaml:\"etcd-exec-path\""];
// TODO: support embedded etcd
// AgentAddr is the agent HTTP server address.
string AgentAddr = 11 [(gogoproto.moretags) = "yaml:\"agent-addr\""];
// FailpointHTTPAddr is the agent's failpoints HTTP server address.
string FailpointHTTPAddr = 12 [(gogoproto.moretags) = "yaml:\"failpoint-http-addr\""];
// BaseDir is the base directory where all logs and etcd data are stored.
string BaseDir = 101 [(gogoproto.moretags) = "yaml:\"base-dir\""];
// EtcdLogPath is the log file to store current etcd server logs.
string EtcdLogPath = 102 [(gogoproto.moretags) = "yaml:\"etcd-log-path\""];
// EtcdClientProxy is true when client traffic needs to be proxied.
// If true, listen client URL port must be different than advertise client URL port.
bool EtcdClientProxy = 201 [(gogoproto.moretags) = "yaml:\"etcd-client-proxy\""];
// EtcdPeerProxy is true when peer traffic needs to be proxied.
// If true, listen peer URL port must be different than advertise peer URL port.
bool EtcdPeerProxy = 202 [(gogoproto.moretags) = "yaml:\"etcd-peer-proxy\""];
// EtcdClientEndpoint is the etcd client endpoint.
string EtcdClientEndpoint = 301 [(gogoproto.moretags) = "yaml:\"etcd-client-endpoint\""];
// Etcd defines etcd binary configuration flags.
Etcd Etcd = 302 [(gogoproto.moretags) = "yaml:\"etcd\""];
// ClientCertData contains cert file contents from this member's etcd server.
string ClientCertData = 401 [(gogoproto.moretags) = "yaml:\"client-cert-data\""];
string ClientCertPath = 402 [(gogoproto.moretags) = "yaml:\"client-cert-path\""];
// ClientKeyData contains key file contents from this member's etcd server.
string ClientKeyData = 403 [(gogoproto.moretags) = "yaml:\"client-key-data\""];
string ClientKeyPath = 404 [(gogoproto.moretags) = "yaml:\"client-key-path\""];
// ClientTrustedCAData contains trusted CA file contents from this member's etcd server.
string ClientTrustedCAData = 405 [(gogoproto.moretags) = "yaml:\"client-trusted-ca-data\""];
string ClientTrustedCAPath = 406 [(gogoproto.moretags) = "yaml:\"client-trusted-ca-path\""];
// PeerCertData contains cert file contents from this member's etcd server.
string PeerCertData = 501 [(gogoproto.moretags) = "yaml:\"peer-cert-data\""];
string PeerCertPath = 502 [(gogoproto.moretags) = "yaml:\"peer-cert-path\""];
// PeerKeyData contains key file contents from this member's etcd server.
string PeerKeyData = 503 [(gogoproto.moretags) = "yaml:\"peer-key-data\""];
string PeerKeyPath = 504 [(gogoproto.moretags) = "yaml:\"peer-key-path\""];
// PeerTrustedCAData contains trusted CA file contents from this member's etcd server.
string PeerTrustedCAData = 505 [(gogoproto.moretags) = "yaml:\"peer-trusted-ca-data\""];
string PeerTrustedCAPath = 506 [(gogoproto.moretags) = "yaml:\"peer-trusted-ca-path\""];
}
message Tester {
string DataDir = 1 [(gogoproto.moretags) = "yaml:\"data-dir\""];
string Network = 2 [(gogoproto.moretags) = "yaml:\"network\""];
string Addr = 3 [(gogoproto.moretags) = "yaml:\"addr\""];
// DelayLatencyMsRv is the delay latency in milliseconds,
// to inject to simulated slow network.
uint32 DelayLatencyMs = 11 [(gogoproto.moretags) = "yaml:\"delay-latency-ms\""];
// DelayLatencyMsRv is the delay latency random variable in milliseconds.
uint32 DelayLatencyMsRv = 12 [(gogoproto.moretags) = "yaml:\"delay-latency-ms-rv\""];
// UpdatedDelayLatencyMs is the update delay latency in milliseconds,
// to inject to simulated slow network. It's the final latency to apply,
// in case the latency numbers are randomly generated from given delay latency field.
uint32 UpdatedDelayLatencyMs = 13 [(gogoproto.moretags) = "yaml:\"updated-delay-latency-ms\""];
// RoundLimit is the limit of rounds to run failure set (-1 to run without limits).
int32 RoundLimit = 21 [(gogoproto.moretags) = "yaml:\"round-limit\""];
// ExitOnFailure is true, then exit tester on first failure.
bool ExitOnFailure = 22 [(gogoproto.moretags) = "yaml:\"exit-on-failure\""];
// ConsistencyCheck is true to check consistency (revision, hash).
bool ConsistencyCheck = 23 [(gogoproto.moretags) = "yaml:\"consistency-check\""];
// EnablePprof is true to enable profiler.
bool EnablePprof = 24 [(gogoproto.moretags) = "yaml:\"enable-pprof\""];
// FailureDelayMs is the delay duration after failure is injected.
// Useful when triggering snapshot or no-op failure cases.
uint32 FailureDelayMs = 31 [(gogoproto.moretags) = "yaml:\"failure-delay-ms\""];
// FailureShuffle is true to randomize failure injecting order.
bool FailureShuffle = 32 [(gogoproto.moretags) = "yaml:\"failure-shuffle\""];
// FailureCases is the selected test cases to schedule.
// If empty, run all failure cases.
repeated string FailureCases = 33 [(gogoproto.moretags) = "yaml:\"failure-cases\""];
// Failpoinommands is the list of "gofail" commands (e.g. panic("etcd-tester"),1*sleep(1000)
repeated string FailpointCommands = 34 [(gogoproto.moretags) = "yaml:\"failpoint-commands\""];
// RunnerExecPath is a path of etcd-runner binary.
string RunnerExecPath = 41 [(gogoproto.moretags) = "yaml:\"runner-exec-path\""];
// ExternalExecPath is a path of script for enabling/disabling an external fault injector.
string ExternalExecPath = 42 [(gogoproto.moretags) = "yaml:\"external-exec-path\""];
// StressTypes is the list of stresser names:
// keys, lease, nop, election-runner, watch-runner, lock-racer-runner, lease-runner.
repeated string StressTypes = 101 [(gogoproto.moretags) = "yaml:\"stress-types\""];
// StressKeySize is the size of each small key written into etcd.
int32 StressKeySize = 102 [(gogoproto.moretags) = "yaml:\"stress-key-size\""];
// StressKeySizeLarge is the size of each large key written into etcd.
int32 StressKeySizeLarge = 103 [(gogoproto.moretags) = "yaml:\"stress-key-size-large\""];
// StressKeySuffixRange is the count of key range written into etcd.
// Stress keys are created with "fmt.Sprintf("foo%016x", rand.Intn(keySuffixRange)".
int32 StressKeySuffixRange = 104 [(gogoproto.moretags) = "yaml:\"stress-key-suffix-range\""];
// StressKeySuffixRangeTxn is the count of key range written into etcd txn (max 100).
// Stress keys are created with "fmt.Sprintf("/k%03d", i)".
int32 StressKeySuffixRangeTxn = 105 [(gogoproto.moretags) = "yaml:\"stress-key-suffix-range-txn\""];
// StressKeyTxnOps is the number of operations per a transaction (max 64).
int32 StressKeyTxnOps = 106 [(gogoproto.moretags) = "yaml:\"stress-key-txn-ops\""];
// StressClients is the number of concurrent stressing clients
// with "one" shared TCP connection.
int32 StressClients = 201 [(gogoproto.moretags) = "yaml:\"stress-clients\""];
// StressQPS is the maximum number of stresser requests per second.
int32 StressQPS = 202 [(gogoproto.moretags) = "yaml:\"stress-qps\""];
}
message Etcd {
string Name = 1 [(gogoproto.moretags) = "yaml:\"name\""];
string DataDir = 2 [(gogoproto.moretags) = "yaml:\"data-dir\""];
string WALDir = 3 [(gogoproto.moretags) = "yaml:\"wal-dir\""];
// HeartbeatIntervalMs is the time (in milliseconds) of a heartbeat interval.
// Default value is 100, which is 100ms.
int64 HeartbeatIntervalMs = 11 [(gogoproto.moretags) = "yaml:\"heartbeat-interval\""];
// ElectionTimeoutMs is the time (in milliseconds) for an election to timeout.
// Default value is 1000, which is 1s.
int64 ElectionTimeoutMs = 12 [(gogoproto.moretags) = "yaml:\"election-timeout\""];
repeated string ListenClientURLs = 21 [(gogoproto.moretags) = "yaml:\"listen-client-urls\""];
repeated string AdvertiseClientURLs = 22 [(gogoproto.moretags) = "yaml:\"advertise-client-urls\""];
bool ClientAutoTLS = 23 [(gogoproto.moretags) = "yaml:\"auto-tls\""];
bool ClientCertAuth = 24 [(gogoproto.moretags) = "yaml:\"client-cert-auth\""];
string ClientCertFile = 25 [(gogoproto.moretags) = "yaml:\"cert-file\""];
string ClientKeyFile = 26 [(gogoproto.moretags) = "yaml:\"key-file\""];
string ClientTrustedCAFile = 27 [(gogoproto.moretags) = "yaml:\"trusted-ca-file\""];
repeated string ListenPeerURLs = 31 [(gogoproto.moretags) = "yaml:\"listen-peer-urls\""];
repeated string AdvertisePeerURLs = 32 [(gogoproto.moretags) = "yaml:\"initial-advertise-peer-urls\""];
bool PeerAutoTLS = 33 [(gogoproto.moretags) = "yaml:\"peer-auto-tls\""];
bool PeerClientCertAuth = 34 [(gogoproto.moretags) = "yaml:\"peer-client-cert-auth\""];
string PeerCertFile = 35 [(gogoproto.moretags) = "yaml:\"peer-cert-file\""];
string PeerKeyFile = 36 [(gogoproto.moretags) = "yaml:\"peer-key-file\""];
string PeerTrustedCAFile = 37 [(gogoproto.moretags) = "yaml:\"peer-trusted-ca-file\""];
string InitialCluster = 41 [(gogoproto.moretags) = "yaml:\"initial-cluster\""];
string InitialClusterState = 42 [(gogoproto.moretags) = "yaml:\"initial-cluster-state\""];
string InitialClusterToken = 43 [(gogoproto.moretags) = "yaml:\"initial-cluster-token\""];
int64 SnapshotCount = 51 [(gogoproto.moretags) = "yaml:\"snapshot-count\""];
int64 QuotaBackendBytes = 52 [(gogoproto.moretags) = "yaml:\"quota-backend-bytes\""];
bool PreVote = 63 [(gogoproto.moretags) = "yaml:\"pre-vote\""];
bool InitialCorruptCheck = 64 [(gogoproto.moretags) = "yaml:\"initial-corrupt-check\""];
}
enum Operation {
// NOT_STARTED is the agent status before etcd first start.
NOT_STARTED = 0;
// INITIAL_START_ETCD is only called to start etcd, the very first time.
INITIAL_START_ETCD = 10;
// RESTART_ETCD is sent to restart killed etcd.
RESTART_ETCD = 11;
// SIGTERM_ETCD pauses etcd process while keeping data directories
// and previous etcd configurations.
SIGTERM_ETCD = 20;
// SIGQUIT_ETCD_AND_REMOVE_DATA kills etcd process and removes all data
// directories to simulate destroying the whole machine.
SIGQUIT_ETCD_AND_REMOVE_DATA = 21;
// SIGQUIT_ETCD_AND_ARCHIVE_DATA is sent when consistency check failed,
// thus need to archive etcd data directories.
SIGQUIT_ETCD_AND_ARCHIVE_DATA = 30;
// SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT destroys etcd process,
// etcd data, and agent server.
SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT = 31;
// BLACKHOLE_PEER_PORT_TX_RX drops all outgoing/incoming packets from/to
// the peer port on target member's peer port.
BLACKHOLE_PEER_PORT_TX_RX = 100;
// UNBLACKHOLE_PEER_PORT_TX_RX removes outgoing/incoming packet dropping.
UNBLACKHOLE_PEER_PORT_TX_RX = 101;
// DELAY_PEER_PORT_TX_RX delays all outgoing/incoming packets from/to
// the peer port on target member's peer port.
DELAY_PEER_PORT_TX_RX = 200;
// UNDELAY_PEER_PORT_TX_RX removes all outgoing/incoming delays.
UNDELAY_PEER_PORT_TX_RX = 201;
}
// FailureCase defines various system faults in distributed systems,
// in order to verify correct behavior of etcd servers and clients.
enum FailureCase {
// SIGTERM_ONE_FOLLOWER stops a randomly chosen follower (non-leader)
// but does not delete its data directories on disk for next restart.
// It waits "failure-delay-ms" before recovering this failure.
// The expected behavior is that the follower comes back online
// and rejoins the cluster, and then each member continues to process
// client requests ('Put' request that requires Raft consensus).
SIGTERM_ONE_FOLLOWER = 0;
// SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT stops a randomly chosen
// follower but does not delete its data directories on disk for next
// restart. And waits until most up-to-date node (leader) applies the
// snapshot count of entries since the stop operation.
// The expected behavior is that the follower comes back online and
// rejoins the cluster, and then active leader sends snapshot
// to the follower to force it to follow the leader's log.
// As always, after recovery, each member must be able to process
// client requests.
SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 1;
// SIGTERM_LEADER stops the active leader node but does not delete its
// data directories on disk for next restart. Then it waits
// "failure-delay-ms" before recovering this failure, in order to
// trigger election timeouts.
// The expected behavior is that a new leader gets elected, and the
// old leader comes back online and rejoins the cluster as a follower.
// As always, after recovery, each member must be able to process
// client requests.
SIGTERM_LEADER = 2;
// SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT stops the active leader node
// but does not delete its data directories on disk for next restart.
// And waits until most up-to-date node ("new" leader) applies the
// snapshot count of entries since the stop operation.
// The expected behavior is that cluster elects a new leader, and the
// old leader comes back online and rejoins the cluster as a follower.
// And it receives the snapshot from the new leader to overwrite its
// store. As always, after recovery, each member must be able to
// process client requests.
SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT = 3;
// SIGTERM_QUORUM stops majority number of nodes to make the whole cluster
// inoperable but does not delete data directories on stopped nodes
// for next restart. And it waits "failure-delay-ms" before recovering
// this failure.
// The expected behavior is that nodes come back online, thus cluster
// comes back operative as well. As always, after recovery, each member
// must be able to process client requests.
SIGTERM_QUORUM = 4;
// SIGTERM_ALL stops the whole cluster but does not delete data directories
// on disk for next restart. And it waits "failure-delay-ms" before
// recovering this failure.
// The expected behavior is that nodes come back online, thus cluster
// comes back operative as well. As always, after recovery, each member
// must be able to process client requests.
SIGTERM_ALL = 5;
// BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER drops all outgoing/incoming
// packets from/to the peer port on a randomly chosen follower
// (non-leader), and waits for "failure-delay-ms" until recovery.
// The expected behavior is that once dropping operation is undone,
// each member must be able to process client requests.
BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER = 100;
// BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT drops
// all outgoing/incoming packets from/to the peer port on a randomly
// chosen follower (non-leader), and waits for most up-to-date node
// (leader) applies the snapshot count of entries since the blackhole
// operation.
// The expected behavior is that once packet drop operation is undone,
// the slow follower tries to catch up, possibly receiving the snapshot
// from the active leader. As always, after recovery, each member must
// be able to process client requests.
BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 101;
// BLACKHOLE_PEER_PORT_TX_RX_LEADER drops all outgoing/incoming packets
// from/to the peer port on the active leader (isolated), and waits for
// "failure-delay-ms" until recovery, in order to trigger election timeout.
// The expected behavior is that after election timeout, a new leader gets
// elected, and once dropping operation is undone, the old leader comes
// back and rejoins the cluster as a follower. As always, after recovery,
// each member must be able to process client requests.
BLACKHOLE_PEER_PORT_TX_RX_LEADER = 102;
// BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT drops all
// outgoing/incoming packets from/to the peer port on the active leader,
// and waits for most up-to-date node (leader) applies the snapshot
// count of entries since the blackhole operation.
// The expected behavior is that cluster elects a new leader, and once
// dropping operation is undone, the old leader comes back and rejoins
// the cluster as a follower. The slow follower tries to catch up, likely
// receiving the snapshot from the new active leader. As always, after
// recovery, each member must be able to process client requests.
BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT = 103;
// BLACKHOLE_PEER_PORT_TX_RX_QUORUM drops all outgoing/incoming packets
// from/to the peer ports on majority nodes of cluster, thus losing its
// leader and cluster being inoperable. And it waits for "failure-delay-ms"
// until recovery.
// The expected behavior is that once packet drop operation is undone,
// nodes come back online, thus cluster comes back operative. As always,
// after recovery, each member must be able to process client requests.
BLACKHOLE_PEER_PORT_TX_RX_QUORUM = 104;
// BLACKHOLE_PEER_PORT_TX_RX_ALL drops all outgoing/incoming packets
// from/to the peer ports on all nodes, thus making cluster totally
// inoperable. It waits for "failure-delay-ms" until recovery.
// The expected behavior is that once packet drop operation is undone,
// nodes come back online, thus cluster comes back operative. As always,
// after recovery, each member must be able to process client requests.
BLACKHOLE_PEER_PORT_TX_RX_ALL = 105;
// DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER delays outgoing/incoming packets
// from/to the peer port on a randomly chosen follower (non-leader).
// It waits for "failure-delay-ms" until recovery.
// The expected behavior is that once packet delay operation is undone,
// the follower comes back and tries to catch up with latest changes from
// cluster. And as always, after recovery, each member must be able to
// process client requests.
DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER = 200;
// RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER delays outgoing/incoming
// packets from/to the peer port on a randomly chosen follower
// (non-leader) with a randomized time duration (thus isolated). It waits
// for "failure-delay-ms" until recovery.
// The expected behavior is that once packet delay operation is undone,
// each member must be able to process client requests.
RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER = 201;
// DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT delays
// outgoing/incoming packets from/to the peer port on a randomly chosen
// follower (non-leader), and waits for most up-to-date node (leader)
// applies the snapshot count of entries since the delay operation.
// The expected behavior is that the delayed follower gets isolated
// and behind the current active leader, and once delay operation is undone,
// the slow follower comes back and catches up possibly receiving snapshot
// from the active leader. As always, after recovery, each member must be
// able to process client requests.
DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 202;
// RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT delays
// outgoing/incoming packets from/to the peer port on a randomly chosen
// follower (non-leader) with a randomized time duration, and waits for
// most up-to-date node (leader) applies the snapshot count of entries
// since the delay operation.
// The expected behavior is that the delayed follower gets isolated
// and behind the current active leader, and once delay operation is undone,
// the slow follower comes back and catches up, possibly receiving a
// snapshot from the active leader. As always, after recovery, each member
// must be able to process client requests.
RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 203;
// DELAY_PEER_PORT_TX_RX_LEADER delays outgoing/incoming packets from/to
// the peer port on the active leader. And waits for "failure-delay-ms"
// until recovery.
// The expected behavior is that cluster may elect a new leader, and
// once packet delay operation is undone, the (old) leader comes back
// and tries to catch up with latest changes from cluster. As always,
// after recovery, each member must be able to process client requests.
DELAY_PEER_PORT_TX_RX_LEADER = 204;
// RANDOM_DELAY_PEER_PORT_TX_RX_LEADER delays outgoing/incoming packets
// from/to the peer port on the active leader with a randomized time
// duration. And waits for "failure-delay-ms" until recovery.
// The expected behavior is that cluster may elect a new leader, and
// once packet delay operation is undone, the (old) leader comes back
// and tries to catch up with latest changes from cluster. As always,
// after recovery, each member must be able to process client requests.
RANDOM_DELAY_PEER_PORT_TX_RX_LEADER = 205;
// DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT delays
// outgoing/incoming packets from/to the peer port on the active leader,
// and waits for most up-to-date node (current or new leader) applies the
// snapshot count of entries since the delay operation.
// The expected behavior is that cluster may elect a new leader, and
// the old leader gets isolated and behind the current active leader,
// and once delay operation is undone, the slow follower comes back
// and catches up, likely receiving a snapshot from the active leader.
// As always, after recovery, each member must be able to process client
// requests.
DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT = 206;
// RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT delays
// outgoing/incoming packets from/to the peer port on the active leader,
// with a randomized time duration. And it waits for most up-to-date node
// (current or new leader) applies the snapshot count of entries since the
// delay operation.
// The expected behavior is that cluster may elect a new leader, and
// the old leader gets isolated and behind the current active leader,
// and once delay operation is undone, the slow follower comes back
// and catches up, likely receiving a snapshot from the active leader.
// As always, after recovery, each member must be able to process client
// requests.
RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT = 207;
// DELAY_PEER_PORT_TX_RX_QUORUM delays outgoing/incoming packets from/to
// the peer ports on majority nodes of cluster. And it waits for
// "failure-delay-ms" until recovery, likely to trigger election timeouts.
// The expected behavior is that cluster may elect a new leader, while
// quorum of nodes struggle with slow networks, and once delay operation
// is undone, nodes come back and cluster comes back operative. As always,
// after recovery, each member must be able to process client requests.
DELAY_PEER_PORT_TX_RX_QUORUM = 208;
// RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM delays outgoing/incoming packets
// from/to the peer ports on majority nodes of cluster, with randomized
// time durations. And it waits for "failure-delay-ms" until recovery,
// likely to trigger election timeouts.
// The expected behavior is that cluster may elect a new leader, while
// quorum of nodes struggle with slow networks, and once delay operation
// is undone, nodes come back and cluster comes back operative. As always,
// after recovery, each member must be able to process client requests.
RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM = 209;
// DELAY_PEER_PORT_TX_RX_ALL delays outgoing/incoming packets from/to the
// peer ports on all nodes. And it waits for "failure-delay-ms" until
// recovery, likely to trigger election timeouts.
// The expected behavior is that cluster may become totally inoperable,
// struggling with slow networks across the whole cluster. Once delay
// operation is undone, nodes come back and cluster comes back operative.
// As always, after recovery, each member must be able to process client
// requests.
DELAY_PEER_PORT_TX_RX_ALL = 210;
// RANDOM_DELAY_PEER_PORT_TX_RX_ALL delays outgoing/incoming packets
// from/to the peer ports on all nodes, with randomized time durations.
// And it waits for "failure-delay-ms" until recovery, likely to trigger
// election timeouts.
// The expected behavior is that cluster may become totally inoperable,
// struggling with slow networks across the whole cluster. Once delay
// operation is undone, nodes come back and cluster comes back operative.
// As always, after recovery, each member must be able to process client
// requests.
RANDOM_DELAY_PEER_PORT_TX_RX_ALL = 211;
// NO_FAIL_WITH_STRESS runs no-op failure injection that does not do
// anything against cluster for "failure-delay-ms" duration, while
// stressers are still sending requests.
NO_FAIL_WITH_STRESS = 300;
// NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS runs no-op failure injection
// that does not do anything against cluster for "failure-delay-ms"
// duration, while all stressers are stopped.
NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS = 301;
// FAILPOINTS injects failpoints to etcd server runtime, triggering panics
// in critical code paths.
FAILPOINTS = 400;
// EXTERNAL runs external failure injection scripts.
EXTERNAL = 500;
}
enum StressType {
KV = 0;
LEASE = 1;
ELECTION_RUNNER = 2;
WATCH_RUNNER = 3;
LOCK_RACER_RUNNER = 4;
LEASE_RUNNER = 5;
}

View File

@ -8,7 +8,7 @@
COMMENT
if ! [[ "${0}" =~ "scripts/docker-local-agent.sh" ]]; then
echo "must be run from tools/functional-tester"
echo "must be run from functional"
exit 255
fi

View File

@ -1,7 +1,7 @@
#!/usr/bin/env bash
if ! [[ "${0}" =~ "scripts/docker-local-tester.sh" ]]; then
echo "must be run from tools/functional-tester"
echo "must be run from functional"
exit 255
fi
@ -15,4 +15,4 @@ docker run \
--net=host \
--name tester \
gcr.io/etcd-development/etcd-functional-tester:go${GO_VERSION} \
/bin/bash -c "./bin/etcd-tester --config ./local-test.yaml"
/bin/bash -c "./bin/etcd-tester --config ./functional.yaml"

View File

@ -21,7 +21,7 @@ import (
"github.com/coreos/etcd/clientv3"
"github.com/coreos/etcd/etcdserver/api/v3rpc/rpctypes"
"github.com/coreos/etcd/tools/functional-tester/rpcpb"
"github.com/coreos/etcd/functional/rpcpb"
"go.uber.org/zap"
"google.golang.org/grpc"

View File

@ -18,23 +18,24 @@ import (
"context"
"errors"
"fmt"
"io"
"io/ioutil"
"math/rand"
"net/http"
"net/url"
"path/filepath"
"strings"
"sync"
"time"
"github.com/coreos/etcd/functional/rpcpb"
"github.com/coreos/etcd/pkg/debugutil"
"github.com/coreos/etcd/pkg/fileutil"
"github.com/coreos/etcd/tools/functional-tester/rpcpb"
"github.com/prometheus/client_golang/prometheus/promhttp"
"go.uber.org/zap"
"golang.org/x/time/rate"
"google.golang.org/grpc"
yaml "gopkg.in/yaml.v2"
)
// Cluster defines tester cluster.
@ -62,221 +63,6 @@ type Cluster struct {
cs int
}
func newCluster(lg *zap.Logger, fpath string) (*Cluster, error) {
bts, err := ioutil.ReadFile(fpath)
if err != nil {
return nil, err
}
lg.Info("opened configuration file", zap.String("path", fpath))
clus := &Cluster{lg: lg}
if err = yaml.Unmarshal(bts, clus); err != nil {
return nil, err
}
for i, mem := range clus.Members {
if mem.BaseDir == "" {
return nil, fmt.Errorf("Members[i].BaseDir cannot be empty (got %q)", mem.BaseDir)
}
if mem.EtcdLogPath == "" {
return nil, fmt.Errorf("Members[i].EtcdLogPath cannot be empty (got %q)", mem.EtcdLogPath)
}
if mem.Etcd.Name == "" {
return nil, fmt.Errorf("'--name' cannot be empty (got %+v)", mem)
}
if mem.Etcd.DataDir == "" {
return nil, fmt.Errorf("'--data-dir' cannot be empty (got %+v)", mem)
}
if mem.Etcd.SnapshotCount == 0 {
return nil, fmt.Errorf("'--snapshot-count' cannot be 0 (got %+v)", mem.Etcd.SnapshotCount)
}
if mem.Etcd.DataDir == "" {
return nil, fmt.Errorf("'--data-dir' cannot be empty (got %q)", mem.Etcd.DataDir)
}
if mem.Etcd.WALDir == "" {
clus.Members[i].Etcd.WALDir = filepath.Join(mem.Etcd.DataDir, "member", "wal")
}
if mem.Etcd.HeartbeatIntervalMs == 0 {
return nil, fmt.Errorf("'--heartbeat-interval' cannot be 0 (got %+v)", mem.Etcd)
}
if mem.Etcd.ElectionTimeoutMs == 0 {
return nil, fmt.Errorf("'--election-timeout' cannot be 0 (got %+v)", mem.Etcd)
}
if int64(clus.Tester.DelayLatencyMs) <= mem.Etcd.ElectionTimeoutMs {
return nil, fmt.Errorf("delay latency %d ms must be greater than election timeout %d ms", clus.Tester.DelayLatencyMs, mem.Etcd.ElectionTimeoutMs)
}
port := ""
listenClientPorts := make([]string, len(clus.Members))
for i, u := range mem.Etcd.ListenClientURLs {
if !isValidURL(u) {
return nil, fmt.Errorf("'--listen-client-urls' has valid URL %q", u)
}
listenClientPorts[i], err = getPort(u)
if err != nil {
return nil, fmt.Errorf("'--listen-client-urls' has no port %q", u)
}
}
for i, u := range mem.Etcd.AdvertiseClientURLs {
if !isValidURL(u) {
return nil, fmt.Errorf("'--advertise-client-urls' has valid URL %q", u)
}
port, err = getPort(u)
if err != nil {
return nil, fmt.Errorf("'--advertise-client-urls' has no port %q", u)
}
if mem.EtcdClientProxy && listenClientPorts[i] == port {
return nil, fmt.Errorf("clus.Members[%d] requires client port proxy, but advertise port %q conflicts with listener port %q", i, port, listenClientPorts[i])
}
}
listenPeerPorts := make([]string, len(clus.Members))
for i, u := range mem.Etcd.ListenPeerURLs {
if !isValidURL(u) {
return nil, fmt.Errorf("'--listen-peer-urls' has valid URL %q", u)
}
listenPeerPorts[i], err = getPort(u)
if err != nil {
return nil, fmt.Errorf("'--listen-peer-urls' has no port %q", u)
}
}
for j, u := range mem.Etcd.AdvertisePeerURLs {
if !isValidURL(u) {
return nil, fmt.Errorf("'--initial-advertise-peer-urls' has valid URL %q", u)
}
port, err = getPort(u)
if err != nil {
return nil, fmt.Errorf("'--initial-advertise-peer-urls' has no port %q", u)
}
if mem.EtcdPeerProxy && listenPeerPorts[j] == port {
return nil, fmt.Errorf("clus.Members[%d] requires peer port proxy, but advertise port %q conflicts with listener port %q", i, port, listenPeerPorts[j])
}
}
if !strings.HasPrefix(mem.EtcdLogPath, mem.BaseDir) {
return nil, fmt.Errorf("EtcdLogPath must be prefixed with BaseDir (got %q)", mem.EtcdLogPath)
}
if !strings.HasPrefix(mem.Etcd.DataDir, mem.BaseDir) {
return nil, fmt.Errorf("Etcd.DataDir must be prefixed with BaseDir (got %q)", mem.Etcd.DataDir)
}
// TODO: support separate WALDir that can be handled via failure-archive
if !strings.HasPrefix(mem.Etcd.WALDir, mem.BaseDir) {
return nil, fmt.Errorf("Etcd.WALDir must be prefixed with BaseDir (got %q)", mem.Etcd.WALDir)
}
// TODO: only support generated certs with TLS generator
// deprecate auto TLS
if mem.Etcd.ClientAutoTLS && mem.Etcd.ClientCertAuth {
return nil, fmt.Errorf("Etcd.ClientAutoTLS and Etcd.ClientCertAuth are both 'true'")
}
if mem.Etcd.ClientAutoTLS && mem.Etcd.ClientCertFile != "" {
return nil, fmt.Errorf("Etcd.ClientAutoTLS 'true', but Etcd.ClientCertFile is %q", mem.Etcd.ClientCertFile)
}
if mem.Etcd.ClientCertAuth && mem.Etcd.ClientCertFile == "" {
return nil, fmt.Errorf("Etcd.ClientCertAuth 'true', but Etcd.ClientCertFile is %q", mem.Etcd.PeerCertFile)
}
if mem.Etcd.ClientAutoTLS && mem.Etcd.ClientKeyFile != "" {
return nil, fmt.Errorf("Etcd.ClientAutoTLS 'true', but Etcd.ClientKeyFile is %q", mem.Etcd.ClientKeyFile)
}
if mem.Etcd.ClientAutoTLS && mem.Etcd.ClientTrustedCAFile != "" {
return nil, fmt.Errorf("Etcd.ClientAutoTLS 'true', but Etcd.ClientTrustedCAFile is %q", mem.Etcd.ClientTrustedCAFile)
}
if mem.Etcd.PeerAutoTLS && mem.Etcd.PeerClientCertAuth {
return nil, fmt.Errorf("Etcd.PeerAutoTLS and Etcd.PeerClientCertAuth are both 'true'")
}
if mem.Etcd.PeerAutoTLS && mem.Etcd.PeerCertFile != "" {
return nil, fmt.Errorf("Etcd.PeerAutoTLS 'true', but Etcd.PeerCertFile is %q", mem.Etcd.PeerCertFile)
}
if mem.Etcd.PeerClientCertAuth && mem.Etcd.PeerCertFile == "" {
return nil, fmt.Errorf("Etcd.PeerClientCertAuth 'true', but Etcd.PeerCertFile is %q", mem.Etcd.PeerCertFile)
}
if mem.Etcd.PeerAutoTLS && mem.Etcd.PeerKeyFile != "" {
return nil, fmt.Errorf("Etcd.PeerAutoTLS 'true', but Etcd.PeerKeyFile is %q", mem.Etcd.PeerKeyFile)
}
if mem.Etcd.PeerAutoTLS && mem.Etcd.PeerTrustedCAFile != "" {
return nil, fmt.Errorf("Etcd.PeerAutoTLS 'true', but Etcd.PeerTrustedCAFile is %q", mem.Etcd.PeerTrustedCAFile)
}
if mem.Etcd.ClientAutoTLS || mem.Etcd.ClientCertFile != "" {
for _, cu := range mem.Etcd.ListenClientURLs {
var u *url.URL
u, err = url.Parse(cu)
if err != nil {
return nil, err
}
if u.Scheme != "https" { // TODO: support unix
return nil, fmt.Errorf("client TLS is enabled with wrong scheme %q", cu)
}
}
for _, cu := range mem.Etcd.AdvertiseClientURLs {
var u *url.URL
u, err = url.Parse(cu)
if err != nil {
return nil, err
}
if u.Scheme != "https" { // TODO: support unix
return nil, fmt.Errorf("client TLS is enabled with wrong scheme %q", cu)
}
}
}
if mem.Etcd.PeerAutoTLS || mem.Etcd.PeerCertFile != "" {
for _, cu := range mem.Etcd.ListenPeerURLs {
var u *url.URL
u, err = url.Parse(cu)
if err != nil {
return nil, err
}
if u.Scheme != "https" { // TODO: support unix
return nil, fmt.Errorf("peer TLS is enabled with wrong scheme %q", cu)
}
}
for _, cu := range mem.Etcd.AdvertisePeerURLs {
var u *url.URL
u, err = url.Parse(cu)
if err != nil {
return nil, err
}
if u.Scheme != "https" { // TODO: support unix
return nil, fmt.Errorf("peer TLS is enabled with wrong scheme %q", cu)
}
}
}
}
if len(clus.Tester.FailureCases) == 0 {
return nil, errors.New("FailureCases not found")
}
if clus.Tester.DelayLatencyMs <= clus.Tester.DelayLatencyMsRv*5 {
return nil, fmt.Errorf("delay latency %d ms must be greater than 5x of delay latency random variable %d ms", clus.Tester.DelayLatencyMs, clus.Tester.DelayLatencyMsRv)
}
if clus.Tester.UpdatedDelayLatencyMs == 0 {
clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs
}
for _, v := range clus.Tester.FailureCases {
if _, ok := rpcpb.FailureCase_value[v]; !ok {
return nil, fmt.Errorf("%q is not defined in 'rpcpb.FailureCase_value'", v)
}
}
for _, v := range clus.Tester.StressTypes {
if _, ok := rpcpb.StressType_value[v]; !ok {
return nil, fmt.Errorf("StressType is unknown; got %q", v)
}
}
if clus.Tester.StressKeySuffixRangeTxn > 100 {
return nil, fmt.Errorf("StressKeySuffixRangeTxn maximum value is 100, got %v", clus.Tester.StressKeySuffixRangeTxn)
}
if clus.Tester.StressKeyTxnOps > 64 {
return nil, fmt.Errorf("StressKeyTxnOps maximum value is 64, got %v", clus.Tester.StressKeyTxnOps)
}
return clus, err
}
var dialOpts = []grpc.DialOption{
grpc.WithInsecure(),
grpc.WithTimeout(5 * time.Second),
@ -285,7 +71,7 @@ var dialOpts = []grpc.DialOption{
// NewCluster creates a client from a tester configuration.
func NewCluster(lg *zap.Logger, fpath string) (*Cluster, error) {
clus, err := newCluster(lg, fpath)
clus, err := read(lg, fpath)
if err != nil {
return nil, err
}
@ -320,7 +106,7 @@ func NewCluster(lg *zap.Logger, fpath string) (*Cluster, error) {
}
}
clus.testerHTTPServer = &http.Server{
Addr: clus.Tester.TesterAddr,
Addr: clus.Tester.Addr,
Handler: mux,
}
go clus.serveTesterServer()
@ -340,12 +126,12 @@ func NewCluster(lg *zap.Logger, fpath string) (*Cluster, error) {
func (clus *Cluster) serveTesterServer() {
clus.lg.Info(
"started tester HTTP server",
zap.String("tester-address", clus.Tester.TesterAddr),
zap.String("tester-address", clus.Tester.Addr),
)
err := clus.testerHTTPServer.ListenAndServe()
clus.lg.Info(
"tester HTTP server returned",
zap.String("tester-address", clus.Tester.TesterAddr),
zap.String("tester-address", clus.Tester.Addr),
zap.Error(err),
)
if err != nil && err != http.ErrServerClosed {
@ -356,70 +142,98 @@ func (clus *Cluster) serveTesterServer() {
func (clus *Cluster) updateFailures() {
for _, cs := range clus.Tester.FailureCases {
switch cs {
case "KILL_ONE_FOLLOWER":
clus.failures = append(clus.failures, newFailureKillOneFollower())
case "KILL_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT":
clus.failures = append(clus.failures, newFailureKillOneFollowerUntilTriggerSnapshot())
case "KILL_LEADER":
clus.failures = append(clus.failures, newFailureKillLeader())
case "KILL_LEADER_UNTIL_TRIGGER_SNAPSHOT":
clus.failures = append(clus.failures, newFailureKillLeaderUntilTriggerSnapshot())
case "KILL_QUORUM":
clus.failures = append(clus.failures, newFailureKillQuorum())
case "KILL_ALL":
clus.failures = append(clus.failures, newFailureKillAll())
case "SIGTERM_ONE_FOLLOWER":
clus.failures = append(clus.failures,
new_FailureCase_SIGTERM_ONE_FOLLOWER(clus))
case "SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT":
clus.failures = append(clus.failures,
new_FailureCase_SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT(clus))
case "SIGTERM_LEADER":
clus.failures = append(clus.failures,
new_FailureCase_SIGTERM_LEADER(clus))
case "SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT":
clus.failures = append(clus.failures,
new_FailureCase_SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT(clus))
case "SIGTERM_QUORUM":
clus.failures = append(clus.failures,
new_FailureCase_SIGTERM_QUORUM(clus))
case "SIGTERM_ALL":
clus.failures = append(clus.failures,
new_FailureCase_SIGTERM_ALL(clus))
case "BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER":
clus.failures = append(clus.failures, newFailureBlackholePeerPortTxRxOneFollower(clus))
clus.failures = append(clus.failures,
new_FailureCase_BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER(clus))
case "BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT":
clus.failures = append(clus.failures, newFailureBlackholePeerPortTxRxOneFollowerUntilTriggerSnapshot())
clus.failures = append(clus.failures,
new_FailureCase_BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT())
case "BLACKHOLE_PEER_PORT_TX_RX_LEADER":
clus.failures = append(clus.failures, newFailureBlackholePeerPortTxRxLeader(clus))
clus.failures = append(clus.failures,
new_FailureCase_BLACKHOLE_PEER_PORT_TX_RX_LEADER(clus))
case "BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT":
clus.failures = append(clus.failures, newFailureBlackholePeerPortTxRxLeaderUntilTriggerSnapshot())
clus.failures = append(clus.failures,
new_FailureCase_BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT())
case "BLACKHOLE_PEER_PORT_TX_RX_QUORUM":
clus.failures = append(clus.failures, newFailureBlackholePeerPortTxRxQuorum(clus))
clus.failures = append(clus.failures,
new_FailureCase_BLACKHOLE_PEER_PORT_TX_RX_QUORUM(clus))
case "BLACKHOLE_PEER_PORT_TX_RX_ALL":
clus.failures = append(clus.failures, newFailureBlackholePeerPortTxRxAll(clus))
clus.failures = append(clus.failures,
new_FailureCase_BLACKHOLE_PEER_PORT_TX_RX_ALL(clus))
case "DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER":
clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxOneFollower(clus, false))
clus.failures = append(clus.failures,
new_FailureCase_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER(clus, false))
case "RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER":
clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxOneFollower(clus, true))
clus.failures = append(clus.failures,
new_FailureCase_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER(clus, true))
case "DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT":
clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxOneFollowerUntilTriggerSnapshot(clus, false))
clus.failures = append(clus.failures,
new_FailureCase_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT(clus, false))
case "RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT":
clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxOneFollowerUntilTriggerSnapshot(clus, true))
clus.failures = append(clus.failures,
new_FailureCase_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT(clus, true))
case "DELAY_PEER_PORT_TX_RX_LEADER":
clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxLeader(clus, false))
clus.failures = append(clus.failures,
new_FailureCase_DELAY_PEER_PORT_TX_RX_LEADER(clus, false))
case "RANDOM_DELAY_PEER_PORT_TX_RX_LEADER":
clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxLeader(clus, true))
clus.failures = append(clus.failures,
new_FailureCase_DELAY_PEER_PORT_TX_RX_LEADER(clus, true))
case "DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT":
clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxLeaderUntilTriggerSnapshot(clus, false))
clus.failures = append(clus.failures,
new_FailureCase_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT(clus, false))
case "RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT":
clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxLeaderUntilTriggerSnapshot(clus, true))
clus.failures = append(clus.failures,
new_FailureCase_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT(clus, true))
case "DELAY_PEER_PORT_TX_RX_QUORUM":
clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxQuorum(clus, false))
clus.failures = append(clus.failures,
new_FailureCase_DELAY_PEER_PORT_TX_RX_QUORUM(clus, false))
case "RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM":
clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxQuorum(clus, true))
clus.failures = append(clus.failures,
new_FailureCase_DELAY_PEER_PORT_TX_RX_QUORUM(clus, true))
case "DELAY_PEER_PORT_TX_RX_ALL":
clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxAll(clus, false))
clus.failures = append(clus.failures,
new_FailureCase_DELAY_PEER_PORT_TX_RX_ALL(clus, false))
case "RANDOM_DELAY_PEER_PORT_TX_RX_ALL":
clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxAll(clus, true))
clus.failures = append(clus.failures,
new_FailureCase_DELAY_PEER_PORT_TX_RX_ALL(clus, true))
case "NO_FAIL_WITH_STRESS":
clus.failures = append(clus.failures, newFailureNoFailWithStress(clus))
clus.failures = append(clus.failures,
new_FailureCase_NO_FAIL_WITH_STRESS(clus))
case "NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS":
clus.failures = append(clus.failures, newFailureNoFailWithNoStressForLiveness(clus))
clus.failures = append(clus.failures,
new_FailureCase_NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS(clus))
case "EXTERNAL":
clus.failures = append(clus.failures, newFailureExternal(clus.Tester.ExternalExecPath))
clus.failures = append(clus.failures,
new_FailureCase_EXTERNAL(clus.Tester.ExternalExecPath))
case "FAILPOINTS":
fpFailures, fperr := failpointFailures(clus)
if len(fpFailures) == 0 {
clus.lg.Info("no failpoints found!", zap.Error(fperr))
}
clus.failures = append(clus.failures, fpFailures...)
clus.failures = append(clus.failures,
fpFailures...)
}
}
}
@ -444,48 +258,6 @@ func (clus *Cluster) UpdateDelayLatencyMs() {
}
}
func (clus *Cluster) shuffleFailures() {
rand.Seed(time.Now().UnixNano())
offset := rand.Intn(1000)
n := len(clus.failures)
cp := coprime(n)
fs := make([]Failure, n)
for i := 0; i < n; i++ {
fs[i] = clus.failures[(cp*i+offset)%n]
}
clus.failures = fs
clus.lg.Info("shuffled test failure cases", zap.Int("total", n))
}
/*
x and y of GCD 1 are coprime to each other
x1 = ( coprime of n * idx1 + offset ) % n
x2 = ( coprime of n * idx2 + offset ) % n
(x2 - x1) = coprime of n * (idx2 - idx1) % n
= (idx2 - idx1) = 1
Consecutive x's are guaranteed to be distinct
*/
func coprime(n int) int {
coprime := 1
for i := n / 2; i < n; i++ {
if gcd(i, n) == 1 {
coprime = i
break
}
}
return coprime
}
func gcd(x, y int) int {
if y == 0 {
return x
}
return gcd(y, x%y)
}
func (clus *Cluster) updateStresserChecker() {
cs := &compositeStresser{}
for _, m := range clus.Members {
@ -502,11 +274,7 @@ func (clus *Cluster) updateStresserChecker() {
clus.checker = newNoChecker()
}
clus.lg.Info(
"updated stressers",
zap.Int("round", clus.rd),
zap.Int("case", clus.cs),
)
clus.lg.Info("updated stressers")
}
func (clus *Cluster) checkConsistency() (err error) {
@ -542,45 +310,74 @@ func (clus *Cluster) checkConsistency() (err error) {
return err
}
// Bootstrap bootstraps etcd cluster the very first time.
// Send_INITIAL_START_ETCD bootstraps etcd cluster the very first time.
// After this, just continue to call kill/restart.
func (clus *Cluster) Bootstrap() error {
func (clus *Cluster) Send_INITIAL_START_ETCD() error {
// this is the only time that creates request from scratch
return clus.broadcastOperation(rpcpb.Operation_InitialStartEtcd)
return clus.broadcast(rpcpb.Operation_INITIAL_START_ETCD)
}
// FailArchive sends "FailArchive" operation.
func (clus *Cluster) FailArchive() error {
return clus.broadcastOperation(rpcpb.Operation_FailArchive)
// send_SIGQUIT_ETCD_AND_ARCHIVE_DATA sends "send_SIGQUIT_ETCD_AND_ARCHIVE_DATA" operation.
func (clus *Cluster) send_SIGQUIT_ETCD_AND_ARCHIVE_DATA() error {
return clus.broadcast(rpcpb.Operation_SIGQUIT_ETCD_AND_ARCHIVE_DATA)
}
// Restart sends "Restart" operation.
func (clus *Cluster) Restart() error {
return clus.broadcastOperation(rpcpb.Operation_RestartEtcd)
// send_RESTART_ETCD sends restart operation.
func (clus *Cluster) send_RESTART_ETCD() error {
return clus.broadcast(rpcpb.Operation_RESTART_ETCD)
}
func (clus *Cluster) broadcastOperation(op rpcpb.Operation) error {
func (clus *Cluster) broadcast(op rpcpb.Operation) error {
var wg sync.WaitGroup
wg.Add(len(clus.agentStreams))
errc := make(chan error, len(clus.agentStreams))
for i := range clus.agentStreams {
err := clus.sendOperation(i, op)
go func(idx int, o rpcpb.Operation) {
defer wg.Done()
errc <- clus.sendOp(idx, o)
}(i, op)
}
wg.Wait()
close(errc)
errs := []string{}
for err := range errc {
if err == nil {
continue
}
if err != nil {
if op == rpcpb.Operation_DestroyEtcdAgent &&
strings.Contains(err.Error(), "rpc error: code = Unavailable desc = transport is closing") {
// agent server has already closed;
// so this error is expected
clus.lg.Info(
"successfully destroyed",
zap.String("member", clus.Members[i].EtcdClientEndpoint),
)
continue
destroyed := false
if op == rpcpb.Operation_SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT {
if err == io.EOF {
destroyed = true
}
if strings.Contains(err.Error(),
"rpc error: code = Unavailable desc = transport is closing") {
// agent server has already closed;
// so this error is expected
destroyed = true
}
if strings.Contains(err.Error(),
"desc = os: process already finished") {
destroyed = true
}
}
if !destroyed {
errs = append(errs, err.Error())
}
return err
}
}
return nil
if len(errs) == 0 {
return nil
}
return errors.New(strings.Join(errs, ", "))
}
func (clus *Cluster) sendOperation(idx int, op rpcpb.Operation) error {
if op == rpcpb.Operation_InitialStartEtcd {
func (clus *Cluster) sendOp(idx int, op rpcpb.Operation) error {
if op == rpcpb.Operation_INITIAL_START_ETCD {
clus.agentRequests[idx] = &rpcpb.Request{
Operation: op,
Member: clus.Members[idx],
@ -639,9 +436,9 @@ func (clus *Cluster) sendOperation(idx int, op rpcpb.Operation) error {
}
// store TLS assets from agents/servers onto disk
if secure && (op == rpcpb.Operation_InitialStartEtcd || op == rpcpb.Operation_RestartEtcd) {
if secure && (op == rpcpb.Operation_INITIAL_START_ETCD || op == rpcpb.Operation_RESTART_ETCD) {
dirClient := filepath.Join(
clus.Tester.TesterDataDir,
clus.Tester.DataDir,
clus.Members[idx].Etcd.Name,
"fixtures",
"client",
@ -699,9 +496,9 @@ func (clus *Cluster) sendOperation(idx int, op rpcpb.Operation) error {
return nil
}
// DestroyEtcdAgents terminates all tester connections to agents and etcd servers.
func (clus *Cluster) DestroyEtcdAgents() {
err := clus.broadcastOperation(rpcpb.Operation_DestroyEtcdAgent)
// Send_SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT terminates all tester connections to agents and etcd servers.
func (clus *Cluster) Send_SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT() {
err := clus.broadcast(rpcpb.Operation_SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT)
if err != nil {
clus.lg.Warn("destroying etcd/agents FAIL", zap.Error(err))
} else {
@ -717,7 +514,7 @@ func (clus *Cluster) DestroyEtcdAgents() {
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
err := clus.testerHTTPServer.Shutdown(ctx)
cancel()
clus.lg.Info("closed tester HTTP server", zap.String("tester-address", clus.Tester.TesterAddr), zap.Error(err))
clus.lg.Info("closed tester HTTP server", zap.String("tester-address", clus.Tester.Addr), zap.Error(err))
}
}
@ -886,6 +683,7 @@ func (clus *Cluster) defrag() error {
"defrag ALL PASS",
zap.Int("round", clus.rd),
zap.Int("case", clus.cs),
zap.Int("case-total", len(clus.failures)),
)
return nil
}

View File

@ -0,0 +1,346 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tester
import (
"errors"
"fmt"
"io/ioutil"
"net/url"
"path/filepath"
"strings"
"github.com/coreos/etcd/functional/rpcpb"
"go.uber.org/zap"
yaml "gopkg.in/yaml.v2"
)
func read(lg *zap.Logger, fpath string) (*Cluster, error) {
bts, err := ioutil.ReadFile(fpath)
if err != nil {
return nil, err
}
lg.Info("opened configuration file", zap.String("path", fpath))
clus := &Cluster{lg: lg}
if err = yaml.Unmarshal(bts, clus); err != nil {
return nil, err
}
if len(clus.Members) < 3 {
return nil, fmt.Errorf("len(clus.Members) expects at least 3, got %d", len(clus.Members))
}
for i, mem := range clus.Members {
if mem.BaseDir == "" {
return nil, fmt.Errorf("BaseDir cannot be empty (got %q)", mem.BaseDir)
}
if mem.EtcdLogPath == "" {
return nil, fmt.Errorf("EtcdLogPath cannot be empty (got %q)", mem.EtcdLogPath)
}
if mem.Etcd.Name == "" {
return nil, fmt.Errorf("'--name' cannot be empty (got %+v)", mem)
}
if mem.Etcd.DataDir == "" {
return nil, fmt.Errorf("'--data-dir' cannot be empty (got %+v)", mem)
}
if mem.Etcd.SnapshotCount == 0 {
return nil, fmt.Errorf("'--snapshot-count' cannot be 0 (got %+v)", mem.Etcd.SnapshotCount)
}
if mem.Etcd.DataDir == "" {
return nil, fmt.Errorf("'--data-dir' cannot be empty (got %q)", mem.Etcd.DataDir)
}
if mem.Etcd.WALDir == "" {
clus.Members[i].Etcd.WALDir = filepath.Join(mem.Etcd.DataDir, "member", "wal")
}
if mem.Etcd.HeartbeatIntervalMs == 0 {
return nil, fmt.Errorf("'--heartbeat-interval' cannot be 0 (got %+v)", mem.Etcd)
}
if mem.Etcd.ElectionTimeoutMs == 0 {
return nil, fmt.Errorf("'--election-timeout' cannot be 0 (got %+v)", mem.Etcd)
}
if int64(clus.Tester.DelayLatencyMs) <= mem.Etcd.ElectionTimeoutMs {
return nil, fmt.Errorf("delay latency %d ms must be greater than election timeout %d ms", clus.Tester.DelayLatencyMs, mem.Etcd.ElectionTimeoutMs)
}
port := ""
listenClientPorts := make([]string, len(clus.Members))
for i, u := range mem.Etcd.ListenClientURLs {
if !isValidURL(u) {
return nil, fmt.Errorf("'--listen-client-urls' has valid URL %q", u)
}
listenClientPorts[i], err = getPort(u)
if err != nil {
return nil, fmt.Errorf("'--listen-client-urls' has no port %q", u)
}
}
for i, u := range mem.Etcd.AdvertiseClientURLs {
if !isValidURL(u) {
return nil, fmt.Errorf("'--advertise-client-urls' has valid URL %q", u)
}
port, err = getPort(u)
if err != nil {
return nil, fmt.Errorf("'--advertise-client-urls' has no port %q", u)
}
if mem.EtcdClientProxy && listenClientPorts[i] == port {
return nil, fmt.Errorf("clus.Members[%d] requires client port proxy, but advertise port %q conflicts with listener port %q", i, port, listenClientPorts[i])
}
}
listenPeerPorts := make([]string, len(clus.Members))
for i, u := range mem.Etcd.ListenPeerURLs {
if !isValidURL(u) {
return nil, fmt.Errorf("'--listen-peer-urls' has valid URL %q", u)
}
listenPeerPorts[i], err = getPort(u)
if err != nil {
return nil, fmt.Errorf("'--listen-peer-urls' has no port %q", u)
}
}
for j, u := range mem.Etcd.AdvertisePeerURLs {
if !isValidURL(u) {
return nil, fmt.Errorf("'--initial-advertise-peer-urls' has valid URL %q", u)
}
port, err = getPort(u)
if err != nil {
return nil, fmt.Errorf("'--initial-advertise-peer-urls' has no port %q", u)
}
if mem.EtcdPeerProxy && listenPeerPorts[j] == port {
return nil, fmt.Errorf("clus.Members[%d] requires peer port proxy, but advertise port %q conflicts with listener port %q", i, port, listenPeerPorts[j])
}
}
if !strings.HasPrefix(mem.EtcdLogPath, mem.BaseDir) {
return nil, fmt.Errorf("EtcdLogPath must be prefixed with BaseDir (got %q)", mem.EtcdLogPath)
}
if !strings.HasPrefix(mem.Etcd.DataDir, mem.BaseDir) {
return nil, fmt.Errorf("Etcd.DataDir must be prefixed with BaseDir (got %q)", mem.Etcd.DataDir)
}
// TODO: support separate WALDir that can be handled via failure-archive
if !strings.HasPrefix(mem.Etcd.WALDir, mem.BaseDir) {
return nil, fmt.Errorf("Etcd.WALDir must be prefixed with BaseDir (got %q)", mem.Etcd.WALDir)
}
// TODO: only support generated certs with TLS generator
// deprecate auto TLS
if mem.Etcd.PeerAutoTLS && mem.Etcd.PeerCertFile != "" {
return nil, fmt.Errorf("Etcd.PeerAutoTLS 'true', but Etcd.PeerCertFile is %q", mem.Etcd.PeerCertFile)
}
if mem.Etcd.PeerAutoTLS && mem.Etcd.PeerKeyFile != "" {
return nil, fmt.Errorf("Etcd.PeerAutoTLS 'true', but Etcd.PeerKeyFile is %q", mem.Etcd.PeerKeyFile)
}
if mem.Etcd.PeerAutoTLS && mem.Etcd.PeerTrustedCAFile != "" {
return nil, fmt.Errorf("Etcd.PeerAutoTLS 'true', but Etcd.PeerTrustedCAFile is %q", mem.Etcd.PeerTrustedCAFile)
}
if mem.Etcd.ClientAutoTLS && mem.Etcd.ClientCertFile != "" {
return nil, fmt.Errorf("Etcd.ClientAutoTLS 'true', but Etcd.ClientCertFile is %q", mem.Etcd.ClientCertFile)
}
if mem.Etcd.ClientAutoTLS && mem.Etcd.ClientKeyFile != "" {
return nil, fmt.Errorf("Etcd.ClientAutoTLS 'true', but Etcd.ClientKeyFile is %q", mem.Etcd.ClientKeyFile)
}
if mem.Etcd.ClientAutoTLS && mem.Etcd.ClientTrustedCAFile != "" {
return nil, fmt.Errorf("Etcd.ClientAutoTLS 'true', but Etcd.ClientTrustedCAFile is %q", mem.Etcd.ClientTrustedCAFile)
}
if mem.Etcd.PeerClientCertAuth && mem.Etcd.PeerCertFile == "" {
return nil, fmt.Errorf("Etcd.PeerClientCertAuth 'true', but Etcd.PeerCertFile is %q", mem.Etcd.PeerCertFile)
}
if mem.Etcd.PeerClientCertAuth && mem.Etcd.PeerKeyFile == "" {
return nil, fmt.Errorf("Etcd.PeerClientCertAuth 'true', but Etcd.PeerKeyFile is %q", mem.Etcd.PeerCertFile)
}
// only support self-signed certs
if mem.Etcd.PeerClientCertAuth && mem.Etcd.PeerTrustedCAFile == "" {
return nil, fmt.Errorf("Etcd.PeerClientCertAuth 'true', but Etcd.PeerTrustedCAFile is %q", mem.Etcd.PeerCertFile)
}
if !mem.Etcd.PeerClientCertAuth && mem.Etcd.PeerCertFile != "" {
return nil, fmt.Errorf("Etcd.PeerClientCertAuth 'false', but Etcd.PeerCertFile is %q", mem.Etcd.PeerCertFile)
}
if !mem.Etcd.PeerClientCertAuth && mem.Etcd.PeerKeyFile != "" {
return nil, fmt.Errorf("Etcd.PeerClientCertAuth 'false', but Etcd.PeerKeyFile is %q", mem.Etcd.PeerCertFile)
}
if !mem.Etcd.PeerClientCertAuth && mem.Etcd.PeerTrustedCAFile != "" {
return nil, fmt.Errorf("Etcd.PeerClientCertAuth 'false', but Etcd.PeerTrustedCAFile is %q", mem.Etcd.PeerTrustedCAFile)
}
if mem.Etcd.PeerClientCertAuth && mem.Etcd.PeerAutoTLS {
return nil, fmt.Errorf("Etcd.PeerClientCertAuth and Etcd.PeerAutoTLS cannot be both 'true'")
}
if (mem.Etcd.PeerCertFile == "") != (mem.Etcd.PeerKeyFile == "") {
return nil, fmt.Errorf("Both Etcd.PeerCertFile %q and Etcd.PeerKeyFile %q must be either empty or non-empty", mem.Etcd.PeerCertFile, mem.Etcd.PeerKeyFile)
}
if mem.Etcd.ClientCertAuth && mem.Etcd.ClientAutoTLS {
return nil, fmt.Errorf("Etcd.ClientCertAuth and Etcd.ClientAutoTLS cannot be both 'true'")
}
if mem.Etcd.ClientCertAuth && mem.Etcd.ClientCertFile == "" {
return nil, fmt.Errorf("Etcd.ClientCertAuth 'true', but Etcd.ClientCertFile is %q", mem.Etcd.PeerCertFile)
}
if mem.Etcd.ClientCertAuth && mem.Etcd.ClientKeyFile == "" {
return nil, fmt.Errorf("Etcd.ClientCertAuth 'true', but Etcd.ClientKeyFile is %q", mem.Etcd.PeerCertFile)
}
if mem.Etcd.ClientCertAuth && mem.Etcd.ClientTrustedCAFile == "" {
return nil, fmt.Errorf("Etcd.ClientCertAuth 'true', but Etcd.ClientTrustedCAFile is %q", mem.Etcd.ClientTrustedCAFile)
}
if !mem.Etcd.ClientCertAuth && mem.Etcd.ClientCertFile != "" {
return nil, fmt.Errorf("Etcd.ClientCertAuth 'false', but Etcd.ClientCertFile is %q", mem.Etcd.PeerCertFile)
}
if !mem.Etcd.ClientCertAuth && mem.Etcd.ClientKeyFile != "" {
return nil, fmt.Errorf("Etcd.ClientCertAuth 'false', but Etcd.ClientKeyFile is %q", mem.Etcd.PeerCertFile)
}
if !mem.Etcd.ClientCertAuth && mem.Etcd.ClientTrustedCAFile != "" {
return nil, fmt.Errorf("Etcd.ClientCertAuth 'false', but Etcd.ClientTrustedCAFile is %q", mem.Etcd.PeerCertFile)
}
if (mem.Etcd.ClientCertFile == "") != (mem.Etcd.ClientKeyFile == "") {
return nil, fmt.Errorf("Both Etcd.ClientCertFile %q and Etcd.ClientKeyFile %q must be either empty or non-empty", mem.Etcd.ClientCertFile, mem.Etcd.ClientKeyFile)
}
peerTLS := mem.Etcd.PeerAutoTLS ||
(mem.Etcd.PeerClientCertAuth && mem.Etcd.PeerCertFile != "" && mem.Etcd.PeerKeyFile != "" && mem.Etcd.PeerTrustedCAFile != "")
if peerTLS {
for _, cu := range mem.Etcd.ListenPeerURLs {
var u *url.URL
u, err = url.Parse(cu)
if err != nil {
return nil, err
}
if u.Scheme != "https" { // TODO: support unix
return nil, fmt.Errorf("peer TLS is enabled with wrong scheme %q", cu)
}
}
for _, cu := range mem.Etcd.AdvertisePeerURLs {
var u *url.URL
u, err = url.Parse(cu)
if err != nil {
return nil, err
}
if u.Scheme != "https" { // TODO: support unix
return nil, fmt.Errorf("peer TLS is enabled with wrong scheme %q", cu)
}
}
clus.Members[i].PeerCertPath = mem.Etcd.PeerCertFile
if mem.Etcd.PeerCertFile != "" {
var data []byte
data, err = ioutil.ReadFile(mem.Etcd.PeerCertFile)
if err != nil {
return nil, fmt.Errorf("failed to read %q (%v)", mem.Etcd.PeerCertFile, err)
}
clus.Members[i].PeerCertData = string(data)
}
clus.Members[i].PeerKeyPath = mem.Etcd.PeerKeyFile
if mem.Etcd.PeerKeyFile != "" {
var data []byte
data, err = ioutil.ReadFile(mem.Etcd.PeerKeyFile)
if err != nil {
return nil, fmt.Errorf("failed to read %q (%v)", mem.Etcd.PeerKeyFile, err)
}
clus.Members[i].PeerCertData = string(data)
}
clus.Members[i].PeerTrustedCAPath = mem.Etcd.PeerTrustedCAFile
if mem.Etcd.PeerTrustedCAFile != "" {
var data []byte
data, err = ioutil.ReadFile(mem.Etcd.PeerTrustedCAFile)
if err != nil {
return nil, fmt.Errorf("failed to read %q (%v)", mem.Etcd.PeerTrustedCAFile, err)
}
clus.Members[i].PeerCertData = string(data)
}
}
clientTLS := mem.Etcd.ClientAutoTLS ||
(mem.Etcd.ClientCertAuth && mem.Etcd.ClientCertFile != "" && mem.Etcd.ClientKeyFile != "" && mem.Etcd.ClientTrustedCAFile != "")
if clientTLS {
for _, cu := range mem.Etcd.ListenClientURLs {
var u *url.URL
u, err = url.Parse(cu)
if err != nil {
return nil, err
}
if u.Scheme != "https" { // TODO: support unix
return nil, fmt.Errorf("client TLS is enabled with wrong scheme %q", cu)
}
}
for _, cu := range mem.Etcd.AdvertiseClientURLs {
var u *url.URL
u, err = url.Parse(cu)
if err != nil {
return nil, err
}
if u.Scheme != "https" { // TODO: support unix
return nil, fmt.Errorf("client TLS is enabled with wrong scheme %q", cu)
}
}
clus.Members[i].ClientCertPath = mem.Etcd.ClientCertFile
if mem.Etcd.ClientCertFile != "" {
var data []byte
data, err = ioutil.ReadFile(mem.Etcd.ClientCertFile)
if err != nil {
return nil, fmt.Errorf("failed to read %q (%v)", mem.Etcd.ClientCertFile, err)
}
clus.Members[i].ClientCertData = string(data)
}
clus.Members[i].ClientKeyPath = mem.Etcd.ClientKeyFile
if mem.Etcd.ClientKeyFile != "" {
var data []byte
data, err = ioutil.ReadFile(mem.Etcd.ClientKeyFile)
if err != nil {
return nil, fmt.Errorf("failed to read %q (%v)", mem.Etcd.ClientKeyFile, err)
}
clus.Members[i].ClientCertData = string(data)
}
clus.Members[i].ClientTrustedCAPath = mem.Etcd.ClientTrustedCAFile
if mem.Etcd.ClientTrustedCAFile != "" {
var data []byte
data, err = ioutil.ReadFile(mem.Etcd.ClientTrustedCAFile)
if err != nil {
return nil, fmt.Errorf("failed to read %q (%v)", mem.Etcd.ClientTrustedCAFile, err)
}
clus.Members[i].ClientCertData = string(data)
}
}
}
if len(clus.Tester.FailureCases) == 0 {
return nil, errors.New("FailureCases not found")
}
if clus.Tester.DelayLatencyMs <= clus.Tester.DelayLatencyMsRv*5 {
return nil, fmt.Errorf("delay latency %d ms must be greater than 5x of delay latency random variable %d ms", clus.Tester.DelayLatencyMs, clus.Tester.DelayLatencyMsRv)
}
if clus.Tester.UpdatedDelayLatencyMs == 0 {
clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs
}
for _, v := range clus.Tester.FailureCases {
if _, ok := rpcpb.FailureCase_value[v]; !ok {
return nil, fmt.Errorf("%q is not defined in 'rpcpb.FailureCase_value'", v)
}
}
for _, v := range clus.Tester.StressTypes {
if _, ok := rpcpb.StressType_value[v]; !ok {
return nil, fmt.Errorf("StressType is unknown; got %q", v)
}
}
if clus.Tester.StressKeySuffixRangeTxn > 100 {
return nil, fmt.Errorf("StressKeySuffixRangeTxn maximum value is 100, got %v", clus.Tester.StressKeySuffixRangeTxn)
}
if clus.Tester.StressKeyTxnOps > 64 {
return nil, fmt.Errorf("StressKeyTxnOps maximum value is 64, got %v", clus.Tester.StressKeyTxnOps)
}
return clus, err
}

View File

@ -19,8 +19,8 @@ import (
"os"
"time"
"github.com/coreos/etcd/functional/rpcpb"
"github.com/coreos/etcd/pkg/fileutil"
"github.com/coreos/etcd/tools/functional-tester/rpcpb"
"go.uber.org/zap"
)
@ -29,12 +29,14 @@ import (
// Previous tests showed etcd can compact about 60,000 entries per second.
const compactQPS = 50000
// StartTester starts tester.
func (clus *Cluster) StartTester() {
if err := fileutil.TouchDirAll(clus.Tester.TesterDataDir); err != nil {
// Run starts tester.
func (clus *Cluster) Run() {
defer printReport()
if err := fileutil.TouchDirAll(clus.Tester.DataDir); err != nil {
clus.lg.Panic(
"failed to create test data directory",
zap.String("dir", clus.Tester.TesterDataDir),
zap.String("dir", clus.Tester.DataDir),
zap.Error(err),
)
}
@ -49,6 +51,7 @@ func (clus *Cluster) StartTester() {
"round FAIL",
zap.Int("round", clus.rd),
zap.Int("case", clus.cs),
zap.Int("case-total", len(clus.failures)),
zap.Error(err),
)
if clus.cleanup() != nil {
@ -72,6 +75,7 @@ func (clus *Cluster) StartTester() {
"compact START",
zap.Int("round", clus.rd),
zap.Int("case", clus.cs),
zap.Int("case-total", len(clus.failures)),
zap.Duration("timeout", timeout),
)
if err := clus.compact(revToCompact, timeout); err != nil {
@ -79,6 +83,7 @@ func (clus *Cluster) StartTester() {
"compact FAIL",
zap.Int("round", clus.rd),
zap.Int("case", clus.cs),
zap.Int("case-total", len(clus.failures)),
zap.Error(err),
)
if err = clus.cleanup(); err != nil {
@ -86,6 +91,7 @@ func (clus *Cluster) StartTester() {
"cleanup FAIL",
zap.Int("round", clus.rd),
zap.Int("case", clus.cs),
zap.Int("case-total", len(clus.failures)),
zap.Error(err),
)
return
@ -105,6 +111,7 @@ func (clus *Cluster) StartTester() {
"functional-tester PASS",
zap.Int("round", clus.rd),
zap.Int("case", clus.cs),
zap.Int("case-total", len(clus.failures)),
)
}
@ -117,12 +124,14 @@ func (clus *Cluster) doRound() error {
clus.lg.Info(
"round START",
zap.Int("round", clus.rd),
zap.Int("case", clus.cs),
zap.Int("case-total", len(clus.failures)),
zap.Strings("failures", clus.failureStrings()),
zap.Int("total-failures", len(clus.failures)),
)
for i, fa := range clus.failures {
clus.cs = i
caseTotal[fa.Desc()]++
caseTotalCounter.WithLabelValues(fa.Desc()).Inc()
caseNow := time.Now()
@ -130,8 +139,8 @@ func (clus *Cluster) doRound() error {
"case START",
zap.Int("round", clus.rd),
zap.Int("case", clus.cs),
zap.Int("case-total", len(clus.failures)),
zap.String("desc", fa.Desc()),
zap.Int("total-failures", len(clus.failures)),
)
clus.lg.Info("wait health before injecting failures")
@ -143,9 +152,10 @@ func (clus *Cluster) doRound() error {
fcase := fa.FailureCase()
if fcase != rpcpb.FailureCase_NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS {
clus.lg.Info(
"stresser START",
"stress START",
zap.Int("round", clus.rd),
zap.Int("case", clus.cs),
zap.Int("case-total", len(clus.failures)),
zap.String("desc", fa.Desc()),
)
if err := clus.stresser.Stress(); err != nil {
@ -158,6 +168,7 @@ func (clus *Cluster) doRound() error {
"inject START",
zap.Int("round", clus.rd),
zap.Int("case", clus.cs),
zap.Int("case-total", len(clus.failures)),
zap.String("desc", fa.Desc()),
)
if err := fa.Inject(clus); err != nil {
@ -171,6 +182,7 @@ func (clus *Cluster) doRound() error {
"recover START",
zap.Int("round", clus.rd),
zap.Int("case", clus.cs),
zap.Int("case-total", len(clus.failures)),
zap.String("desc", fa.Desc()),
)
if err := fa.Recover(clus); err != nil {
@ -178,7 +190,13 @@ func (clus *Cluster) doRound() error {
}
if stressStarted {
clus.lg.Info("stresser PAUSE")
clus.lg.Info(
"stress PAUSE",
zap.Int("round", clus.rd),
zap.Int("case", clus.cs),
zap.Int("case-total", len(clus.failures)),
zap.String("desc", fa.Desc()),
)
ems := clus.stresser.Pause()
if fcase == rpcpb.FailureCase_NO_FAIL_WITH_STRESS && len(ems) > 0 {
ess := make([]string, 0, len(ems))
@ -201,12 +219,24 @@ func (clus *Cluster) doRound() error {
}
}
clus.lg.Info("health check START")
clus.lg.Info(
"health check START",
zap.Int("round", clus.rd),
zap.Int("case", clus.cs),
zap.Int("case-total", len(clus.failures)),
zap.String("desc", fa.Desc()),
)
if err := clus.WaitHealth(); err != nil {
return fmt.Errorf("wait full health error: %v", err)
}
clus.lg.Info("consistency check START")
clus.lg.Info(
"consistency check START",
zap.Int("round", clus.rd),
zap.Int("case", clus.cs),
zap.Int("case-total", len(clus.failures)),
zap.String("desc", fa.Desc()),
)
if err := clus.checkConsistency(); err != nil {
return fmt.Errorf("consistency check error (%v)", err)
}
@ -215,8 +245,8 @@ func (clus *Cluster) doRound() error {
"case PASS",
zap.Int("round", clus.rd),
zap.Int("case", clus.cs),
zap.Int("case-total", len(clus.failures)),
zap.String("desc", fa.Desc()),
zap.Int("total-failures", len(clus.failures)),
zap.Duration("took", time.Since(caseNow)),
)
}
@ -225,7 +255,7 @@ func (clus *Cluster) doRound() error {
"round ALL PASS",
zap.Int("round", clus.rd),
zap.Strings("failures", clus.failureStrings()),
zap.Int("total-failures", len(clus.failures)),
zap.Int("case-total", len(clus.failures)),
zap.Duration("took", time.Since(roundNow)),
)
return nil
@ -280,21 +310,21 @@ func (clus *Cluster) compact(rev int64, timeout time.Duration) (err error) {
}
func (clus *Cluster) failed() {
if !clus.Tester.ExitOnFailure {
return
}
clus.lg.Info(
"functional-tester FAIL",
zap.Int("round", clus.rd),
zap.Int("case", clus.cs),
zap.Int("case-total", len(clus.failures)),
)
clus.DestroyEtcdAgents()
clus.Send_SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT()
os.Exit(2)
}
func (clus *Cluster) cleanup() error {
defer clus.failed()
if clus.Tester.ExitOnFailure {
defer clus.failed()
}
roundFailedTotalCounter.Inc()
desc := "compact/defrag"
@ -307,23 +337,26 @@ func (clus *Cluster) cleanup() error {
"closing stressers before archiving failure data",
zap.Int("round", clus.rd),
zap.Int("case", clus.cs),
zap.Int("case-total", len(clus.failures)),
)
clus.stresser.Close()
if err := clus.FailArchive(); err != nil {
if err := clus.send_SIGQUIT_ETCD_AND_ARCHIVE_DATA(); err != nil {
clus.lg.Warn(
"cleanup FAIL",
zap.Int("round", clus.rd),
zap.Int("case", clus.cs),
zap.Int("case-total", len(clus.failures)),
zap.Error(err),
)
return err
}
if err := clus.Restart(); err != nil {
if err := clus.send_RESTART_ETCD(); err != nil {
clus.lg.Warn(
"restart FAIL",
zap.Int("round", clus.rd),
zap.Int("case", clus.cs),
zap.Int("case-total", len(clus.failures)),
zap.Error(err),
)
return err

View File

@ -0,0 +1,64 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tester
import (
"math/rand"
"time"
"go.uber.org/zap"
)
func (clus *Cluster) shuffleFailures() {
rand.Seed(time.Now().UnixNano())
offset := rand.Intn(1000)
n := len(clus.failures)
cp := coprime(n)
fs := make([]Failure, n)
for i := 0; i < n; i++ {
fs[i] = clus.failures[(cp*i+offset)%n]
}
clus.failures = fs
clus.lg.Info("shuffled test failure cases", zap.Int("total", n))
}
/*
x and y of GCD 1 are coprime to each other
x1 = ( coprime of n * idx1 + offset ) % n
x2 = ( coprime of n * idx2 + offset ) % n
(x2 - x1) = coprime of n * (idx2 - idx1) % n
= (idx2 - idx1) = 1
Consecutive x's are guaranteed to be distinct
*/
func coprime(n int) int {
coprime := 1
for i := n / 2; i < n; i++ {
if gcd(i, n) == 1 {
coprime = i
break
}
}
return coprime
}
func gcd(x, y int) int {
if y == 0 {
return x
}
return gcd(y, x%y)
}

View File

@ -19,12 +19,12 @@ import (
"sort"
"testing"
"github.com/coreos/etcd/tools/functional-tester/rpcpb"
"github.com/coreos/etcd/functional/rpcpb"
"go.uber.org/zap"
)
func Test_newCluster(t *testing.T) {
func Test_read(t *testing.T) {
exp := &Cluster{
Members: []*rpcpb.Member{
{
@ -143,9 +143,9 @@ func Test_newCluster(t *testing.T) {
},
},
Tester: &rpcpb.Tester{
TesterDataDir: "/tmp/etcd-tester-data",
TesterNetwork: "tcp",
TesterAddr: "127.0.0.1:9028",
DataDir: "/tmp/etcd-tester-data",
Network: "tcp",
Addr: "127.0.0.1:9028",
DelayLatencyMs: 5000,
DelayLatencyMsRv: 500,
UpdatedDelayLatencyMs: 5000,
@ -153,13 +153,15 @@ func Test_newCluster(t *testing.T) {
ExitOnFailure: true,
ConsistencyCheck: true,
EnablePprof: true,
FailureDelayMs: 7000,
FailureShuffle: true,
FailureCases: []string{
"KILL_ONE_FOLLOWER",
"KILL_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT",
"KILL_LEADER",
"KILL_LEADER_UNTIL_TRIGGER_SNAPSHOT",
"KILL_QUORUM",
"KILL_ALL",
"SIGTERM_ONE_FOLLOWER",
"SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT",
"SIGTERM_LEADER",
"SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT",
"SIGTERM_QUORUM",
"SIGTERM_ALL",
"BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER",
"BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT",
"BLACKHOLE_PEER_PORT_TX_RX_LEADER",
@ -181,10 +183,8 @@ func Test_newCluster(t *testing.T) {
"NO_FAIL_WITH_STRESS",
"NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS",
},
FailureDelayMs: 7000,
FailureShuffle: true,
FailpointCommands: []string{`panic("etcd-tester")`},
RunnerExecPath: "/etcd-runner",
RunnerExecPath: "./bin/etcd-runner",
ExternalExecPath: "",
StressTypes: []string{"KV", "LEASE"},
StressKeySize: 100,
@ -203,7 +203,7 @@ func Test_newCluster(t *testing.T) {
}
defer logger.Sync()
cfg, err := newCluster(logger, "./local-test.yaml")
cfg, err := read(logger, "../../functional.yaml")
if err != nil {
t.Fatal(err)
}
@ -235,6 +235,7 @@ func Test_newCluster(t *testing.T) {
sort.Strings(fs1)
sort.Strings(fs2)
sort.Strings(fs3)
if !reflect.DeepEqual(fs1, fs2) {
t.Fatalf("expected %q, got %q", fs1, fs2)
}

View File

@ -19,7 +19,7 @@ import (
"math/rand"
"time"
"github.com/coreos/etcd/tools/functional-tester/rpcpb"
"github.com/coreos/etcd/functional/rpcpb"
"go.uber.org/zap"
)
@ -242,17 +242,12 @@ func (f *failureUntilSnapshot) Inject(clus *Cluster) error {
if err := f.Failure.Inject(clus); err != nil {
return err
}
if len(clus.Members) < 3 {
return nil
}
snapshotCount := clus.Members[0].Etcd.SnapshotCount
now := time.Now()
clus.lg.Info(
"trigger snapshot START",
zap.Int("round", clus.rd),
zap.Int("case", clus.cs),
zap.String("desc", f.Desc()),
zap.Int64("etcd-snapshot-count", snapshotCount),
)
@ -283,8 +278,6 @@ func (f *failureUntilSnapshot) Inject(clus *Cluster) error {
if diff > snapshotCount {
clus.lg.Info(
"trigger snapshot PASS",
zap.Int("round", clus.rd),
zap.Int("case", clus.cs),
zap.Int("retries", i),
zap.String("desc", f.Desc()),
zap.Int64("committed-entries", diff),

View File

@ -31,9 +31,9 @@ func (f *failureDelay) Inject(clus *Cluster) error {
}
if f.delayDuration > 0 {
clus.lg.Info(
"sleeping in failureDelay",
"wait after inject",
zap.Duration("delay", f.delayDuration),
zap.String("case", f.Failure.Desc()),
zap.String("desc", f.Failure.Desc()),
)
time.Sleep(f.delayDuration)
}

View File

@ -18,7 +18,7 @@ import (
"fmt"
"os/exec"
"github.com/coreos/etcd/tools/functional-tester/rpcpb"
"github.com/coreos/etcd/functional/rpcpb"
)
type failureExternal struct {
@ -46,7 +46,7 @@ func (f *failureExternal) FailureCase() rpcpb.FailureCase {
return f.failureCase
}
func newFailureExternal(scriptPath string) Failure {
func new_FailureCase_EXTERNAL(scriptPath string) Failure {
return &failureExternal{
desc: fmt.Sprintf("external fault injector (script: %q)", scriptPath),
failureCase: rpcpb.FailureCase_EXTERNAL,

View File

@ -21,7 +21,7 @@ import (
"strings"
"sync"
"github.com/coreos/etcd/tools/functional-tester/rpcpb"
"github.com/coreos/etcd/functional/rpcpb"
)
type failpointStats struct {
@ -145,7 +145,7 @@ func makeRecoverFailpoint(fp string) recoverMemberFunc {
fpStats.mu.Lock()
fpStats.crashes[fp]++
fpStats.mu.Unlock()
return recoverKill(clus, idx)
return recover_SIGTERM_ETCD(clus, idx)
}
}

View File

@ -14,21 +14,21 @@
package tester
import "github.com/coreos/etcd/tools/functional-tester/rpcpb"
import "github.com/coreos/etcd/functional/rpcpb"
func injectBlackholePeerPortTxRx(clus *Cluster, idx int) error {
return clus.sendOperation(idx, rpcpb.Operation_BlackholePeerPortTxRx)
func inject_BLACKHOLE_PEER_PORT_TX_RX(clus *Cluster, idx int) error {
return clus.sendOp(idx, rpcpb.Operation_BLACKHOLE_PEER_PORT_TX_RX)
}
func recoverBlackholePeerPortTxRx(clus *Cluster, idx int) error {
return clus.sendOperation(idx, rpcpb.Operation_UnblackholePeerPortTxRx)
func recover_BLACKHOLE_PEER_PORT_TX_RX(clus *Cluster, idx int) error {
return clus.sendOp(idx, rpcpb.Operation_UNBLACKHOLE_PEER_PORT_TX_RX)
}
func newFailureBlackholePeerPortTxRxOneFollower(clus *Cluster) Failure {
func new_FailureCase_BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER(clus *Cluster) Failure {
ff := failureByFunc{
failureCase: rpcpb.FailureCase_BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER,
injectMember: injectBlackholePeerPortTxRx,
recoverMember: recoverBlackholePeerPortTxRx,
injectMember: inject_BLACKHOLE_PEER_PORT_TX_RX,
recoverMember: recover_BLACKHOLE_PEER_PORT_TX_RX,
}
f := &failureFollower{ff, -1, -1}
return &failureDelay{
@ -37,11 +37,11 @@ func newFailureBlackholePeerPortTxRxOneFollower(clus *Cluster) Failure {
}
}
func newFailureBlackholePeerPortTxRxOneFollowerUntilTriggerSnapshot() Failure {
func new_FailureCase_BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT() Failure {
ff := failureByFunc{
failureCase: rpcpb.FailureCase_BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT,
injectMember: injectBlackholePeerPortTxRx,
recoverMember: recoverBlackholePeerPortTxRx,
injectMember: inject_BLACKHOLE_PEER_PORT_TX_RX,
recoverMember: recover_BLACKHOLE_PEER_PORT_TX_RX,
}
f := &failureFollower{ff, -1, -1}
return &failureUntilSnapshot{
@ -50,11 +50,11 @@ func newFailureBlackholePeerPortTxRxOneFollowerUntilTriggerSnapshot() Failure {
}
}
func newFailureBlackholePeerPortTxRxLeader(clus *Cluster) Failure {
func new_FailureCase_BLACKHOLE_PEER_PORT_TX_RX_LEADER(clus *Cluster) Failure {
ff := failureByFunc{
failureCase: rpcpb.FailureCase_BLACKHOLE_PEER_PORT_TX_RX_LEADER,
injectMember: injectBlackholePeerPortTxRx,
recoverMember: recoverBlackholePeerPortTxRx,
injectMember: inject_BLACKHOLE_PEER_PORT_TX_RX,
recoverMember: recover_BLACKHOLE_PEER_PORT_TX_RX,
}
f := &failureLeader{ff, -1, -1}
return &failureDelay{
@ -63,11 +63,11 @@ func newFailureBlackholePeerPortTxRxLeader(clus *Cluster) Failure {
}
}
func newFailureBlackholePeerPortTxRxLeaderUntilTriggerSnapshot() Failure {
func new_FailureCase_BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT() Failure {
ff := failureByFunc{
failureCase: rpcpb.FailureCase_BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT,
injectMember: injectBlackholePeerPortTxRx,
recoverMember: recoverBlackholePeerPortTxRx,
injectMember: inject_BLACKHOLE_PEER_PORT_TX_RX,
recoverMember: recover_BLACKHOLE_PEER_PORT_TX_RX,
}
f := &failureLeader{ff, -1, -1}
return &failureUntilSnapshot{
@ -76,11 +76,11 @@ func newFailureBlackholePeerPortTxRxLeaderUntilTriggerSnapshot() Failure {
}
}
func newFailureBlackholePeerPortTxRxQuorum(clus *Cluster) Failure {
func new_FailureCase_BLACKHOLE_PEER_PORT_TX_RX_QUORUM(clus *Cluster) Failure {
f := &failureQuorum{
failureCase: rpcpb.FailureCase_BLACKHOLE_PEER_PORT_TX_RX_QUORUM,
injectMember: injectBlackholePeerPortTxRx,
recoverMember: recoverBlackholePeerPortTxRx,
injectMember: inject_BLACKHOLE_PEER_PORT_TX_RX,
recoverMember: recover_BLACKHOLE_PEER_PORT_TX_RX,
}
return &failureDelay{
Failure: f,
@ -88,11 +88,11 @@ func newFailureBlackholePeerPortTxRxQuorum(clus *Cluster) Failure {
}
}
func newFailureBlackholePeerPortTxRxAll(clus *Cluster) Failure {
func new_FailureCase_BLACKHOLE_PEER_PORT_TX_RX_ALL(clus *Cluster) Failure {
f := &failureAll{
failureCase: rpcpb.FailureCase_BLACKHOLE_PEER_PORT_TX_RX_ALL,
injectMember: injectBlackholePeerPortTxRx,
recoverMember: recoverBlackholePeerPortTxRx,
injectMember: inject_BLACKHOLE_PEER_PORT_TX_RX,
recoverMember: recover_BLACKHOLE_PEER_PORT_TX_RX,
}
return &failureDelay{
Failure: f,

View File

@ -17,7 +17,7 @@ package tester
import (
"time"
"github.com/coreos/etcd/tools/functional-tester/rpcpb"
"github.com/coreos/etcd/functional/rpcpb"
"go.uber.org/zap"
)
@ -30,35 +30,33 @@ const (
waitRecover = 5 * time.Second
)
func injectDelayPeerPortTxRx(clus *Cluster, idx int) error {
func inject_DELAY_PEER_PORT_TX_RX(clus *Cluster, idx int) error {
clus.lg.Info(
"injecting delay latency",
zap.Duration("latency", time.Duration(clus.Tester.UpdatedDelayLatencyMs)*time.Millisecond),
zap.Duration("latency-rv", time.Duration(clus.Tester.DelayLatencyMsRv)*time.Millisecond),
zap.String("endpoint", clus.Members[idx].EtcdClientEndpoint),
)
return clus.sendOperation(idx, rpcpb.Operation_DelayPeerPortTxRx)
return clus.sendOp(idx, rpcpb.Operation_DELAY_PEER_PORT_TX_RX)
}
func recoverDelayPeerPortTxRx(clus *Cluster, idx int) error {
err := clus.sendOperation(idx, rpcpb.Operation_UndelayPeerPortTxRx)
func recover_DELAY_PEER_PORT_TX_RX(clus *Cluster, idx int) error {
err := clus.sendOp(idx, rpcpb.Operation_UNDELAY_PEER_PORT_TX_RX)
time.Sleep(waitRecover)
return err
}
func newFailureDelayPeerPortTxRxOneFollower(clus *Cluster, random bool) Failure {
func new_FailureCase_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER(clus *Cluster, random bool) Failure {
ff := failureByFunc{
failureCase: rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER,
injectMember: injectDelayPeerPortTxRx,
recoverMember: recoverDelayPeerPortTxRx,
injectMember: inject_DELAY_PEER_PORT_TX_RX,
recoverMember: recover_DELAY_PEER_PORT_TX_RX,
}
clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs
if random {
clus.UpdateDelayLatencyMs()
ff.failureCase = rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER
}
f := &failureFollower{ff, -1, -1}
return &failureDelay{
Failure: f,
@ -66,19 +64,17 @@ func newFailureDelayPeerPortTxRxOneFollower(clus *Cluster, random bool) Failure
}
}
func newFailureDelayPeerPortTxRxOneFollowerUntilTriggerSnapshot(clus *Cluster, random bool) Failure {
func new_FailureCase_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT(clus *Cluster, random bool) Failure {
ff := failureByFunc{
failureCase: rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT,
injectMember: injectDelayPeerPortTxRx,
recoverMember: recoverDelayPeerPortTxRx,
injectMember: inject_DELAY_PEER_PORT_TX_RX,
recoverMember: recover_DELAY_PEER_PORT_TX_RX,
}
clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs
if random {
clus.UpdateDelayLatencyMs()
ff.failureCase = rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT
}
f := &failureFollower{ff, -1, -1}
return &failureUntilSnapshot{
failureCase: ff.failureCase,
@ -86,19 +82,17 @@ func newFailureDelayPeerPortTxRxOneFollowerUntilTriggerSnapshot(clus *Cluster, r
}
}
func newFailureDelayPeerPortTxRxLeader(clus *Cluster, random bool) Failure {
func new_FailureCase_DELAY_PEER_PORT_TX_RX_LEADER(clus *Cluster, random bool) Failure {
ff := failureByFunc{
failureCase: rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_LEADER,
injectMember: injectDelayPeerPortTxRx,
recoverMember: recoverDelayPeerPortTxRx,
injectMember: inject_DELAY_PEER_PORT_TX_RX,
recoverMember: recover_DELAY_PEER_PORT_TX_RX,
}
clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs
if random {
clus.UpdateDelayLatencyMs()
ff.failureCase = rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_LEADER
}
f := &failureLeader{ff, -1, -1}
return &failureDelay{
Failure: f,
@ -106,19 +100,17 @@ func newFailureDelayPeerPortTxRxLeader(clus *Cluster, random bool) Failure {
}
}
func newFailureDelayPeerPortTxRxLeaderUntilTriggerSnapshot(clus *Cluster, random bool) Failure {
func new_FailureCase_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT(clus *Cluster, random bool) Failure {
ff := failureByFunc{
failureCase: rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT,
injectMember: injectDelayPeerPortTxRx,
recoverMember: recoverDelayPeerPortTxRx,
injectMember: inject_DELAY_PEER_PORT_TX_RX,
recoverMember: recover_DELAY_PEER_PORT_TX_RX,
}
clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs
if random {
clus.UpdateDelayLatencyMs()
ff.failureCase = rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT
}
f := &failureLeader{ff, -1, -1}
return &failureUntilSnapshot{
failureCase: ff.failureCase,
@ -126,38 +118,34 @@ func newFailureDelayPeerPortTxRxLeaderUntilTriggerSnapshot(clus *Cluster, random
}
}
func newFailureDelayPeerPortTxRxQuorum(clus *Cluster, random bool) Failure {
func new_FailureCase_DELAY_PEER_PORT_TX_RX_QUORUM(clus *Cluster, random bool) Failure {
f := &failureQuorum{
failureCase: rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_QUORUM,
injectMember: injectDelayPeerPortTxRx,
recoverMember: recoverDelayPeerPortTxRx,
injectMember: inject_DELAY_PEER_PORT_TX_RX,
recoverMember: recover_DELAY_PEER_PORT_TX_RX,
}
clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs
if random {
clus.UpdateDelayLatencyMs()
f.failureCase = rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM
}
return &failureDelay{
Failure: f,
delayDuration: clus.GetFailureDelayDuration(),
}
}
func newFailureDelayPeerPortTxRxAll(clus *Cluster, random bool) Failure {
func new_FailureCase_DELAY_PEER_PORT_TX_RX_ALL(clus *Cluster, random bool) Failure {
f := &failureAll{
failureCase: rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_ALL,
injectMember: injectDelayPeerPortTxRx,
recoverMember: recoverDelayPeerPortTxRx,
injectMember: inject_DELAY_PEER_PORT_TX_RX,
recoverMember: recover_DELAY_PEER_PORT_TX_RX,
}
clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs
if random {
clus.UpdateDelayLatencyMs()
f.failureCase = rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_ALL
}
return &failureDelay{
Failure: f,
delayDuration: clus.GetFailureDelayDuration(),

View File

@ -17,7 +17,7 @@ package tester
import (
"time"
"github.com/coreos/etcd/tools/functional-tester/rpcpb"
"github.com/coreos/etcd/functional/rpcpb"
"go.uber.org/zap"
)
@ -43,7 +43,7 @@ func (f *failureNoFailWithStress) FailureCase() rpcpb.FailureCase {
return f.failureCase
}
func newFailureNoFailWithStress(clus *Cluster) Failure {
func new_FailureCase_NO_FAIL_WITH_STRESS(clus *Cluster) Failure {
f := &failureNoFailWithStress{
failureCase: rpcpb.FailureCase_NO_FAIL_WITH_STRESS,
}
@ -88,7 +88,7 @@ func (f *failureNoFailWithNoStressForLiveness) FailureCase() rpcpb.FailureCase {
return f.failureCase
}
func newFailureNoFailWithNoStressForLiveness(clus *Cluster) Failure {
func new_FailureCase_NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS(clus *Cluster) Failure {
f := &failureNoFailWithNoStressForLiveness{
failureCase: rpcpb.FailureCase_NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS,
}

View File

@ -0,0 +1,89 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tester
import "github.com/coreos/etcd/functional/rpcpb"
func inject_SIGTERM_ETCD(clus *Cluster, idx int) error {
return clus.sendOp(idx, rpcpb.Operation_SIGTERM_ETCD)
}
func recover_SIGTERM_ETCD(clus *Cluster, idx int) error {
return clus.sendOp(idx, rpcpb.Operation_RESTART_ETCD)
}
func new_FailureCase_SIGTERM_ONE_FOLLOWER(clus *Cluster) Failure {
ff := failureByFunc{
failureCase: rpcpb.FailureCase_SIGTERM_ONE_FOLLOWER,
injectMember: inject_SIGTERM_ETCD,
recoverMember: recover_SIGTERM_ETCD,
}
f := &failureFollower{ff, -1, -1}
return &failureDelay{
Failure: f,
delayDuration: clus.GetFailureDelayDuration(),
}
}
func new_FailureCase_SIGTERM_LEADER(clus *Cluster) Failure {
ff := failureByFunc{
failureCase: rpcpb.FailureCase_SIGTERM_LEADER,
injectMember: inject_SIGTERM_ETCD,
recoverMember: recover_SIGTERM_ETCD,
}
f := &failureLeader{ff, -1, -1}
return &failureDelay{
Failure: f,
delayDuration: clus.GetFailureDelayDuration(),
}
}
func new_FailureCase_SIGTERM_QUORUM(clus *Cluster) Failure {
f := &failureQuorum{
failureCase: rpcpb.FailureCase_SIGTERM_QUORUM,
injectMember: inject_SIGTERM_ETCD,
recoverMember: recover_SIGTERM_ETCD,
}
return &failureDelay{
Failure: f,
delayDuration: clus.GetFailureDelayDuration(),
}
}
func new_FailureCase_SIGTERM_ALL(clus *Cluster) Failure {
f := &failureAll{
failureCase: rpcpb.FailureCase_SIGTERM_ALL,
injectMember: inject_SIGTERM_ETCD,
recoverMember: recover_SIGTERM_ETCD,
}
return &failureDelay{
Failure: f,
delayDuration: clus.GetFailureDelayDuration(),
}
}
func new_FailureCase_SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT(clus *Cluster) Failure {
return &failureUntilSnapshot{
failureCase: rpcpb.FailureCase_SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT,
Failure: new_FailureCase_SIGTERM_ONE_FOLLOWER(clus),
}
}
func new_FailureCase_SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT(clus *Cluster) Failure {
return &failureUntilSnapshot{
failureCase: rpcpb.FailureCase_SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT,
Failure: new_FailureCase_SIGTERM_LEADER(clus),
}
}

View File

@ -14,9 +14,16 @@
package tester
import "github.com/prometheus/client_golang/prometheus"
import (
"fmt"
"sort"
"github.com/prometheus/client_golang/prometheus"
)
var (
caseTotal = make(map[string]int)
caseTotalCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "etcd",
@ -60,3 +67,17 @@ func init() {
prometheus.MustRegister(roundTotalCounter)
prometheus.MustRegister(roundFailedTotalCounter)
}
func printReport() {
rows := make([]string, 0, len(caseTotal))
for k, v := range caseTotal {
rows = append(rows, fmt.Sprintf("%s: %d", k, v))
}
sort.Strings(rows)
println()
for _, row := range rows {
fmt.Println(row)
}
println()
}

View File

@ -18,7 +18,7 @@ import (
"fmt"
"time"
"github.com/coreos/etcd/tools/functional-tester/rpcpb"
"github.com/coreos/etcd/functional/rpcpb"
"go.uber.org/zap"
)
@ -52,6 +52,7 @@ func newStresser(clus *Cluster, m *rpcpb.Member) Stresser {
// TODO: Too intensive stressing clients can panic etcd member with
// 'out of memory' error. Put rate limits in server side.
stressers[i] = &keyStresser{
stype: rpcpb.StressType_KV,
lg: clus.lg,
m: m,
keySize: int(clus.Tester.StressKeySize),
@ -65,6 +66,7 @@ func newStresser(clus *Cluster, m *rpcpb.Member) Stresser {
case "LEASE":
stressers[i] = &leaseStresser{
stype: rpcpb.StressType_LEASE,
lg: clus.lg,
m: m,
numLeases: 10, // TODO: configurable
@ -84,6 +86,8 @@ func newStresser(clus *Cluster, m *rpcpb.Member) Stresser {
"--req-rate", fmt.Sprintf("%v", reqRate),
}
stressers[i] = newRunnerStresser(
rpcpb.StressType_ELECTION_RUNNER,
clus.lg,
clus.Tester.RunnerExecPath,
args,
clus.rateLimiter,
@ -102,7 +106,14 @@ func newStresser(clus *Cluster, m *rpcpb.Member) Stresser {
"--rounds=0", // runs forever
"--req-rate", fmt.Sprintf("%v", reqRate),
}
stressers[i] = newRunnerStresser(clus.Tester.RunnerExecPath, args, clus.rateLimiter, reqRate)
stressers[i] = newRunnerStresser(
rpcpb.StressType_WATCH_RUNNER,
clus.lg,
clus.Tester.RunnerExecPath,
args,
clus.rateLimiter,
reqRate,
)
case "LOCK_RACER_RUNNER":
reqRate := 100
@ -114,7 +125,14 @@ func newStresser(clus *Cluster, m *rpcpb.Member) Stresser {
"--rounds=0", // runs forever
"--req-rate", fmt.Sprintf("%v", reqRate),
}
stressers[i] = newRunnerStresser(clus.Tester.RunnerExecPath, args, clus.rateLimiter, reqRate)
stressers[i] = newRunnerStresser(
rpcpb.StressType_LOCK_RACER_RUNNER,
clus.lg,
clus.Tester.RunnerExecPath,
args,
clus.rateLimiter,
reqRate,
)
case "LEASE_RUNNER":
args := []string{
@ -122,7 +140,14 @@ func newStresser(clus *Cluster, m *rpcpb.Member) Stresser {
"--ttl=30",
"--endpoints", m.EtcdClientEndpoint,
}
stressers[i] = newRunnerStresser(clus.Tester.RunnerExecPath, args, clus.rateLimiter, 0)
stressers[i] = newRunnerStresser(
rpcpb.StressType_LEASE_RUNNER,
clus.lg,
clus.Tester.RunnerExecPath,
args,
clus.rateLimiter,
0,
)
}
}
return &compositeStresser{stressers}

View File

@ -18,6 +18,7 @@ import (
"context"
"fmt"
"math/rand"
"reflect"
"sync"
"sync/atomic"
"time"
@ -25,7 +26,7 @@ import (
"github.com/coreos/etcd/clientv3"
"github.com/coreos/etcd/etcdserver"
"github.com/coreos/etcd/etcdserver/api/v3rpc/rpctypes"
"github.com/coreos/etcd/tools/functional-tester/rpcpb"
"github.com/coreos/etcd/functional/rpcpb"
"go.uber.org/zap"
"golang.org/x/time/rate"
@ -34,7 +35,8 @@ import (
)
type keyStresser struct {
lg *zap.Logger
stype rpcpb.StressType
lg *zap.Logger
m *rpcpb.Member
@ -102,7 +104,8 @@ func (s *keyStresser) Stress() error {
}
s.lg.Info(
"key stresser START",
"stress START",
zap.String("stress-type", s.stype.String()),
zap.String("endpoint", s.m.EtcdClientEndpoint),
)
return nil
@ -156,8 +159,10 @@ func (s *keyStresser) run() {
return
default:
s.lg.Warn(
"key stresser exited with error",
"stress run exiting",
zap.String("stress-type", s.stype.String()),
zap.String("endpoint", s.m.EtcdClientEndpoint),
zap.String("error-type", reflect.TypeOf(err).String()),
zap.Error(err),
)
return
@ -188,7 +193,8 @@ func (s *keyStresser) Close() map[string]int {
s.emu.Unlock()
s.lg.Info(
"key stresser STOP",
"stress STOP",
zap.String("stress-type", s.stype.String()),
zap.String("endpoint", s.m.EtcdClientEndpoint),
)
return ess

View File

@ -24,7 +24,7 @@ import (
"github.com/coreos/etcd/clientv3"
"github.com/coreos/etcd/etcdserver/api/v3rpc/rpctypes"
"github.com/coreos/etcd/tools/functional-tester/rpcpb"
"github.com/coreos/etcd/functional/rpcpb"
"go.uber.org/zap"
"golang.org/x/time/rate"
@ -38,7 +38,8 @@ const (
)
type leaseStresser struct {
lg *zap.Logger
stype rpcpb.StressType
lg *zap.Logger
m *rpcpb.Member
cli *clientv3.Client
@ -121,7 +122,8 @@ func (ls *leaseStresser) setupOnce() error {
func (ls *leaseStresser) Stress() error {
ls.lg.Info(
"lease stresser START",
"stress START",
zap.String("stress-type", ls.stype.String()),
zap.String("endpoint", ls.m.EtcdClientEndpoint),
)
@ -159,22 +161,26 @@ func (ls *leaseStresser) run() {
}
ls.lg.Debug(
"lease stresser is creating leases",
"stress creating leases",
zap.String("stress-type", ls.stype.String()),
zap.String("endpoint", ls.m.EtcdClientEndpoint),
)
ls.createLeases()
ls.lg.Debug(
"lease stresser created leases",
"stress created leases",
zap.String("stress-type", ls.stype.String()),
zap.String("endpoint", ls.m.EtcdClientEndpoint),
)
ls.lg.Debug(
"lease stresser is dropped leases",
"stress dropped leases",
zap.String("stress-type", ls.stype.String()),
zap.String("endpoint", ls.m.EtcdClientEndpoint),
)
ls.randomlyDropLeases()
ls.lg.Debug(
"lease stresser dropped leases",
"stress dropped leases",
zap.String("stress-type", ls.stype.String()),
zap.String("endpoint", ls.m.EtcdClientEndpoint),
)
}
@ -243,6 +249,7 @@ func (ls *leaseStresser) createLeaseWithKeys(ttl int64) (int64, error) {
if err != nil {
ls.lg.Debug(
"createLease failed",
zap.String("stress-type", ls.stype.String()),
zap.String("endpoint", ls.m.EtcdClientEndpoint),
zap.Error(err),
)
@ -251,6 +258,7 @@ func (ls *leaseStresser) createLeaseWithKeys(ttl int64) (int64, error) {
ls.lg.Debug(
"createLease created lease",
zap.String("stress-type", ls.stype.String()),
zap.String("endpoint", ls.m.EtcdClientEndpoint),
zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
)
@ -284,6 +292,7 @@ func (ls *leaseStresser) randomlyDropLeases() {
}
ls.lg.Debug(
"randomlyDropLease dropped a lease",
zap.String("stress-type", ls.stype.String()),
zap.String("endpoint", ls.m.EtcdClientEndpoint),
zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
)
@ -313,6 +322,7 @@ func (ls *leaseStresser) keepLeaseAlive(leaseID int64) {
case <-ls.ctx.Done():
ls.lg.Debug(
"keepLeaseAlive context canceled",
zap.String("stress-type", ls.stype.String()),
zap.String("endpoint", ls.m.EtcdClientEndpoint),
zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
zap.Error(ls.ctx.Err()),
@ -327,6 +337,7 @@ func (ls *leaseStresser) keepLeaseAlive(leaseID int64) {
ls.aliveLeases.remove(leaseID)
ls.lg.Debug(
"keepLeaseAlive lease has not been renewed, dropped it",
zap.String("stress-type", ls.stype.String()),
zap.String("endpoint", ls.m.EtcdClientEndpoint),
zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
)
@ -337,6 +348,7 @@ func (ls *leaseStresser) keepLeaseAlive(leaseID int64) {
if err != nil {
ls.lg.Debug(
"keepLeaseAlive lease creates stream error",
zap.String("stress-type", ls.stype.String()),
zap.String("endpoint", ls.m.EtcdClientEndpoint),
zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
zap.Error(err),
@ -350,6 +362,7 @@ func (ls *leaseStresser) keepLeaseAlive(leaseID int64) {
if err != nil {
ls.lg.Debug(
"keepLeaseAlive failed to receive lease keepalive response",
zap.String("stress-type", ls.stype.String()),
zap.String("endpoint", ls.m.EtcdClientEndpoint),
zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
zap.Error(err),
@ -359,6 +372,7 @@ func (ls *leaseStresser) keepLeaseAlive(leaseID int64) {
ls.lg.Debug(
"keepLeaseAlive waiting on lease stream",
zap.String("stress-type", ls.stype.String()),
zap.String("endpoint", ls.m.EtcdClientEndpoint),
zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
)
@ -367,6 +381,7 @@ func (ls *leaseStresser) keepLeaseAlive(leaseID int64) {
if respRC == nil {
ls.lg.Debug(
"keepLeaseAlive received nil lease keepalive response",
zap.String("stress-type", ls.stype.String()),
zap.String("endpoint", ls.m.EtcdClientEndpoint),
zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
)
@ -378,6 +393,7 @@ func (ls *leaseStresser) keepLeaseAlive(leaseID int64) {
if respRC.TTL <= 0 {
ls.lg.Debug(
"keepLeaseAlive stream received lease keepalive response TTL <= 0",
zap.String("stress-type", ls.stype.String()),
zap.String("endpoint", ls.m.EtcdClientEndpoint),
zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
zap.Int64("ttl", respRC.TTL),
@ -388,6 +404,7 @@ func (ls *leaseStresser) keepLeaseAlive(leaseID int64) {
// renew lease timestamp only if lease is present
ls.lg.Debug(
"keepLeaseAlive renewed a lease",
zap.String("stress-type", ls.stype.String()),
zap.String("endpoint", ls.m.EtcdClientEndpoint),
zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
)
@ -440,6 +457,7 @@ func (ls *leaseStresser) randomlyDropLease(leaseID int64) (bool, error) {
ls.lg.Debug(
"randomlyDropLease error",
zap.String("stress-type", ls.stype.String()),
zap.String("endpoint", ls.m.EtcdClientEndpoint),
zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
zap.Error(ls.ctx.Err()),
@ -457,7 +475,8 @@ func (ls *leaseStresser) Close() map[string]int {
ls.aliveWg.Wait()
ls.cli.Close()
ls.lg.Info(
"lease stresser STOP",
"stress STOP",
zap.String("stress-type", ls.stype.String()),
zap.String("endpoint", ls.m.EtcdClientEndpoint),
)
return nil

View File

@ -20,10 +20,16 @@ import (
"os/exec"
"syscall"
"github.com/coreos/etcd/functional/rpcpb"
"go.uber.org/zap"
"golang.org/x/time/rate"
)
type runnerStresser struct {
stype rpcpb.StressType
lg *zap.Logger
cmd *exec.Cmd
cmdStr string
args []string
@ -34,9 +40,17 @@ type runnerStresser struct {
donec chan struct{}
}
func newRunnerStresser(cmdStr string, args []string, rl *rate.Limiter, reqRate int) *runnerStresser {
func newRunnerStresser(
stype rpcpb.StressType,
lg *zap.Logger,
cmdStr string,
args []string,
rl *rate.Limiter,
reqRate int,
) *runnerStresser {
rl.SetLimit(rl.Limit() - rate.Limit(reqRate))
return &runnerStresser{
stype: stype,
cmdStr: cmdStr,
args: args,
rl: rl,
@ -71,6 +85,10 @@ func (rs *runnerStresser) setupOnce() (err error) {
}
func (rs *runnerStresser) Stress() (err error) {
rs.lg.Info(
"stress START",
zap.String("stress-type", rs.stype.String()),
)
if err = rs.setupOnce(); err != nil {
return err
}
@ -78,6 +96,10 @@ func (rs *runnerStresser) Stress() (err error) {
}
func (rs *runnerStresser) Pause() map[string]int {
rs.lg.Info(
"stress STOP",
zap.String("stress-type", rs.stype.String()),
)
syscall.Kill(rs.cmd.Process.Pid, syscall.SIGSTOP)
return nil
}

16
pkg/fileutil/doc.go Normal file
View File

@ -0,0 +1,16 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package fileutil implements utility functions related to files and paths.
package fileutil

View File

@ -12,7 +12,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
// Package fileutil implements utility functions related to files and paths.
package fileutil
import (
@ -93,6 +92,7 @@ func CreateDirAll(dir string) error {
return err
}
// Exist returns true if a file or directory exists.
func Exist(name string) bool {
_, err := os.Stat(name)
return err == nil

View File

@ -15,8 +15,10 @@
package fileutil
import (
"fmt"
"io"
"io/ioutil"
"math/rand"
"os"
"os/user"
"path/filepath"
@ -24,6 +26,7 @@ import (
"runtime"
"strings"
"testing"
"time"
)
func TestIsDirWriteable(t *testing.T) {
@ -104,6 +107,16 @@ func TestCreateDirAll(t *testing.T) {
}
func TestExist(t *testing.T) {
fdir := filepath.Join(os.TempDir(), fmt.Sprint(time.Now().UnixNano()+rand.Int63n(1000)))
os.RemoveAll(fdir)
if err := os.Mkdir(fdir, 0666); err != nil {
t.Skip(err)
}
defer os.RemoveAll(fdir)
if !Exist(fdir) {
t.Fatalf("expected Exist true, got %v", Exist(fdir))
}
f, err := ioutil.TempFile(os.TempDir(), "fileutil")
if err != nil {
t.Fatal(err)

16
pkg/stringutil/doc.go Normal file
View File

@ -0,0 +1,16 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package stringutil exports string utility functions.
package stringutil

View File

@ -1,4 +1,4 @@
// Copyright 2016 The etcd Authors
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@ -12,41 +12,40 @@
// See the License for the specific language governing permissions and
// limitations under the License.
// Package stringutil exports string utility functions.
package stringutil
import "math/rand"
const (
chars = "abcdefghijklmnopqrstuvwxyz0123456789"
import (
"math/rand"
"time"
)
// UniqueStrings returns a slice of randomly generated unique strings.
func UniqueStrings(maxlen uint, n int) []string {
exist := make(map[string]bool)
ss := make([]string, 0)
func UniqueStrings(slen uint, n int) (ss []string) {
exist := make(map[string]struct{})
ss = make([]string, 0, n)
for len(ss) < n {
s := randomString(maxlen)
if !exist[s] {
exist[s] = true
s := randString(slen)
if _, ok := exist[s]; !ok {
ss = append(ss, s)
exist[s] = struct{}{}
}
}
return ss
}
// RandomStrings returns a slice of randomly generated strings.
func RandomStrings(maxlen uint, n int) []string {
ss := make([]string, 0)
func RandomStrings(slen uint, n int) (ss []string) {
ss = make([]string, 0, n)
for i := 0; i < n; i++ {
ss = append(ss, randomString(maxlen))
ss = append(ss, randString(slen))
}
return ss
}
func randomString(l uint) string {
const chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
func randString(l uint) string {
rand.Seed(time.Now().UnixNano())
s := make([]byte, l)
for i := 0; i < int(l); i++ {
s[i] = chars[rand.Intn(len(chars))]

View File

@ -0,0 +1,30 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package stringutil
import (
"fmt"
"testing"
)
func TestUniqueStrings(t *testing.T) {
ss := UniqueStrings(10, 50)
for i := 1; i < len(ss); i++ {
if ss[i-1] == ss[i] {
t.Fatalf("ss[i-1] %q == ss[i] %q", ss[i-1], ss[i])
}
}
fmt.Println(ss)
}

4
test
View File

@ -37,7 +37,7 @@ source ./build
# build before setting up test GOPATH
if [[ "${PASSES}" == *"functional"* ]]; then
./tools/functional-tester/build
./functional/build
fi
if [ -z "$PASSES" ]; then
@ -196,7 +196,7 @@ function functional_pass {
done
echo "Starting 'etcd-tester'"
./bin/etcd-tester --config ./tools/functional-tester/tester/local-test.yaml && echo "'etcd-tester' succeeded"
./bin/etcd-tester --config ./functional.yaml && echo "'etcd-tester' succeeded"
ETCD_TESTER_EXIT_CODE=$?
echo "ETCD_TESTER_EXIT_CODE:" ${ETCD_TESTER_EXIT_CODE}

View File

@ -1,14 +0,0 @@
s1: bin/etcd --name s1 --data-dir /tmp/etcd-test-proxy-data.s1 --listen-client-urls http://127.0.0.1:1379 --advertise-client-urls http://127.0.0.1:13790 --listen-peer-urls http://127.0.0.1:1380 --initial-advertise-peer-urls http://127.0.0.1:13800 --initial-cluster-token tkn --initial-cluster 's1=http://127.0.0.1:13800,s2=http://127.0.0.1:23800,s3=http://127.0.0.1:33800' --initial-cluster-state new
s1-client-proxy: bin/etcd-test-proxy --from localhost:13790 --to localhost:1379 --http-port 1378
s1-peer-proxy: bin/etcd-test-proxy --from localhost:13800 --to localhost:1380 --http-port 1381
s2: bin/etcd --name s2 --data-dir /tmp/etcd-test-proxy-data.s2 --listen-client-urls http://127.0.0.1:2379 --advertise-client-urls http://127.0.0.1:23790 --listen-peer-urls http://127.0.0.1:2380 --initial-advertise-peer-urls http://127.0.0.1:23800 --initial-cluster-token tkn --initial-cluster 's1=http://127.0.0.1:13800,s2=http://127.0.0.1:23800,s3=http://127.0.0.1:33800' --initial-cluster-state new
s2-client-proxy: bin/etcd-test-proxy --from localhost:23790 --to localhost:2379 --http-port 2378
s2-peer-proxy: bin/etcd-test-proxy --from localhost:23800 --to localhost:2380 --http-port 2381
s3: bin/etcd --name s3 --data-dir /tmp/etcd-test-proxy-data.s3 --listen-client-urls http://127.0.0.1:3379 --advertise-client-urls http://127.0.0.1:33790 --listen-peer-urls http://127.0.0.1:3380 --initial-advertise-peer-urls http://127.0.0.1:33800 --initial-cluster-token tkn --initial-cluster 's1=http://127.0.0.1:13800,s2=http://127.0.0.1:23800,s3=http://127.0.0.1:33800' --initial-cluster-state new
s3-client-proxy: bin/etcd-test-proxy --from localhost:33790 --to localhost:3379 --http-port 3378
s3-client-proxy: bin/etcd-test-proxy --from localhost:33800 --to localhost:3380 --http-port 3381

View File

@ -1,33 +0,0 @@
# etcd functional test suite
etcd functional test suite tests the functionality of an etcd cluster with a focus on failure resistance under high pressure. It sets up an etcd cluster and inject failures into the cluster by killing the process or isolate the network of the process. It expects the etcd cluster to recover within a short amount of time after fixing the fault.
etcd functional test suite has two components: etcd-agent and etcd-tester. etcd-agent runs on every test machine, and etcd-tester is a single controller of the test. tester controls agents: start etcd process, stop, terminate, inject failures, and so on.
### Run locally
```bash
PASSES=functional ./test
```
### Run with Docker
To run locally, first build tester image:
```bash
pushd ../..
make build-docker-functional-tester
popd
```
And run [example scripts](./scripts).
```bash
# run 3 agents for 3-node local etcd cluster
./scripts/docker-local-agent.sh 1
./scripts/docker-local-agent.sh 2
./scripts/docker-local-agent.sh 3
# to run only 1 tester round
./scripts/docker-local-tester.sh
```

View File

@ -1,10 +0,0 @@
#!/usr/bin/env bash
if ! [[ "$0" =~ "tools/functional-tester/build" ]]; then
echo "must be run from repository root"
exit 255
fi
CGO_ENABLED=0 go build -a -installsuffix cgo -ldflags "-s" -o bin/etcd-agent ./tools/functional-tester/cmd/etcd-agent
CGO_ENABLED=0 go build -a -installsuffix cgo -ldflags "-s" -o bin/etcd-tester ./tools/functional-tester/cmd/etcd-tester
CGO_ENABLED=0 go build -a -installsuffix cgo -ldflags "-s" -o bin/etcd-runner ./tools/functional-tester/cmd/etcd-runner

View File

@ -1,249 +0,0 @@
syntax = "proto3";
package rpcpb;
import "github.com/gogo/protobuf/gogoproto/gogo.proto";
option (gogoproto.marshaler_all) = true;
option (gogoproto.sizer_all) = true;
option (gogoproto.unmarshaler_all) = true;
option (gogoproto.goproto_getters_all) = false;
service Transport {
rpc Transport(stream Request) returns (stream Response) {}
}
enum Operation {
NotStarted = 0;
// InitialStartEtcd is only called to start etcd very first time.
InitialStartEtcd = 1;
// RestartEtcd is sent to restart killed etcd.
RestartEtcd = 2;
// KillEtcd pauses etcd process while keeping data directories
// and previous etcd configurations.
KillEtcd = 3;
// FailArchive is sent when consistency check failed,
// thus need to archive etcd data directories.
FailArchive = 4;
// DestroyEtcdAgent destroys etcd process, etcd data, and agent server.
DestroyEtcdAgent = 5;
BlackholePeerPortTxRx = 100;
UnblackholePeerPortTxRx = 101;
DelayPeerPortTxRx = 102;
UndelayPeerPortTxRx = 103;
}
message Etcd {
string Name = 1 [(gogoproto.moretags) = "yaml:\"name\""];
string DataDir = 2 [(gogoproto.moretags) = "yaml:\"data-dir\""];
string WALDir = 3 [(gogoproto.moretags) = "yaml:\"wal-dir\""];
// HeartbeatIntervalMs is the time (in milliseconds) of a heartbeat interval.
// Default value is 100, which is 100ms.
int64 HeartbeatIntervalMs = 11 [(gogoproto.moretags) = "yaml:\"heartbeat-interval\""];
// ElectionTimeoutMs is the time (in milliseconds) for an election to timeout.
// Default value is 1000, which is 1s.
int64 ElectionTimeoutMs = 12 [(gogoproto.moretags) = "yaml:\"election-timeout\""];
repeated string ListenClientURLs = 21 [(gogoproto.moretags) = "yaml:\"listen-client-urls\""];
repeated string AdvertiseClientURLs = 22 [(gogoproto.moretags) = "yaml:\"advertise-client-urls\""];
bool ClientAutoTLS = 23 [(gogoproto.moretags) = "yaml:\"auto-tls\""];
bool ClientCertAuth = 24 [(gogoproto.moretags) = "yaml:\"client-cert-auth\""];
string ClientCertFile = 25 [(gogoproto.moretags) = "yaml:\"cert-file\""];
string ClientKeyFile = 26 [(gogoproto.moretags) = "yaml:\"key-file\""];
string ClientTrustedCAFile = 27 [(gogoproto.moretags) = "yaml:\"trusted-ca-file\""];
repeated string ListenPeerURLs = 31 [(gogoproto.moretags) = "yaml:\"listen-peer-urls\""];
repeated string AdvertisePeerURLs = 32 [(gogoproto.moretags) = "yaml:\"initial-advertise-peer-urls\""];
bool PeerAutoTLS = 33 [(gogoproto.moretags) = "yaml:\"peer-auto-tls\""];
bool PeerClientCertAuth = 34 [(gogoproto.moretags) = "yaml:\"peer-client-cert-auth\""];
string PeerCertFile = 35 [(gogoproto.moretags) = "yaml:\"peer-cert-file\""];
string PeerKeyFile = 36 [(gogoproto.moretags) = "yaml:\"peer-key-file\""];
string PeerTrustedCAFile = 37 [(gogoproto.moretags) = "yaml:\"peer-trusted-ca-file\""];
string InitialCluster = 41 [(gogoproto.moretags) = "yaml:\"initial-cluster\""];
string InitialClusterState = 42 [(gogoproto.moretags) = "yaml:\"initial-cluster-state\""];
string InitialClusterToken = 43 [(gogoproto.moretags) = "yaml:\"initial-cluster-token\""];
int64 SnapshotCount = 51 [(gogoproto.moretags) = "yaml:\"snapshot-count\""];
int64 QuotaBackendBytes = 52 [(gogoproto.moretags) = "yaml:\"quota-backend-bytes\""];
bool PreVote = 63 [(gogoproto.moretags) = "yaml:\"pre-vote\""];
bool InitialCorruptCheck = 64 [(gogoproto.moretags) = "yaml:\"initial-corrupt-check\""];
}
message Member {
// EtcdExecPath is the executable etcd binary path in agent server.
string EtcdExecPath = 1 [(gogoproto.moretags) = "yaml:\"etcd-exec-path\""];
// TODO: support embedded etcd
// AgentAddr is the agent HTTP server address.
string AgentAddr = 11 [(gogoproto.moretags) = "yaml:\"agent-addr\""];
// FailpointHTTPAddr is the agent's failpoints HTTP server address.
string FailpointHTTPAddr = 12 [(gogoproto.moretags) = "yaml:\"failpoint-http-addr\""];
// BaseDir is the base directory where all logs and etcd data are stored.
string BaseDir = 101 [(gogoproto.moretags) = "yaml:\"base-dir\""];
// EtcdLogPath is the log file to store current etcd server logs.
string EtcdLogPath = 102 [(gogoproto.moretags) = "yaml:\"etcd-log-path\""];
// EtcdClientProxy is true when client traffic needs to be proxied.
// If true, listen client URL port must be different than advertise client URL port.
bool EtcdClientProxy = 201 [(gogoproto.moretags) = "yaml:\"etcd-client-proxy\""];
// EtcdPeerProxy is true when peer traffic needs to be proxied.
// If true, listen peer URL port must be different than advertise peer URL port.
bool EtcdPeerProxy = 202 [(gogoproto.moretags) = "yaml:\"etcd-peer-proxy\""];
// EtcdClientEndpoint is the etcd client endpoint.
string EtcdClientEndpoint = 301 [(gogoproto.moretags) = "yaml:\"etcd-client-endpoint\""];
// Etcd defines etcd binary configuration flags.
Etcd Etcd = 302 [(gogoproto.moretags) = "yaml:\"etcd\""];
// ClientCertData contains cert file contents from this member's etcd server.
string ClientCertData = 401 [(gogoproto.moretags) = "yaml:\"client-cert-data\""];
string ClientCertPath = 402 [(gogoproto.moretags) = "yaml:\"client-cert-path\""];
// ClientKeyData contains key file contents from this member's etcd server.
string ClientKeyData = 403 [(gogoproto.moretags) = "yaml:\"client-key-data\""];
string ClientKeyPath = 404 [(gogoproto.moretags) = "yaml:\"client-key-path\""];
// ClientTrustedCAData contains trusted CA file contents from this member's etcd server.
string ClientTrustedCAData = 405 [(gogoproto.moretags) = "yaml:\"client-trusted-ca-data\""];
string ClientTrustedCAPath = 406 [(gogoproto.moretags) = "yaml:\"client-trusted-ca-path\""];
// PeerCertData contains cert file contents from this member's etcd server.
string PeerCertData = 501 [(gogoproto.moretags) = "yaml:\"peer-cert-data\""];
string PeerCertPath = 502 [(gogoproto.moretags) = "yaml:\"peer-cert-path\""];
// PeerKeyData contains key file contents from this member's etcd server.
string PeerKeyData = 503 [(gogoproto.moretags) = "yaml:\"peer-key-data\""];
string PeerKeyPath = 504 [(gogoproto.moretags) = "yaml:\"peer-key-path\""];
// PeerTrustedCAData contains trusted CA file contents from this member's etcd server.
string PeerTrustedCAData = 505 [(gogoproto.moretags) = "yaml:\"peer-trusted-ca-data\""];
string PeerTrustedCAPath = 506 [(gogoproto.moretags) = "yaml:\"peer-trusted-ca-path\""];
}
enum FailureCase {
KILL_ONE_FOLLOWER = 0;
KILL_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 1;
KILL_LEADER = 2;
KILL_LEADER_UNTIL_TRIGGER_SNAPSHOT = 3;
KILL_QUORUM = 4;
KILL_ALL = 5;
BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER = 100;
BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 101;
BLACKHOLE_PEER_PORT_TX_RX_LEADER = 102;
BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT = 103;
BLACKHOLE_PEER_PORT_TX_RX_QUORUM = 104;
BLACKHOLE_PEER_PORT_TX_RX_ALL = 105;
DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER = 200;
RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER = 201;
DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 202;
RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 203;
DELAY_PEER_PORT_TX_RX_LEADER = 204;
RANDOM_DELAY_PEER_PORT_TX_RX_LEADER = 205;
DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT = 206;
RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT = 207;
DELAY_PEER_PORT_TX_RX_QUORUM = 208;
RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM = 209;
DELAY_PEER_PORT_TX_RX_ALL = 210;
RANDOM_DELAY_PEER_PORT_TX_RX_ALL = 211;
// NO_FAIL_WITH_STRESS runs no-op failure injection for specified period
// while stressers are still sending requests.
NO_FAIL_WITH_STRESS = 300;
// NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS runs no-op failure injection
// with all stressers stopped.
NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS = 301;
FAILPOINTS = 400;
EXTERNAL = 500;
}
enum StressType {
KV = 0;
LEASE = 1;
ELECTION_RUNNER = 2;
WATCH_RUNNER = 3;
LOCK_RACER_RUNNER = 4;
LEASE_RUNNER = 5;
}
message Tester {
string TesterDataDir = 1 [(gogoproto.moretags) = "yaml:\"tester-data-dir\""];
string TesterNetwork = 2 [(gogoproto.moretags) = "yaml:\"tester-network\""];
string TesterAddr = 3 [(gogoproto.moretags) = "yaml:\"tester-addr\""];
// DelayLatencyMsRv is the delay latency in milliseconds,
// to inject to simulated slow network.
uint32 DelayLatencyMs = 11 [(gogoproto.moretags) = "yaml:\"delay-latency-ms\""];
// DelayLatencyMsRv is the delay latency random variable in milliseconds.
uint32 DelayLatencyMsRv = 12 [(gogoproto.moretags) = "yaml:\"delay-latency-ms-rv\""];
// UpdatedDelayLatencyMs is the update delay latency in milliseconds,
// to inject to simulated slow network. It's the final latency to apply,
// in case the latency numbers are randomly generated from given delay latency field.
uint32 UpdatedDelayLatencyMs = 13 [(gogoproto.moretags) = "yaml:\"updated-delay-latency-ms\""];
// RoundLimit is the limit of rounds to run failure set (-1 to run without limits).
int32 RoundLimit = 21 [(gogoproto.moretags) = "yaml:\"round-limit\""];
// ExitOnFailure is true, then exit tester on first failure.
bool ExitOnFailure = 22 [(gogoproto.moretags) = "yaml:\"exit-on-failure\""];
// ConsistencyCheck is true to check consistency (revision, hash).
bool ConsistencyCheck = 23 [(gogoproto.moretags) = "yaml:\"consistency-check\""];
// EnablePprof is true to enable profiler.
bool EnablePprof = 24 [(gogoproto.moretags) = "yaml:\"enable-pprof\""];
// FailureCases is the selected test cases to schedule.
// If empty, run all failure cases.
repeated string FailureCases = 31 [(gogoproto.moretags) = "yaml:\"failure-cases\""];
// FailureDelayMs is the delay duration after failure is injected.
// Useful when triggering snapshot or no-op failure cases.
uint32 FailureDelayMs = 32 [(gogoproto.moretags) = "yaml:\"failure-delay-ms\""];
// FailureShuffle is true to randomize failure injecting order.
bool FailureShuffle = 33 [(gogoproto.moretags) = "yaml:\"failure-shuffle\""];
// FailpointCommands is the list of "gofail" commands (e.g. panic("etcd-tester"),1*sleep(1000)).
repeated string FailpointCommands = 34 [(gogoproto.moretags) = "yaml:\"failpoint-commands\""];
// RunnerExecPath is a path of etcd-runner binary.
string RunnerExecPath = 41 [(gogoproto.moretags) = "yaml:\"runner-exec-path\""];
// ExternalExecPath is a path of script for enabling/disabling an external fault injector.
string ExternalExecPath = 42 [(gogoproto.moretags) = "yaml:\"external-exec-path\""];
// StressTypes is the list of stresser names:
// keys, lease, nop, election-runner, watch-runner, lock-racer-runner, lease-runner.
repeated string StressTypes = 101 [(gogoproto.moretags) = "yaml:\"stress-types\""];
// StressKeySize is the size of each small key written into etcd.
int32 StressKeySize = 102 [(gogoproto.moretags) = "yaml:\"stress-key-size\""];
// StressKeySizeLarge is the size of each large key written into etcd.
int32 StressKeySizeLarge = 103 [(gogoproto.moretags) = "yaml:\"stress-key-size-large\""];
// StressKeySuffixRange is the count of key range written into etcd.
// Stress keys are created with "fmt.Sprintf("foo%016x", rand.Intn(keySuffixRange)".
int32 StressKeySuffixRange = 104 [(gogoproto.moretags) = "yaml:\"stress-key-suffix-range\""];
// StressKeySuffixRangeTxn is the count of key range written into etcd txn (max 100).
// Stress keys are created with "fmt.Sprintf("/k%03d", i)".
int32 StressKeySuffixRangeTxn = 105 [(gogoproto.moretags) = "yaml:\"stress-key-suffix-range-txn\""];
// StressKeyTxnOps is the number of operations per a transaction (max 64).
int32 StressKeyTxnOps = 106 [(gogoproto.moretags) = "yaml:\"stress-key-txn-ops\""];
// StressClients is the number of concurrent stressing clients
// with "one" shared TCP connection.
int32 StressClients = 201 [(gogoproto.moretags) = "yaml:\"stress-clients\""];
// StressQPS is the maximum number of stresser requests per second.
int32 StressQPS = 202 [(gogoproto.moretags) = "yaml:\"stress-qps\""];
}
message Request {
Operation Operation = 1;
// Member contains the same Member object from tester configuration.
Member Member = 2;
// Tester contains tester configuration.
Tester Tester = 3;
}
message Response {
bool Success = 1;
string Status = 2;
// Member contains the same Member object from tester request.
Member Member = 3;
}

View File

@ -1,73 +0,0 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tester
import "github.com/coreos/etcd/tools/functional-tester/rpcpb"
func injectKill(clus *Cluster, idx int) error {
return clus.sendOperation(idx, rpcpb.Operation_KillEtcd)
}
func recoverKill(clus *Cluster, idx int) error {
return clus.sendOperation(idx, rpcpb.Operation_RestartEtcd)
}
func newFailureKillOneFollower() Failure {
ff := failureByFunc{
failureCase: rpcpb.FailureCase_KILL_ONE_FOLLOWER,
injectMember: injectKill,
recoverMember: recoverKill,
}
return &failureFollower{ff, -1, -1}
}
func newFailureKillLeader() Failure {
ff := failureByFunc{
failureCase: rpcpb.FailureCase_KILL_LEADER,
injectMember: injectKill,
recoverMember: recoverKill,
}
return &failureLeader{ff, -1, -1}
}
func newFailureKillQuorum() Failure {
return &failureQuorum{
failureCase: rpcpb.FailureCase_KILL_QUORUM,
injectMember: injectKill,
recoverMember: recoverKill,
}
}
func newFailureKillAll() Failure {
return &failureAll{
failureCase: rpcpb.FailureCase_KILL_ALL,
injectMember: injectKill,
recoverMember: recoverKill,
}
}
func newFailureKillOneFollowerUntilTriggerSnapshot() Failure {
return &failureUntilSnapshot{
failureCase: rpcpb.FailureCase_KILL_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT,
Failure: newFailureKillOneFollower(),
}
}
func newFailureKillLeaderUntilTriggerSnapshot() Failure {
return &failureUntilSnapshot{
failureCase: rpcpb.FailureCase_KILL_LEADER_UNTIL_TRIGGER_SNAPSHOT,
Failure: newFailureKillLeader(),
}
}