mirror of
https://github.com/etcd-io/etcd.git
synced 2024-09-27 06:25:44 +00:00
Merge pull request #9548 from gyuho/functional-tester
functional-tester: clean up, handle Operation_SIGQUIT_ETCD_AND_REMOVE_DATA
This commit is contained in:
commit
10a51a3003
@ -30,7 +30,7 @@ See [code changes](https://github.com/coreos/etcd/compare/v3.3.0...v3.4.0) and [
|
||||
- Futhermore, when `--auto-compaction-mode=periodic --auto-compaction-retention=30m` and writes per minute are about 1000, `v3.3.0`, `v3.3.1`, and `v3.3.2` compact revision 30000, 33000, and 36000, for every 3-minute, while `v3.3.3` *or later* compacts revision 30000, 60000, and 90000, for every 30-minute.
|
||||
- Improve [lease expire/revoke operation performance](https://github.com/coreos/etcd/pull/9418), address [lease scalability issue](https://github.com/coreos/etcd/issues/9496).
|
||||
- Make [Lease `Lookup` non-blocking with concurrent `Grant`/`Revoke`](https://github.com/coreos/etcd/pull/9229).
|
||||
- Improve [functional tester](https://github.com/coreos/etcd/tree/master/tools/functional-tester) coverage: use [proxy layer to run network fault tests in CI](https://github.com/coreos/etcd/pull/9081), enable [TLS both for server and client](https://github.com/coreos/etcd/pull/9534), add [liveness mode](https://github.com/coreos/etcd/issues/9230), and [shuffle test sequence](https://github.com/coreos/etcd/issues/9381).
|
||||
- Improve [functional tester](https://github.com/coreos/etcd/tree/master/functional) coverage: use [proxy layer to run network fault tests in CI](https://github.com/coreos/etcd/pull/9081), enable [TLS both for server and client](https://github.com/coreos/etcd/pull/9534), add [liveness mode](https://github.com/coreos/etcd/issues/9230), and [shuffle test sequence](https://github.com/coreos/etcd/issues/9381).
|
||||
|
||||
### Breaking Changes
|
||||
|
||||
|
@ -22,7 +22,7 @@ RUN rm -rf ${GOROOT} \
|
||||
|
||||
RUN mkdir -p ${GOPATH}/src/github.com/coreos/etcd
|
||||
ADD . ${GOPATH}/src/github.com/coreos/etcd
|
||||
ADD ./tools/functional-tester/tester/local-test.yaml /local-test.yaml
|
||||
ADD ./functional.yaml /functional.yaml
|
||||
|
||||
RUN go get -v github.com/coreos/gofail \
|
||||
&& pushd ${GOPATH}/src/github.com/coreos/etcd \
|
||||
@ -32,11 +32,11 @@ RUN go get -v github.com/coreos/gofail \
|
||||
&& cp ./bin/etcdctl /bin/etcdctl \
|
||||
&& GO_BUILD_FLAGS="-v" FAILPOINTS=1 ./build \
|
||||
&& cp ./bin/etcd /bin/etcd-failpoints \
|
||||
&& ./tools/functional-tester/build \
|
||||
&& ./functional/build \
|
||||
&& cp ./bin/etcd-agent /bin/etcd-agent \
|
||||
&& cp ./bin/etcd-tester /bin/etcd-tester \
|
||||
&& cp ./bin/etcd-proxy /bin/etcd-proxy \
|
||||
&& cp ./bin/etcd-runner /bin/etcd-runner \
|
||||
&& cp ./bin/etcd-tester /bin/etcd-tester \
|
||||
&& go build -v -o /bin/benchmark ./tools/benchmark \
|
||||
&& go build -v -o /bin/etcd-test-proxy ./tools/etcd-test-proxy \
|
||||
&& popd \
|
||||
&& rm -rf ${GOPATH}/src/github.com/coreos/etcd
|
50
Makefile
50
Makefile
@ -469,46 +469,48 @@ docker-dns-srv-test-certs-wildcard-run:
|
||||
|
||||
|
||||
# Example:
|
||||
# make build-etcd-test-proxy
|
||||
# make build-functional
|
||||
# make build-docker-functional
|
||||
# make push-docker-functional
|
||||
# make pull-docker-functional
|
||||
|
||||
build-etcd-test-proxy:
|
||||
go build -v -o ./bin/etcd-test-proxy ./tools/etcd-test-proxy
|
||||
|
||||
|
||||
|
||||
# Example:
|
||||
# make build-docker-functional-tester
|
||||
# make push-docker-functional-tester
|
||||
# make pull-docker-functional-tester
|
||||
|
||||
build-docker-functional-tester:
|
||||
build-functional:
|
||||
$(info GO_VERSION: $(GO_VERSION))
|
||||
$(info ETCD_VERSION: $(ETCD_VERSION))
|
||||
@sed -i.bak 's|REPLACE_ME_GO_VERSION|$(GO_VERSION)|g' ./Dockerfile-functional-tester
|
||||
./functional/build
|
||||
./bin/etcd-agent -help || true && \
|
||||
./bin/etcd-proxy -help || true && \
|
||||
./bin/etcd-runner --help || true && \
|
||||
./bin/etcd-tester -help || true
|
||||
|
||||
build-docker-functional:
|
||||
$(info GO_VERSION: $(GO_VERSION))
|
||||
$(info ETCD_VERSION: $(ETCD_VERSION))
|
||||
@sed -i.bak 's|REPLACE_ME_GO_VERSION|$(GO_VERSION)|g' ./Dockerfile-functional
|
||||
docker build \
|
||||
--tag gcr.io/etcd-development/etcd-functional-tester:go$(GO_VERSION) \
|
||||
--file ./Dockerfile-functional-tester \
|
||||
--tag gcr.io/etcd-development/etcd-functional:go$(GO_VERSION) \
|
||||
--file ./Dockerfile-functional \
|
||||
.
|
||||
@mv ./Dockerfile-functional-tester.bak ./Dockerfile-functional-tester
|
||||
@mv ./Dockerfile-functional.bak ./Dockerfile-functional
|
||||
|
||||
docker run \
|
||||
--rm \
|
||||
gcr.io/etcd-development/etcd-functional-tester:go$(GO_VERSION) \
|
||||
gcr.io/etcd-development/etcd-functional:go$(GO_VERSION) \
|
||||
/bin/bash -c "./bin/etcd --version && \
|
||||
./bin/etcd-failpoints --version && \
|
||||
ETCDCTL_API=3 ./bin/etcdctl version && \
|
||||
./bin/etcd-agent -help || true && \
|
||||
./bin/etcd-tester -help || true && \
|
||||
./bin/etcd-proxy -help || true && \
|
||||
./bin/etcd-runner --help || true && \
|
||||
./bin/benchmark --help || true && \
|
||||
./bin/etcd-test-proxy -help || true"
|
||||
./bin/etcd-tester -help || true && \
|
||||
./bin/benchmark --help || true"
|
||||
|
||||
push-docker-functional-tester:
|
||||
push-docker-functional:
|
||||
$(info GO_VERSION: $(GO_VERSION))
|
||||
$(info ETCD_VERSION: $(ETCD_VERSION))
|
||||
gcloud docker -- push gcr.io/etcd-development/etcd-functional-tester:go$(GO_VERSION)
|
||||
gcloud docker -- push gcr.io/etcd-development/etcd-functional:go$(GO_VERSION)
|
||||
|
||||
pull-docker-functional-tester:
|
||||
pull-docker-functional:
|
||||
$(info GO_VERSION: $(GO_VERSION))
|
||||
$(info ETCD_VERSION: $(ETCD_VERSION))
|
||||
docker pull gcr.io/etcd-development/etcd-functional-tester:go$(GO_VERSION)
|
||||
docker pull gcr.io/etcd-development/etcd-functional:go$(GO_VERSION)
|
||||
|
@ -106,9 +106,9 @@ agent-configs:
|
||||
initial-corrupt-check: true
|
||||
|
||||
tester-config:
|
||||
tester-data-dir: /tmp/etcd-tester-data
|
||||
tester-network: tcp
|
||||
tester-addr: 127.0.0.1:9028
|
||||
data-dir: /tmp/etcd-tester-data
|
||||
network: tcp
|
||||
addr: 127.0.0.1:9028
|
||||
|
||||
# slow enough to trigger election
|
||||
delay-latency-ms: 5000
|
||||
@ -119,13 +119,15 @@ tester-config:
|
||||
consistency-check: true
|
||||
enable-pprof: true
|
||||
|
||||
failure-delay-ms: 7000
|
||||
failure-shuffle: true
|
||||
failure-cases:
|
||||
- KILL_ONE_FOLLOWER
|
||||
- KILL_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT
|
||||
- KILL_LEADER
|
||||
- KILL_LEADER_UNTIL_TRIGGER_SNAPSHOT
|
||||
- KILL_QUORUM
|
||||
- KILL_ALL
|
||||
- SIGTERM_ONE_FOLLOWER
|
||||
- SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT
|
||||
- SIGTERM_LEADER
|
||||
- SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT
|
||||
- SIGTERM_QUORUM
|
||||
- SIGTERM_ALL
|
||||
- BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER
|
||||
- BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT
|
||||
- BLACKHOLE_PEER_PORT_TX_RX_LEADER
|
||||
@ -147,14 +149,11 @@ tester-config:
|
||||
- NO_FAIL_WITH_STRESS
|
||||
- NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS
|
||||
|
||||
failure-delay-ms: 7000
|
||||
failure-shuffle: true
|
||||
failpoint-commands:
|
||||
- panic("etcd-tester")
|
||||
# failpoint-commands:
|
||||
# - panic("etcd-tester"),1*sleep(1000)
|
||||
|
||||
runner-exec-path: /etcd-runner
|
||||
runner-exec-path: ./bin/etcd-runner
|
||||
external-exec-path: ""
|
||||
|
||||
stress-types:
|
14
functional/Procfile-proxy
Normal file
14
functional/Procfile-proxy
Normal file
@ -0,0 +1,14 @@
|
||||
s1: bin/etcd --name s1 --data-dir /tmp/etcd-proxy-data.s1 --listen-client-urls http://127.0.0.1:1379 --advertise-client-urls http://127.0.0.1:13790 --listen-peer-urls http://127.0.0.1:1380 --initial-advertise-peer-urls http://127.0.0.1:13800 --initial-cluster-token tkn --initial-cluster 's1=http://127.0.0.1:13800,s2=http://127.0.0.1:23800,s3=http://127.0.0.1:33800' --initial-cluster-state new
|
||||
|
||||
s1-client-proxy: bin/etcd-proxy --from localhost:13790 --to localhost:1379 --http-port 1378
|
||||
s1-peer-proxy: bin/etcd-proxy --from localhost:13800 --to localhost:1380 --http-port 1381
|
||||
|
||||
s2: bin/etcd --name s2 --data-dir /tmp/etcd-proxy-data.s2 --listen-client-urls http://127.0.0.1:2379 --advertise-client-urls http://127.0.0.1:23790 --listen-peer-urls http://127.0.0.1:2380 --initial-advertise-peer-urls http://127.0.0.1:23800 --initial-cluster-token tkn --initial-cluster 's1=http://127.0.0.1:13800,s2=http://127.0.0.1:23800,s3=http://127.0.0.1:33800' --initial-cluster-state new
|
||||
|
||||
s2-client-proxy: bin/etcd-proxy --from localhost:23790 --to localhost:2379 --http-port 2378
|
||||
s2-peer-proxy: bin/etcd-proxy --from localhost:23800 --to localhost:2380 --http-port 2381
|
||||
|
||||
s3: bin/etcd --name s3 --data-dir /tmp/etcd-proxy-data.s3 --listen-client-urls http://127.0.0.1:3379 --advertise-client-urls http://127.0.0.1:33790 --listen-peer-urls http://127.0.0.1:3380 --initial-advertise-peer-urls http://127.0.0.1:33800 --initial-cluster-token tkn --initial-cluster 's1=http://127.0.0.1:13800,s2=http://127.0.0.1:23800,s3=http://127.0.0.1:33800' --initial-cluster-state new
|
||||
|
||||
s3-client-proxy: bin/etcd-proxy --from localhost:33790 --to localhost:3379 --http-port 3378
|
||||
s3-client-proxy: bin/etcd-proxy --from localhost:33800 --to localhost:3380 --http-port 3381
|
@ -1,4 +1,36 @@
|
||||
#### etcd-test-proxy
|
||||
## etcd Functional Testing
|
||||
|
||||
`functional` verifies the correct behavior of etcd under various system and network malfunctions. It sets up an etcd cluster under high pressure loads and continuously injects failures into the cluster. Then it expects the etcd cluster to recover within a few seconds. This has been extremely helpful to find critical bugs.
|
||||
|
||||
See [functional.yaml](../functional.yaml) for an example configuration.
|
||||
|
||||
### Run locally
|
||||
|
||||
```bash
|
||||
PASSES=functional ./test
|
||||
```
|
||||
|
||||
### Run with Docker
|
||||
|
||||
```bash
|
||||
pushd ..
|
||||
make build-docker-functional
|
||||
popd
|
||||
```
|
||||
|
||||
And run [example scripts](./scripts).
|
||||
|
||||
```bash
|
||||
# run 3 agents for 3-node local etcd cluster
|
||||
./scripts/docker-local-agent.sh 1
|
||||
./scripts/docker-local-agent.sh 2
|
||||
./scripts/docker-local-agent.sh 3
|
||||
|
||||
# to run only 1 tester round
|
||||
./scripts/docker-local-tester.sh
|
||||
```
|
||||
|
||||
## etcd Proxy
|
||||
|
||||
Proxy layer that simulates various network conditions.
|
||||
|
||||
@ -8,10 +40,10 @@ Test locally
|
||||
$ ./build
|
||||
$ ./bin/etcd
|
||||
|
||||
$ make build-etcd-test-proxy
|
||||
$ make build-functional
|
||||
|
||||
$ ./bin/etcd-test-proxy --help
|
||||
$ ./bin/etcd-test-proxy --from localhost:23790 --to localhost:2379 --http-port 2378 --verbose
|
||||
$ ./bin/etcd-proxy --help
|
||||
$ ./bin/etcd-proxy --from localhost:23790 --to localhost:2379 --http-port 2378 --verbose
|
||||
|
||||
$ ETCDCTL_API=3 ./bin/etcdctl --endpoints localhost:2379 put foo bar
|
||||
$ ETCDCTL_API=3 ./bin/etcdctl --endpoints localhost:23790 put foo bar
|
||||
@ -168,10 +200,10 @@ Trigger leader election
|
||||
|
||||
```bash
|
||||
$ ./build
|
||||
$ make build-etcd-test-proxy
|
||||
$ make build-functional
|
||||
|
||||
$ rm -rf /tmp/etcd-test-proxy-data.s*
|
||||
$ goreman -f ./tools/etcd-test-proxy/Procfile start
|
||||
$ rm -rf /tmp/etcd-proxy-data.s*
|
||||
$ goreman -f ./functional/Procfile-proxy start
|
||||
|
||||
$ ETCDCTL_API=3 ./bin/etcdctl \
|
||||
--endpoints localhost:13790,localhost:23790,localhost:33790 \
|
@ -25,9 +25,9 @@ import (
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/coreos/etcd/functional/rpcpb"
|
||||
"github.com/coreos/etcd/pkg/fileutil"
|
||||
"github.com/coreos/etcd/pkg/proxy"
|
||||
"github.com/coreos/etcd/tools/functional-tester/rpcpb"
|
||||
|
||||
"go.uber.org/zap"
|
||||
)
|
||||
@ -36,32 +36,40 @@ import (
|
||||
// return status error in response for wrong configuration/operation (e.g. start etcd twice)
|
||||
func (srv *Server) handleTesterRequest(req *rpcpb.Request) (resp *rpcpb.Response, err error) {
|
||||
defer func() {
|
||||
if err == nil {
|
||||
if err == nil && req != nil {
|
||||
srv.last = req.Operation
|
||||
srv.lg.Info("handler success", zap.String("operation", req.Operation.String()))
|
||||
}
|
||||
}()
|
||||
if req != nil {
|
||||
srv.Member = req.Member
|
||||
srv.Tester = req.Tester
|
||||
}
|
||||
|
||||
switch req.Operation {
|
||||
case rpcpb.Operation_InitialStartEtcd:
|
||||
return srv.handleInitialStartEtcd(req)
|
||||
case rpcpb.Operation_RestartEtcd:
|
||||
return srv.handleRestartEtcd()
|
||||
case rpcpb.Operation_KillEtcd:
|
||||
return srv.handleKillEtcd()
|
||||
case rpcpb.Operation_FailArchive:
|
||||
return srv.handleFailArchive()
|
||||
case rpcpb.Operation_DestroyEtcdAgent:
|
||||
return srv.handleDestroyEtcdAgent()
|
||||
case rpcpb.Operation_INITIAL_START_ETCD:
|
||||
return srv.handle_INITIAL_START_ETCD(req)
|
||||
case rpcpb.Operation_RESTART_ETCD:
|
||||
return srv.handle_RESTART_ETCD()
|
||||
|
||||
case rpcpb.Operation_BlackholePeerPortTxRx:
|
||||
return srv.handleBlackholePeerPortTxRx()
|
||||
case rpcpb.Operation_UnblackholePeerPortTxRx:
|
||||
return srv.handleUnblackholePeerPortTxRx()
|
||||
case rpcpb.Operation_DelayPeerPortTxRx:
|
||||
return srv.handleDelayPeerPortTxRx()
|
||||
case rpcpb.Operation_UndelayPeerPortTxRx:
|
||||
return srv.handleUndelayPeerPortTxRx()
|
||||
case rpcpb.Operation_SIGTERM_ETCD:
|
||||
return srv.handle_SIGTERM_ETCD()
|
||||
case rpcpb.Operation_SIGQUIT_ETCD_AND_REMOVE_DATA:
|
||||
return srv.handle_SIGQUIT_ETCD_AND_REMOVE_DATA()
|
||||
|
||||
case rpcpb.Operation_SIGQUIT_ETCD_AND_ARCHIVE_DATA:
|
||||
return srv.handle_SIGQUIT_ETCD_AND_ARCHIVE_DATA()
|
||||
case rpcpb.Operation_SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT:
|
||||
return srv.handle_SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT()
|
||||
|
||||
case rpcpb.Operation_BLACKHOLE_PEER_PORT_TX_RX:
|
||||
return srv.handle_BLACKHOLE_PEER_PORT_TX_RX()
|
||||
case rpcpb.Operation_UNBLACKHOLE_PEER_PORT_TX_RX:
|
||||
return srv.handle_UNBLACKHOLE_PEER_PORT_TX_RX()
|
||||
case rpcpb.Operation_DELAY_PEER_PORT_TX_RX:
|
||||
return srv.handle_DELAY_PEER_PORT_TX_RX()
|
||||
case rpcpb.Operation_UNDELAY_PEER_PORT_TX_RX:
|
||||
return srv.handle_UNDELAY_PEER_PORT_TX_RX()
|
||||
|
||||
default:
|
||||
msg := fmt.Sprintf("operation not found (%v)", req.Operation)
|
||||
@ -69,18 +77,15 @@ func (srv *Server) handleTesterRequest(req *rpcpb.Request) (resp *rpcpb.Response
|
||||
}
|
||||
}
|
||||
|
||||
func (srv *Server) handleInitialStartEtcd(req *rpcpb.Request) (*rpcpb.Response, error) {
|
||||
if srv.last != rpcpb.Operation_NotStarted {
|
||||
func (srv *Server) handle_INITIAL_START_ETCD(req *rpcpb.Request) (*rpcpb.Response, error) {
|
||||
if srv.last != rpcpb.Operation_NOT_STARTED {
|
||||
return &rpcpb.Response{
|
||||
Success: false,
|
||||
Status: fmt.Sprintf("%q is not valid; last server operation was %q", rpcpb.Operation_InitialStartEtcd.String(), srv.last.String()),
|
||||
Status: fmt.Sprintf("%q is not valid; last server operation was %q", rpcpb.Operation_INITIAL_START_ETCD.String(), srv.last.String()),
|
||||
Member: req.Member,
|
||||
}, nil
|
||||
}
|
||||
|
||||
srv.Member = req.Member
|
||||
srv.Tester = req.Tester
|
||||
|
||||
err := fileutil.TouchDirAll(srv.Member.BaseDir)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@ -118,7 +123,6 @@ func (srv *Server) handleInitialStartEtcd(req *rpcpb.Request) (*rpcpb.Response,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// TODO: support TLS
|
||||
func (srv *Server) startProxy() error {
|
||||
if srv.Member.EtcdClientProxy {
|
||||
advertiseClientURL, advertiseClientURLPort, err := getURLAndPort(srv.Member.Etcd.AdvertiseClientURLs[0])
|
||||
@ -236,45 +240,124 @@ func (srv *Server) creatEtcdCmd() {
|
||||
srv.etcdCmd.Stderr = srv.etcdLogFile
|
||||
}
|
||||
|
||||
// if started with manual TLS, stores TLS assets
|
||||
// from tester/client to disk before starting etcd process
|
||||
func (srv *Server) saveTLSAssets() error {
|
||||
// if started with manual TLS, stores TLS assets
|
||||
// from tester/client to disk before starting etcd process
|
||||
// TODO: not implemented yet
|
||||
if !srv.Member.Etcd.ClientAutoTLS {
|
||||
if srv.Member.Etcd.ClientCertAuth {
|
||||
return fmt.Errorf("manual TLS setup is not implemented yet, but Member.Etcd.ClientCertAuth is %v", srv.Member.Etcd.ClientCertAuth)
|
||||
if srv.Member.PeerCertPath != "" {
|
||||
if srv.Member.PeerCertData == "" {
|
||||
return fmt.Errorf("got empty data for %q", srv.Member.PeerCertPath)
|
||||
}
|
||||
if srv.Member.Etcd.ClientCertFile != "" {
|
||||
return fmt.Errorf("manual TLS setup is not implemented yet, but Member.Etcd.ClientCertFile is %q", srv.Member.Etcd.ClientCertFile)
|
||||
}
|
||||
if srv.Member.Etcd.ClientKeyFile != "" {
|
||||
return fmt.Errorf("manual TLS setup is not implemented yet, but Member.Etcd.ClientKeyFile is %q", srv.Member.Etcd.ClientKeyFile)
|
||||
}
|
||||
if srv.Member.Etcd.ClientTrustedCAFile != "" {
|
||||
return fmt.Errorf("manual TLS setup is not implemented yet, but Member.Etcd.ClientTrustedCAFile is %q", srv.Member.Etcd.ClientTrustedCAFile)
|
||||
if err := ioutil.WriteFile(srv.Member.PeerCertPath, []byte(srv.Member.PeerCertData), 0644); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if !srv.Member.Etcd.PeerAutoTLS {
|
||||
if srv.Member.Etcd.PeerClientCertAuth {
|
||||
return fmt.Errorf("manual TLS setup is not implemented yet, but Member.Etcd.PeerClientCertAuth is %v", srv.Member.Etcd.PeerClientCertAuth)
|
||||
if srv.Member.PeerKeyPath != "" {
|
||||
if srv.Member.PeerKeyData == "" {
|
||||
return fmt.Errorf("got empty data for %q", srv.Member.PeerKeyPath)
|
||||
}
|
||||
if srv.Member.Etcd.PeerCertFile != "" {
|
||||
return fmt.Errorf("manual TLS setup is not implemented yet, but Member.Etcd.PeerCertFile is %q", srv.Member.Etcd.PeerCertFile)
|
||||
if err := ioutil.WriteFile(srv.Member.PeerKeyPath, []byte(srv.Member.PeerKeyData), 0644); err != nil {
|
||||
return err
|
||||
}
|
||||
if srv.Member.Etcd.PeerKeyFile != "" {
|
||||
return fmt.Errorf("manual TLS setup is not implemented yet, but Member.Etcd.PeerKeyFile is %q", srv.Member.Etcd.PeerKeyFile)
|
||||
}
|
||||
if srv.Member.PeerTrustedCAPath != "" {
|
||||
if srv.Member.PeerTrustedCAData == "" {
|
||||
return fmt.Errorf("got empty data for %q", srv.Member.PeerTrustedCAPath)
|
||||
}
|
||||
if srv.Member.Etcd.PeerTrustedCAFile != "" {
|
||||
return fmt.Errorf("manual TLS setup is not implemented yet, but Member.Etcd.PeerTrustedCAFile is %q", srv.Member.Etcd.PeerTrustedCAFile)
|
||||
if err := ioutil.WriteFile(srv.Member.PeerTrustedCAPath, []byte(srv.Member.PeerTrustedCAData), 0644); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if srv.Member.PeerCertPath != "" &&
|
||||
srv.Member.PeerKeyPath != "" &&
|
||||
srv.Member.PeerTrustedCAPath != "" {
|
||||
srv.lg.Info(
|
||||
"wrote",
|
||||
zap.String("peer-cert", srv.Member.PeerCertPath),
|
||||
zap.String("peer-key", srv.Member.PeerKeyPath),
|
||||
zap.String("peer-trusted-ca", srv.Member.PeerTrustedCAPath),
|
||||
)
|
||||
}
|
||||
|
||||
if srv.Member.ClientCertPath != "" {
|
||||
if srv.Member.ClientCertData == "" {
|
||||
return fmt.Errorf("got empty data for %q", srv.Member.ClientCertPath)
|
||||
}
|
||||
if err := ioutil.WriteFile(srv.Member.ClientCertPath, []byte(srv.Member.ClientCertData), 0644); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if srv.Member.ClientKeyPath != "" {
|
||||
if srv.Member.ClientKeyData == "" {
|
||||
return fmt.Errorf("got empty data for %q", srv.Member.ClientKeyPath)
|
||||
}
|
||||
if err := ioutil.WriteFile(srv.Member.ClientKeyPath, []byte(srv.Member.ClientKeyData), 0644); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if srv.Member.ClientTrustedCAPath != "" {
|
||||
if srv.Member.ClientTrustedCAData == "" {
|
||||
return fmt.Errorf("got empty data for %q", srv.Member.ClientTrustedCAPath)
|
||||
}
|
||||
if err := ioutil.WriteFile(srv.Member.ClientTrustedCAPath, []byte(srv.Member.ClientTrustedCAData), 0644); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if srv.Member.ClientCertPath != "" &&
|
||||
srv.Member.ClientKeyPath != "" &&
|
||||
srv.Member.ClientTrustedCAPath != "" {
|
||||
srv.lg.Info(
|
||||
"wrote",
|
||||
zap.String("client-cert", srv.Member.ClientCertPath),
|
||||
zap.String("client-key", srv.Member.ClientKeyPath),
|
||||
zap.String("client-trusted-ca", srv.Member.ClientTrustedCAPath),
|
||||
)
|
||||
}
|
||||
|
||||
// TODO
|
||||
return nil
|
||||
}
|
||||
|
||||
func (srv *Server) loadAutoTLSAssets() error {
|
||||
// if started with auto TLS, sends back TLS assets to tester/client
|
||||
if srv.Member.Etcd.PeerAutoTLS {
|
||||
// in case of slow disk
|
||||
time.Sleep(time.Second)
|
||||
|
||||
fdir := filepath.Join(srv.Member.Etcd.DataDir, "fixtures", "peer")
|
||||
|
||||
srv.lg.Info(
|
||||
"loading client auto TLS assets",
|
||||
zap.String("dir", fdir),
|
||||
zap.String("endpoint", srv.EtcdClientEndpoint),
|
||||
)
|
||||
|
||||
certPath := filepath.Join(fdir, "cert.pem")
|
||||
if !fileutil.Exist(certPath) {
|
||||
return fmt.Errorf("cannot find %q", certPath)
|
||||
}
|
||||
certData, err := ioutil.ReadFile(certPath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot read %q (%v)", certPath, err)
|
||||
}
|
||||
srv.Member.PeerCertData = string(certData)
|
||||
|
||||
keyPath := filepath.Join(fdir, "key.pem")
|
||||
if !fileutil.Exist(keyPath) {
|
||||
return fmt.Errorf("cannot find %q", keyPath)
|
||||
}
|
||||
keyData, err := ioutil.ReadFile(keyPath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot read %q (%v)", keyPath, err)
|
||||
}
|
||||
srv.Member.PeerKeyData = string(keyData)
|
||||
|
||||
srv.lg.Info(
|
||||
"loaded peer auto TLS assets",
|
||||
zap.String("peer-cert-path", certPath),
|
||||
zap.Int("peer-cert-length", len(certData)),
|
||||
zap.String("peer-key-path", keyPath),
|
||||
zap.Int("peer-key-length", len(keyData)),
|
||||
)
|
||||
}
|
||||
|
||||
if srv.Member.Etcd.ClientAutoTLS {
|
||||
// in case of slow disk
|
||||
time.Sleep(time.Second)
|
||||
@ -315,46 +398,7 @@ func (srv *Server) loadAutoTLSAssets() error {
|
||||
zap.Int("peer-key-length", len(keyData)),
|
||||
)
|
||||
}
|
||||
if srv.Member.Etcd.ClientAutoTLS {
|
||||
// in case of slow disk
|
||||
time.Sleep(time.Second)
|
||||
|
||||
fdir := filepath.Join(srv.Member.Etcd.DataDir, "fixtures", "peer")
|
||||
|
||||
srv.lg.Info(
|
||||
"loading client TLS assets",
|
||||
zap.String("dir", fdir),
|
||||
zap.String("endpoint", srv.EtcdClientEndpoint),
|
||||
)
|
||||
|
||||
certPath := filepath.Join(fdir, "cert.pem")
|
||||
if !fileutil.Exist(certPath) {
|
||||
return fmt.Errorf("cannot find %q", certPath)
|
||||
}
|
||||
certData, err := ioutil.ReadFile(certPath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot read %q (%v)", certPath, err)
|
||||
}
|
||||
srv.Member.PeerCertData = string(certData)
|
||||
|
||||
keyPath := filepath.Join(fdir, "key.pem")
|
||||
if !fileutil.Exist(keyPath) {
|
||||
return fmt.Errorf("cannot find %q", keyPath)
|
||||
}
|
||||
keyData, err := ioutil.ReadFile(keyPath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("cannot read %q (%v)", keyPath, err)
|
||||
}
|
||||
srv.Member.PeerKeyData = string(keyData)
|
||||
|
||||
srv.lg.Info(
|
||||
"loaded peer TLS assets",
|
||||
zap.String("peer-cert-path", certPath),
|
||||
zap.Int("peer-cert-length", len(certData)),
|
||||
zap.String("peer-key-path", keyPath),
|
||||
zap.Int("peer-key-length", len(keyData)),
|
||||
)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@ -363,10 +407,17 @@ func (srv *Server) startEtcdCmd() error {
|
||||
return srv.etcdCmd.Start()
|
||||
}
|
||||
|
||||
func (srv *Server) handleRestartEtcd() (*rpcpb.Response, error) {
|
||||
func (srv *Server) handle_RESTART_ETCD() (*rpcpb.Response, error) {
|
||||
var err error
|
||||
if !fileutil.Exist(srv.Member.BaseDir) {
|
||||
err = fileutil.TouchDirAll(srv.Member.BaseDir)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
srv.creatEtcdCmd()
|
||||
|
||||
var err error
|
||||
if err = srv.saveTLSAssets(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@ -394,7 +445,7 @@ func (srv *Server) handleRestartEtcd() (*rpcpb.Response, error) {
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (srv *Server) handleKillEtcd() (*rpcpb.Response, error) {
|
||||
func (srv *Server) handle_SIGTERM_ETCD() (*rpcpb.Response, error) {
|
||||
srv.stopProxy()
|
||||
|
||||
err := stopWithSig(srv.etcdCmd, syscall.SIGTERM)
|
||||
@ -405,11 +456,32 @@ func (srv *Server) handleKillEtcd() (*rpcpb.Response, error) {
|
||||
|
||||
return &rpcpb.Response{
|
||||
Success: true,
|
||||
Status: "successfully killed etcd!",
|
||||
Status: "killed etcd",
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (srv *Server) handleFailArchive() (*rpcpb.Response, error) {
|
||||
func (srv *Server) handle_SIGQUIT_ETCD_AND_REMOVE_DATA() (*rpcpb.Response, error) {
|
||||
srv.stopProxy()
|
||||
|
||||
err := stopWithSig(srv.etcdCmd, syscall.SIGQUIT)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
srv.lg.Info("killed etcd", zap.String("signal", syscall.SIGQUIT.String()))
|
||||
|
||||
err = os.RemoveAll(srv.Member.BaseDir)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
srv.lg.Info("removed base directory", zap.String("dir", srv.Member.BaseDir))
|
||||
|
||||
return &rpcpb.Response{
|
||||
Success: true,
|
||||
Status: "killed etcd and removed base directory",
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (srv *Server) handle_SIGQUIT_ETCD_AND_ARCHIVE_DATA() (*rpcpb.Response, error) {
|
||||
srv.stopProxy()
|
||||
|
||||
// exit with stackstrace
|
||||
@ -444,17 +516,19 @@ func (srv *Server) handleFailArchive() (*rpcpb.Response, error) {
|
||||
|
||||
return &rpcpb.Response{
|
||||
Success: true,
|
||||
Status: "successfully cleaned up etcd!",
|
||||
Status: "cleaned up etcd",
|
||||
}, nil
|
||||
}
|
||||
|
||||
// stop proxy, etcd, delete data directory
|
||||
func (srv *Server) handleDestroyEtcdAgent() (*rpcpb.Response, error) {
|
||||
err := stopWithSig(srv.etcdCmd, syscall.SIGTERM)
|
||||
func (srv *Server) handle_SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT() (*rpcpb.Response, error) {
|
||||
srv.stopProxy()
|
||||
|
||||
err := stopWithSig(srv.etcdCmd, syscall.SIGQUIT)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
srv.lg.Info("killed etcd", zap.String("signal", syscall.SIGTERM.String()))
|
||||
srv.lg.Info("killed etcd", zap.String("signal", syscall.SIGQUIT.String()))
|
||||
|
||||
err = os.RemoveAll(srv.Member.BaseDir)
|
||||
if err != nil {
|
||||
@ -465,22 +539,13 @@ func (srv *Server) handleDestroyEtcdAgent() (*rpcpb.Response, error) {
|
||||
// stop agent server
|
||||
srv.Stop()
|
||||
|
||||
for port, px := range srv.advertiseClientPortToProxy {
|
||||
err := px.Close()
|
||||
srv.lg.Info("closed proxy", zap.Int("client-port", port), zap.Error(err))
|
||||
}
|
||||
for port, px := range srv.advertisePeerPortToProxy {
|
||||
err := px.Close()
|
||||
srv.lg.Info("closed proxy", zap.Int("peer-port", port), zap.Error(err))
|
||||
}
|
||||
|
||||
return &rpcpb.Response{
|
||||
Success: true,
|
||||
Status: "successfully destroyed etcd and agent!",
|
||||
Status: "destroyed etcd and agent",
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (srv *Server) handleBlackholePeerPortTxRx() (*rpcpb.Response, error) {
|
||||
func (srv *Server) handle_BLACKHOLE_PEER_PORT_TX_RX() (*rpcpb.Response, error) {
|
||||
for port, px := range srv.advertisePeerPortToProxy {
|
||||
srv.lg.Info("blackholing", zap.Int("peer-port", port))
|
||||
px.BlackholeTx()
|
||||
@ -489,11 +554,11 @@ func (srv *Server) handleBlackholePeerPortTxRx() (*rpcpb.Response, error) {
|
||||
}
|
||||
return &rpcpb.Response{
|
||||
Success: true,
|
||||
Status: "successfully blackholed peer port tx/rx!",
|
||||
Status: "blackholed peer port tx/rx",
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (srv *Server) handleUnblackholePeerPortTxRx() (*rpcpb.Response, error) {
|
||||
func (srv *Server) handle_UNBLACKHOLE_PEER_PORT_TX_RX() (*rpcpb.Response, error) {
|
||||
for port, px := range srv.advertisePeerPortToProxy {
|
||||
srv.lg.Info("unblackholing", zap.Int("peer-port", port))
|
||||
px.UnblackholeTx()
|
||||
@ -502,11 +567,11 @@ func (srv *Server) handleUnblackholePeerPortTxRx() (*rpcpb.Response, error) {
|
||||
}
|
||||
return &rpcpb.Response{
|
||||
Success: true,
|
||||
Status: "successfully unblackholed peer port tx/rx!",
|
||||
Status: "unblackholed peer port tx/rx",
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (srv *Server) handleDelayPeerPortTxRx() (*rpcpb.Response, error) {
|
||||
func (srv *Server) handle_DELAY_PEER_PORT_TX_RX() (*rpcpb.Response, error) {
|
||||
lat := time.Duration(srv.Tester.UpdatedDelayLatencyMs) * time.Millisecond
|
||||
rv := time.Duration(srv.Tester.DelayLatencyMsRv) * time.Millisecond
|
||||
|
||||
@ -527,11 +592,11 @@ func (srv *Server) handleDelayPeerPortTxRx() (*rpcpb.Response, error) {
|
||||
|
||||
return &rpcpb.Response{
|
||||
Success: true,
|
||||
Status: "successfully delay peer port tx/rx!",
|
||||
Status: "delayed peer port tx/rx",
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (srv *Server) handleUndelayPeerPortTxRx() (*rpcpb.Response, error) {
|
||||
func (srv *Server) handle_UNDELAY_PEER_PORT_TX_RX() (*rpcpb.Response, error) {
|
||||
for port, px := range srv.advertisePeerPortToProxy {
|
||||
srv.lg.Info("undelaying", zap.Int("peer-port", port))
|
||||
px.UndelayTx()
|
||||
@ -540,6 +605,6 @@ func (srv *Server) handleUndelayPeerPortTxRx() (*rpcpb.Response, error) {
|
||||
}
|
||||
return &rpcpb.Response{
|
||||
Success: true,
|
||||
Status: "successfully undelay peer port tx/rx!",
|
||||
Status: "undelayed peer port tx/rx",
|
||||
}, nil
|
||||
}
|
@ -21,8 +21,8 @@ import (
|
||||
"os/exec"
|
||||
"strings"
|
||||
|
||||
"github.com/coreos/etcd/functional/rpcpb"
|
||||
"github.com/coreos/etcd/pkg/proxy"
|
||||
"github.com/coreos/etcd/tools/functional-tester/rpcpb"
|
||||
|
||||
"go.uber.org/zap"
|
||||
"google.golang.org/grpc"
|
||||
@ -64,7 +64,7 @@ func NewServer(
|
||||
lg: lg,
|
||||
network: network,
|
||||
address: address,
|
||||
last: rpcpb.Operation_NotStarted,
|
||||
last: rpcpb.Operation_NOT_STARTED,
|
||||
advertiseClientPortToProxy: make(map[int]proxy.Server),
|
||||
advertisePeerPortToProxy: make(map[int]proxy.Server),
|
||||
}
|
11
functional/build
Executable file
11
functional/build
Executable file
@ -0,0 +1,11 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
if ! [[ "$0" =~ "functional/build" ]]; then
|
||||
echo "must be run from repository root"
|
||||
exit 255
|
||||
fi
|
||||
|
||||
CGO_ENABLED=0 go build -a -v -installsuffix cgo -ldflags "-s" -o ./bin/etcd-agent ./functional/cmd/etcd-agent
|
||||
CGO_ENABLED=0 go build -a -v -installsuffix cgo -ldflags "-s" -o ./bin/etcd-proxy ./functional/cmd/etcd-proxy
|
||||
CGO_ENABLED=0 go build -a -v -installsuffix cgo -ldflags "-s" -o ./bin/etcd-runner ./functional/cmd/etcd-runner
|
||||
CGO_ENABLED=0 go build -a -v -installsuffix cgo -ldflags "-s" -o ./bin/etcd-tester ./functional/cmd/etcd-tester
|
@ -18,7 +18,7 @@ package main
|
||||
import (
|
||||
"flag"
|
||||
|
||||
"github.com/coreos/etcd/tools/functional-tester/agent"
|
||||
"github.com/coreos/etcd/functional/agent"
|
||||
|
||||
"go.uber.org/zap"
|
||||
)
|
@ -12,7 +12,7 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// etcd-test-proxy is a proxy layer that simulates various network conditions.
|
||||
// etcd-proxy is a proxy layer that simulates various network conditions.
|
||||
package main
|
||||
|
||||
import (
|
||||
@ -40,13 +40,13 @@ func main() {
|
||||
// TODO: support TLS
|
||||
flag.StringVar(&from, "from", "localhost:23790", "Address URL to proxy from.")
|
||||
flag.StringVar(&to, "to", "localhost:2379", "Address URL to forward.")
|
||||
flag.IntVar(&httpPort, "http-port", 2378, "Port to serve etcd-test-proxy API.")
|
||||
flag.IntVar(&httpPort, "http-port", 2378, "Port to serve etcd-proxy API.")
|
||||
flag.BoolVar(&verbose, "verbose", false, "'true' to run proxy in verbose mode.")
|
||||
|
||||
flag.Usage = func() {
|
||||
fmt.Fprintf(os.Stderr, "Usage of %q:\n", os.Args[0])
|
||||
fmt.Fprintln(os.Stderr, `
|
||||
etcd-test-proxy simulates various network conditions for etcd testing purposes.
|
||||
etcd-proxy simulates various network conditions for etcd testing purposes.
|
||||
See README.md for more examples.
|
||||
|
||||
Example:
|
||||
@ -55,12 +55,12 @@ Example:
|
||||
$ ./build
|
||||
$ ./bin/etcd
|
||||
|
||||
# build etcd-test-proxy
|
||||
$ make build-etcd-test-proxy
|
||||
# build etcd-proxy
|
||||
$ make build-etcd-proxy
|
||||
|
||||
# to test etcd with proxy layer
|
||||
$ ./bin/etcd-test-proxy --help
|
||||
$ ./bin/etcd-test-proxy --from localhost:23790 --to localhost:2379 --http-port 2378 --verbose
|
||||
$ ./bin/etcd-proxy --help
|
||||
$ ./bin/etcd-proxy --from localhost:23790 --to localhost:2379 --http-port 2378 --verbose
|
||||
|
||||
$ ETCDCTL_API=3 ./bin/etcdctl --endpoints localhost:2379 put foo bar
|
||||
$ ETCDCTL_API=3 ./bin/etcdctl --endpoints localhost:23790 put foo bar`)
|
@ -12,10 +12,11 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// etcd-runner is a program for testing etcd clientv3 features against a fault injected cluster.
|
||||
// etcd-runner is a program for testing etcd clientv3 features
|
||||
// against a fault injected cluster.
|
||||
package main
|
||||
|
||||
import "github.com/coreos/etcd/tools/functional-tester/runner"
|
||||
import "github.com/coreos/etcd/functional/runner"
|
||||
|
||||
func main() {
|
||||
runner.Start()
|
@ -18,7 +18,7 @@ package main
|
||||
import (
|
||||
"flag"
|
||||
|
||||
"github.com/coreos/etcd/tools/functional-tester/tester"
|
||||
"github.com/coreos/etcd/functional/tester"
|
||||
|
||||
"go.uber.org/zap"
|
||||
)
|
||||
@ -44,11 +44,11 @@ func main() {
|
||||
logger.Fatal("failed to create a cluster", zap.Error(err))
|
||||
}
|
||||
|
||||
err = clus.Bootstrap()
|
||||
err = clus.Send_INITIAL_START_ETCD()
|
||||
if err != nil {
|
||||
logger.Fatal("Bootstrap failed", zap.Error(err))
|
||||
}
|
||||
defer clus.DestroyEtcdAgents()
|
||||
defer clus.Send_SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT()
|
||||
|
||||
logger.Info("wait health after bootstrap")
|
||||
err = clus.WaitHealth()
|
||||
@ -56,5 +56,5 @@ func main() {
|
||||
logger.Fatal("WaitHealth failed", zap.Error(err))
|
||||
}
|
||||
|
||||
clus.StartTester()
|
||||
clus.Run()
|
||||
}
|
File diff suppressed because it is too large
Load Diff
480
functional/rpcpb/rpc.proto
Normal file
480
functional/rpcpb/rpc.proto
Normal file
@ -0,0 +1,480 @@
|
||||
syntax = "proto3";
|
||||
package rpcpb;
|
||||
|
||||
import "github.com/gogo/protobuf/gogoproto/gogo.proto";
|
||||
|
||||
option (gogoproto.marshaler_all) = true;
|
||||
option (gogoproto.sizer_all) = true;
|
||||
option (gogoproto.unmarshaler_all) = true;
|
||||
option (gogoproto.goproto_getters_all) = false;
|
||||
|
||||
message Request {
|
||||
Operation Operation = 1;
|
||||
// Member contains the same Member object from tester configuration.
|
||||
Member Member = 2;
|
||||
// Tester contains tester configuration.
|
||||
Tester Tester = 3;
|
||||
}
|
||||
|
||||
message Response {
|
||||
bool Success = 1;
|
||||
string Status = 2;
|
||||
// Member contains the same Member object from tester request.
|
||||
Member Member = 3;
|
||||
}
|
||||
|
||||
service Transport {
|
||||
rpc Transport(stream Request) returns (stream Response) {}
|
||||
}
|
||||
|
||||
message Member {
|
||||
// EtcdExecPath is the executable etcd binary path in agent server.
|
||||
string EtcdExecPath = 1 [(gogoproto.moretags) = "yaml:\"etcd-exec-path\""];
|
||||
|
||||
// TODO: support embedded etcd
|
||||
|
||||
// AgentAddr is the agent HTTP server address.
|
||||
string AgentAddr = 11 [(gogoproto.moretags) = "yaml:\"agent-addr\""];
|
||||
// FailpointHTTPAddr is the agent's failpoints HTTP server address.
|
||||
string FailpointHTTPAddr = 12 [(gogoproto.moretags) = "yaml:\"failpoint-http-addr\""];
|
||||
|
||||
// BaseDir is the base directory where all logs and etcd data are stored.
|
||||
string BaseDir = 101 [(gogoproto.moretags) = "yaml:\"base-dir\""];
|
||||
// EtcdLogPath is the log file to store current etcd server logs.
|
||||
string EtcdLogPath = 102 [(gogoproto.moretags) = "yaml:\"etcd-log-path\""];
|
||||
|
||||
// EtcdClientProxy is true when client traffic needs to be proxied.
|
||||
// If true, listen client URL port must be different than advertise client URL port.
|
||||
bool EtcdClientProxy = 201 [(gogoproto.moretags) = "yaml:\"etcd-client-proxy\""];
|
||||
// EtcdPeerProxy is true when peer traffic needs to be proxied.
|
||||
// If true, listen peer URL port must be different than advertise peer URL port.
|
||||
bool EtcdPeerProxy = 202 [(gogoproto.moretags) = "yaml:\"etcd-peer-proxy\""];
|
||||
|
||||
// EtcdClientEndpoint is the etcd client endpoint.
|
||||
string EtcdClientEndpoint = 301 [(gogoproto.moretags) = "yaml:\"etcd-client-endpoint\""];
|
||||
// Etcd defines etcd binary configuration flags.
|
||||
Etcd Etcd = 302 [(gogoproto.moretags) = "yaml:\"etcd\""];
|
||||
|
||||
// ClientCertData contains cert file contents from this member's etcd server.
|
||||
string ClientCertData = 401 [(gogoproto.moretags) = "yaml:\"client-cert-data\""];
|
||||
string ClientCertPath = 402 [(gogoproto.moretags) = "yaml:\"client-cert-path\""];
|
||||
// ClientKeyData contains key file contents from this member's etcd server.
|
||||
string ClientKeyData = 403 [(gogoproto.moretags) = "yaml:\"client-key-data\""];
|
||||
string ClientKeyPath = 404 [(gogoproto.moretags) = "yaml:\"client-key-path\""];
|
||||
// ClientTrustedCAData contains trusted CA file contents from this member's etcd server.
|
||||
string ClientTrustedCAData = 405 [(gogoproto.moretags) = "yaml:\"client-trusted-ca-data\""];
|
||||
string ClientTrustedCAPath = 406 [(gogoproto.moretags) = "yaml:\"client-trusted-ca-path\""];
|
||||
|
||||
// PeerCertData contains cert file contents from this member's etcd server.
|
||||
string PeerCertData = 501 [(gogoproto.moretags) = "yaml:\"peer-cert-data\""];
|
||||
string PeerCertPath = 502 [(gogoproto.moretags) = "yaml:\"peer-cert-path\""];
|
||||
// PeerKeyData contains key file contents from this member's etcd server.
|
||||
string PeerKeyData = 503 [(gogoproto.moretags) = "yaml:\"peer-key-data\""];
|
||||
string PeerKeyPath = 504 [(gogoproto.moretags) = "yaml:\"peer-key-path\""];
|
||||
// PeerTrustedCAData contains trusted CA file contents from this member's etcd server.
|
||||
string PeerTrustedCAData = 505 [(gogoproto.moretags) = "yaml:\"peer-trusted-ca-data\""];
|
||||
string PeerTrustedCAPath = 506 [(gogoproto.moretags) = "yaml:\"peer-trusted-ca-path\""];
|
||||
}
|
||||
|
||||
message Tester {
|
||||
string DataDir = 1 [(gogoproto.moretags) = "yaml:\"data-dir\""];
|
||||
string Network = 2 [(gogoproto.moretags) = "yaml:\"network\""];
|
||||
string Addr = 3 [(gogoproto.moretags) = "yaml:\"addr\""];
|
||||
|
||||
// DelayLatencyMsRv is the delay latency in milliseconds,
|
||||
// to inject to simulated slow network.
|
||||
uint32 DelayLatencyMs = 11 [(gogoproto.moretags) = "yaml:\"delay-latency-ms\""];
|
||||
// DelayLatencyMsRv is the delay latency random variable in milliseconds.
|
||||
uint32 DelayLatencyMsRv = 12 [(gogoproto.moretags) = "yaml:\"delay-latency-ms-rv\""];
|
||||
// UpdatedDelayLatencyMs is the update delay latency in milliseconds,
|
||||
// to inject to simulated slow network. It's the final latency to apply,
|
||||
// in case the latency numbers are randomly generated from given delay latency field.
|
||||
uint32 UpdatedDelayLatencyMs = 13 [(gogoproto.moretags) = "yaml:\"updated-delay-latency-ms\""];
|
||||
|
||||
// RoundLimit is the limit of rounds to run failure set (-1 to run without limits).
|
||||
int32 RoundLimit = 21 [(gogoproto.moretags) = "yaml:\"round-limit\""];
|
||||
// ExitOnFailure is true, then exit tester on first failure.
|
||||
bool ExitOnFailure = 22 [(gogoproto.moretags) = "yaml:\"exit-on-failure\""];
|
||||
// ConsistencyCheck is true to check consistency (revision, hash).
|
||||
bool ConsistencyCheck = 23 [(gogoproto.moretags) = "yaml:\"consistency-check\""];
|
||||
// EnablePprof is true to enable profiler.
|
||||
bool EnablePprof = 24 [(gogoproto.moretags) = "yaml:\"enable-pprof\""];
|
||||
|
||||
// FailureDelayMs is the delay duration after failure is injected.
|
||||
// Useful when triggering snapshot or no-op failure cases.
|
||||
uint32 FailureDelayMs = 31 [(gogoproto.moretags) = "yaml:\"failure-delay-ms\""];
|
||||
// FailureShuffle is true to randomize failure injecting order.
|
||||
bool FailureShuffle = 32 [(gogoproto.moretags) = "yaml:\"failure-shuffle\""];
|
||||
// FailureCases is the selected test cases to schedule.
|
||||
// If empty, run all failure cases.
|
||||
repeated string FailureCases = 33 [(gogoproto.moretags) = "yaml:\"failure-cases\""];
|
||||
// Failpoinommands is the list of "gofail" commands (e.g. panic("etcd-tester"),1*sleep(1000)
|
||||
repeated string FailpointCommands = 34 [(gogoproto.moretags) = "yaml:\"failpoint-commands\""];
|
||||
|
||||
// RunnerExecPath is a path of etcd-runner binary.
|
||||
string RunnerExecPath = 41 [(gogoproto.moretags) = "yaml:\"runner-exec-path\""];
|
||||
// ExternalExecPath is a path of script for enabling/disabling an external fault injector.
|
||||
string ExternalExecPath = 42 [(gogoproto.moretags) = "yaml:\"external-exec-path\""];
|
||||
|
||||
// StressTypes is the list of stresser names:
|
||||
// keys, lease, nop, election-runner, watch-runner, lock-racer-runner, lease-runner.
|
||||
repeated string StressTypes = 101 [(gogoproto.moretags) = "yaml:\"stress-types\""];
|
||||
// StressKeySize is the size of each small key written into etcd.
|
||||
int32 StressKeySize = 102 [(gogoproto.moretags) = "yaml:\"stress-key-size\""];
|
||||
// StressKeySizeLarge is the size of each large key written into etcd.
|
||||
int32 StressKeySizeLarge = 103 [(gogoproto.moretags) = "yaml:\"stress-key-size-large\""];
|
||||
// StressKeySuffixRange is the count of key range written into etcd.
|
||||
// Stress keys are created with "fmt.Sprintf("foo%016x", rand.Intn(keySuffixRange)".
|
||||
int32 StressKeySuffixRange = 104 [(gogoproto.moretags) = "yaml:\"stress-key-suffix-range\""];
|
||||
// StressKeySuffixRangeTxn is the count of key range written into etcd txn (max 100).
|
||||
// Stress keys are created with "fmt.Sprintf("/k%03d", i)".
|
||||
int32 StressKeySuffixRangeTxn = 105 [(gogoproto.moretags) = "yaml:\"stress-key-suffix-range-txn\""];
|
||||
// StressKeyTxnOps is the number of operations per a transaction (max 64).
|
||||
int32 StressKeyTxnOps = 106 [(gogoproto.moretags) = "yaml:\"stress-key-txn-ops\""];
|
||||
|
||||
// StressClients is the number of concurrent stressing clients
|
||||
// with "one" shared TCP connection.
|
||||
int32 StressClients = 201 [(gogoproto.moretags) = "yaml:\"stress-clients\""];
|
||||
// StressQPS is the maximum number of stresser requests per second.
|
||||
int32 StressQPS = 202 [(gogoproto.moretags) = "yaml:\"stress-qps\""];
|
||||
}
|
||||
|
||||
message Etcd {
|
||||
string Name = 1 [(gogoproto.moretags) = "yaml:\"name\""];
|
||||
string DataDir = 2 [(gogoproto.moretags) = "yaml:\"data-dir\""];
|
||||
string WALDir = 3 [(gogoproto.moretags) = "yaml:\"wal-dir\""];
|
||||
|
||||
// HeartbeatIntervalMs is the time (in milliseconds) of a heartbeat interval.
|
||||
// Default value is 100, which is 100ms.
|
||||
int64 HeartbeatIntervalMs = 11 [(gogoproto.moretags) = "yaml:\"heartbeat-interval\""];
|
||||
// ElectionTimeoutMs is the time (in milliseconds) for an election to timeout.
|
||||
// Default value is 1000, which is 1s.
|
||||
int64 ElectionTimeoutMs = 12 [(gogoproto.moretags) = "yaml:\"election-timeout\""];
|
||||
|
||||
repeated string ListenClientURLs = 21 [(gogoproto.moretags) = "yaml:\"listen-client-urls\""];
|
||||
repeated string AdvertiseClientURLs = 22 [(gogoproto.moretags) = "yaml:\"advertise-client-urls\""];
|
||||
bool ClientAutoTLS = 23 [(gogoproto.moretags) = "yaml:\"auto-tls\""];
|
||||
bool ClientCertAuth = 24 [(gogoproto.moretags) = "yaml:\"client-cert-auth\""];
|
||||
string ClientCertFile = 25 [(gogoproto.moretags) = "yaml:\"cert-file\""];
|
||||
string ClientKeyFile = 26 [(gogoproto.moretags) = "yaml:\"key-file\""];
|
||||
string ClientTrustedCAFile = 27 [(gogoproto.moretags) = "yaml:\"trusted-ca-file\""];
|
||||
|
||||
repeated string ListenPeerURLs = 31 [(gogoproto.moretags) = "yaml:\"listen-peer-urls\""];
|
||||
repeated string AdvertisePeerURLs = 32 [(gogoproto.moretags) = "yaml:\"initial-advertise-peer-urls\""];
|
||||
bool PeerAutoTLS = 33 [(gogoproto.moretags) = "yaml:\"peer-auto-tls\""];
|
||||
bool PeerClientCertAuth = 34 [(gogoproto.moretags) = "yaml:\"peer-client-cert-auth\""];
|
||||
string PeerCertFile = 35 [(gogoproto.moretags) = "yaml:\"peer-cert-file\""];
|
||||
string PeerKeyFile = 36 [(gogoproto.moretags) = "yaml:\"peer-key-file\""];
|
||||
string PeerTrustedCAFile = 37 [(gogoproto.moretags) = "yaml:\"peer-trusted-ca-file\""];
|
||||
|
||||
string InitialCluster = 41 [(gogoproto.moretags) = "yaml:\"initial-cluster\""];
|
||||
string InitialClusterState = 42 [(gogoproto.moretags) = "yaml:\"initial-cluster-state\""];
|
||||
string InitialClusterToken = 43 [(gogoproto.moretags) = "yaml:\"initial-cluster-token\""];
|
||||
|
||||
int64 SnapshotCount = 51 [(gogoproto.moretags) = "yaml:\"snapshot-count\""];
|
||||
int64 QuotaBackendBytes = 52 [(gogoproto.moretags) = "yaml:\"quota-backend-bytes\""];
|
||||
|
||||
bool PreVote = 63 [(gogoproto.moretags) = "yaml:\"pre-vote\""];
|
||||
bool InitialCorruptCheck = 64 [(gogoproto.moretags) = "yaml:\"initial-corrupt-check\""];
|
||||
}
|
||||
|
||||
enum Operation {
|
||||
// NOT_STARTED is the agent status before etcd first start.
|
||||
NOT_STARTED = 0;
|
||||
|
||||
// INITIAL_START_ETCD is only called to start etcd, the very first time.
|
||||
INITIAL_START_ETCD = 10;
|
||||
// RESTART_ETCD is sent to restart killed etcd.
|
||||
RESTART_ETCD = 11;
|
||||
|
||||
// SIGTERM_ETCD pauses etcd process while keeping data directories
|
||||
// and previous etcd configurations.
|
||||
SIGTERM_ETCD = 20;
|
||||
// SIGQUIT_ETCD_AND_REMOVE_DATA kills etcd process and removes all data
|
||||
// directories to simulate destroying the whole machine.
|
||||
SIGQUIT_ETCD_AND_REMOVE_DATA = 21;
|
||||
|
||||
// SIGQUIT_ETCD_AND_ARCHIVE_DATA is sent when consistency check failed,
|
||||
// thus need to archive etcd data directories.
|
||||
SIGQUIT_ETCD_AND_ARCHIVE_DATA = 30;
|
||||
// SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT destroys etcd process,
|
||||
// etcd data, and agent server.
|
||||
SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT = 31;
|
||||
|
||||
// BLACKHOLE_PEER_PORT_TX_RX drops all outgoing/incoming packets from/to
|
||||
// the peer port on target member's peer port.
|
||||
BLACKHOLE_PEER_PORT_TX_RX = 100;
|
||||
// UNBLACKHOLE_PEER_PORT_TX_RX removes outgoing/incoming packet dropping.
|
||||
UNBLACKHOLE_PEER_PORT_TX_RX = 101;
|
||||
|
||||
// DELAY_PEER_PORT_TX_RX delays all outgoing/incoming packets from/to
|
||||
// the peer port on target member's peer port.
|
||||
DELAY_PEER_PORT_TX_RX = 200;
|
||||
// UNDELAY_PEER_PORT_TX_RX removes all outgoing/incoming delays.
|
||||
UNDELAY_PEER_PORT_TX_RX = 201;
|
||||
}
|
||||
|
||||
// FailureCase defines various system faults in distributed systems,
|
||||
// in order to verify correct behavior of etcd servers and clients.
|
||||
enum FailureCase {
|
||||
// SIGTERM_ONE_FOLLOWER stops a randomly chosen follower (non-leader)
|
||||
// but does not delete its data directories on disk for next restart.
|
||||
// It waits "failure-delay-ms" before recovering this failure.
|
||||
// The expected behavior is that the follower comes back online
|
||||
// and rejoins the cluster, and then each member continues to process
|
||||
// client requests ('Put' request that requires Raft consensus).
|
||||
SIGTERM_ONE_FOLLOWER = 0;
|
||||
|
||||
// SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT stops a randomly chosen
|
||||
// follower but does not delete its data directories on disk for next
|
||||
// restart. And waits until most up-to-date node (leader) applies the
|
||||
// snapshot count of entries since the stop operation.
|
||||
// The expected behavior is that the follower comes back online and
|
||||
// rejoins the cluster, and then active leader sends snapshot
|
||||
// to the follower to force it to follow the leader's log.
|
||||
// As always, after recovery, each member must be able to process
|
||||
// client requests.
|
||||
SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 1;
|
||||
|
||||
// SIGTERM_LEADER stops the active leader node but does not delete its
|
||||
// data directories on disk for next restart. Then it waits
|
||||
// "failure-delay-ms" before recovering this failure, in order to
|
||||
// trigger election timeouts.
|
||||
// The expected behavior is that a new leader gets elected, and the
|
||||
// old leader comes back online and rejoins the cluster as a follower.
|
||||
// As always, after recovery, each member must be able to process
|
||||
// client requests.
|
||||
SIGTERM_LEADER = 2;
|
||||
|
||||
// SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT stops the active leader node
|
||||
// but does not delete its data directories on disk for next restart.
|
||||
// And waits until most up-to-date node ("new" leader) applies the
|
||||
// snapshot count of entries since the stop operation.
|
||||
// The expected behavior is that cluster elects a new leader, and the
|
||||
// old leader comes back online and rejoins the cluster as a follower.
|
||||
// And it receives the snapshot from the new leader to overwrite its
|
||||
// store. As always, after recovery, each member must be able to
|
||||
// process client requests.
|
||||
SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT = 3;
|
||||
|
||||
// SIGTERM_QUORUM stops majority number of nodes to make the whole cluster
|
||||
// inoperable but does not delete data directories on stopped nodes
|
||||
// for next restart. And it waits "failure-delay-ms" before recovering
|
||||
// this failure.
|
||||
// The expected behavior is that nodes come back online, thus cluster
|
||||
// comes back operative as well. As always, after recovery, each member
|
||||
// must be able to process client requests.
|
||||
SIGTERM_QUORUM = 4;
|
||||
|
||||
// SIGTERM_ALL stops the whole cluster but does not delete data directories
|
||||
// on disk for next restart. And it waits "failure-delay-ms" before
|
||||
// recovering this failure.
|
||||
// The expected behavior is that nodes come back online, thus cluster
|
||||
// comes back operative as well. As always, after recovery, each member
|
||||
// must be able to process client requests.
|
||||
SIGTERM_ALL = 5;
|
||||
|
||||
// BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER drops all outgoing/incoming
|
||||
// packets from/to the peer port on a randomly chosen follower
|
||||
// (non-leader), and waits for "failure-delay-ms" until recovery.
|
||||
// The expected behavior is that once dropping operation is undone,
|
||||
// each member must be able to process client requests.
|
||||
BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER = 100;
|
||||
|
||||
// BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT drops
|
||||
// all outgoing/incoming packets from/to the peer port on a randomly
|
||||
// chosen follower (non-leader), and waits for most up-to-date node
|
||||
// (leader) applies the snapshot count of entries since the blackhole
|
||||
// operation.
|
||||
// The expected behavior is that once packet drop operation is undone,
|
||||
// the slow follower tries to catch up, possibly receiving the snapshot
|
||||
// from the active leader. As always, after recovery, each member must
|
||||
// be able to process client requests.
|
||||
BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 101;
|
||||
|
||||
// BLACKHOLE_PEER_PORT_TX_RX_LEADER drops all outgoing/incoming packets
|
||||
// from/to the peer port on the active leader (isolated), and waits for
|
||||
// "failure-delay-ms" until recovery, in order to trigger election timeout.
|
||||
// The expected behavior is that after election timeout, a new leader gets
|
||||
// elected, and once dropping operation is undone, the old leader comes
|
||||
// back and rejoins the cluster as a follower. As always, after recovery,
|
||||
// each member must be able to process client requests.
|
||||
BLACKHOLE_PEER_PORT_TX_RX_LEADER = 102;
|
||||
|
||||
// BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT drops all
|
||||
// outgoing/incoming packets from/to the peer port on the active leader,
|
||||
// and waits for most up-to-date node (leader) applies the snapshot
|
||||
// count of entries since the blackhole operation.
|
||||
// The expected behavior is that cluster elects a new leader, and once
|
||||
// dropping operation is undone, the old leader comes back and rejoins
|
||||
// the cluster as a follower. The slow follower tries to catch up, likely
|
||||
// receiving the snapshot from the new active leader. As always, after
|
||||
// recovery, each member must be able to process client requests.
|
||||
BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT = 103;
|
||||
|
||||
// BLACKHOLE_PEER_PORT_TX_RX_QUORUM drops all outgoing/incoming packets
|
||||
// from/to the peer ports on majority nodes of cluster, thus losing its
|
||||
// leader and cluster being inoperable. And it waits for "failure-delay-ms"
|
||||
// until recovery.
|
||||
// The expected behavior is that once packet drop operation is undone,
|
||||
// nodes come back online, thus cluster comes back operative. As always,
|
||||
// after recovery, each member must be able to process client requests.
|
||||
BLACKHOLE_PEER_PORT_TX_RX_QUORUM = 104;
|
||||
|
||||
// BLACKHOLE_PEER_PORT_TX_RX_ALL drops all outgoing/incoming packets
|
||||
// from/to the peer ports on all nodes, thus making cluster totally
|
||||
// inoperable. It waits for "failure-delay-ms" until recovery.
|
||||
// The expected behavior is that once packet drop operation is undone,
|
||||
// nodes come back online, thus cluster comes back operative. As always,
|
||||
// after recovery, each member must be able to process client requests.
|
||||
BLACKHOLE_PEER_PORT_TX_RX_ALL = 105;
|
||||
|
||||
// DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER delays outgoing/incoming packets
|
||||
// from/to the peer port on a randomly chosen follower (non-leader).
|
||||
// It waits for "failure-delay-ms" until recovery.
|
||||
// The expected behavior is that once packet delay operation is undone,
|
||||
// the follower comes back and tries to catch up with latest changes from
|
||||
// cluster. And as always, after recovery, each member must be able to
|
||||
// process client requests.
|
||||
DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER = 200;
|
||||
|
||||
// RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER delays outgoing/incoming
|
||||
// packets from/to the peer port on a randomly chosen follower
|
||||
// (non-leader) with a randomized time duration (thus isolated). It waits
|
||||
// for "failure-delay-ms" until recovery.
|
||||
// The expected behavior is that once packet delay operation is undone,
|
||||
// each member must be able to process client requests.
|
||||
RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER = 201;
|
||||
|
||||
// DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT delays
|
||||
// outgoing/incoming packets from/to the peer port on a randomly chosen
|
||||
// follower (non-leader), and waits for most up-to-date node (leader)
|
||||
// applies the snapshot count of entries since the delay operation.
|
||||
// The expected behavior is that the delayed follower gets isolated
|
||||
// and behind the current active leader, and once delay operation is undone,
|
||||
// the slow follower comes back and catches up possibly receiving snapshot
|
||||
// from the active leader. As always, after recovery, each member must be
|
||||
// able to process client requests.
|
||||
DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 202;
|
||||
|
||||
// RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT delays
|
||||
// outgoing/incoming packets from/to the peer port on a randomly chosen
|
||||
// follower (non-leader) with a randomized time duration, and waits for
|
||||
// most up-to-date node (leader) applies the snapshot count of entries
|
||||
// since the delay operation.
|
||||
// The expected behavior is that the delayed follower gets isolated
|
||||
// and behind the current active leader, and once delay operation is undone,
|
||||
// the slow follower comes back and catches up, possibly receiving a
|
||||
// snapshot from the active leader. As always, after recovery, each member
|
||||
// must be able to process client requests.
|
||||
RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 203;
|
||||
|
||||
// DELAY_PEER_PORT_TX_RX_LEADER delays outgoing/incoming packets from/to
|
||||
// the peer port on the active leader. And waits for "failure-delay-ms"
|
||||
// until recovery.
|
||||
// The expected behavior is that cluster may elect a new leader, and
|
||||
// once packet delay operation is undone, the (old) leader comes back
|
||||
// and tries to catch up with latest changes from cluster. As always,
|
||||
// after recovery, each member must be able to process client requests.
|
||||
DELAY_PEER_PORT_TX_RX_LEADER = 204;
|
||||
|
||||
// RANDOM_DELAY_PEER_PORT_TX_RX_LEADER delays outgoing/incoming packets
|
||||
// from/to the peer port on the active leader with a randomized time
|
||||
// duration. And waits for "failure-delay-ms" until recovery.
|
||||
// The expected behavior is that cluster may elect a new leader, and
|
||||
// once packet delay operation is undone, the (old) leader comes back
|
||||
// and tries to catch up with latest changes from cluster. As always,
|
||||
// after recovery, each member must be able to process client requests.
|
||||
RANDOM_DELAY_PEER_PORT_TX_RX_LEADER = 205;
|
||||
|
||||
// DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT delays
|
||||
// outgoing/incoming packets from/to the peer port on the active leader,
|
||||
// and waits for most up-to-date node (current or new leader) applies the
|
||||
// snapshot count of entries since the delay operation.
|
||||
// The expected behavior is that cluster may elect a new leader, and
|
||||
// the old leader gets isolated and behind the current active leader,
|
||||
// and once delay operation is undone, the slow follower comes back
|
||||
// and catches up, likely receiving a snapshot from the active leader.
|
||||
// As always, after recovery, each member must be able to process client
|
||||
// requests.
|
||||
DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT = 206;
|
||||
|
||||
// RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT delays
|
||||
// outgoing/incoming packets from/to the peer port on the active leader,
|
||||
// with a randomized time duration. And it waits for most up-to-date node
|
||||
// (current or new leader) applies the snapshot count of entries since the
|
||||
// delay operation.
|
||||
// The expected behavior is that cluster may elect a new leader, and
|
||||
// the old leader gets isolated and behind the current active leader,
|
||||
// and once delay operation is undone, the slow follower comes back
|
||||
// and catches up, likely receiving a snapshot from the active leader.
|
||||
// As always, after recovery, each member must be able to process client
|
||||
// requests.
|
||||
RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT = 207;
|
||||
|
||||
// DELAY_PEER_PORT_TX_RX_QUORUM delays outgoing/incoming packets from/to
|
||||
// the peer ports on majority nodes of cluster. And it waits for
|
||||
// "failure-delay-ms" until recovery, likely to trigger election timeouts.
|
||||
// The expected behavior is that cluster may elect a new leader, while
|
||||
// quorum of nodes struggle with slow networks, and once delay operation
|
||||
// is undone, nodes come back and cluster comes back operative. As always,
|
||||
// after recovery, each member must be able to process client requests.
|
||||
DELAY_PEER_PORT_TX_RX_QUORUM = 208;
|
||||
|
||||
// RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM delays outgoing/incoming packets
|
||||
// from/to the peer ports on majority nodes of cluster, with randomized
|
||||
// time durations. And it waits for "failure-delay-ms" until recovery,
|
||||
// likely to trigger election timeouts.
|
||||
// The expected behavior is that cluster may elect a new leader, while
|
||||
// quorum of nodes struggle with slow networks, and once delay operation
|
||||
// is undone, nodes come back and cluster comes back operative. As always,
|
||||
// after recovery, each member must be able to process client requests.
|
||||
RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM = 209;
|
||||
|
||||
// DELAY_PEER_PORT_TX_RX_ALL delays outgoing/incoming packets from/to the
|
||||
// peer ports on all nodes. And it waits for "failure-delay-ms" until
|
||||
// recovery, likely to trigger election timeouts.
|
||||
// The expected behavior is that cluster may become totally inoperable,
|
||||
// struggling with slow networks across the whole cluster. Once delay
|
||||
// operation is undone, nodes come back and cluster comes back operative.
|
||||
// As always, after recovery, each member must be able to process client
|
||||
// requests.
|
||||
DELAY_PEER_PORT_TX_RX_ALL = 210;
|
||||
|
||||
// RANDOM_DELAY_PEER_PORT_TX_RX_ALL delays outgoing/incoming packets
|
||||
// from/to the peer ports on all nodes, with randomized time durations.
|
||||
// And it waits for "failure-delay-ms" until recovery, likely to trigger
|
||||
// election timeouts.
|
||||
// The expected behavior is that cluster may become totally inoperable,
|
||||
// struggling with slow networks across the whole cluster. Once delay
|
||||
// operation is undone, nodes come back and cluster comes back operative.
|
||||
// As always, after recovery, each member must be able to process client
|
||||
// requests.
|
||||
RANDOM_DELAY_PEER_PORT_TX_RX_ALL = 211;
|
||||
|
||||
// NO_FAIL_WITH_STRESS runs no-op failure injection that does not do
|
||||
// anything against cluster for "failure-delay-ms" duration, while
|
||||
// stressers are still sending requests.
|
||||
NO_FAIL_WITH_STRESS = 300;
|
||||
|
||||
// NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS runs no-op failure injection
|
||||
// that does not do anything against cluster for "failure-delay-ms"
|
||||
// duration, while all stressers are stopped.
|
||||
NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS = 301;
|
||||
|
||||
// FAILPOINTS injects failpoints to etcd server runtime, triggering panics
|
||||
// in critical code paths.
|
||||
FAILPOINTS = 400;
|
||||
|
||||
// EXTERNAL runs external failure injection scripts.
|
||||
EXTERNAL = 500;
|
||||
}
|
||||
|
||||
enum StressType {
|
||||
KV = 0;
|
||||
LEASE = 1;
|
||||
ELECTION_RUNNER = 2;
|
||||
WATCH_RUNNER = 3;
|
||||
LOCK_RACER_RUNNER = 4;
|
||||
LEASE_RUNNER = 5;
|
||||
}
|
@ -8,7 +8,7 @@
|
||||
COMMENT
|
||||
|
||||
if ! [[ "${0}" =~ "scripts/docker-local-agent.sh" ]]; then
|
||||
echo "must be run from tools/functional-tester"
|
||||
echo "must be run from functional"
|
||||
exit 255
|
||||
fi
|
||||
|
@ -1,7 +1,7 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
if ! [[ "${0}" =~ "scripts/docker-local-tester.sh" ]]; then
|
||||
echo "must be run from tools/functional-tester"
|
||||
echo "must be run from functional"
|
||||
exit 255
|
||||
fi
|
||||
|
||||
@ -15,4 +15,4 @@ docker run \
|
||||
--net=host \
|
||||
--name tester \
|
||||
gcr.io/etcd-development/etcd-functional-tester:go${GO_VERSION} \
|
||||
/bin/bash -c "./bin/etcd-tester --config ./local-test.yaml"
|
||||
/bin/bash -c "./bin/etcd-tester --config ./functional.yaml"
|
@ -21,7 +21,7 @@ import (
|
||||
|
||||
"github.com/coreos/etcd/clientv3"
|
||||
"github.com/coreos/etcd/etcdserver/api/v3rpc/rpctypes"
|
||||
"github.com/coreos/etcd/tools/functional-tester/rpcpb"
|
||||
"github.com/coreos/etcd/functional/rpcpb"
|
||||
|
||||
"go.uber.org/zap"
|
||||
"google.golang.org/grpc"
|
@ -18,23 +18,24 @@ import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"math/rand"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/coreos/etcd/functional/rpcpb"
|
||||
"github.com/coreos/etcd/pkg/debugutil"
|
||||
"github.com/coreos/etcd/pkg/fileutil"
|
||||
"github.com/coreos/etcd/tools/functional-tester/rpcpb"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus/promhttp"
|
||||
"go.uber.org/zap"
|
||||
"golang.org/x/time/rate"
|
||||
"google.golang.org/grpc"
|
||||
yaml "gopkg.in/yaml.v2"
|
||||
)
|
||||
|
||||
// Cluster defines tester cluster.
|
||||
@ -62,221 +63,6 @@ type Cluster struct {
|
||||
cs int
|
||||
}
|
||||
|
||||
func newCluster(lg *zap.Logger, fpath string) (*Cluster, error) {
|
||||
bts, err := ioutil.ReadFile(fpath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
lg.Info("opened configuration file", zap.String("path", fpath))
|
||||
|
||||
clus := &Cluster{lg: lg}
|
||||
if err = yaml.Unmarshal(bts, clus); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for i, mem := range clus.Members {
|
||||
if mem.BaseDir == "" {
|
||||
return nil, fmt.Errorf("Members[i].BaseDir cannot be empty (got %q)", mem.BaseDir)
|
||||
}
|
||||
if mem.EtcdLogPath == "" {
|
||||
return nil, fmt.Errorf("Members[i].EtcdLogPath cannot be empty (got %q)", mem.EtcdLogPath)
|
||||
}
|
||||
|
||||
if mem.Etcd.Name == "" {
|
||||
return nil, fmt.Errorf("'--name' cannot be empty (got %+v)", mem)
|
||||
}
|
||||
if mem.Etcd.DataDir == "" {
|
||||
return nil, fmt.Errorf("'--data-dir' cannot be empty (got %+v)", mem)
|
||||
}
|
||||
if mem.Etcd.SnapshotCount == 0 {
|
||||
return nil, fmt.Errorf("'--snapshot-count' cannot be 0 (got %+v)", mem.Etcd.SnapshotCount)
|
||||
}
|
||||
if mem.Etcd.DataDir == "" {
|
||||
return nil, fmt.Errorf("'--data-dir' cannot be empty (got %q)", mem.Etcd.DataDir)
|
||||
}
|
||||
if mem.Etcd.WALDir == "" {
|
||||
clus.Members[i].Etcd.WALDir = filepath.Join(mem.Etcd.DataDir, "member", "wal")
|
||||
}
|
||||
|
||||
if mem.Etcd.HeartbeatIntervalMs == 0 {
|
||||
return nil, fmt.Errorf("'--heartbeat-interval' cannot be 0 (got %+v)", mem.Etcd)
|
||||
}
|
||||
if mem.Etcd.ElectionTimeoutMs == 0 {
|
||||
return nil, fmt.Errorf("'--election-timeout' cannot be 0 (got %+v)", mem.Etcd)
|
||||
}
|
||||
if int64(clus.Tester.DelayLatencyMs) <= mem.Etcd.ElectionTimeoutMs {
|
||||
return nil, fmt.Errorf("delay latency %d ms must be greater than election timeout %d ms", clus.Tester.DelayLatencyMs, mem.Etcd.ElectionTimeoutMs)
|
||||
}
|
||||
|
||||
port := ""
|
||||
listenClientPorts := make([]string, len(clus.Members))
|
||||
for i, u := range mem.Etcd.ListenClientURLs {
|
||||
if !isValidURL(u) {
|
||||
return nil, fmt.Errorf("'--listen-client-urls' has valid URL %q", u)
|
||||
}
|
||||
listenClientPorts[i], err = getPort(u)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("'--listen-client-urls' has no port %q", u)
|
||||
}
|
||||
}
|
||||
for i, u := range mem.Etcd.AdvertiseClientURLs {
|
||||
if !isValidURL(u) {
|
||||
return nil, fmt.Errorf("'--advertise-client-urls' has valid URL %q", u)
|
||||
}
|
||||
port, err = getPort(u)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("'--advertise-client-urls' has no port %q", u)
|
||||
}
|
||||
if mem.EtcdClientProxy && listenClientPorts[i] == port {
|
||||
return nil, fmt.Errorf("clus.Members[%d] requires client port proxy, but advertise port %q conflicts with listener port %q", i, port, listenClientPorts[i])
|
||||
}
|
||||
}
|
||||
|
||||
listenPeerPorts := make([]string, len(clus.Members))
|
||||
for i, u := range mem.Etcd.ListenPeerURLs {
|
||||
if !isValidURL(u) {
|
||||
return nil, fmt.Errorf("'--listen-peer-urls' has valid URL %q", u)
|
||||
}
|
||||
listenPeerPorts[i], err = getPort(u)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("'--listen-peer-urls' has no port %q", u)
|
||||
}
|
||||
}
|
||||
for j, u := range mem.Etcd.AdvertisePeerURLs {
|
||||
if !isValidURL(u) {
|
||||
return nil, fmt.Errorf("'--initial-advertise-peer-urls' has valid URL %q", u)
|
||||
}
|
||||
port, err = getPort(u)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("'--initial-advertise-peer-urls' has no port %q", u)
|
||||
}
|
||||
if mem.EtcdPeerProxy && listenPeerPorts[j] == port {
|
||||
return nil, fmt.Errorf("clus.Members[%d] requires peer port proxy, but advertise port %q conflicts with listener port %q", i, port, listenPeerPorts[j])
|
||||
}
|
||||
}
|
||||
|
||||
if !strings.HasPrefix(mem.EtcdLogPath, mem.BaseDir) {
|
||||
return nil, fmt.Errorf("EtcdLogPath must be prefixed with BaseDir (got %q)", mem.EtcdLogPath)
|
||||
}
|
||||
if !strings.HasPrefix(mem.Etcd.DataDir, mem.BaseDir) {
|
||||
return nil, fmt.Errorf("Etcd.DataDir must be prefixed with BaseDir (got %q)", mem.Etcd.DataDir)
|
||||
}
|
||||
|
||||
// TODO: support separate WALDir that can be handled via failure-archive
|
||||
if !strings.HasPrefix(mem.Etcd.WALDir, mem.BaseDir) {
|
||||
return nil, fmt.Errorf("Etcd.WALDir must be prefixed with BaseDir (got %q)", mem.Etcd.WALDir)
|
||||
}
|
||||
|
||||
// TODO: only support generated certs with TLS generator
|
||||
// deprecate auto TLS
|
||||
if mem.Etcd.ClientAutoTLS && mem.Etcd.ClientCertAuth {
|
||||
return nil, fmt.Errorf("Etcd.ClientAutoTLS and Etcd.ClientCertAuth are both 'true'")
|
||||
}
|
||||
if mem.Etcd.ClientAutoTLS && mem.Etcd.ClientCertFile != "" {
|
||||
return nil, fmt.Errorf("Etcd.ClientAutoTLS 'true', but Etcd.ClientCertFile is %q", mem.Etcd.ClientCertFile)
|
||||
}
|
||||
if mem.Etcd.ClientCertAuth && mem.Etcd.ClientCertFile == "" {
|
||||
return nil, fmt.Errorf("Etcd.ClientCertAuth 'true', but Etcd.ClientCertFile is %q", mem.Etcd.PeerCertFile)
|
||||
}
|
||||
if mem.Etcd.ClientAutoTLS && mem.Etcd.ClientKeyFile != "" {
|
||||
return nil, fmt.Errorf("Etcd.ClientAutoTLS 'true', but Etcd.ClientKeyFile is %q", mem.Etcd.ClientKeyFile)
|
||||
}
|
||||
if mem.Etcd.ClientAutoTLS && mem.Etcd.ClientTrustedCAFile != "" {
|
||||
return nil, fmt.Errorf("Etcd.ClientAutoTLS 'true', but Etcd.ClientTrustedCAFile is %q", mem.Etcd.ClientTrustedCAFile)
|
||||
}
|
||||
if mem.Etcd.PeerAutoTLS && mem.Etcd.PeerClientCertAuth {
|
||||
return nil, fmt.Errorf("Etcd.PeerAutoTLS and Etcd.PeerClientCertAuth are both 'true'")
|
||||
}
|
||||
if mem.Etcd.PeerAutoTLS && mem.Etcd.PeerCertFile != "" {
|
||||
return nil, fmt.Errorf("Etcd.PeerAutoTLS 'true', but Etcd.PeerCertFile is %q", mem.Etcd.PeerCertFile)
|
||||
}
|
||||
if mem.Etcd.PeerClientCertAuth && mem.Etcd.PeerCertFile == "" {
|
||||
return nil, fmt.Errorf("Etcd.PeerClientCertAuth 'true', but Etcd.PeerCertFile is %q", mem.Etcd.PeerCertFile)
|
||||
}
|
||||
if mem.Etcd.PeerAutoTLS && mem.Etcd.PeerKeyFile != "" {
|
||||
return nil, fmt.Errorf("Etcd.PeerAutoTLS 'true', but Etcd.PeerKeyFile is %q", mem.Etcd.PeerKeyFile)
|
||||
}
|
||||
if mem.Etcd.PeerAutoTLS && mem.Etcd.PeerTrustedCAFile != "" {
|
||||
return nil, fmt.Errorf("Etcd.PeerAutoTLS 'true', but Etcd.PeerTrustedCAFile is %q", mem.Etcd.PeerTrustedCAFile)
|
||||
}
|
||||
|
||||
if mem.Etcd.ClientAutoTLS || mem.Etcd.ClientCertFile != "" {
|
||||
for _, cu := range mem.Etcd.ListenClientURLs {
|
||||
var u *url.URL
|
||||
u, err = url.Parse(cu)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if u.Scheme != "https" { // TODO: support unix
|
||||
return nil, fmt.Errorf("client TLS is enabled with wrong scheme %q", cu)
|
||||
}
|
||||
}
|
||||
for _, cu := range mem.Etcd.AdvertiseClientURLs {
|
||||
var u *url.URL
|
||||
u, err = url.Parse(cu)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if u.Scheme != "https" { // TODO: support unix
|
||||
return nil, fmt.Errorf("client TLS is enabled with wrong scheme %q", cu)
|
||||
}
|
||||
}
|
||||
}
|
||||
if mem.Etcd.PeerAutoTLS || mem.Etcd.PeerCertFile != "" {
|
||||
for _, cu := range mem.Etcd.ListenPeerURLs {
|
||||
var u *url.URL
|
||||
u, err = url.Parse(cu)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if u.Scheme != "https" { // TODO: support unix
|
||||
return nil, fmt.Errorf("peer TLS is enabled with wrong scheme %q", cu)
|
||||
}
|
||||
}
|
||||
for _, cu := range mem.Etcd.AdvertisePeerURLs {
|
||||
var u *url.URL
|
||||
u, err = url.Parse(cu)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if u.Scheme != "https" { // TODO: support unix
|
||||
return nil, fmt.Errorf("peer TLS is enabled with wrong scheme %q", cu)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if len(clus.Tester.FailureCases) == 0 {
|
||||
return nil, errors.New("FailureCases not found")
|
||||
}
|
||||
if clus.Tester.DelayLatencyMs <= clus.Tester.DelayLatencyMsRv*5 {
|
||||
return nil, fmt.Errorf("delay latency %d ms must be greater than 5x of delay latency random variable %d ms", clus.Tester.DelayLatencyMs, clus.Tester.DelayLatencyMsRv)
|
||||
}
|
||||
if clus.Tester.UpdatedDelayLatencyMs == 0 {
|
||||
clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs
|
||||
}
|
||||
|
||||
for _, v := range clus.Tester.FailureCases {
|
||||
if _, ok := rpcpb.FailureCase_value[v]; !ok {
|
||||
return nil, fmt.Errorf("%q is not defined in 'rpcpb.FailureCase_value'", v)
|
||||
}
|
||||
}
|
||||
|
||||
for _, v := range clus.Tester.StressTypes {
|
||||
if _, ok := rpcpb.StressType_value[v]; !ok {
|
||||
return nil, fmt.Errorf("StressType is unknown; got %q", v)
|
||||
}
|
||||
}
|
||||
if clus.Tester.StressKeySuffixRangeTxn > 100 {
|
||||
return nil, fmt.Errorf("StressKeySuffixRangeTxn maximum value is 100, got %v", clus.Tester.StressKeySuffixRangeTxn)
|
||||
}
|
||||
if clus.Tester.StressKeyTxnOps > 64 {
|
||||
return nil, fmt.Errorf("StressKeyTxnOps maximum value is 64, got %v", clus.Tester.StressKeyTxnOps)
|
||||
}
|
||||
|
||||
return clus, err
|
||||
}
|
||||
|
||||
var dialOpts = []grpc.DialOption{
|
||||
grpc.WithInsecure(),
|
||||
grpc.WithTimeout(5 * time.Second),
|
||||
@ -285,7 +71,7 @@ var dialOpts = []grpc.DialOption{
|
||||
|
||||
// NewCluster creates a client from a tester configuration.
|
||||
func NewCluster(lg *zap.Logger, fpath string) (*Cluster, error) {
|
||||
clus, err := newCluster(lg, fpath)
|
||||
clus, err := read(lg, fpath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@ -320,7 +106,7 @@ func NewCluster(lg *zap.Logger, fpath string) (*Cluster, error) {
|
||||
}
|
||||
}
|
||||
clus.testerHTTPServer = &http.Server{
|
||||
Addr: clus.Tester.TesterAddr,
|
||||
Addr: clus.Tester.Addr,
|
||||
Handler: mux,
|
||||
}
|
||||
go clus.serveTesterServer()
|
||||
@ -340,12 +126,12 @@ func NewCluster(lg *zap.Logger, fpath string) (*Cluster, error) {
|
||||
func (clus *Cluster) serveTesterServer() {
|
||||
clus.lg.Info(
|
||||
"started tester HTTP server",
|
||||
zap.String("tester-address", clus.Tester.TesterAddr),
|
||||
zap.String("tester-address", clus.Tester.Addr),
|
||||
)
|
||||
err := clus.testerHTTPServer.ListenAndServe()
|
||||
clus.lg.Info(
|
||||
"tester HTTP server returned",
|
||||
zap.String("tester-address", clus.Tester.TesterAddr),
|
||||
zap.String("tester-address", clus.Tester.Addr),
|
||||
zap.Error(err),
|
||||
)
|
||||
if err != nil && err != http.ErrServerClosed {
|
||||
@ -356,70 +142,98 @@ func (clus *Cluster) serveTesterServer() {
|
||||
func (clus *Cluster) updateFailures() {
|
||||
for _, cs := range clus.Tester.FailureCases {
|
||||
switch cs {
|
||||
case "KILL_ONE_FOLLOWER":
|
||||
clus.failures = append(clus.failures, newFailureKillOneFollower())
|
||||
case "KILL_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT":
|
||||
clus.failures = append(clus.failures, newFailureKillOneFollowerUntilTriggerSnapshot())
|
||||
case "KILL_LEADER":
|
||||
clus.failures = append(clus.failures, newFailureKillLeader())
|
||||
case "KILL_LEADER_UNTIL_TRIGGER_SNAPSHOT":
|
||||
clus.failures = append(clus.failures, newFailureKillLeaderUntilTriggerSnapshot())
|
||||
case "KILL_QUORUM":
|
||||
clus.failures = append(clus.failures, newFailureKillQuorum())
|
||||
case "KILL_ALL":
|
||||
clus.failures = append(clus.failures, newFailureKillAll())
|
||||
case "SIGTERM_ONE_FOLLOWER":
|
||||
clus.failures = append(clus.failures,
|
||||
new_FailureCase_SIGTERM_ONE_FOLLOWER(clus))
|
||||
case "SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT":
|
||||
clus.failures = append(clus.failures,
|
||||
new_FailureCase_SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT(clus))
|
||||
case "SIGTERM_LEADER":
|
||||
clus.failures = append(clus.failures,
|
||||
new_FailureCase_SIGTERM_LEADER(clus))
|
||||
case "SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT":
|
||||
clus.failures = append(clus.failures,
|
||||
new_FailureCase_SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT(clus))
|
||||
case "SIGTERM_QUORUM":
|
||||
clus.failures = append(clus.failures,
|
||||
new_FailureCase_SIGTERM_QUORUM(clus))
|
||||
case "SIGTERM_ALL":
|
||||
clus.failures = append(clus.failures,
|
||||
new_FailureCase_SIGTERM_ALL(clus))
|
||||
|
||||
case "BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER":
|
||||
clus.failures = append(clus.failures, newFailureBlackholePeerPortTxRxOneFollower(clus))
|
||||
clus.failures = append(clus.failures,
|
||||
new_FailureCase_BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER(clus))
|
||||
case "BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT":
|
||||
clus.failures = append(clus.failures, newFailureBlackholePeerPortTxRxOneFollowerUntilTriggerSnapshot())
|
||||
clus.failures = append(clus.failures,
|
||||
new_FailureCase_BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT())
|
||||
case "BLACKHOLE_PEER_PORT_TX_RX_LEADER":
|
||||
clus.failures = append(clus.failures, newFailureBlackholePeerPortTxRxLeader(clus))
|
||||
clus.failures = append(clus.failures,
|
||||
new_FailureCase_BLACKHOLE_PEER_PORT_TX_RX_LEADER(clus))
|
||||
case "BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT":
|
||||
clus.failures = append(clus.failures, newFailureBlackholePeerPortTxRxLeaderUntilTriggerSnapshot())
|
||||
clus.failures = append(clus.failures,
|
||||
new_FailureCase_BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT())
|
||||
case "BLACKHOLE_PEER_PORT_TX_RX_QUORUM":
|
||||
clus.failures = append(clus.failures, newFailureBlackholePeerPortTxRxQuorum(clus))
|
||||
clus.failures = append(clus.failures,
|
||||
new_FailureCase_BLACKHOLE_PEER_PORT_TX_RX_QUORUM(clus))
|
||||
case "BLACKHOLE_PEER_PORT_TX_RX_ALL":
|
||||
clus.failures = append(clus.failures, newFailureBlackholePeerPortTxRxAll(clus))
|
||||
clus.failures = append(clus.failures,
|
||||
new_FailureCase_BLACKHOLE_PEER_PORT_TX_RX_ALL(clus))
|
||||
|
||||
case "DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER":
|
||||
clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxOneFollower(clus, false))
|
||||
clus.failures = append(clus.failures,
|
||||
new_FailureCase_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER(clus, false))
|
||||
case "RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER":
|
||||
clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxOneFollower(clus, true))
|
||||
clus.failures = append(clus.failures,
|
||||
new_FailureCase_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER(clus, true))
|
||||
case "DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT":
|
||||
clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxOneFollowerUntilTriggerSnapshot(clus, false))
|
||||
clus.failures = append(clus.failures,
|
||||
new_FailureCase_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT(clus, false))
|
||||
case "RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT":
|
||||
clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxOneFollowerUntilTriggerSnapshot(clus, true))
|
||||
clus.failures = append(clus.failures,
|
||||
new_FailureCase_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT(clus, true))
|
||||
case "DELAY_PEER_PORT_TX_RX_LEADER":
|
||||
clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxLeader(clus, false))
|
||||
clus.failures = append(clus.failures,
|
||||
new_FailureCase_DELAY_PEER_PORT_TX_RX_LEADER(clus, false))
|
||||
case "RANDOM_DELAY_PEER_PORT_TX_RX_LEADER":
|
||||
clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxLeader(clus, true))
|
||||
clus.failures = append(clus.failures,
|
||||
new_FailureCase_DELAY_PEER_PORT_TX_RX_LEADER(clus, true))
|
||||
case "DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT":
|
||||
clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxLeaderUntilTriggerSnapshot(clus, false))
|
||||
clus.failures = append(clus.failures,
|
||||
new_FailureCase_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT(clus, false))
|
||||
case "RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT":
|
||||
clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxLeaderUntilTriggerSnapshot(clus, true))
|
||||
clus.failures = append(clus.failures,
|
||||
new_FailureCase_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT(clus, true))
|
||||
case "DELAY_PEER_PORT_TX_RX_QUORUM":
|
||||
clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxQuorum(clus, false))
|
||||
clus.failures = append(clus.failures,
|
||||
new_FailureCase_DELAY_PEER_PORT_TX_RX_QUORUM(clus, false))
|
||||
case "RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM":
|
||||
clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxQuorum(clus, true))
|
||||
clus.failures = append(clus.failures,
|
||||
new_FailureCase_DELAY_PEER_PORT_TX_RX_QUORUM(clus, true))
|
||||
case "DELAY_PEER_PORT_TX_RX_ALL":
|
||||
clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxAll(clus, false))
|
||||
clus.failures = append(clus.failures,
|
||||
new_FailureCase_DELAY_PEER_PORT_TX_RX_ALL(clus, false))
|
||||
case "RANDOM_DELAY_PEER_PORT_TX_RX_ALL":
|
||||
clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxAll(clus, true))
|
||||
clus.failures = append(clus.failures,
|
||||
new_FailureCase_DELAY_PEER_PORT_TX_RX_ALL(clus, true))
|
||||
|
||||
case "NO_FAIL_WITH_STRESS":
|
||||
clus.failures = append(clus.failures, newFailureNoFailWithStress(clus))
|
||||
clus.failures = append(clus.failures,
|
||||
new_FailureCase_NO_FAIL_WITH_STRESS(clus))
|
||||
case "NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS":
|
||||
clus.failures = append(clus.failures, newFailureNoFailWithNoStressForLiveness(clus))
|
||||
clus.failures = append(clus.failures,
|
||||
new_FailureCase_NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS(clus))
|
||||
|
||||
case "EXTERNAL":
|
||||
clus.failures = append(clus.failures, newFailureExternal(clus.Tester.ExternalExecPath))
|
||||
clus.failures = append(clus.failures,
|
||||
new_FailureCase_EXTERNAL(clus.Tester.ExternalExecPath))
|
||||
case "FAILPOINTS":
|
||||
fpFailures, fperr := failpointFailures(clus)
|
||||
if len(fpFailures) == 0 {
|
||||
clus.lg.Info("no failpoints found!", zap.Error(fperr))
|
||||
}
|
||||
clus.failures = append(clus.failures, fpFailures...)
|
||||
clus.failures = append(clus.failures,
|
||||
fpFailures...)
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -444,48 +258,6 @@ func (clus *Cluster) UpdateDelayLatencyMs() {
|
||||
}
|
||||
}
|
||||
|
||||
func (clus *Cluster) shuffleFailures() {
|
||||
rand.Seed(time.Now().UnixNano())
|
||||
offset := rand.Intn(1000)
|
||||
n := len(clus.failures)
|
||||
cp := coprime(n)
|
||||
|
||||
fs := make([]Failure, n)
|
||||
for i := 0; i < n; i++ {
|
||||
fs[i] = clus.failures[(cp*i+offset)%n]
|
||||
}
|
||||
clus.failures = fs
|
||||
clus.lg.Info("shuffled test failure cases", zap.Int("total", n))
|
||||
}
|
||||
|
||||
/*
|
||||
x and y of GCD 1 are coprime to each other
|
||||
|
||||
x1 = ( coprime of n * idx1 + offset ) % n
|
||||
x2 = ( coprime of n * idx2 + offset ) % n
|
||||
(x2 - x1) = coprime of n * (idx2 - idx1) % n
|
||||
= (idx2 - idx1) = 1
|
||||
|
||||
Consecutive x's are guaranteed to be distinct
|
||||
*/
|
||||
func coprime(n int) int {
|
||||
coprime := 1
|
||||
for i := n / 2; i < n; i++ {
|
||||
if gcd(i, n) == 1 {
|
||||
coprime = i
|
||||
break
|
||||
}
|
||||
}
|
||||
return coprime
|
||||
}
|
||||
|
||||
func gcd(x, y int) int {
|
||||
if y == 0 {
|
||||
return x
|
||||
}
|
||||
return gcd(y, x%y)
|
||||
}
|
||||
|
||||
func (clus *Cluster) updateStresserChecker() {
|
||||
cs := &compositeStresser{}
|
||||
for _, m := range clus.Members {
|
||||
@ -502,11 +274,7 @@ func (clus *Cluster) updateStresserChecker() {
|
||||
clus.checker = newNoChecker()
|
||||
}
|
||||
|
||||
clus.lg.Info(
|
||||
"updated stressers",
|
||||
zap.Int("round", clus.rd),
|
||||
zap.Int("case", clus.cs),
|
||||
)
|
||||
clus.lg.Info("updated stressers")
|
||||
}
|
||||
|
||||
func (clus *Cluster) checkConsistency() (err error) {
|
||||
@ -542,45 +310,74 @@ func (clus *Cluster) checkConsistency() (err error) {
|
||||
return err
|
||||
}
|
||||
|
||||
// Bootstrap bootstraps etcd cluster the very first time.
|
||||
// Send_INITIAL_START_ETCD bootstraps etcd cluster the very first time.
|
||||
// After this, just continue to call kill/restart.
|
||||
func (clus *Cluster) Bootstrap() error {
|
||||
func (clus *Cluster) Send_INITIAL_START_ETCD() error {
|
||||
// this is the only time that creates request from scratch
|
||||
return clus.broadcastOperation(rpcpb.Operation_InitialStartEtcd)
|
||||
return clus.broadcast(rpcpb.Operation_INITIAL_START_ETCD)
|
||||
}
|
||||
|
||||
// FailArchive sends "FailArchive" operation.
|
||||
func (clus *Cluster) FailArchive() error {
|
||||
return clus.broadcastOperation(rpcpb.Operation_FailArchive)
|
||||
// send_SIGQUIT_ETCD_AND_ARCHIVE_DATA sends "send_SIGQUIT_ETCD_AND_ARCHIVE_DATA" operation.
|
||||
func (clus *Cluster) send_SIGQUIT_ETCD_AND_ARCHIVE_DATA() error {
|
||||
return clus.broadcast(rpcpb.Operation_SIGQUIT_ETCD_AND_ARCHIVE_DATA)
|
||||
}
|
||||
|
||||
// Restart sends "Restart" operation.
|
||||
func (clus *Cluster) Restart() error {
|
||||
return clus.broadcastOperation(rpcpb.Operation_RestartEtcd)
|
||||
// send_RESTART_ETCD sends restart operation.
|
||||
func (clus *Cluster) send_RESTART_ETCD() error {
|
||||
return clus.broadcast(rpcpb.Operation_RESTART_ETCD)
|
||||
}
|
||||
|
||||
func (clus *Cluster) broadcastOperation(op rpcpb.Operation) error {
|
||||
func (clus *Cluster) broadcast(op rpcpb.Operation) error {
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(len(clus.agentStreams))
|
||||
|
||||
errc := make(chan error, len(clus.agentStreams))
|
||||
for i := range clus.agentStreams {
|
||||
err := clus.sendOperation(i, op)
|
||||
go func(idx int, o rpcpb.Operation) {
|
||||
defer wg.Done()
|
||||
errc <- clus.sendOp(idx, o)
|
||||
}(i, op)
|
||||
}
|
||||
wg.Wait()
|
||||
close(errc)
|
||||
|
||||
errs := []string{}
|
||||
for err := range errc {
|
||||
if err == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
if op == rpcpb.Operation_DestroyEtcdAgent &&
|
||||
strings.Contains(err.Error(), "rpc error: code = Unavailable desc = transport is closing") {
|
||||
// agent server has already closed;
|
||||
// so this error is expected
|
||||
clus.lg.Info(
|
||||
"successfully destroyed",
|
||||
zap.String("member", clus.Members[i].EtcdClientEndpoint),
|
||||
)
|
||||
continue
|
||||
destroyed := false
|
||||
if op == rpcpb.Operation_SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT {
|
||||
if err == io.EOF {
|
||||
destroyed = true
|
||||
}
|
||||
if strings.Contains(err.Error(),
|
||||
"rpc error: code = Unavailable desc = transport is closing") {
|
||||
// agent server has already closed;
|
||||
// so this error is expected
|
||||
destroyed = true
|
||||
}
|
||||
if strings.Contains(err.Error(),
|
||||
"desc = os: process already finished") {
|
||||
destroyed = true
|
||||
}
|
||||
}
|
||||
if !destroyed {
|
||||
errs = append(errs, err.Error())
|
||||
}
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
|
||||
if len(errs) == 0 {
|
||||
return nil
|
||||
}
|
||||
return errors.New(strings.Join(errs, ", "))
|
||||
}
|
||||
|
||||
func (clus *Cluster) sendOperation(idx int, op rpcpb.Operation) error {
|
||||
if op == rpcpb.Operation_InitialStartEtcd {
|
||||
func (clus *Cluster) sendOp(idx int, op rpcpb.Operation) error {
|
||||
if op == rpcpb.Operation_INITIAL_START_ETCD {
|
||||
clus.agentRequests[idx] = &rpcpb.Request{
|
||||
Operation: op,
|
||||
Member: clus.Members[idx],
|
||||
@ -639,9 +436,9 @@ func (clus *Cluster) sendOperation(idx int, op rpcpb.Operation) error {
|
||||
}
|
||||
|
||||
// store TLS assets from agents/servers onto disk
|
||||
if secure && (op == rpcpb.Operation_InitialStartEtcd || op == rpcpb.Operation_RestartEtcd) {
|
||||
if secure && (op == rpcpb.Operation_INITIAL_START_ETCD || op == rpcpb.Operation_RESTART_ETCD) {
|
||||
dirClient := filepath.Join(
|
||||
clus.Tester.TesterDataDir,
|
||||
clus.Tester.DataDir,
|
||||
clus.Members[idx].Etcd.Name,
|
||||
"fixtures",
|
||||
"client",
|
||||
@ -699,9 +496,9 @@ func (clus *Cluster) sendOperation(idx int, op rpcpb.Operation) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// DestroyEtcdAgents terminates all tester connections to agents and etcd servers.
|
||||
func (clus *Cluster) DestroyEtcdAgents() {
|
||||
err := clus.broadcastOperation(rpcpb.Operation_DestroyEtcdAgent)
|
||||
// Send_SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT terminates all tester connections to agents and etcd servers.
|
||||
func (clus *Cluster) Send_SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT() {
|
||||
err := clus.broadcast(rpcpb.Operation_SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT)
|
||||
if err != nil {
|
||||
clus.lg.Warn("destroying etcd/agents FAIL", zap.Error(err))
|
||||
} else {
|
||||
@ -717,7 +514,7 @@ func (clus *Cluster) DestroyEtcdAgents() {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
|
||||
err := clus.testerHTTPServer.Shutdown(ctx)
|
||||
cancel()
|
||||
clus.lg.Info("closed tester HTTP server", zap.String("tester-address", clus.Tester.TesterAddr), zap.Error(err))
|
||||
clus.lg.Info("closed tester HTTP server", zap.String("tester-address", clus.Tester.Addr), zap.Error(err))
|
||||
}
|
||||
}
|
||||
|
||||
@ -886,6 +683,7 @@ func (clus *Cluster) defrag() error {
|
||||
"defrag ALL PASS",
|
||||
zap.Int("round", clus.rd),
|
||||
zap.Int("case", clus.cs),
|
||||
zap.Int("case-total", len(clus.failures)),
|
||||
)
|
||||
return nil
|
||||
}
|
346
functional/tester/cluster_read_config.go
Normal file
346
functional/tester/cluster_read_config.go
Normal file
@ -0,0 +1,346 @@
|
||||
// Copyright 2018 The etcd Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package tester
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"net/url"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"github.com/coreos/etcd/functional/rpcpb"
|
||||
|
||||
"go.uber.org/zap"
|
||||
yaml "gopkg.in/yaml.v2"
|
||||
)
|
||||
|
||||
func read(lg *zap.Logger, fpath string) (*Cluster, error) {
|
||||
bts, err := ioutil.ReadFile(fpath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
lg.Info("opened configuration file", zap.String("path", fpath))
|
||||
|
||||
clus := &Cluster{lg: lg}
|
||||
if err = yaml.Unmarshal(bts, clus); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if len(clus.Members) < 3 {
|
||||
return nil, fmt.Errorf("len(clus.Members) expects at least 3, got %d", len(clus.Members))
|
||||
}
|
||||
|
||||
for i, mem := range clus.Members {
|
||||
if mem.BaseDir == "" {
|
||||
return nil, fmt.Errorf("BaseDir cannot be empty (got %q)", mem.BaseDir)
|
||||
}
|
||||
if mem.EtcdLogPath == "" {
|
||||
return nil, fmt.Errorf("EtcdLogPath cannot be empty (got %q)", mem.EtcdLogPath)
|
||||
}
|
||||
|
||||
if mem.Etcd.Name == "" {
|
||||
return nil, fmt.Errorf("'--name' cannot be empty (got %+v)", mem)
|
||||
}
|
||||
if mem.Etcd.DataDir == "" {
|
||||
return nil, fmt.Errorf("'--data-dir' cannot be empty (got %+v)", mem)
|
||||
}
|
||||
if mem.Etcd.SnapshotCount == 0 {
|
||||
return nil, fmt.Errorf("'--snapshot-count' cannot be 0 (got %+v)", mem.Etcd.SnapshotCount)
|
||||
}
|
||||
if mem.Etcd.DataDir == "" {
|
||||
return nil, fmt.Errorf("'--data-dir' cannot be empty (got %q)", mem.Etcd.DataDir)
|
||||
}
|
||||
if mem.Etcd.WALDir == "" {
|
||||
clus.Members[i].Etcd.WALDir = filepath.Join(mem.Etcd.DataDir, "member", "wal")
|
||||
}
|
||||
|
||||
if mem.Etcd.HeartbeatIntervalMs == 0 {
|
||||
return nil, fmt.Errorf("'--heartbeat-interval' cannot be 0 (got %+v)", mem.Etcd)
|
||||
}
|
||||
if mem.Etcd.ElectionTimeoutMs == 0 {
|
||||
return nil, fmt.Errorf("'--election-timeout' cannot be 0 (got %+v)", mem.Etcd)
|
||||
}
|
||||
if int64(clus.Tester.DelayLatencyMs) <= mem.Etcd.ElectionTimeoutMs {
|
||||
return nil, fmt.Errorf("delay latency %d ms must be greater than election timeout %d ms", clus.Tester.DelayLatencyMs, mem.Etcd.ElectionTimeoutMs)
|
||||
}
|
||||
|
||||
port := ""
|
||||
listenClientPorts := make([]string, len(clus.Members))
|
||||
for i, u := range mem.Etcd.ListenClientURLs {
|
||||
if !isValidURL(u) {
|
||||
return nil, fmt.Errorf("'--listen-client-urls' has valid URL %q", u)
|
||||
}
|
||||
listenClientPorts[i], err = getPort(u)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("'--listen-client-urls' has no port %q", u)
|
||||
}
|
||||
}
|
||||
for i, u := range mem.Etcd.AdvertiseClientURLs {
|
||||
if !isValidURL(u) {
|
||||
return nil, fmt.Errorf("'--advertise-client-urls' has valid URL %q", u)
|
||||
}
|
||||
port, err = getPort(u)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("'--advertise-client-urls' has no port %q", u)
|
||||
}
|
||||
if mem.EtcdClientProxy && listenClientPorts[i] == port {
|
||||
return nil, fmt.Errorf("clus.Members[%d] requires client port proxy, but advertise port %q conflicts with listener port %q", i, port, listenClientPorts[i])
|
||||
}
|
||||
}
|
||||
|
||||
listenPeerPorts := make([]string, len(clus.Members))
|
||||
for i, u := range mem.Etcd.ListenPeerURLs {
|
||||
if !isValidURL(u) {
|
||||
return nil, fmt.Errorf("'--listen-peer-urls' has valid URL %q", u)
|
||||
}
|
||||
listenPeerPorts[i], err = getPort(u)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("'--listen-peer-urls' has no port %q", u)
|
||||
}
|
||||
}
|
||||
for j, u := range mem.Etcd.AdvertisePeerURLs {
|
||||
if !isValidURL(u) {
|
||||
return nil, fmt.Errorf("'--initial-advertise-peer-urls' has valid URL %q", u)
|
||||
}
|
||||
port, err = getPort(u)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("'--initial-advertise-peer-urls' has no port %q", u)
|
||||
}
|
||||
if mem.EtcdPeerProxy && listenPeerPorts[j] == port {
|
||||
return nil, fmt.Errorf("clus.Members[%d] requires peer port proxy, but advertise port %q conflicts with listener port %q", i, port, listenPeerPorts[j])
|
||||
}
|
||||
}
|
||||
|
||||
if !strings.HasPrefix(mem.EtcdLogPath, mem.BaseDir) {
|
||||
return nil, fmt.Errorf("EtcdLogPath must be prefixed with BaseDir (got %q)", mem.EtcdLogPath)
|
||||
}
|
||||
if !strings.HasPrefix(mem.Etcd.DataDir, mem.BaseDir) {
|
||||
return nil, fmt.Errorf("Etcd.DataDir must be prefixed with BaseDir (got %q)", mem.Etcd.DataDir)
|
||||
}
|
||||
|
||||
// TODO: support separate WALDir that can be handled via failure-archive
|
||||
if !strings.HasPrefix(mem.Etcd.WALDir, mem.BaseDir) {
|
||||
return nil, fmt.Errorf("Etcd.WALDir must be prefixed with BaseDir (got %q)", mem.Etcd.WALDir)
|
||||
}
|
||||
|
||||
// TODO: only support generated certs with TLS generator
|
||||
// deprecate auto TLS
|
||||
if mem.Etcd.PeerAutoTLS && mem.Etcd.PeerCertFile != "" {
|
||||
return nil, fmt.Errorf("Etcd.PeerAutoTLS 'true', but Etcd.PeerCertFile is %q", mem.Etcd.PeerCertFile)
|
||||
}
|
||||
if mem.Etcd.PeerAutoTLS && mem.Etcd.PeerKeyFile != "" {
|
||||
return nil, fmt.Errorf("Etcd.PeerAutoTLS 'true', but Etcd.PeerKeyFile is %q", mem.Etcd.PeerKeyFile)
|
||||
}
|
||||
if mem.Etcd.PeerAutoTLS && mem.Etcd.PeerTrustedCAFile != "" {
|
||||
return nil, fmt.Errorf("Etcd.PeerAutoTLS 'true', but Etcd.PeerTrustedCAFile is %q", mem.Etcd.PeerTrustedCAFile)
|
||||
}
|
||||
if mem.Etcd.ClientAutoTLS && mem.Etcd.ClientCertFile != "" {
|
||||
return nil, fmt.Errorf("Etcd.ClientAutoTLS 'true', but Etcd.ClientCertFile is %q", mem.Etcd.ClientCertFile)
|
||||
}
|
||||
if mem.Etcd.ClientAutoTLS && mem.Etcd.ClientKeyFile != "" {
|
||||
return nil, fmt.Errorf("Etcd.ClientAutoTLS 'true', but Etcd.ClientKeyFile is %q", mem.Etcd.ClientKeyFile)
|
||||
}
|
||||
if mem.Etcd.ClientAutoTLS && mem.Etcd.ClientTrustedCAFile != "" {
|
||||
return nil, fmt.Errorf("Etcd.ClientAutoTLS 'true', but Etcd.ClientTrustedCAFile is %q", mem.Etcd.ClientTrustedCAFile)
|
||||
}
|
||||
|
||||
if mem.Etcd.PeerClientCertAuth && mem.Etcd.PeerCertFile == "" {
|
||||
return nil, fmt.Errorf("Etcd.PeerClientCertAuth 'true', but Etcd.PeerCertFile is %q", mem.Etcd.PeerCertFile)
|
||||
}
|
||||
if mem.Etcd.PeerClientCertAuth && mem.Etcd.PeerKeyFile == "" {
|
||||
return nil, fmt.Errorf("Etcd.PeerClientCertAuth 'true', but Etcd.PeerKeyFile is %q", mem.Etcd.PeerCertFile)
|
||||
}
|
||||
// only support self-signed certs
|
||||
if mem.Etcd.PeerClientCertAuth && mem.Etcd.PeerTrustedCAFile == "" {
|
||||
return nil, fmt.Errorf("Etcd.PeerClientCertAuth 'true', but Etcd.PeerTrustedCAFile is %q", mem.Etcd.PeerCertFile)
|
||||
}
|
||||
if !mem.Etcd.PeerClientCertAuth && mem.Etcd.PeerCertFile != "" {
|
||||
return nil, fmt.Errorf("Etcd.PeerClientCertAuth 'false', but Etcd.PeerCertFile is %q", mem.Etcd.PeerCertFile)
|
||||
}
|
||||
if !mem.Etcd.PeerClientCertAuth && mem.Etcd.PeerKeyFile != "" {
|
||||
return nil, fmt.Errorf("Etcd.PeerClientCertAuth 'false', but Etcd.PeerKeyFile is %q", mem.Etcd.PeerCertFile)
|
||||
}
|
||||
if !mem.Etcd.PeerClientCertAuth && mem.Etcd.PeerTrustedCAFile != "" {
|
||||
return nil, fmt.Errorf("Etcd.PeerClientCertAuth 'false', but Etcd.PeerTrustedCAFile is %q", mem.Etcd.PeerTrustedCAFile)
|
||||
}
|
||||
if mem.Etcd.PeerClientCertAuth && mem.Etcd.PeerAutoTLS {
|
||||
return nil, fmt.Errorf("Etcd.PeerClientCertAuth and Etcd.PeerAutoTLS cannot be both 'true'")
|
||||
}
|
||||
if (mem.Etcd.PeerCertFile == "") != (mem.Etcd.PeerKeyFile == "") {
|
||||
return nil, fmt.Errorf("Both Etcd.PeerCertFile %q and Etcd.PeerKeyFile %q must be either empty or non-empty", mem.Etcd.PeerCertFile, mem.Etcd.PeerKeyFile)
|
||||
}
|
||||
if mem.Etcd.ClientCertAuth && mem.Etcd.ClientAutoTLS {
|
||||
return nil, fmt.Errorf("Etcd.ClientCertAuth and Etcd.ClientAutoTLS cannot be both 'true'")
|
||||
}
|
||||
if mem.Etcd.ClientCertAuth && mem.Etcd.ClientCertFile == "" {
|
||||
return nil, fmt.Errorf("Etcd.ClientCertAuth 'true', but Etcd.ClientCertFile is %q", mem.Etcd.PeerCertFile)
|
||||
}
|
||||
if mem.Etcd.ClientCertAuth && mem.Etcd.ClientKeyFile == "" {
|
||||
return nil, fmt.Errorf("Etcd.ClientCertAuth 'true', but Etcd.ClientKeyFile is %q", mem.Etcd.PeerCertFile)
|
||||
}
|
||||
if mem.Etcd.ClientCertAuth && mem.Etcd.ClientTrustedCAFile == "" {
|
||||
return nil, fmt.Errorf("Etcd.ClientCertAuth 'true', but Etcd.ClientTrustedCAFile is %q", mem.Etcd.ClientTrustedCAFile)
|
||||
}
|
||||
if !mem.Etcd.ClientCertAuth && mem.Etcd.ClientCertFile != "" {
|
||||
return nil, fmt.Errorf("Etcd.ClientCertAuth 'false', but Etcd.ClientCertFile is %q", mem.Etcd.PeerCertFile)
|
||||
}
|
||||
if !mem.Etcd.ClientCertAuth && mem.Etcd.ClientKeyFile != "" {
|
||||
return nil, fmt.Errorf("Etcd.ClientCertAuth 'false', but Etcd.ClientKeyFile is %q", mem.Etcd.PeerCertFile)
|
||||
}
|
||||
if !mem.Etcd.ClientCertAuth && mem.Etcd.ClientTrustedCAFile != "" {
|
||||
return nil, fmt.Errorf("Etcd.ClientCertAuth 'false', but Etcd.ClientTrustedCAFile is %q", mem.Etcd.PeerCertFile)
|
||||
}
|
||||
if (mem.Etcd.ClientCertFile == "") != (mem.Etcd.ClientKeyFile == "") {
|
||||
return nil, fmt.Errorf("Both Etcd.ClientCertFile %q and Etcd.ClientKeyFile %q must be either empty or non-empty", mem.Etcd.ClientCertFile, mem.Etcd.ClientKeyFile)
|
||||
}
|
||||
|
||||
peerTLS := mem.Etcd.PeerAutoTLS ||
|
||||
(mem.Etcd.PeerClientCertAuth && mem.Etcd.PeerCertFile != "" && mem.Etcd.PeerKeyFile != "" && mem.Etcd.PeerTrustedCAFile != "")
|
||||
if peerTLS {
|
||||
for _, cu := range mem.Etcd.ListenPeerURLs {
|
||||
var u *url.URL
|
||||
u, err = url.Parse(cu)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if u.Scheme != "https" { // TODO: support unix
|
||||
return nil, fmt.Errorf("peer TLS is enabled with wrong scheme %q", cu)
|
||||
}
|
||||
}
|
||||
for _, cu := range mem.Etcd.AdvertisePeerURLs {
|
||||
var u *url.URL
|
||||
u, err = url.Parse(cu)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if u.Scheme != "https" { // TODO: support unix
|
||||
return nil, fmt.Errorf("peer TLS is enabled with wrong scheme %q", cu)
|
||||
}
|
||||
}
|
||||
clus.Members[i].PeerCertPath = mem.Etcd.PeerCertFile
|
||||
if mem.Etcd.PeerCertFile != "" {
|
||||
var data []byte
|
||||
data, err = ioutil.ReadFile(mem.Etcd.PeerCertFile)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to read %q (%v)", mem.Etcd.PeerCertFile, err)
|
||||
}
|
||||
clus.Members[i].PeerCertData = string(data)
|
||||
}
|
||||
clus.Members[i].PeerKeyPath = mem.Etcd.PeerKeyFile
|
||||
if mem.Etcd.PeerKeyFile != "" {
|
||||
var data []byte
|
||||
data, err = ioutil.ReadFile(mem.Etcd.PeerKeyFile)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to read %q (%v)", mem.Etcd.PeerKeyFile, err)
|
||||
}
|
||||
clus.Members[i].PeerCertData = string(data)
|
||||
}
|
||||
clus.Members[i].PeerTrustedCAPath = mem.Etcd.PeerTrustedCAFile
|
||||
if mem.Etcd.PeerTrustedCAFile != "" {
|
||||
var data []byte
|
||||
data, err = ioutil.ReadFile(mem.Etcd.PeerTrustedCAFile)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to read %q (%v)", mem.Etcd.PeerTrustedCAFile, err)
|
||||
}
|
||||
clus.Members[i].PeerCertData = string(data)
|
||||
}
|
||||
}
|
||||
|
||||
clientTLS := mem.Etcd.ClientAutoTLS ||
|
||||
(mem.Etcd.ClientCertAuth && mem.Etcd.ClientCertFile != "" && mem.Etcd.ClientKeyFile != "" && mem.Etcd.ClientTrustedCAFile != "")
|
||||
if clientTLS {
|
||||
for _, cu := range mem.Etcd.ListenClientURLs {
|
||||
var u *url.URL
|
||||
u, err = url.Parse(cu)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if u.Scheme != "https" { // TODO: support unix
|
||||
return nil, fmt.Errorf("client TLS is enabled with wrong scheme %q", cu)
|
||||
}
|
||||
}
|
||||
for _, cu := range mem.Etcd.AdvertiseClientURLs {
|
||||
var u *url.URL
|
||||
u, err = url.Parse(cu)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if u.Scheme != "https" { // TODO: support unix
|
||||
return nil, fmt.Errorf("client TLS is enabled with wrong scheme %q", cu)
|
||||
}
|
||||
}
|
||||
clus.Members[i].ClientCertPath = mem.Etcd.ClientCertFile
|
||||
if mem.Etcd.ClientCertFile != "" {
|
||||
var data []byte
|
||||
data, err = ioutil.ReadFile(mem.Etcd.ClientCertFile)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to read %q (%v)", mem.Etcd.ClientCertFile, err)
|
||||
}
|
||||
clus.Members[i].ClientCertData = string(data)
|
||||
}
|
||||
clus.Members[i].ClientKeyPath = mem.Etcd.ClientKeyFile
|
||||
if mem.Etcd.ClientKeyFile != "" {
|
||||
var data []byte
|
||||
data, err = ioutil.ReadFile(mem.Etcd.ClientKeyFile)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to read %q (%v)", mem.Etcd.ClientKeyFile, err)
|
||||
}
|
||||
clus.Members[i].ClientCertData = string(data)
|
||||
}
|
||||
clus.Members[i].ClientTrustedCAPath = mem.Etcd.ClientTrustedCAFile
|
||||
if mem.Etcd.ClientTrustedCAFile != "" {
|
||||
var data []byte
|
||||
data, err = ioutil.ReadFile(mem.Etcd.ClientTrustedCAFile)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to read %q (%v)", mem.Etcd.ClientTrustedCAFile, err)
|
||||
}
|
||||
clus.Members[i].ClientCertData = string(data)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if len(clus.Tester.FailureCases) == 0 {
|
||||
return nil, errors.New("FailureCases not found")
|
||||
}
|
||||
if clus.Tester.DelayLatencyMs <= clus.Tester.DelayLatencyMsRv*5 {
|
||||
return nil, fmt.Errorf("delay latency %d ms must be greater than 5x of delay latency random variable %d ms", clus.Tester.DelayLatencyMs, clus.Tester.DelayLatencyMsRv)
|
||||
}
|
||||
if clus.Tester.UpdatedDelayLatencyMs == 0 {
|
||||
clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs
|
||||
}
|
||||
|
||||
for _, v := range clus.Tester.FailureCases {
|
||||
if _, ok := rpcpb.FailureCase_value[v]; !ok {
|
||||
return nil, fmt.Errorf("%q is not defined in 'rpcpb.FailureCase_value'", v)
|
||||
}
|
||||
}
|
||||
|
||||
for _, v := range clus.Tester.StressTypes {
|
||||
if _, ok := rpcpb.StressType_value[v]; !ok {
|
||||
return nil, fmt.Errorf("StressType is unknown; got %q", v)
|
||||
}
|
||||
}
|
||||
|
||||
if clus.Tester.StressKeySuffixRangeTxn > 100 {
|
||||
return nil, fmt.Errorf("StressKeySuffixRangeTxn maximum value is 100, got %v", clus.Tester.StressKeySuffixRangeTxn)
|
||||
}
|
||||
if clus.Tester.StressKeyTxnOps > 64 {
|
||||
return nil, fmt.Errorf("StressKeyTxnOps maximum value is 64, got %v", clus.Tester.StressKeyTxnOps)
|
||||
}
|
||||
|
||||
return clus, err
|
||||
}
|
@ -19,8 +19,8 @@ import (
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"github.com/coreos/etcd/functional/rpcpb"
|
||||
"github.com/coreos/etcd/pkg/fileutil"
|
||||
"github.com/coreos/etcd/tools/functional-tester/rpcpb"
|
||||
|
||||
"go.uber.org/zap"
|
||||
)
|
||||
@ -29,12 +29,14 @@ import (
|
||||
// Previous tests showed etcd can compact about 60,000 entries per second.
|
||||
const compactQPS = 50000
|
||||
|
||||
// StartTester starts tester.
|
||||
func (clus *Cluster) StartTester() {
|
||||
if err := fileutil.TouchDirAll(clus.Tester.TesterDataDir); err != nil {
|
||||
// Run starts tester.
|
||||
func (clus *Cluster) Run() {
|
||||
defer printReport()
|
||||
|
||||
if err := fileutil.TouchDirAll(clus.Tester.DataDir); err != nil {
|
||||
clus.lg.Panic(
|
||||
"failed to create test data directory",
|
||||
zap.String("dir", clus.Tester.TesterDataDir),
|
||||
zap.String("dir", clus.Tester.DataDir),
|
||||
zap.Error(err),
|
||||
)
|
||||
}
|
||||
@ -49,6 +51,7 @@ func (clus *Cluster) StartTester() {
|
||||
"round FAIL",
|
||||
zap.Int("round", clus.rd),
|
||||
zap.Int("case", clus.cs),
|
||||
zap.Int("case-total", len(clus.failures)),
|
||||
zap.Error(err),
|
||||
)
|
||||
if clus.cleanup() != nil {
|
||||
@ -72,6 +75,7 @@ func (clus *Cluster) StartTester() {
|
||||
"compact START",
|
||||
zap.Int("round", clus.rd),
|
||||
zap.Int("case", clus.cs),
|
||||
zap.Int("case-total", len(clus.failures)),
|
||||
zap.Duration("timeout", timeout),
|
||||
)
|
||||
if err := clus.compact(revToCompact, timeout); err != nil {
|
||||
@ -79,6 +83,7 @@ func (clus *Cluster) StartTester() {
|
||||
"compact FAIL",
|
||||
zap.Int("round", clus.rd),
|
||||
zap.Int("case", clus.cs),
|
||||
zap.Int("case-total", len(clus.failures)),
|
||||
zap.Error(err),
|
||||
)
|
||||
if err = clus.cleanup(); err != nil {
|
||||
@ -86,6 +91,7 @@ func (clus *Cluster) StartTester() {
|
||||
"cleanup FAIL",
|
||||
zap.Int("round", clus.rd),
|
||||
zap.Int("case", clus.cs),
|
||||
zap.Int("case-total", len(clus.failures)),
|
||||
zap.Error(err),
|
||||
)
|
||||
return
|
||||
@ -105,6 +111,7 @@ func (clus *Cluster) StartTester() {
|
||||
"functional-tester PASS",
|
||||
zap.Int("round", clus.rd),
|
||||
zap.Int("case", clus.cs),
|
||||
zap.Int("case-total", len(clus.failures)),
|
||||
)
|
||||
}
|
||||
|
||||
@ -117,12 +124,14 @@ func (clus *Cluster) doRound() error {
|
||||
clus.lg.Info(
|
||||
"round START",
|
||||
zap.Int("round", clus.rd),
|
||||
zap.Int("case", clus.cs),
|
||||
zap.Int("case-total", len(clus.failures)),
|
||||
zap.Strings("failures", clus.failureStrings()),
|
||||
zap.Int("total-failures", len(clus.failures)),
|
||||
)
|
||||
for i, fa := range clus.failures {
|
||||
clus.cs = i
|
||||
|
||||
caseTotal[fa.Desc()]++
|
||||
caseTotalCounter.WithLabelValues(fa.Desc()).Inc()
|
||||
|
||||
caseNow := time.Now()
|
||||
@ -130,8 +139,8 @@ func (clus *Cluster) doRound() error {
|
||||
"case START",
|
||||
zap.Int("round", clus.rd),
|
||||
zap.Int("case", clus.cs),
|
||||
zap.Int("case-total", len(clus.failures)),
|
||||
zap.String("desc", fa.Desc()),
|
||||
zap.Int("total-failures", len(clus.failures)),
|
||||
)
|
||||
|
||||
clus.lg.Info("wait health before injecting failures")
|
||||
@ -143,9 +152,10 @@ func (clus *Cluster) doRound() error {
|
||||
fcase := fa.FailureCase()
|
||||
if fcase != rpcpb.FailureCase_NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS {
|
||||
clus.lg.Info(
|
||||
"stresser START",
|
||||
"stress START",
|
||||
zap.Int("round", clus.rd),
|
||||
zap.Int("case", clus.cs),
|
||||
zap.Int("case-total", len(clus.failures)),
|
||||
zap.String("desc", fa.Desc()),
|
||||
)
|
||||
if err := clus.stresser.Stress(); err != nil {
|
||||
@ -158,6 +168,7 @@ func (clus *Cluster) doRound() error {
|
||||
"inject START",
|
||||
zap.Int("round", clus.rd),
|
||||
zap.Int("case", clus.cs),
|
||||
zap.Int("case-total", len(clus.failures)),
|
||||
zap.String("desc", fa.Desc()),
|
||||
)
|
||||
if err := fa.Inject(clus); err != nil {
|
||||
@ -171,6 +182,7 @@ func (clus *Cluster) doRound() error {
|
||||
"recover START",
|
||||
zap.Int("round", clus.rd),
|
||||
zap.Int("case", clus.cs),
|
||||
zap.Int("case-total", len(clus.failures)),
|
||||
zap.String("desc", fa.Desc()),
|
||||
)
|
||||
if err := fa.Recover(clus); err != nil {
|
||||
@ -178,7 +190,13 @@ func (clus *Cluster) doRound() error {
|
||||
}
|
||||
|
||||
if stressStarted {
|
||||
clus.lg.Info("stresser PAUSE")
|
||||
clus.lg.Info(
|
||||
"stress PAUSE",
|
||||
zap.Int("round", clus.rd),
|
||||
zap.Int("case", clus.cs),
|
||||
zap.Int("case-total", len(clus.failures)),
|
||||
zap.String("desc", fa.Desc()),
|
||||
)
|
||||
ems := clus.stresser.Pause()
|
||||
if fcase == rpcpb.FailureCase_NO_FAIL_WITH_STRESS && len(ems) > 0 {
|
||||
ess := make([]string, 0, len(ems))
|
||||
@ -201,12 +219,24 @@ func (clus *Cluster) doRound() error {
|
||||
}
|
||||
}
|
||||
|
||||
clus.lg.Info("health check START")
|
||||
clus.lg.Info(
|
||||
"health check START",
|
||||
zap.Int("round", clus.rd),
|
||||
zap.Int("case", clus.cs),
|
||||
zap.Int("case-total", len(clus.failures)),
|
||||
zap.String("desc", fa.Desc()),
|
||||
)
|
||||
if err := clus.WaitHealth(); err != nil {
|
||||
return fmt.Errorf("wait full health error: %v", err)
|
||||
}
|
||||
|
||||
clus.lg.Info("consistency check START")
|
||||
clus.lg.Info(
|
||||
"consistency check START",
|
||||
zap.Int("round", clus.rd),
|
||||
zap.Int("case", clus.cs),
|
||||
zap.Int("case-total", len(clus.failures)),
|
||||
zap.String("desc", fa.Desc()),
|
||||
)
|
||||
if err := clus.checkConsistency(); err != nil {
|
||||
return fmt.Errorf("consistency check error (%v)", err)
|
||||
}
|
||||
@ -215,8 +245,8 @@ func (clus *Cluster) doRound() error {
|
||||
"case PASS",
|
||||
zap.Int("round", clus.rd),
|
||||
zap.Int("case", clus.cs),
|
||||
zap.Int("case-total", len(clus.failures)),
|
||||
zap.String("desc", fa.Desc()),
|
||||
zap.Int("total-failures", len(clus.failures)),
|
||||
zap.Duration("took", time.Since(caseNow)),
|
||||
)
|
||||
}
|
||||
@ -225,7 +255,7 @@ func (clus *Cluster) doRound() error {
|
||||
"round ALL PASS",
|
||||
zap.Int("round", clus.rd),
|
||||
zap.Strings("failures", clus.failureStrings()),
|
||||
zap.Int("total-failures", len(clus.failures)),
|
||||
zap.Int("case-total", len(clus.failures)),
|
||||
zap.Duration("took", time.Since(roundNow)),
|
||||
)
|
||||
return nil
|
||||
@ -280,21 +310,21 @@ func (clus *Cluster) compact(rev int64, timeout time.Duration) (err error) {
|
||||
}
|
||||
|
||||
func (clus *Cluster) failed() {
|
||||
if !clus.Tester.ExitOnFailure {
|
||||
return
|
||||
}
|
||||
|
||||
clus.lg.Info(
|
||||
"functional-tester FAIL",
|
||||
zap.Int("round", clus.rd),
|
||||
zap.Int("case", clus.cs),
|
||||
zap.Int("case-total", len(clus.failures)),
|
||||
)
|
||||
clus.DestroyEtcdAgents()
|
||||
clus.Send_SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT()
|
||||
|
||||
os.Exit(2)
|
||||
}
|
||||
|
||||
func (clus *Cluster) cleanup() error {
|
||||
defer clus.failed()
|
||||
if clus.Tester.ExitOnFailure {
|
||||
defer clus.failed()
|
||||
}
|
||||
|
||||
roundFailedTotalCounter.Inc()
|
||||
desc := "compact/defrag"
|
||||
@ -307,23 +337,26 @@ func (clus *Cluster) cleanup() error {
|
||||
"closing stressers before archiving failure data",
|
||||
zap.Int("round", clus.rd),
|
||||
zap.Int("case", clus.cs),
|
||||
zap.Int("case-total", len(clus.failures)),
|
||||
)
|
||||
clus.stresser.Close()
|
||||
|
||||
if err := clus.FailArchive(); err != nil {
|
||||
if err := clus.send_SIGQUIT_ETCD_AND_ARCHIVE_DATA(); err != nil {
|
||||
clus.lg.Warn(
|
||||
"cleanup FAIL",
|
||||
zap.Int("round", clus.rd),
|
||||
zap.Int("case", clus.cs),
|
||||
zap.Int("case-total", len(clus.failures)),
|
||||
zap.Error(err),
|
||||
)
|
||||
return err
|
||||
}
|
||||
if err := clus.Restart(); err != nil {
|
||||
if err := clus.send_RESTART_ETCD(); err != nil {
|
||||
clus.lg.Warn(
|
||||
"restart FAIL",
|
||||
zap.Int("round", clus.rd),
|
||||
zap.Int("case", clus.cs),
|
||||
zap.Int("case-total", len(clus.failures)),
|
||||
zap.Error(err),
|
||||
)
|
||||
return err
|
64
functional/tester/cluster_shuffle.go
Normal file
64
functional/tester/cluster_shuffle.go
Normal file
@ -0,0 +1,64 @@
|
||||
// Copyright 2018 The etcd Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package tester
|
||||
|
||||
import (
|
||||
"math/rand"
|
||||
"time"
|
||||
|
||||
"go.uber.org/zap"
|
||||
)
|
||||
|
||||
func (clus *Cluster) shuffleFailures() {
|
||||
rand.Seed(time.Now().UnixNano())
|
||||
offset := rand.Intn(1000)
|
||||
n := len(clus.failures)
|
||||
cp := coprime(n)
|
||||
|
||||
fs := make([]Failure, n)
|
||||
for i := 0; i < n; i++ {
|
||||
fs[i] = clus.failures[(cp*i+offset)%n]
|
||||
}
|
||||
clus.failures = fs
|
||||
clus.lg.Info("shuffled test failure cases", zap.Int("total", n))
|
||||
}
|
||||
|
||||
/*
|
||||
x and y of GCD 1 are coprime to each other
|
||||
|
||||
x1 = ( coprime of n * idx1 + offset ) % n
|
||||
x2 = ( coprime of n * idx2 + offset ) % n
|
||||
(x2 - x1) = coprime of n * (idx2 - idx1) % n
|
||||
= (idx2 - idx1) = 1
|
||||
|
||||
Consecutive x's are guaranteed to be distinct
|
||||
*/
|
||||
func coprime(n int) int {
|
||||
coprime := 1
|
||||
for i := n / 2; i < n; i++ {
|
||||
if gcd(i, n) == 1 {
|
||||
coprime = i
|
||||
break
|
||||
}
|
||||
}
|
||||
return coprime
|
||||
}
|
||||
|
||||
func gcd(x, y int) int {
|
||||
if y == 0 {
|
||||
return x
|
||||
}
|
||||
return gcd(y, x%y)
|
||||
}
|
@ -19,12 +19,12 @@ import (
|
||||
"sort"
|
||||
"testing"
|
||||
|
||||
"github.com/coreos/etcd/tools/functional-tester/rpcpb"
|
||||
"github.com/coreos/etcd/functional/rpcpb"
|
||||
|
||||
"go.uber.org/zap"
|
||||
)
|
||||
|
||||
func Test_newCluster(t *testing.T) {
|
||||
func Test_read(t *testing.T) {
|
||||
exp := &Cluster{
|
||||
Members: []*rpcpb.Member{
|
||||
{
|
||||
@ -143,9 +143,9 @@ func Test_newCluster(t *testing.T) {
|
||||
},
|
||||
},
|
||||
Tester: &rpcpb.Tester{
|
||||
TesterDataDir: "/tmp/etcd-tester-data",
|
||||
TesterNetwork: "tcp",
|
||||
TesterAddr: "127.0.0.1:9028",
|
||||
DataDir: "/tmp/etcd-tester-data",
|
||||
Network: "tcp",
|
||||
Addr: "127.0.0.1:9028",
|
||||
DelayLatencyMs: 5000,
|
||||
DelayLatencyMsRv: 500,
|
||||
UpdatedDelayLatencyMs: 5000,
|
||||
@ -153,13 +153,15 @@ func Test_newCluster(t *testing.T) {
|
||||
ExitOnFailure: true,
|
||||
ConsistencyCheck: true,
|
||||
EnablePprof: true,
|
||||
FailureDelayMs: 7000,
|
||||
FailureShuffle: true,
|
||||
FailureCases: []string{
|
||||
"KILL_ONE_FOLLOWER",
|
||||
"KILL_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT",
|
||||
"KILL_LEADER",
|
||||
"KILL_LEADER_UNTIL_TRIGGER_SNAPSHOT",
|
||||
"KILL_QUORUM",
|
||||
"KILL_ALL",
|
||||
"SIGTERM_ONE_FOLLOWER",
|
||||
"SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT",
|
||||
"SIGTERM_LEADER",
|
||||
"SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT",
|
||||
"SIGTERM_QUORUM",
|
||||
"SIGTERM_ALL",
|
||||
"BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER",
|
||||
"BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT",
|
||||
"BLACKHOLE_PEER_PORT_TX_RX_LEADER",
|
||||
@ -181,10 +183,8 @@ func Test_newCluster(t *testing.T) {
|
||||
"NO_FAIL_WITH_STRESS",
|
||||
"NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS",
|
||||
},
|
||||
FailureDelayMs: 7000,
|
||||
FailureShuffle: true,
|
||||
FailpointCommands: []string{`panic("etcd-tester")`},
|
||||
RunnerExecPath: "/etcd-runner",
|
||||
RunnerExecPath: "./bin/etcd-runner",
|
||||
ExternalExecPath: "",
|
||||
StressTypes: []string{"KV", "LEASE"},
|
||||
StressKeySize: 100,
|
||||
@ -203,7 +203,7 @@ func Test_newCluster(t *testing.T) {
|
||||
}
|
||||
defer logger.Sync()
|
||||
|
||||
cfg, err := newCluster(logger, "./local-test.yaml")
|
||||
cfg, err := read(logger, "../../functional.yaml")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
@ -235,6 +235,7 @@ func Test_newCluster(t *testing.T) {
|
||||
sort.Strings(fs1)
|
||||
sort.Strings(fs2)
|
||||
sort.Strings(fs3)
|
||||
|
||||
if !reflect.DeepEqual(fs1, fs2) {
|
||||
t.Fatalf("expected %q, got %q", fs1, fs2)
|
||||
}
|
@ -19,7 +19,7 @@ import (
|
||||
"math/rand"
|
||||
"time"
|
||||
|
||||
"github.com/coreos/etcd/tools/functional-tester/rpcpb"
|
||||
"github.com/coreos/etcd/functional/rpcpb"
|
||||
|
||||
"go.uber.org/zap"
|
||||
)
|
||||
@ -242,17 +242,12 @@ func (f *failureUntilSnapshot) Inject(clus *Cluster) error {
|
||||
if err := f.Failure.Inject(clus); err != nil {
|
||||
return err
|
||||
}
|
||||
if len(clus.Members) < 3 {
|
||||
return nil
|
||||
}
|
||||
|
||||
snapshotCount := clus.Members[0].Etcd.SnapshotCount
|
||||
|
||||
now := time.Now()
|
||||
clus.lg.Info(
|
||||
"trigger snapshot START",
|
||||
zap.Int("round", clus.rd),
|
||||
zap.Int("case", clus.cs),
|
||||
zap.String("desc", f.Desc()),
|
||||
zap.Int64("etcd-snapshot-count", snapshotCount),
|
||||
)
|
||||
@ -283,8 +278,6 @@ func (f *failureUntilSnapshot) Inject(clus *Cluster) error {
|
||||
if diff > snapshotCount {
|
||||
clus.lg.Info(
|
||||
"trigger snapshot PASS",
|
||||
zap.Int("round", clus.rd),
|
||||
zap.Int("case", clus.cs),
|
||||
zap.Int("retries", i),
|
||||
zap.String("desc", f.Desc()),
|
||||
zap.Int64("committed-entries", diff),
|
@ -31,9 +31,9 @@ func (f *failureDelay) Inject(clus *Cluster) error {
|
||||
}
|
||||
if f.delayDuration > 0 {
|
||||
clus.lg.Info(
|
||||
"sleeping in failureDelay",
|
||||
"wait after inject",
|
||||
zap.Duration("delay", f.delayDuration),
|
||||
zap.String("case", f.Failure.Desc()),
|
||||
zap.String("desc", f.Failure.Desc()),
|
||||
)
|
||||
time.Sleep(f.delayDuration)
|
||||
}
|
@ -18,7 +18,7 @@ import (
|
||||
"fmt"
|
||||
"os/exec"
|
||||
|
||||
"github.com/coreos/etcd/tools/functional-tester/rpcpb"
|
||||
"github.com/coreos/etcd/functional/rpcpb"
|
||||
)
|
||||
|
||||
type failureExternal struct {
|
||||
@ -46,7 +46,7 @@ func (f *failureExternal) FailureCase() rpcpb.FailureCase {
|
||||
return f.failureCase
|
||||
}
|
||||
|
||||
func newFailureExternal(scriptPath string) Failure {
|
||||
func new_FailureCase_EXTERNAL(scriptPath string) Failure {
|
||||
return &failureExternal{
|
||||
desc: fmt.Sprintf("external fault injector (script: %q)", scriptPath),
|
||||
failureCase: rpcpb.FailureCase_EXTERNAL,
|
@ -21,7 +21,7 @@ import (
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"github.com/coreos/etcd/tools/functional-tester/rpcpb"
|
||||
"github.com/coreos/etcd/functional/rpcpb"
|
||||
)
|
||||
|
||||
type failpointStats struct {
|
||||
@ -145,7 +145,7 @@ func makeRecoverFailpoint(fp string) recoverMemberFunc {
|
||||
fpStats.mu.Lock()
|
||||
fpStats.crashes[fp]++
|
||||
fpStats.mu.Unlock()
|
||||
return recoverKill(clus, idx)
|
||||
return recover_SIGTERM_ETCD(clus, idx)
|
||||
}
|
||||
}
|
||||
|
@ -14,21 +14,21 @@
|
||||
|
||||
package tester
|
||||
|
||||
import "github.com/coreos/etcd/tools/functional-tester/rpcpb"
|
||||
import "github.com/coreos/etcd/functional/rpcpb"
|
||||
|
||||
func injectBlackholePeerPortTxRx(clus *Cluster, idx int) error {
|
||||
return clus.sendOperation(idx, rpcpb.Operation_BlackholePeerPortTxRx)
|
||||
func inject_BLACKHOLE_PEER_PORT_TX_RX(clus *Cluster, idx int) error {
|
||||
return clus.sendOp(idx, rpcpb.Operation_BLACKHOLE_PEER_PORT_TX_RX)
|
||||
}
|
||||
|
||||
func recoverBlackholePeerPortTxRx(clus *Cluster, idx int) error {
|
||||
return clus.sendOperation(idx, rpcpb.Operation_UnblackholePeerPortTxRx)
|
||||
func recover_BLACKHOLE_PEER_PORT_TX_RX(clus *Cluster, idx int) error {
|
||||
return clus.sendOp(idx, rpcpb.Operation_UNBLACKHOLE_PEER_PORT_TX_RX)
|
||||
}
|
||||
|
||||
func newFailureBlackholePeerPortTxRxOneFollower(clus *Cluster) Failure {
|
||||
func new_FailureCase_BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER(clus *Cluster) Failure {
|
||||
ff := failureByFunc{
|
||||
failureCase: rpcpb.FailureCase_BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER,
|
||||
injectMember: injectBlackholePeerPortTxRx,
|
||||
recoverMember: recoverBlackholePeerPortTxRx,
|
||||
injectMember: inject_BLACKHOLE_PEER_PORT_TX_RX,
|
||||
recoverMember: recover_BLACKHOLE_PEER_PORT_TX_RX,
|
||||
}
|
||||
f := &failureFollower{ff, -1, -1}
|
||||
return &failureDelay{
|
||||
@ -37,11 +37,11 @@ func newFailureBlackholePeerPortTxRxOneFollower(clus *Cluster) Failure {
|
||||
}
|
||||
}
|
||||
|
||||
func newFailureBlackholePeerPortTxRxOneFollowerUntilTriggerSnapshot() Failure {
|
||||
func new_FailureCase_BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT() Failure {
|
||||
ff := failureByFunc{
|
||||
failureCase: rpcpb.FailureCase_BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT,
|
||||
injectMember: injectBlackholePeerPortTxRx,
|
||||
recoverMember: recoverBlackholePeerPortTxRx,
|
||||
injectMember: inject_BLACKHOLE_PEER_PORT_TX_RX,
|
||||
recoverMember: recover_BLACKHOLE_PEER_PORT_TX_RX,
|
||||
}
|
||||
f := &failureFollower{ff, -1, -1}
|
||||
return &failureUntilSnapshot{
|
||||
@ -50,11 +50,11 @@ func newFailureBlackholePeerPortTxRxOneFollowerUntilTriggerSnapshot() Failure {
|
||||
}
|
||||
}
|
||||
|
||||
func newFailureBlackholePeerPortTxRxLeader(clus *Cluster) Failure {
|
||||
func new_FailureCase_BLACKHOLE_PEER_PORT_TX_RX_LEADER(clus *Cluster) Failure {
|
||||
ff := failureByFunc{
|
||||
failureCase: rpcpb.FailureCase_BLACKHOLE_PEER_PORT_TX_RX_LEADER,
|
||||
injectMember: injectBlackholePeerPortTxRx,
|
||||
recoverMember: recoverBlackholePeerPortTxRx,
|
||||
injectMember: inject_BLACKHOLE_PEER_PORT_TX_RX,
|
||||
recoverMember: recover_BLACKHOLE_PEER_PORT_TX_RX,
|
||||
}
|
||||
f := &failureLeader{ff, -1, -1}
|
||||
return &failureDelay{
|
||||
@ -63,11 +63,11 @@ func newFailureBlackholePeerPortTxRxLeader(clus *Cluster) Failure {
|
||||
}
|
||||
}
|
||||
|
||||
func newFailureBlackholePeerPortTxRxLeaderUntilTriggerSnapshot() Failure {
|
||||
func new_FailureCase_BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT() Failure {
|
||||
ff := failureByFunc{
|
||||
failureCase: rpcpb.FailureCase_BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT,
|
||||
injectMember: injectBlackholePeerPortTxRx,
|
||||
recoverMember: recoverBlackholePeerPortTxRx,
|
||||
injectMember: inject_BLACKHOLE_PEER_PORT_TX_RX,
|
||||
recoverMember: recover_BLACKHOLE_PEER_PORT_TX_RX,
|
||||
}
|
||||
f := &failureLeader{ff, -1, -1}
|
||||
return &failureUntilSnapshot{
|
||||
@ -76,11 +76,11 @@ func newFailureBlackholePeerPortTxRxLeaderUntilTriggerSnapshot() Failure {
|
||||
}
|
||||
}
|
||||
|
||||
func newFailureBlackholePeerPortTxRxQuorum(clus *Cluster) Failure {
|
||||
func new_FailureCase_BLACKHOLE_PEER_PORT_TX_RX_QUORUM(clus *Cluster) Failure {
|
||||
f := &failureQuorum{
|
||||
failureCase: rpcpb.FailureCase_BLACKHOLE_PEER_PORT_TX_RX_QUORUM,
|
||||
injectMember: injectBlackholePeerPortTxRx,
|
||||
recoverMember: recoverBlackholePeerPortTxRx,
|
||||
injectMember: inject_BLACKHOLE_PEER_PORT_TX_RX,
|
||||
recoverMember: recover_BLACKHOLE_PEER_PORT_TX_RX,
|
||||
}
|
||||
return &failureDelay{
|
||||
Failure: f,
|
||||
@ -88,11 +88,11 @@ func newFailureBlackholePeerPortTxRxQuorum(clus *Cluster) Failure {
|
||||
}
|
||||
}
|
||||
|
||||
func newFailureBlackholePeerPortTxRxAll(clus *Cluster) Failure {
|
||||
func new_FailureCase_BLACKHOLE_PEER_PORT_TX_RX_ALL(clus *Cluster) Failure {
|
||||
f := &failureAll{
|
||||
failureCase: rpcpb.FailureCase_BLACKHOLE_PEER_PORT_TX_RX_ALL,
|
||||
injectMember: injectBlackholePeerPortTxRx,
|
||||
recoverMember: recoverBlackholePeerPortTxRx,
|
||||
injectMember: inject_BLACKHOLE_PEER_PORT_TX_RX,
|
||||
recoverMember: recover_BLACKHOLE_PEER_PORT_TX_RX,
|
||||
}
|
||||
return &failureDelay{
|
||||
Failure: f,
|
@ -17,7 +17,7 @@ package tester
|
||||
import (
|
||||
"time"
|
||||
|
||||
"github.com/coreos/etcd/tools/functional-tester/rpcpb"
|
||||
"github.com/coreos/etcd/functional/rpcpb"
|
||||
|
||||
"go.uber.org/zap"
|
||||
)
|
||||
@ -30,35 +30,33 @@ const (
|
||||
waitRecover = 5 * time.Second
|
||||
)
|
||||
|
||||
func injectDelayPeerPortTxRx(clus *Cluster, idx int) error {
|
||||
func inject_DELAY_PEER_PORT_TX_RX(clus *Cluster, idx int) error {
|
||||
clus.lg.Info(
|
||||
"injecting delay latency",
|
||||
zap.Duration("latency", time.Duration(clus.Tester.UpdatedDelayLatencyMs)*time.Millisecond),
|
||||
zap.Duration("latency-rv", time.Duration(clus.Tester.DelayLatencyMsRv)*time.Millisecond),
|
||||
zap.String("endpoint", clus.Members[idx].EtcdClientEndpoint),
|
||||
)
|
||||
return clus.sendOperation(idx, rpcpb.Operation_DelayPeerPortTxRx)
|
||||
return clus.sendOp(idx, rpcpb.Operation_DELAY_PEER_PORT_TX_RX)
|
||||
}
|
||||
|
||||
func recoverDelayPeerPortTxRx(clus *Cluster, idx int) error {
|
||||
err := clus.sendOperation(idx, rpcpb.Operation_UndelayPeerPortTxRx)
|
||||
func recover_DELAY_PEER_PORT_TX_RX(clus *Cluster, idx int) error {
|
||||
err := clus.sendOp(idx, rpcpb.Operation_UNDELAY_PEER_PORT_TX_RX)
|
||||
time.Sleep(waitRecover)
|
||||
return err
|
||||
}
|
||||
|
||||
func newFailureDelayPeerPortTxRxOneFollower(clus *Cluster, random bool) Failure {
|
||||
func new_FailureCase_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER(clus *Cluster, random bool) Failure {
|
||||
ff := failureByFunc{
|
||||
failureCase: rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER,
|
||||
injectMember: injectDelayPeerPortTxRx,
|
||||
recoverMember: recoverDelayPeerPortTxRx,
|
||||
injectMember: inject_DELAY_PEER_PORT_TX_RX,
|
||||
recoverMember: recover_DELAY_PEER_PORT_TX_RX,
|
||||
}
|
||||
|
||||
clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs
|
||||
if random {
|
||||
clus.UpdateDelayLatencyMs()
|
||||
ff.failureCase = rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER
|
||||
}
|
||||
|
||||
f := &failureFollower{ff, -1, -1}
|
||||
return &failureDelay{
|
||||
Failure: f,
|
||||
@ -66,19 +64,17 @@ func newFailureDelayPeerPortTxRxOneFollower(clus *Cluster, random bool) Failure
|
||||
}
|
||||
}
|
||||
|
||||
func newFailureDelayPeerPortTxRxOneFollowerUntilTriggerSnapshot(clus *Cluster, random bool) Failure {
|
||||
func new_FailureCase_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT(clus *Cluster, random bool) Failure {
|
||||
ff := failureByFunc{
|
||||
failureCase: rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT,
|
||||
injectMember: injectDelayPeerPortTxRx,
|
||||
recoverMember: recoverDelayPeerPortTxRx,
|
||||
injectMember: inject_DELAY_PEER_PORT_TX_RX,
|
||||
recoverMember: recover_DELAY_PEER_PORT_TX_RX,
|
||||
}
|
||||
|
||||
clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs
|
||||
if random {
|
||||
clus.UpdateDelayLatencyMs()
|
||||
ff.failureCase = rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT
|
||||
}
|
||||
|
||||
f := &failureFollower{ff, -1, -1}
|
||||
return &failureUntilSnapshot{
|
||||
failureCase: ff.failureCase,
|
||||
@ -86,19 +82,17 @@ func newFailureDelayPeerPortTxRxOneFollowerUntilTriggerSnapshot(clus *Cluster, r
|
||||
}
|
||||
}
|
||||
|
||||
func newFailureDelayPeerPortTxRxLeader(clus *Cluster, random bool) Failure {
|
||||
func new_FailureCase_DELAY_PEER_PORT_TX_RX_LEADER(clus *Cluster, random bool) Failure {
|
||||
ff := failureByFunc{
|
||||
failureCase: rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_LEADER,
|
||||
injectMember: injectDelayPeerPortTxRx,
|
||||
recoverMember: recoverDelayPeerPortTxRx,
|
||||
injectMember: inject_DELAY_PEER_PORT_TX_RX,
|
||||
recoverMember: recover_DELAY_PEER_PORT_TX_RX,
|
||||
}
|
||||
|
||||
clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs
|
||||
if random {
|
||||
clus.UpdateDelayLatencyMs()
|
||||
ff.failureCase = rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_LEADER
|
||||
}
|
||||
|
||||
f := &failureLeader{ff, -1, -1}
|
||||
return &failureDelay{
|
||||
Failure: f,
|
||||
@ -106,19 +100,17 @@ func newFailureDelayPeerPortTxRxLeader(clus *Cluster, random bool) Failure {
|
||||
}
|
||||
}
|
||||
|
||||
func newFailureDelayPeerPortTxRxLeaderUntilTriggerSnapshot(clus *Cluster, random bool) Failure {
|
||||
func new_FailureCase_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT(clus *Cluster, random bool) Failure {
|
||||
ff := failureByFunc{
|
||||
failureCase: rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT,
|
||||
injectMember: injectDelayPeerPortTxRx,
|
||||
recoverMember: recoverDelayPeerPortTxRx,
|
||||
injectMember: inject_DELAY_PEER_PORT_TX_RX,
|
||||
recoverMember: recover_DELAY_PEER_PORT_TX_RX,
|
||||
}
|
||||
|
||||
clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs
|
||||
if random {
|
||||
clus.UpdateDelayLatencyMs()
|
||||
ff.failureCase = rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT
|
||||
}
|
||||
|
||||
f := &failureLeader{ff, -1, -1}
|
||||
return &failureUntilSnapshot{
|
||||
failureCase: ff.failureCase,
|
||||
@ -126,38 +118,34 @@ func newFailureDelayPeerPortTxRxLeaderUntilTriggerSnapshot(clus *Cluster, random
|
||||
}
|
||||
}
|
||||
|
||||
func newFailureDelayPeerPortTxRxQuorum(clus *Cluster, random bool) Failure {
|
||||
func new_FailureCase_DELAY_PEER_PORT_TX_RX_QUORUM(clus *Cluster, random bool) Failure {
|
||||
f := &failureQuorum{
|
||||
failureCase: rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_QUORUM,
|
||||
injectMember: injectDelayPeerPortTxRx,
|
||||
recoverMember: recoverDelayPeerPortTxRx,
|
||||
injectMember: inject_DELAY_PEER_PORT_TX_RX,
|
||||
recoverMember: recover_DELAY_PEER_PORT_TX_RX,
|
||||
}
|
||||
|
||||
clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs
|
||||
if random {
|
||||
clus.UpdateDelayLatencyMs()
|
||||
f.failureCase = rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM
|
||||
}
|
||||
|
||||
return &failureDelay{
|
||||
Failure: f,
|
||||
delayDuration: clus.GetFailureDelayDuration(),
|
||||
}
|
||||
}
|
||||
|
||||
func newFailureDelayPeerPortTxRxAll(clus *Cluster, random bool) Failure {
|
||||
func new_FailureCase_DELAY_PEER_PORT_TX_RX_ALL(clus *Cluster, random bool) Failure {
|
||||
f := &failureAll{
|
||||
failureCase: rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_ALL,
|
||||
injectMember: injectDelayPeerPortTxRx,
|
||||
recoverMember: recoverDelayPeerPortTxRx,
|
||||
injectMember: inject_DELAY_PEER_PORT_TX_RX,
|
||||
recoverMember: recover_DELAY_PEER_PORT_TX_RX,
|
||||
}
|
||||
|
||||
clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs
|
||||
if random {
|
||||
clus.UpdateDelayLatencyMs()
|
||||
f.failureCase = rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_ALL
|
||||
}
|
||||
|
||||
return &failureDelay{
|
||||
Failure: f,
|
||||
delayDuration: clus.GetFailureDelayDuration(),
|
@ -17,7 +17,7 @@ package tester
|
||||
import (
|
||||
"time"
|
||||
|
||||
"github.com/coreos/etcd/tools/functional-tester/rpcpb"
|
||||
"github.com/coreos/etcd/functional/rpcpb"
|
||||
|
||||
"go.uber.org/zap"
|
||||
)
|
||||
@ -43,7 +43,7 @@ func (f *failureNoFailWithStress) FailureCase() rpcpb.FailureCase {
|
||||
return f.failureCase
|
||||
}
|
||||
|
||||
func newFailureNoFailWithStress(clus *Cluster) Failure {
|
||||
func new_FailureCase_NO_FAIL_WITH_STRESS(clus *Cluster) Failure {
|
||||
f := &failureNoFailWithStress{
|
||||
failureCase: rpcpb.FailureCase_NO_FAIL_WITH_STRESS,
|
||||
}
|
||||
@ -88,7 +88,7 @@ func (f *failureNoFailWithNoStressForLiveness) FailureCase() rpcpb.FailureCase {
|
||||
return f.failureCase
|
||||
}
|
||||
|
||||
func newFailureNoFailWithNoStressForLiveness(clus *Cluster) Failure {
|
||||
func new_FailureCase_NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS(clus *Cluster) Failure {
|
||||
f := &failureNoFailWithNoStressForLiveness{
|
||||
failureCase: rpcpb.FailureCase_NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS,
|
||||
}
|
89
functional/tester/failure_case_sigterm.go
Normal file
89
functional/tester/failure_case_sigterm.go
Normal file
@ -0,0 +1,89 @@
|
||||
// Copyright 2018 The etcd Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package tester
|
||||
|
||||
import "github.com/coreos/etcd/functional/rpcpb"
|
||||
|
||||
func inject_SIGTERM_ETCD(clus *Cluster, idx int) error {
|
||||
return clus.sendOp(idx, rpcpb.Operation_SIGTERM_ETCD)
|
||||
}
|
||||
|
||||
func recover_SIGTERM_ETCD(clus *Cluster, idx int) error {
|
||||
return clus.sendOp(idx, rpcpb.Operation_RESTART_ETCD)
|
||||
}
|
||||
|
||||
func new_FailureCase_SIGTERM_ONE_FOLLOWER(clus *Cluster) Failure {
|
||||
ff := failureByFunc{
|
||||
failureCase: rpcpb.FailureCase_SIGTERM_ONE_FOLLOWER,
|
||||
injectMember: inject_SIGTERM_ETCD,
|
||||
recoverMember: recover_SIGTERM_ETCD,
|
||||
}
|
||||
f := &failureFollower{ff, -1, -1}
|
||||
return &failureDelay{
|
||||
Failure: f,
|
||||
delayDuration: clus.GetFailureDelayDuration(),
|
||||
}
|
||||
}
|
||||
|
||||
func new_FailureCase_SIGTERM_LEADER(clus *Cluster) Failure {
|
||||
ff := failureByFunc{
|
||||
failureCase: rpcpb.FailureCase_SIGTERM_LEADER,
|
||||
injectMember: inject_SIGTERM_ETCD,
|
||||
recoverMember: recover_SIGTERM_ETCD,
|
||||
}
|
||||
f := &failureLeader{ff, -1, -1}
|
||||
return &failureDelay{
|
||||
Failure: f,
|
||||
delayDuration: clus.GetFailureDelayDuration(),
|
||||
}
|
||||
}
|
||||
|
||||
func new_FailureCase_SIGTERM_QUORUM(clus *Cluster) Failure {
|
||||
f := &failureQuorum{
|
||||
failureCase: rpcpb.FailureCase_SIGTERM_QUORUM,
|
||||
injectMember: inject_SIGTERM_ETCD,
|
||||
recoverMember: recover_SIGTERM_ETCD,
|
||||
}
|
||||
return &failureDelay{
|
||||
Failure: f,
|
||||
delayDuration: clus.GetFailureDelayDuration(),
|
||||
}
|
||||
}
|
||||
|
||||
func new_FailureCase_SIGTERM_ALL(clus *Cluster) Failure {
|
||||
f := &failureAll{
|
||||
failureCase: rpcpb.FailureCase_SIGTERM_ALL,
|
||||
injectMember: inject_SIGTERM_ETCD,
|
||||
recoverMember: recover_SIGTERM_ETCD,
|
||||
}
|
||||
return &failureDelay{
|
||||
Failure: f,
|
||||
delayDuration: clus.GetFailureDelayDuration(),
|
||||
}
|
||||
}
|
||||
|
||||
func new_FailureCase_SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT(clus *Cluster) Failure {
|
||||
return &failureUntilSnapshot{
|
||||
failureCase: rpcpb.FailureCase_SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT,
|
||||
Failure: new_FailureCase_SIGTERM_ONE_FOLLOWER(clus),
|
||||
}
|
||||
}
|
||||
|
||||
func new_FailureCase_SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT(clus *Cluster) Failure {
|
||||
return &failureUntilSnapshot{
|
||||
failureCase: rpcpb.FailureCase_SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT,
|
||||
Failure: new_FailureCase_SIGTERM_LEADER(clus),
|
||||
}
|
||||
}
|
@ -14,9 +14,16 @@
|
||||
|
||||
package tester
|
||||
|
||||
import "github.com/prometheus/client_golang/prometheus"
|
||||
import (
|
||||
"fmt"
|
||||
"sort"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
)
|
||||
|
||||
var (
|
||||
caseTotal = make(map[string]int)
|
||||
|
||||
caseTotalCounter = prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: "etcd",
|
||||
@ -60,3 +67,17 @@ func init() {
|
||||
prometheus.MustRegister(roundTotalCounter)
|
||||
prometheus.MustRegister(roundFailedTotalCounter)
|
||||
}
|
||||
|
||||
func printReport() {
|
||||
rows := make([]string, 0, len(caseTotal))
|
||||
for k, v := range caseTotal {
|
||||
rows = append(rows, fmt.Sprintf("%s: %d", k, v))
|
||||
}
|
||||
sort.Strings(rows)
|
||||
|
||||
println()
|
||||
for _, row := range rows {
|
||||
fmt.Println(row)
|
||||
}
|
||||
println()
|
||||
}
|
@ -18,7 +18,7 @@ import (
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/coreos/etcd/tools/functional-tester/rpcpb"
|
||||
"github.com/coreos/etcd/functional/rpcpb"
|
||||
|
||||
"go.uber.org/zap"
|
||||
)
|
||||
@ -52,6 +52,7 @@ func newStresser(clus *Cluster, m *rpcpb.Member) Stresser {
|
||||
// TODO: Too intensive stressing clients can panic etcd member with
|
||||
// 'out of memory' error. Put rate limits in server side.
|
||||
stressers[i] = &keyStresser{
|
||||
stype: rpcpb.StressType_KV,
|
||||
lg: clus.lg,
|
||||
m: m,
|
||||
keySize: int(clus.Tester.StressKeySize),
|
||||
@ -65,6 +66,7 @@ func newStresser(clus *Cluster, m *rpcpb.Member) Stresser {
|
||||
|
||||
case "LEASE":
|
||||
stressers[i] = &leaseStresser{
|
||||
stype: rpcpb.StressType_LEASE,
|
||||
lg: clus.lg,
|
||||
m: m,
|
||||
numLeases: 10, // TODO: configurable
|
||||
@ -84,6 +86,8 @@ func newStresser(clus *Cluster, m *rpcpb.Member) Stresser {
|
||||
"--req-rate", fmt.Sprintf("%v", reqRate),
|
||||
}
|
||||
stressers[i] = newRunnerStresser(
|
||||
rpcpb.StressType_ELECTION_RUNNER,
|
||||
clus.lg,
|
||||
clus.Tester.RunnerExecPath,
|
||||
args,
|
||||
clus.rateLimiter,
|
||||
@ -102,7 +106,14 @@ func newStresser(clus *Cluster, m *rpcpb.Member) Stresser {
|
||||
"--rounds=0", // runs forever
|
||||
"--req-rate", fmt.Sprintf("%v", reqRate),
|
||||
}
|
||||
stressers[i] = newRunnerStresser(clus.Tester.RunnerExecPath, args, clus.rateLimiter, reqRate)
|
||||
stressers[i] = newRunnerStresser(
|
||||
rpcpb.StressType_WATCH_RUNNER,
|
||||
clus.lg,
|
||||
clus.Tester.RunnerExecPath,
|
||||
args,
|
||||
clus.rateLimiter,
|
||||
reqRate,
|
||||
)
|
||||
|
||||
case "LOCK_RACER_RUNNER":
|
||||
reqRate := 100
|
||||
@ -114,7 +125,14 @@ func newStresser(clus *Cluster, m *rpcpb.Member) Stresser {
|
||||
"--rounds=0", // runs forever
|
||||
"--req-rate", fmt.Sprintf("%v", reqRate),
|
||||
}
|
||||
stressers[i] = newRunnerStresser(clus.Tester.RunnerExecPath, args, clus.rateLimiter, reqRate)
|
||||
stressers[i] = newRunnerStresser(
|
||||
rpcpb.StressType_LOCK_RACER_RUNNER,
|
||||
clus.lg,
|
||||
clus.Tester.RunnerExecPath,
|
||||
args,
|
||||
clus.rateLimiter,
|
||||
reqRate,
|
||||
)
|
||||
|
||||
case "LEASE_RUNNER":
|
||||
args := []string{
|
||||
@ -122,7 +140,14 @@ func newStresser(clus *Cluster, m *rpcpb.Member) Stresser {
|
||||
"--ttl=30",
|
||||
"--endpoints", m.EtcdClientEndpoint,
|
||||
}
|
||||
stressers[i] = newRunnerStresser(clus.Tester.RunnerExecPath, args, clus.rateLimiter, 0)
|
||||
stressers[i] = newRunnerStresser(
|
||||
rpcpb.StressType_LEASE_RUNNER,
|
||||
clus.lg,
|
||||
clus.Tester.RunnerExecPath,
|
||||
args,
|
||||
clus.rateLimiter,
|
||||
0,
|
||||
)
|
||||
}
|
||||
}
|
||||
return &compositeStresser{stressers}
|
@ -18,6 +18,7 @@ import (
|
||||
"context"
|
||||
"fmt"
|
||||
"math/rand"
|
||||
"reflect"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
@ -25,7 +26,7 @@ import (
|
||||
"github.com/coreos/etcd/clientv3"
|
||||
"github.com/coreos/etcd/etcdserver"
|
||||
"github.com/coreos/etcd/etcdserver/api/v3rpc/rpctypes"
|
||||
"github.com/coreos/etcd/tools/functional-tester/rpcpb"
|
||||
"github.com/coreos/etcd/functional/rpcpb"
|
||||
|
||||
"go.uber.org/zap"
|
||||
"golang.org/x/time/rate"
|
||||
@ -34,7 +35,8 @@ import (
|
||||
)
|
||||
|
||||
type keyStresser struct {
|
||||
lg *zap.Logger
|
||||
stype rpcpb.StressType
|
||||
lg *zap.Logger
|
||||
|
||||
m *rpcpb.Member
|
||||
|
||||
@ -102,7 +104,8 @@ func (s *keyStresser) Stress() error {
|
||||
}
|
||||
|
||||
s.lg.Info(
|
||||
"key stresser START",
|
||||
"stress START",
|
||||
zap.String("stress-type", s.stype.String()),
|
||||
zap.String("endpoint", s.m.EtcdClientEndpoint),
|
||||
)
|
||||
return nil
|
||||
@ -156,8 +159,10 @@ func (s *keyStresser) run() {
|
||||
return
|
||||
default:
|
||||
s.lg.Warn(
|
||||
"key stresser exited with error",
|
||||
"stress run exiting",
|
||||
zap.String("stress-type", s.stype.String()),
|
||||
zap.String("endpoint", s.m.EtcdClientEndpoint),
|
||||
zap.String("error-type", reflect.TypeOf(err).String()),
|
||||
zap.Error(err),
|
||||
)
|
||||
return
|
||||
@ -188,7 +193,8 @@ func (s *keyStresser) Close() map[string]int {
|
||||
s.emu.Unlock()
|
||||
|
||||
s.lg.Info(
|
||||
"key stresser STOP",
|
||||
"stress STOP",
|
||||
zap.String("stress-type", s.stype.String()),
|
||||
zap.String("endpoint", s.m.EtcdClientEndpoint),
|
||||
)
|
||||
return ess
|
@ -24,7 +24,7 @@ import (
|
||||
|
||||
"github.com/coreos/etcd/clientv3"
|
||||
"github.com/coreos/etcd/etcdserver/api/v3rpc/rpctypes"
|
||||
"github.com/coreos/etcd/tools/functional-tester/rpcpb"
|
||||
"github.com/coreos/etcd/functional/rpcpb"
|
||||
|
||||
"go.uber.org/zap"
|
||||
"golang.org/x/time/rate"
|
||||
@ -38,7 +38,8 @@ const (
|
||||
)
|
||||
|
||||
type leaseStresser struct {
|
||||
lg *zap.Logger
|
||||
stype rpcpb.StressType
|
||||
lg *zap.Logger
|
||||
|
||||
m *rpcpb.Member
|
||||
cli *clientv3.Client
|
||||
@ -121,7 +122,8 @@ func (ls *leaseStresser) setupOnce() error {
|
||||
|
||||
func (ls *leaseStresser) Stress() error {
|
||||
ls.lg.Info(
|
||||
"lease stresser START",
|
||||
"stress START",
|
||||
zap.String("stress-type", ls.stype.String()),
|
||||
zap.String("endpoint", ls.m.EtcdClientEndpoint),
|
||||
)
|
||||
|
||||
@ -159,22 +161,26 @@ func (ls *leaseStresser) run() {
|
||||
}
|
||||
|
||||
ls.lg.Debug(
|
||||
"lease stresser is creating leases",
|
||||
"stress creating leases",
|
||||
zap.String("stress-type", ls.stype.String()),
|
||||
zap.String("endpoint", ls.m.EtcdClientEndpoint),
|
||||
)
|
||||
ls.createLeases()
|
||||
ls.lg.Debug(
|
||||
"lease stresser created leases",
|
||||
"stress created leases",
|
||||
zap.String("stress-type", ls.stype.String()),
|
||||
zap.String("endpoint", ls.m.EtcdClientEndpoint),
|
||||
)
|
||||
|
||||
ls.lg.Debug(
|
||||
"lease stresser is dropped leases",
|
||||
"stress dropped leases",
|
||||
zap.String("stress-type", ls.stype.String()),
|
||||
zap.String("endpoint", ls.m.EtcdClientEndpoint),
|
||||
)
|
||||
ls.randomlyDropLeases()
|
||||
ls.lg.Debug(
|
||||
"lease stresser dropped leases",
|
||||
"stress dropped leases",
|
||||
zap.String("stress-type", ls.stype.String()),
|
||||
zap.String("endpoint", ls.m.EtcdClientEndpoint),
|
||||
)
|
||||
}
|
||||
@ -243,6 +249,7 @@ func (ls *leaseStresser) createLeaseWithKeys(ttl int64) (int64, error) {
|
||||
if err != nil {
|
||||
ls.lg.Debug(
|
||||
"createLease failed",
|
||||
zap.String("stress-type", ls.stype.String()),
|
||||
zap.String("endpoint", ls.m.EtcdClientEndpoint),
|
||||
zap.Error(err),
|
||||
)
|
||||
@ -251,6 +258,7 @@ func (ls *leaseStresser) createLeaseWithKeys(ttl int64) (int64, error) {
|
||||
|
||||
ls.lg.Debug(
|
||||
"createLease created lease",
|
||||
zap.String("stress-type", ls.stype.String()),
|
||||
zap.String("endpoint", ls.m.EtcdClientEndpoint),
|
||||
zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
|
||||
)
|
||||
@ -284,6 +292,7 @@ func (ls *leaseStresser) randomlyDropLeases() {
|
||||
}
|
||||
ls.lg.Debug(
|
||||
"randomlyDropLease dropped a lease",
|
||||
zap.String("stress-type", ls.stype.String()),
|
||||
zap.String("endpoint", ls.m.EtcdClientEndpoint),
|
||||
zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
|
||||
)
|
||||
@ -313,6 +322,7 @@ func (ls *leaseStresser) keepLeaseAlive(leaseID int64) {
|
||||
case <-ls.ctx.Done():
|
||||
ls.lg.Debug(
|
||||
"keepLeaseAlive context canceled",
|
||||
zap.String("stress-type", ls.stype.String()),
|
||||
zap.String("endpoint", ls.m.EtcdClientEndpoint),
|
||||
zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
|
||||
zap.Error(ls.ctx.Err()),
|
||||
@ -327,6 +337,7 @@ func (ls *leaseStresser) keepLeaseAlive(leaseID int64) {
|
||||
ls.aliveLeases.remove(leaseID)
|
||||
ls.lg.Debug(
|
||||
"keepLeaseAlive lease has not been renewed, dropped it",
|
||||
zap.String("stress-type", ls.stype.String()),
|
||||
zap.String("endpoint", ls.m.EtcdClientEndpoint),
|
||||
zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
|
||||
)
|
||||
@ -337,6 +348,7 @@ func (ls *leaseStresser) keepLeaseAlive(leaseID int64) {
|
||||
if err != nil {
|
||||
ls.lg.Debug(
|
||||
"keepLeaseAlive lease creates stream error",
|
||||
zap.String("stress-type", ls.stype.String()),
|
||||
zap.String("endpoint", ls.m.EtcdClientEndpoint),
|
||||
zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
|
||||
zap.Error(err),
|
||||
@ -350,6 +362,7 @@ func (ls *leaseStresser) keepLeaseAlive(leaseID int64) {
|
||||
if err != nil {
|
||||
ls.lg.Debug(
|
||||
"keepLeaseAlive failed to receive lease keepalive response",
|
||||
zap.String("stress-type", ls.stype.String()),
|
||||
zap.String("endpoint", ls.m.EtcdClientEndpoint),
|
||||
zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
|
||||
zap.Error(err),
|
||||
@ -359,6 +372,7 @@ func (ls *leaseStresser) keepLeaseAlive(leaseID int64) {
|
||||
|
||||
ls.lg.Debug(
|
||||
"keepLeaseAlive waiting on lease stream",
|
||||
zap.String("stress-type", ls.stype.String()),
|
||||
zap.String("endpoint", ls.m.EtcdClientEndpoint),
|
||||
zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
|
||||
)
|
||||
@ -367,6 +381,7 @@ func (ls *leaseStresser) keepLeaseAlive(leaseID int64) {
|
||||
if respRC == nil {
|
||||
ls.lg.Debug(
|
||||
"keepLeaseAlive received nil lease keepalive response",
|
||||
zap.String("stress-type", ls.stype.String()),
|
||||
zap.String("endpoint", ls.m.EtcdClientEndpoint),
|
||||
zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
|
||||
)
|
||||
@ -378,6 +393,7 @@ func (ls *leaseStresser) keepLeaseAlive(leaseID int64) {
|
||||
if respRC.TTL <= 0 {
|
||||
ls.lg.Debug(
|
||||
"keepLeaseAlive stream received lease keepalive response TTL <= 0",
|
||||
zap.String("stress-type", ls.stype.String()),
|
||||
zap.String("endpoint", ls.m.EtcdClientEndpoint),
|
||||
zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
|
||||
zap.Int64("ttl", respRC.TTL),
|
||||
@ -388,6 +404,7 @@ func (ls *leaseStresser) keepLeaseAlive(leaseID int64) {
|
||||
// renew lease timestamp only if lease is present
|
||||
ls.lg.Debug(
|
||||
"keepLeaseAlive renewed a lease",
|
||||
zap.String("stress-type", ls.stype.String()),
|
||||
zap.String("endpoint", ls.m.EtcdClientEndpoint),
|
||||
zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
|
||||
)
|
||||
@ -440,6 +457,7 @@ func (ls *leaseStresser) randomlyDropLease(leaseID int64) (bool, error) {
|
||||
|
||||
ls.lg.Debug(
|
||||
"randomlyDropLease error",
|
||||
zap.String("stress-type", ls.stype.String()),
|
||||
zap.String("endpoint", ls.m.EtcdClientEndpoint),
|
||||
zap.String("lease-id", fmt.Sprintf("%016x", leaseID)),
|
||||
zap.Error(ls.ctx.Err()),
|
||||
@ -457,7 +475,8 @@ func (ls *leaseStresser) Close() map[string]int {
|
||||
ls.aliveWg.Wait()
|
||||
ls.cli.Close()
|
||||
ls.lg.Info(
|
||||
"lease stresser STOP",
|
||||
"stress STOP",
|
||||
zap.String("stress-type", ls.stype.String()),
|
||||
zap.String("endpoint", ls.m.EtcdClientEndpoint),
|
||||
)
|
||||
return nil
|
@ -20,10 +20,16 @@ import (
|
||||
"os/exec"
|
||||
"syscall"
|
||||
|
||||
"github.com/coreos/etcd/functional/rpcpb"
|
||||
|
||||
"go.uber.org/zap"
|
||||
"golang.org/x/time/rate"
|
||||
)
|
||||
|
||||
type runnerStresser struct {
|
||||
stype rpcpb.StressType
|
||||
lg *zap.Logger
|
||||
|
||||
cmd *exec.Cmd
|
||||
cmdStr string
|
||||
args []string
|
||||
@ -34,9 +40,17 @@ type runnerStresser struct {
|
||||
donec chan struct{}
|
||||
}
|
||||
|
||||
func newRunnerStresser(cmdStr string, args []string, rl *rate.Limiter, reqRate int) *runnerStresser {
|
||||
func newRunnerStresser(
|
||||
stype rpcpb.StressType,
|
||||
lg *zap.Logger,
|
||||
cmdStr string,
|
||||
args []string,
|
||||
rl *rate.Limiter,
|
||||
reqRate int,
|
||||
) *runnerStresser {
|
||||
rl.SetLimit(rl.Limit() - rate.Limit(reqRate))
|
||||
return &runnerStresser{
|
||||
stype: stype,
|
||||
cmdStr: cmdStr,
|
||||
args: args,
|
||||
rl: rl,
|
||||
@ -71,6 +85,10 @@ func (rs *runnerStresser) setupOnce() (err error) {
|
||||
}
|
||||
|
||||
func (rs *runnerStresser) Stress() (err error) {
|
||||
rs.lg.Info(
|
||||
"stress START",
|
||||
zap.String("stress-type", rs.stype.String()),
|
||||
)
|
||||
if err = rs.setupOnce(); err != nil {
|
||||
return err
|
||||
}
|
||||
@ -78,6 +96,10 @@ func (rs *runnerStresser) Stress() (err error) {
|
||||
}
|
||||
|
||||
func (rs *runnerStresser) Pause() map[string]int {
|
||||
rs.lg.Info(
|
||||
"stress STOP",
|
||||
zap.String("stress-type", rs.stype.String()),
|
||||
)
|
||||
syscall.Kill(rs.cmd.Process.Pid, syscall.SIGSTOP)
|
||||
return nil
|
||||
}
|
16
pkg/fileutil/doc.go
Normal file
16
pkg/fileutil/doc.go
Normal file
@ -0,0 +1,16 @@
|
||||
// Copyright 2018 The etcd Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Package fileutil implements utility functions related to files and paths.
|
||||
package fileutil
|
@ -12,7 +12,6 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Package fileutil implements utility functions related to files and paths.
|
||||
package fileutil
|
||||
|
||||
import (
|
||||
@ -93,6 +92,7 @@ func CreateDirAll(dir string) error {
|
||||
return err
|
||||
}
|
||||
|
||||
// Exist returns true if a file or directory exists.
|
||||
func Exist(name string) bool {
|
||||
_, err := os.Stat(name)
|
||||
return err == nil
|
||||
|
@ -15,8 +15,10 @@
|
||||
package fileutil
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"math/rand"
|
||||
"os"
|
||||
"os/user"
|
||||
"path/filepath"
|
||||
@ -24,6 +26,7 @@ import (
|
||||
"runtime"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestIsDirWriteable(t *testing.T) {
|
||||
@ -104,6 +107,16 @@ func TestCreateDirAll(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestExist(t *testing.T) {
|
||||
fdir := filepath.Join(os.TempDir(), fmt.Sprint(time.Now().UnixNano()+rand.Int63n(1000)))
|
||||
os.RemoveAll(fdir)
|
||||
if err := os.Mkdir(fdir, 0666); err != nil {
|
||||
t.Skip(err)
|
||||
}
|
||||
defer os.RemoveAll(fdir)
|
||||
if !Exist(fdir) {
|
||||
t.Fatalf("expected Exist true, got %v", Exist(fdir))
|
||||
}
|
||||
|
||||
f, err := ioutil.TempFile(os.TempDir(), "fileutil")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
|
16
pkg/stringutil/doc.go
Normal file
16
pkg/stringutil/doc.go
Normal file
@ -0,0 +1,16 @@
|
||||
// Copyright 2018 The etcd Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Package stringutil exports string utility functions.
|
||||
package stringutil
|
@ -1,4 +1,4 @@
|
||||
// Copyright 2016 The etcd Authors
|
||||
// Copyright 2018 The etcd Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
@ -12,41 +12,40 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Package stringutil exports string utility functions.
|
||||
package stringutil
|
||||
|
||||
import "math/rand"
|
||||
|
||||
const (
|
||||
chars = "abcdefghijklmnopqrstuvwxyz0123456789"
|
||||
import (
|
||||
"math/rand"
|
||||
"time"
|
||||
)
|
||||
|
||||
// UniqueStrings returns a slice of randomly generated unique strings.
|
||||
func UniqueStrings(maxlen uint, n int) []string {
|
||||
exist := make(map[string]bool)
|
||||
ss := make([]string, 0)
|
||||
|
||||
func UniqueStrings(slen uint, n int) (ss []string) {
|
||||
exist := make(map[string]struct{})
|
||||
ss = make([]string, 0, n)
|
||||
for len(ss) < n {
|
||||
s := randomString(maxlen)
|
||||
if !exist[s] {
|
||||
exist[s] = true
|
||||
s := randString(slen)
|
||||
if _, ok := exist[s]; !ok {
|
||||
ss = append(ss, s)
|
||||
exist[s] = struct{}{}
|
||||
}
|
||||
}
|
||||
|
||||
return ss
|
||||
}
|
||||
|
||||
// RandomStrings returns a slice of randomly generated strings.
|
||||
func RandomStrings(maxlen uint, n int) []string {
|
||||
ss := make([]string, 0)
|
||||
func RandomStrings(slen uint, n int) (ss []string) {
|
||||
ss = make([]string, 0, n)
|
||||
for i := 0; i < n; i++ {
|
||||
ss = append(ss, randomString(maxlen))
|
||||
ss = append(ss, randString(slen))
|
||||
}
|
||||
return ss
|
||||
}
|
||||
|
||||
func randomString(l uint) string {
|
||||
const chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
|
||||
|
||||
func randString(l uint) string {
|
||||
rand.Seed(time.Now().UnixNano())
|
||||
s := make([]byte, l)
|
||||
for i := 0; i < int(l); i++ {
|
||||
s[i] = chars[rand.Intn(len(chars))]
|
30
pkg/stringutil/rand_test.go
Normal file
30
pkg/stringutil/rand_test.go
Normal file
@ -0,0 +1,30 @@
|
||||
// Copyright 2018 The etcd Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package stringutil
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestUniqueStrings(t *testing.T) {
|
||||
ss := UniqueStrings(10, 50)
|
||||
for i := 1; i < len(ss); i++ {
|
||||
if ss[i-1] == ss[i] {
|
||||
t.Fatalf("ss[i-1] %q == ss[i] %q", ss[i-1], ss[i])
|
||||
}
|
||||
}
|
||||
fmt.Println(ss)
|
||||
}
|
4
test
4
test
@ -37,7 +37,7 @@ source ./build
|
||||
|
||||
# build before setting up test GOPATH
|
||||
if [[ "${PASSES}" == *"functional"* ]]; then
|
||||
./tools/functional-tester/build
|
||||
./functional/build
|
||||
fi
|
||||
|
||||
if [ -z "$PASSES" ]; then
|
||||
@ -196,7 +196,7 @@ function functional_pass {
|
||||
done
|
||||
|
||||
echo "Starting 'etcd-tester'"
|
||||
./bin/etcd-tester --config ./tools/functional-tester/tester/local-test.yaml && echo "'etcd-tester' succeeded"
|
||||
./bin/etcd-tester --config ./functional.yaml && echo "'etcd-tester' succeeded"
|
||||
ETCD_TESTER_EXIT_CODE=$?
|
||||
echo "ETCD_TESTER_EXIT_CODE:" ${ETCD_TESTER_EXIT_CODE}
|
||||
|
||||
|
@ -1,14 +0,0 @@
|
||||
s1: bin/etcd --name s1 --data-dir /tmp/etcd-test-proxy-data.s1 --listen-client-urls http://127.0.0.1:1379 --advertise-client-urls http://127.0.0.1:13790 --listen-peer-urls http://127.0.0.1:1380 --initial-advertise-peer-urls http://127.0.0.1:13800 --initial-cluster-token tkn --initial-cluster 's1=http://127.0.0.1:13800,s2=http://127.0.0.1:23800,s3=http://127.0.0.1:33800' --initial-cluster-state new
|
||||
|
||||
s1-client-proxy: bin/etcd-test-proxy --from localhost:13790 --to localhost:1379 --http-port 1378
|
||||
s1-peer-proxy: bin/etcd-test-proxy --from localhost:13800 --to localhost:1380 --http-port 1381
|
||||
|
||||
s2: bin/etcd --name s2 --data-dir /tmp/etcd-test-proxy-data.s2 --listen-client-urls http://127.0.0.1:2379 --advertise-client-urls http://127.0.0.1:23790 --listen-peer-urls http://127.0.0.1:2380 --initial-advertise-peer-urls http://127.0.0.1:23800 --initial-cluster-token tkn --initial-cluster 's1=http://127.0.0.1:13800,s2=http://127.0.0.1:23800,s3=http://127.0.0.1:33800' --initial-cluster-state new
|
||||
|
||||
s2-client-proxy: bin/etcd-test-proxy --from localhost:23790 --to localhost:2379 --http-port 2378
|
||||
s2-peer-proxy: bin/etcd-test-proxy --from localhost:23800 --to localhost:2380 --http-port 2381
|
||||
|
||||
s3: bin/etcd --name s3 --data-dir /tmp/etcd-test-proxy-data.s3 --listen-client-urls http://127.0.0.1:3379 --advertise-client-urls http://127.0.0.1:33790 --listen-peer-urls http://127.0.0.1:3380 --initial-advertise-peer-urls http://127.0.0.1:33800 --initial-cluster-token tkn --initial-cluster 's1=http://127.0.0.1:13800,s2=http://127.0.0.1:23800,s3=http://127.0.0.1:33800' --initial-cluster-state new
|
||||
|
||||
s3-client-proxy: bin/etcd-test-proxy --from localhost:33790 --to localhost:3379 --http-port 3378
|
||||
s3-client-proxy: bin/etcd-test-proxy --from localhost:33800 --to localhost:3380 --http-port 3381
|
@ -1,33 +0,0 @@
|
||||
# etcd functional test suite
|
||||
|
||||
etcd functional test suite tests the functionality of an etcd cluster with a focus on failure resistance under high pressure. It sets up an etcd cluster and inject failures into the cluster by killing the process or isolate the network of the process. It expects the etcd cluster to recover within a short amount of time after fixing the fault.
|
||||
|
||||
etcd functional test suite has two components: etcd-agent and etcd-tester. etcd-agent runs on every test machine, and etcd-tester is a single controller of the test. tester controls agents: start etcd process, stop, terminate, inject failures, and so on.
|
||||
|
||||
### Run locally
|
||||
|
||||
```bash
|
||||
PASSES=functional ./test
|
||||
```
|
||||
|
||||
### Run with Docker
|
||||
|
||||
To run locally, first build tester image:
|
||||
|
||||
```bash
|
||||
pushd ../..
|
||||
make build-docker-functional-tester
|
||||
popd
|
||||
```
|
||||
|
||||
And run [example scripts](./scripts).
|
||||
|
||||
```bash
|
||||
# run 3 agents for 3-node local etcd cluster
|
||||
./scripts/docker-local-agent.sh 1
|
||||
./scripts/docker-local-agent.sh 2
|
||||
./scripts/docker-local-agent.sh 3
|
||||
|
||||
# to run only 1 tester round
|
||||
./scripts/docker-local-tester.sh
|
||||
```
|
@ -1,10 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
if ! [[ "$0" =~ "tools/functional-tester/build" ]]; then
|
||||
echo "must be run from repository root"
|
||||
exit 255
|
||||
fi
|
||||
|
||||
CGO_ENABLED=0 go build -a -installsuffix cgo -ldflags "-s" -o bin/etcd-agent ./tools/functional-tester/cmd/etcd-agent
|
||||
CGO_ENABLED=0 go build -a -installsuffix cgo -ldflags "-s" -o bin/etcd-tester ./tools/functional-tester/cmd/etcd-tester
|
||||
CGO_ENABLED=0 go build -a -installsuffix cgo -ldflags "-s" -o bin/etcd-runner ./tools/functional-tester/cmd/etcd-runner
|
@ -1,249 +0,0 @@
|
||||
syntax = "proto3";
|
||||
package rpcpb;
|
||||
|
||||
import "github.com/gogo/protobuf/gogoproto/gogo.proto";
|
||||
|
||||
option (gogoproto.marshaler_all) = true;
|
||||
option (gogoproto.sizer_all) = true;
|
||||
option (gogoproto.unmarshaler_all) = true;
|
||||
option (gogoproto.goproto_getters_all) = false;
|
||||
|
||||
service Transport {
|
||||
rpc Transport(stream Request) returns (stream Response) {}
|
||||
}
|
||||
|
||||
enum Operation {
|
||||
NotStarted = 0;
|
||||
|
||||
// InitialStartEtcd is only called to start etcd very first time.
|
||||
InitialStartEtcd = 1;
|
||||
// RestartEtcd is sent to restart killed etcd.
|
||||
RestartEtcd = 2;
|
||||
// KillEtcd pauses etcd process while keeping data directories
|
||||
// and previous etcd configurations.
|
||||
KillEtcd = 3;
|
||||
// FailArchive is sent when consistency check failed,
|
||||
// thus need to archive etcd data directories.
|
||||
FailArchive = 4;
|
||||
// DestroyEtcdAgent destroys etcd process, etcd data, and agent server.
|
||||
DestroyEtcdAgent = 5;
|
||||
|
||||
BlackholePeerPortTxRx = 100;
|
||||
UnblackholePeerPortTxRx = 101;
|
||||
DelayPeerPortTxRx = 102;
|
||||
UndelayPeerPortTxRx = 103;
|
||||
}
|
||||
|
||||
message Etcd {
|
||||
string Name = 1 [(gogoproto.moretags) = "yaml:\"name\""];
|
||||
string DataDir = 2 [(gogoproto.moretags) = "yaml:\"data-dir\""];
|
||||
string WALDir = 3 [(gogoproto.moretags) = "yaml:\"wal-dir\""];
|
||||
|
||||
// HeartbeatIntervalMs is the time (in milliseconds) of a heartbeat interval.
|
||||
// Default value is 100, which is 100ms.
|
||||
int64 HeartbeatIntervalMs = 11 [(gogoproto.moretags) = "yaml:\"heartbeat-interval\""];
|
||||
// ElectionTimeoutMs is the time (in milliseconds) for an election to timeout.
|
||||
// Default value is 1000, which is 1s.
|
||||
int64 ElectionTimeoutMs = 12 [(gogoproto.moretags) = "yaml:\"election-timeout\""];
|
||||
|
||||
repeated string ListenClientURLs = 21 [(gogoproto.moretags) = "yaml:\"listen-client-urls\""];
|
||||
repeated string AdvertiseClientURLs = 22 [(gogoproto.moretags) = "yaml:\"advertise-client-urls\""];
|
||||
bool ClientAutoTLS = 23 [(gogoproto.moretags) = "yaml:\"auto-tls\""];
|
||||
bool ClientCertAuth = 24 [(gogoproto.moretags) = "yaml:\"client-cert-auth\""];
|
||||
string ClientCertFile = 25 [(gogoproto.moretags) = "yaml:\"cert-file\""];
|
||||
string ClientKeyFile = 26 [(gogoproto.moretags) = "yaml:\"key-file\""];
|
||||
string ClientTrustedCAFile = 27 [(gogoproto.moretags) = "yaml:\"trusted-ca-file\""];
|
||||
|
||||
repeated string ListenPeerURLs = 31 [(gogoproto.moretags) = "yaml:\"listen-peer-urls\""];
|
||||
repeated string AdvertisePeerURLs = 32 [(gogoproto.moretags) = "yaml:\"initial-advertise-peer-urls\""];
|
||||
bool PeerAutoTLS = 33 [(gogoproto.moretags) = "yaml:\"peer-auto-tls\""];
|
||||
bool PeerClientCertAuth = 34 [(gogoproto.moretags) = "yaml:\"peer-client-cert-auth\""];
|
||||
string PeerCertFile = 35 [(gogoproto.moretags) = "yaml:\"peer-cert-file\""];
|
||||
string PeerKeyFile = 36 [(gogoproto.moretags) = "yaml:\"peer-key-file\""];
|
||||
string PeerTrustedCAFile = 37 [(gogoproto.moretags) = "yaml:\"peer-trusted-ca-file\""];
|
||||
|
||||
string InitialCluster = 41 [(gogoproto.moretags) = "yaml:\"initial-cluster\""];
|
||||
string InitialClusterState = 42 [(gogoproto.moretags) = "yaml:\"initial-cluster-state\""];
|
||||
string InitialClusterToken = 43 [(gogoproto.moretags) = "yaml:\"initial-cluster-token\""];
|
||||
|
||||
int64 SnapshotCount = 51 [(gogoproto.moretags) = "yaml:\"snapshot-count\""];
|
||||
int64 QuotaBackendBytes = 52 [(gogoproto.moretags) = "yaml:\"quota-backend-bytes\""];
|
||||
|
||||
bool PreVote = 63 [(gogoproto.moretags) = "yaml:\"pre-vote\""];
|
||||
bool InitialCorruptCheck = 64 [(gogoproto.moretags) = "yaml:\"initial-corrupt-check\""];
|
||||
}
|
||||
|
||||
message Member {
|
||||
// EtcdExecPath is the executable etcd binary path in agent server.
|
||||
string EtcdExecPath = 1 [(gogoproto.moretags) = "yaml:\"etcd-exec-path\""];
|
||||
|
||||
// TODO: support embedded etcd
|
||||
|
||||
// AgentAddr is the agent HTTP server address.
|
||||
string AgentAddr = 11 [(gogoproto.moretags) = "yaml:\"agent-addr\""];
|
||||
// FailpointHTTPAddr is the agent's failpoints HTTP server address.
|
||||
string FailpointHTTPAddr = 12 [(gogoproto.moretags) = "yaml:\"failpoint-http-addr\""];
|
||||
|
||||
// BaseDir is the base directory where all logs and etcd data are stored.
|
||||
string BaseDir = 101 [(gogoproto.moretags) = "yaml:\"base-dir\""];
|
||||
// EtcdLogPath is the log file to store current etcd server logs.
|
||||
string EtcdLogPath = 102 [(gogoproto.moretags) = "yaml:\"etcd-log-path\""];
|
||||
|
||||
// EtcdClientProxy is true when client traffic needs to be proxied.
|
||||
// If true, listen client URL port must be different than advertise client URL port.
|
||||
bool EtcdClientProxy = 201 [(gogoproto.moretags) = "yaml:\"etcd-client-proxy\""];
|
||||
// EtcdPeerProxy is true when peer traffic needs to be proxied.
|
||||
// If true, listen peer URL port must be different than advertise peer URL port.
|
||||
bool EtcdPeerProxy = 202 [(gogoproto.moretags) = "yaml:\"etcd-peer-proxy\""];
|
||||
|
||||
// EtcdClientEndpoint is the etcd client endpoint.
|
||||
string EtcdClientEndpoint = 301 [(gogoproto.moretags) = "yaml:\"etcd-client-endpoint\""];
|
||||
// Etcd defines etcd binary configuration flags.
|
||||
Etcd Etcd = 302 [(gogoproto.moretags) = "yaml:\"etcd\""];
|
||||
|
||||
// ClientCertData contains cert file contents from this member's etcd server.
|
||||
string ClientCertData = 401 [(gogoproto.moretags) = "yaml:\"client-cert-data\""];
|
||||
string ClientCertPath = 402 [(gogoproto.moretags) = "yaml:\"client-cert-path\""];
|
||||
// ClientKeyData contains key file contents from this member's etcd server.
|
||||
string ClientKeyData = 403 [(gogoproto.moretags) = "yaml:\"client-key-data\""];
|
||||
string ClientKeyPath = 404 [(gogoproto.moretags) = "yaml:\"client-key-path\""];
|
||||
// ClientTrustedCAData contains trusted CA file contents from this member's etcd server.
|
||||
string ClientTrustedCAData = 405 [(gogoproto.moretags) = "yaml:\"client-trusted-ca-data\""];
|
||||
string ClientTrustedCAPath = 406 [(gogoproto.moretags) = "yaml:\"client-trusted-ca-path\""];
|
||||
|
||||
// PeerCertData contains cert file contents from this member's etcd server.
|
||||
string PeerCertData = 501 [(gogoproto.moretags) = "yaml:\"peer-cert-data\""];
|
||||
string PeerCertPath = 502 [(gogoproto.moretags) = "yaml:\"peer-cert-path\""];
|
||||
// PeerKeyData contains key file contents from this member's etcd server.
|
||||
string PeerKeyData = 503 [(gogoproto.moretags) = "yaml:\"peer-key-data\""];
|
||||
string PeerKeyPath = 504 [(gogoproto.moretags) = "yaml:\"peer-key-path\""];
|
||||
// PeerTrustedCAData contains trusted CA file contents from this member's etcd server.
|
||||
string PeerTrustedCAData = 505 [(gogoproto.moretags) = "yaml:\"peer-trusted-ca-data\""];
|
||||
string PeerTrustedCAPath = 506 [(gogoproto.moretags) = "yaml:\"peer-trusted-ca-path\""];
|
||||
}
|
||||
|
||||
enum FailureCase {
|
||||
KILL_ONE_FOLLOWER = 0;
|
||||
KILL_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 1;
|
||||
KILL_LEADER = 2;
|
||||
KILL_LEADER_UNTIL_TRIGGER_SNAPSHOT = 3;
|
||||
KILL_QUORUM = 4;
|
||||
KILL_ALL = 5;
|
||||
|
||||
BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER = 100;
|
||||
BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 101;
|
||||
BLACKHOLE_PEER_PORT_TX_RX_LEADER = 102;
|
||||
BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT = 103;
|
||||
BLACKHOLE_PEER_PORT_TX_RX_QUORUM = 104;
|
||||
BLACKHOLE_PEER_PORT_TX_RX_ALL = 105;
|
||||
|
||||
DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER = 200;
|
||||
RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER = 201;
|
||||
DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 202;
|
||||
RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 203;
|
||||
DELAY_PEER_PORT_TX_RX_LEADER = 204;
|
||||
RANDOM_DELAY_PEER_PORT_TX_RX_LEADER = 205;
|
||||
DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT = 206;
|
||||
RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT = 207;
|
||||
DELAY_PEER_PORT_TX_RX_QUORUM = 208;
|
||||
RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM = 209;
|
||||
DELAY_PEER_PORT_TX_RX_ALL = 210;
|
||||
RANDOM_DELAY_PEER_PORT_TX_RX_ALL = 211;
|
||||
|
||||
// NO_FAIL_WITH_STRESS runs no-op failure injection for specified period
|
||||
// while stressers are still sending requests.
|
||||
NO_FAIL_WITH_STRESS = 300;
|
||||
// NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS runs no-op failure injection
|
||||
// with all stressers stopped.
|
||||
NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS = 301;
|
||||
|
||||
FAILPOINTS = 400;
|
||||
EXTERNAL = 500;
|
||||
}
|
||||
|
||||
enum StressType {
|
||||
KV = 0;
|
||||
LEASE = 1;
|
||||
ELECTION_RUNNER = 2;
|
||||
WATCH_RUNNER = 3;
|
||||
LOCK_RACER_RUNNER = 4;
|
||||
LEASE_RUNNER = 5;
|
||||
}
|
||||
|
||||
message Tester {
|
||||
string TesterDataDir = 1 [(gogoproto.moretags) = "yaml:\"tester-data-dir\""];
|
||||
string TesterNetwork = 2 [(gogoproto.moretags) = "yaml:\"tester-network\""];
|
||||
string TesterAddr = 3 [(gogoproto.moretags) = "yaml:\"tester-addr\""];
|
||||
|
||||
// DelayLatencyMsRv is the delay latency in milliseconds,
|
||||
// to inject to simulated slow network.
|
||||
uint32 DelayLatencyMs = 11 [(gogoproto.moretags) = "yaml:\"delay-latency-ms\""];
|
||||
// DelayLatencyMsRv is the delay latency random variable in milliseconds.
|
||||
uint32 DelayLatencyMsRv = 12 [(gogoproto.moretags) = "yaml:\"delay-latency-ms-rv\""];
|
||||
// UpdatedDelayLatencyMs is the update delay latency in milliseconds,
|
||||
// to inject to simulated slow network. It's the final latency to apply,
|
||||
// in case the latency numbers are randomly generated from given delay latency field.
|
||||
uint32 UpdatedDelayLatencyMs = 13 [(gogoproto.moretags) = "yaml:\"updated-delay-latency-ms\""];
|
||||
|
||||
// RoundLimit is the limit of rounds to run failure set (-1 to run without limits).
|
||||
int32 RoundLimit = 21 [(gogoproto.moretags) = "yaml:\"round-limit\""];
|
||||
// ExitOnFailure is true, then exit tester on first failure.
|
||||
bool ExitOnFailure = 22 [(gogoproto.moretags) = "yaml:\"exit-on-failure\""];
|
||||
// ConsistencyCheck is true to check consistency (revision, hash).
|
||||
bool ConsistencyCheck = 23 [(gogoproto.moretags) = "yaml:\"consistency-check\""];
|
||||
// EnablePprof is true to enable profiler.
|
||||
bool EnablePprof = 24 [(gogoproto.moretags) = "yaml:\"enable-pprof\""];
|
||||
|
||||
// FailureCases is the selected test cases to schedule.
|
||||
// If empty, run all failure cases.
|
||||
repeated string FailureCases = 31 [(gogoproto.moretags) = "yaml:\"failure-cases\""];
|
||||
// FailureDelayMs is the delay duration after failure is injected.
|
||||
// Useful when triggering snapshot or no-op failure cases.
|
||||
uint32 FailureDelayMs = 32 [(gogoproto.moretags) = "yaml:\"failure-delay-ms\""];
|
||||
// FailureShuffle is true to randomize failure injecting order.
|
||||
bool FailureShuffle = 33 [(gogoproto.moretags) = "yaml:\"failure-shuffle\""];
|
||||
// FailpointCommands is the list of "gofail" commands (e.g. panic("etcd-tester"),1*sleep(1000)).
|
||||
repeated string FailpointCommands = 34 [(gogoproto.moretags) = "yaml:\"failpoint-commands\""];
|
||||
|
||||
// RunnerExecPath is a path of etcd-runner binary.
|
||||
string RunnerExecPath = 41 [(gogoproto.moretags) = "yaml:\"runner-exec-path\""];
|
||||
// ExternalExecPath is a path of script for enabling/disabling an external fault injector.
|
||||
string ExternalExecPath = 42 [(gogoproto.moretags) = "yaml:\"external-exec-path\""];
|
||||
|
||||
// StressTypes is the list of stresser names:
|
||||
// keys, lease, nop, election-runner, watch-runner, lock-racer-runner, lease-runner.
|
||||
repeated string StressTypes = 101 [(gogoproto.moretags) = "yaml:\"stress-types\""];
|
||||
// StressKeySize is the size of each small key written into etcd.
|
||||
int32 StressKeySize = 102 [(gogoproto.moretags) = "yaml:\"stress-key-size\""];
|
||||
// StressKeySizeLarge is the size of each large key written into etcd.
|
||||
int32 StressKeySizeLarge = 103 [(gogoproto.moretags) = "yaml:\"stress-key-size-large\""];
|
||||
// StressKeySuffixRange is the count of key range written into etcd.
|
||||
// Stress keys are created with "fmt.Sprintf("foo%016x", rand.Intn(keySuffixRange)".
|
||||
int32 StressKeySuffixRange = 104 [(gogoproto.moretags) = "yaml:\"stress-key-suffix-range\""];
|
||||
// StressKeySuffixRangeTxn is the count of key range written into etcd txn (max 100).
|
||||
// Stress keys are created with "fmt.Sprintf("/k%03d", i)".
|
||||
int32 StressKeySuffixRangeTxn = 105 [(gogoproto.moretags) = "yaml:\"stress-key-suffix-range-txn\""];
|
||||
// StressKeyTxnOps is the number of operations per a transaction (max 64).
|
||||
int32 StressKeyTxnOps = 106 [(gogoproto.moretags) = "yaml:\"stress-key-txn-ops\""];
|
||||
|
||||
// StressClients is the number of concurrent stressing clients
|
||||
// with "one" shared TCP connection.
|
||||
int32 StressClients = 201 [(gogoproto.moretags) = "yaml:\"stress-clients\""];
|
||||
// StressQPS is the maximum number of stresser requests per second.
|
||||
int32 StressQPS = 202 [(gogoproto.moretags) = "yaml:\"stress-qps\""];
|
||||
}
|
||||
|
||||
message Request {
|
||||
Operation Operation = 1;
|
||||
// Member contains the same Member object from tester configuration.
|
||||
Member Member = 2;
|
||||
// Tester contains tester configuration.
|
||||
Tester Tester = 3;
|
||||
}
|
||||
|
||||
message Response {
|
||||
bool Success = 1;
|
||||
string Status = 2;
|
||||
// Member contains the same Member object from tester request.
|
||||
Member Member = 3;
|
||||
}
|
@ -1,73 +0,0 @@
|
||||
// Copyright 2018 The etcd Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package tester
|
||||
|
||||
import "github.com/coreos/etcd/tools/functional-tester/rpcpb"
|
||||
|
||||
func injectKill(clus *Cluster, idx int) error {
|
||||
return clus.sendOperation(idx, rpcpb.Operation_KillEtcd)
|
||||
}
|
||||
|
||||
func recoverKill(clus *Cluster, idx int) error {
|
||||
return clus.sendOperation(idx, rpcpb.Operation_RestartEtcd)
|
||||
}
|
||||
|
||||
func newFailureKillOneFollower() Failure {
|
||||
ff := failureByFunc{
|
||||
failureCase: rpcpb.FailureCase_KILL_ONE_FOLLOWER,
|
||||
injectMember: injectKill,
|
||||
recoverMember: recoverKill,
|
||||
}
|
||||
return &failureFollower{ff, -1, -1}
|
||||
}
|
||||
|
||||
func newFailureKillLeader() Failure {
|
||||
ff := failureByFunc{
|
||||
failureCase: rpcpb.FailureCase_KILL_LEADER,
|
||||
injectMember: injectKill,
|
||||
recoverMember: recoverKill,
|
||||
}
|
||||
return &failureLeader{ff, -1, -1}
|
||||
}
|
||||
|
||||
func newFailureKillQuorum() Failure {
|
||||
return &failureQuorum{
|
||||
failureCase: rpcpb.FailureCase_KILL_QUORUM,
|
||||
injectMember: injectKill,
|
||||
recoverMember: recoverKill,
|
||||
}
|
||||
}
|
||||
|
||||
func newFailureKillAll() Failure {
|
||||
return &failureAll{
|
||||
failureCase: rpcpb.FailureCase_KILL_ALL,
|
||||
injectMember: injectKill,
|
||||
recoverMember: recoverKill,
|
||||
}
|
||||
}
|
||||
|
||||
func newFailureKillOneFollowerUntilTriggerSnapshot() Failure {
|
||||
return &failureUntilSnapshot{
|
||||
failureCase: rpcpb.FailureCase_KILL_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT,
|
||||
Failure: newFailureKillOneFollower(),
|
||||
}
|
||||
}
|
||||
|
||||
func newFailureKillLeaderUntilTriggerSnapshot() Failure {
|
||||
return &failureUntilSnapshot{
|
||||
failureCase: rpcpb.FailureCase_KILL_LEADER_UNTIL_TRIGGER_SNAPSHOT,
|
||||
Failure: newFailureKillLeader(),
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user