diff --git a/tools/functional-tester/etcd-tester/failure.go b/tools/functional-tester/etcd-tester/failure.go index 06e9ebd64..68f523631 100644 --- a/tools/functional-tester/etcd-tester/failure.go +++ b/tools/functional-tester/etcd-tester/failure.go @@ -20,18 +20,6 @@ import ( "time" ) -const ( - snapshotCount = 10000 - slowNetworkLatency = 500 // 500 millisecond - randomVariation = 50 - - // Wait more when it recovers from slow network, because network layer - // needs extra time to propagate traffic control (tc command) change. - // Otherwise, we get different hash values from the previous revision. - // For more detail, please see https://github.com/coreos/etcd/issues/5121. - waitRecover = 5 * time.Second -) - type failure interface { // Inject injeccts the failure into the testing cluster at the given // round. When calling the function, the cluster should be in health. @@ -47,63 +35,120 @@ type description string func (d description) Desc() string { return string(d) } -type failureKillAll struct { +type injectMemberFunc func(*member) error +type recoverMemberFunc func(*member) error + +type failureByFunc struct { description + injectMember injectMemberFunc + recoverMember recoverMemberFunc } -func newFailureKillAll() *failureKillAll { - return &failureKillAll{ - description: "kill all members", +type failureOne failureByFunc +type failureAll failureByFunc +type failureMajority failureByFunc +type failureLeader struct { + failureByFunc + idx int +} + +// failureDelay injects a failure and waits for a snapshot event +type failureDelay struct{ failure } + +func (f *failureOne) Inject(c *cluster, round int) error { + return f.injectMember(c.Members[round%c.Size]) +} + +func (f *failureOne) Recover(c *cluster, round int) error { + if err := f.recoverMember(c.Members[round%c.Size]); err != nil { + return err } + return c.WaitHealth() } -func (f *failureKillAll) Inject(c *cluster, round int) error { - for _, a := range c.Agents { - if err := a.Stop(); err != nil { +func (f *failureAll) Inject(c *cluster, round int) error { + for _, m := range c.Members { + if err := f.injectMember(m); err != nil { return err } } return nil } -func (f *failureKillAll) Recover(c *cluster, round int) error { - for _, a := range c.Agents { - if _, err := a.Restart(); err != nil { +func (f *failureAll) Recover(c *cluster, round int) error { + for _, m := range c.Members { + if err := f.recoverMember(m); err != nil { return err } } return c.WaitHealth() } -type failureKillMajority struct { - description -} - -func newFailureKillMajority() *failureKillMajority { - return &failureKillMajority{ - description: "kill majority of the cluster", - } -} - -func (f *failureKillMajority) Inject(c *cluster, round int) error { - for i := range getToKillMap(c.Size, round) { - if err := c.Agents[i].Stop(); err != nil { +func (f *failureMajority) Inject(c *cluster, round int) error { + for i := range killMap(c.Size, round) { + if err := f.injectMember(c.Members[i]); err != nil { return err } } return nil } -func (f *failureKillMajority) Recover(c *cluster, round int) error { - for i := range getToKillMap(c.Size, round) { - if _, err := c.Agents[i].Restart(); err != nil { +func (f *failureMajority) Recover(c *cluster, round int) error { + for i := range killMap(c.Size, round) { + if err := f.recoverMember(c.Members[i]); err != nil { return err } } + return nil +} + +func (f *failureLeader) Inject(c *cluster, round int) error { + idx, err := c.GetLeader() + if err != nil { + return err + } + f.idx = idx + return f.injectMember(c.Members[idx]) +} + +func (f *failureLeader) Recover(c *cluster, round int) error { + if err := f.recoverMember(c.Members[f.idx]); err != nil { + return err + } return c.WaitHealth() } -func getToKillMap(size int, seed int) map[int]bool { +func (f *failureDelay) Inject(c *cluster, round int) error { + if err := f.failure.Inject(c, round); err != nil { + return err + } + + if c.Size < 3 { + return nil + } + + start, _ := c.Report() + end := start + // Normal healthy cluster could accept 1000req/s at least. + // Give it 3-times time to create a new snapshot. + retry := snapshotCount / 1000 * 3 + for j := 0; j < retry; j++ { + end, _ = c.Report() + // If the number of proposals committed is bigger than snapshot count, + // a new snapshot should have been created. + if end-start > snapshotCount { + return nil + } + time.Sleep(time.Second) + } + return fmt.Errorf("cluster too slow: only commit %d requests in %ds", end-start, retry) +} + +func (f *failureDelay) Desc() string { + return f.failure.Desc() + " for a long time and expect it to recover from an incoming snapshot" +} + +func killMap(size int, seed int) map[int]bool { m := make(map[int]bool) r := rand.New(rand.NewSource(int64(seed))) majority := size/2 + 1 @@ -114,288 +159,3 @@ func getToKillMap(size int, seed int) map[int]bool { } } } - -type failureKillOne struct { - description -} - -func newFailureKillOne() *failureKillOne { - return &failureKillOne{ - description: "kill one random member", - } -} - -func (f *failureKillOne) Inject(c *cluster, round int) error { - i := round % c.Size - return c.Agents[i].Stop() -} - -func (f *failureKillOne) Recover(c *cluster, round int) error { - i := round % c.Size - if _, err := c.Agents[i].Restart(); err != nil { - return err - } - return c.WaitHealth() -} - -type failureKillLeader struct { - description - idx int -} - -func newFailureKillLeader() *failureKillLeader { - return &failureKillLeader{ - description: "kill leader member", - } -} - -func (f *failureKillLeader) Inject(c *cluster, round int) error { - idx, err := c.GetLeader() - if err != nil { - return err - } - f.idx = idx - return c.Agents[idx].Stop() -} - -func (f *failureKillLeader) Recover(c *cluster, round int) error { - if _, err := c.Agents[f.idx].Restart(); err != nil { - return err - } - return c.WaitHealth() -} - -// failureKillOneForLongTime kills one member for long time, and restart -// after a snapshot is required. -type failureKillOneForLongTime struct { - description -} - -func newFailureKillOneForLongTime() *failureKillOneForLongTime { - return &failureKillOneForLongTime{ - description: "kill one member for long time and expect it to recover from incoming snapshot", - } -} - -func (f *failureKillOneForLongTime) Inject(c *cluster, round int) error { - i := round % c.Size - if err := c.Agents[i].Stop(); err != nil { - return err - } - if c.Size >= 3 { - start, _ := c.Report() - var end int - // Normal healthy cluster could accept 1000req/s at least. - // Give it 3-times time to create a new snapshot. - retry := snapshotCount / 1000 * 3 - for j := 0; j < retry; j++ { - end, _ = c.Report() - // If the number of proposals committed is bigger than snapshot count, - // a new snapshot should have been created. - if end-start > snapshotCount { - return nil - } - time.Sleep(time.Second) - } - return fmt.Errorf("cluster too slow: only commit %d requests in %ds", end-start, retry) - } - return nil -} - -func (f *failureKillOneForLongTime) Recover(c *cluster, round int) error { - i := round % c.Size - if _, err := c.Agents[i].Restart(); err != nil { - return err - } - return c.WaitHealth() -} - -// failureKillLeaderForLongTime kills the leader for long time, and restart -// after a snapshot is required. -type failureKillLeaderForLongTime struct { - description - idx int -} - -func newFailureKillLeaderForLongTime() *failureKillLeaderForLongTime { - return &failureKillLeaderForLongTime{ - description: "kill the leader for long time and expect it to recover from incoming snapshot", - } -} - -func (f *failureKillLeaderForLongTime) Inject(c *cluster, round int) error { - idx, err := c.GetLeader() - if err != nil { - return err - } - f.idx = idx - if err := c.Agents[idx].Stop(); err != nil { - return err - } - if c.Size >= 3 { - start, _ := c.Report() - var end int - retry := snapshotCount / 1000 * 3 - for j := 0; j < retry; j++ { - end, _ = c.Report() - if end-start > snapshotCount { - return nil - } - time.Sleep(time.Second) - } - return fmt.Errorf("cluster too slow: only commit %d requests in %ds", end-start, retry) - } - return nil -} - -func (f *failureKillLeaderForLongTime) Recover(c *cluster, round int) error { - if _, err := c.Agents[f.idx].Restart(); err != nil { - return err - } - return c.WaitHealth() -} - -type failureIsolate struct { - description -} - -func newFailureIsolate() *failureIsolate { - return &failureIsolate{ - description: "isolate one member", - } -} - -func (f *failureIsolate) Inject(c *cluster, round int) error { - i := round % c.Size - return c.Agents[i].DropPort(peerURLPort) -} - -func (f *failureIsolate) Recover(c *cluster, round int) error { - i := round % c.Size - if err := c.Agents[i].RecoverPort(peerURLPort); err != nil { - return err - } - return c.WaitHealth() -} - -type failureIsolateAll struct { - description -} - -func newFailureIsolateAll() *failureIsolateAll { - return &failureIsolateAll{ - description: "isolate all members", - } -} - -func (f *failureIsolateAll) Inject(c *cluster, round int) error { - for _, a := range c.Agents { - if err := a.DropPort(peerURLPort); err != nil { - return err - } - } - return nil -} - -func (f *failureIsolateAll) Recover(c *cluster, round int) error { - for _, a := range c.Agents { - if err := a.RecoverPort(peerURLPort); err != nil { - return err - } - } - return c.WaitHealth() -} - -type failureSlowNetworkOneMember struct { - description -} - -func newFailureSlowNetworkOneMember() *failureSlowNetworkOneMember { - desc := fmt.Sprintf("slow down one member's network by adding %d ms latency", slowNetworkLatency) - return &failureSlowNetworkOneMember{ - description: description(desc), - } -} - -func (f *failureSlowNetworkOneMember) Inject(c *cluster, round int) error { - i := round % c.Size - if err := c.Agents[i].SetLatency(slowNetworkLatency, randomVariation); err != nil { - c.Agents[i].RemoveLatency() // roll back - return err - } - return nil -} - -func (f *failureSlowNetworkOneMember) Recover(c *cluster, round int) error { - i := round % c.Size - if err := c.Agents[i].RemoveLatency(); err != nil { - return err - } - time.Sleep(waitRecover) - return c.WaitHealth() -} - -type failureSlowNetworkLeader struct { - description - idx int -} - -func newFailureSlowNetworkLeader() *failureSlowNetworkLeader { - desc := fmt.Sprintf("slow down leader's network by adding %d ms latency", slowNetworkLatency) - return &failureSlowNetworkLeader{ - description: description(desc), - } -} - -func (f *failureSlowNetworkLeader) Inject(c *cluster, round int) error { - idx, err := c.GetLeader() - if err != nil { - return err - } - f.idx = idx - if err := c.Agents[idx].SetLatency(slowNetworkLatency, randomVariation); err != nil { - c.Agents[idx].RemoveLatency() // roll back - return err - } - return nil -} - -func (f *failureSlowNetworkLeader) Recover(c *cluster, round int) error { - if err := c.Agents[f.idx].RemoveLatency(); err != nil { - return err - } - time.Sleep(waitRecover) - return c.WaitHealth() -} - -type failureSlowNetworkAll struct { - description -} - -func newFailureSlowNetworkAll() *failureSlowNetworkAll { - return &failureSlowNetworkAll{ - description: "slow down all members' network", - } -} - -func (f *failureSlowNetworkAll) Inject(c *cluster, round int) error { - for i, a := range c.Agents { - if err := a.SetLatency(slowNetworkLatency, randomVariation); err != nil { - for j := 0; j < i; j++ { // roll back - c.Agents[j].RemoveLatency() - } - return err - } - } - return nil -} - -func (f *failureSlowNetworkAll) Recover(c *cluster, round int) error { - for _, a := range c.Agents { - if err := a.RemoveLatency(); err != nil { - return err - } - } - time.Sleep(waitRecover) - return c.WaitHealth() -} diff --git a/tools/functional-tester/etcd-tester/failure_agent.go b/tools/functional-tester/etcd-tester/failure_agent.go new file mode 100644 index 000000000..d5b5af60a --- /dev/null +++ b/tools/functional-tester/etcd-tester/failure_agent.go @@ -0,0 +1,141 @@ +// Copyright 2016 The etcd Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "fmt" + "time" +) + +const ( + snapshotCount = 10000 + slowNetworkLatency = 500 // 500 millisecond + randomVariation = 50 + + // Wait more when it recovers from slow network, because network layer + // needs extra time to propagate traffic control (tc command) change. + // Otherwise, we get different hash values from the previous revision. + // For more detail, please see https://github.com/coreos/etcd/issues/5121. + waitRecover = 5 * time.Second +) + +func injectStop(m *member) error { return m.Agent.Stop() } +func recoverStop(m *member) error { + _, err := m.Agent.Restart() + return err +} + +func newFailureKillAll() failure { + return &failureAll{ + description: "kill all members", + injectMember: injectStop, + recoverMember: recoverStop, + } +} + +func newFailureKillMajority() failure { + return &failureMajority{ + description: "kill majority of the cluster", + injectMember: injectStop, + recoverMember: recoverStop, + } +} + +func newFailureKillOne() failure { + return &failureOne{ + description: "kill one random member", + injectMember: injectStop, + recoverMember: recoverStop, + } +} + +func newFailureKillLeader() failure { + ff := failureByFunc{ + description: "kill leader member", + injectMember: injectStop, + recoverMember: recoverStop, + } + return &failureLeader{ff, 0} +} + +func newFailureKillOneForLongTime() failure { + return &failureDelay{newFailureKillOne()} +} + +func newFailureKillLeaderForLongTime() failure { + return &failureDelay{newFailureKillLeader()} +} + +func injectDropPort(m *member) error { return m.Agent.DropPort(peerURLPort) } +func recoverDropPort(m *member) error { return m.Agent.RecoverPort(peerURLPort) } + +func newFailureIsolate() failure { + return &failureOne{ + description: "isolate one member", + injectMember: injectDropPort, + recoverMember: recoverDropPort, + } +} + +func newFailureIsolateAll() failure { + return &failureAll{ + description: "isolate all members", + injectMember: injectDropPort, + recoverMember: recoverDropPort, + } +} + +func injectLatency(m *member) error { + if err := m.Agent.SetLatency(slowNetworkLatency, randomVariation); err != nil { + m.Agent.RemoveLatency() + return err + } + return nil +} + +func recoverLatency(m *member) error { + if err := m.Agent.RemoveLatency(); err != nil { + return err + } + time.Sleep(waitRecover) + return nil +} + +func newFailureSlowNetworkOneMember() failure { + desc := fmt.Sprintf("slow down one member's network by adding %d ms latency", slowNetworkLatency) + return &failureOne{ + description: description(desc), + injectMember: injectLatency, + recoverMember: recoverLatency, + } +} + +func newFailureSlowNetworkLeader() failure { + desc := fmt.Sprintf("slow down leader's network by adding %d ms latency", slowNetworkLatency) + ff := failureByFunc{ + description: description(desc), + injectMember: injectStop, + recoverMember: recoverStop, + } + return &failureLeader{ff, 0} +} + +func newFailureSlowNetworkAll() failure { + return &failureAll{ + description: "slow down all members' network", + injectMember: injectLatency, + recoverMember: recoverLatency, + } +}