2016-02-09 13:32:39 -08:00

269 lines
7.6 KiB
Go

// Copyright 2015 CoreOS, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package main
import (
"log"
"sync"
"time"
"github.com/coreos/etcd/Godeps/_workspace/src/golang.org/x/net/context"
"github.com/coreos/etcd/Godeps/_workspace/src/google.golang.org/grpc"
pb "github.com/coreos/etcd/etcdserver/etcdserverpb"
)
type tester struct {
failures []failure
cluster *cluster
limit int
status Status
}
func (tt *tester) runLoop() {
tt.status.Since = time.Now()
tt.status.RoundLimit = tt.limit
tt.status.cluster = tt.cluster
for _, f := range tt.failures {
tt.status.Failures = append(tt.status.Failures, f.Desc())
}
for i := 0; i < tt.limit; i++ {
tt.status.setRound(i)
var currentRevision int64
for j, f := range tt.failures {
tt.status.setCase(j)
if err := tt.cluster.WaitHealth(); err != nil {
log.Printf("etcd-tester: [round#%d case#%d] wait full health error: %v", i, j, err)
if err := tt.cleanup(i, j); err != nil {
log.Printf("etcd-tester: [round#%d case#%d] cleanup error: %v", i, j, err)
return
}
continue
}
log.Printf("etcd-tester: [round#%d case#%d] start failure %s", i, j, f.Desc())
log.Printf("etcd-tester: [round#%d case#%d] start injecting failure...", i, j)
if err := f.Inject(tt.cluster, i); err != nil {
log.Printf("etcd-tester: [round#%d case#%d] injection error: %v", i, j, err)
if err := tt.cleanup(i, j); err != nil {
log.Printf("etcd-tester: [round#%d case#%d] cleanup error: %v", i, j, err)
return
}
continue
}
log.Printf("etcd-tester: [round#%d case#%d] injected failure", i, j)
log.Printf("etcd-tester: [round#%d case#%d] start recovering failure...", i, j)
if err := f.Recover(tt.cluster, i); err != nil {
log.Printf("etcd-tester: [round#%d case#%d] recovery error: %v", i, j, err)
if err := tt.cleanup(i, j); err != nil {
log.Printf("etcd-tester: [round#%d case#%d] cleanup error: %v", i, j, err)
return
}
continue
}
log.Printf("etcd-tester: [round#%d case#%d] recovered failure", i, j)
if tt.cluster.v2Only {
log.Printf("etcd-tester: [round#%d case#%d] succeed!", i, j)
continue
}
log.Printf("etcd-tester: [round#%d case#%d] canceling the stressers...", i, j)
for _, s := range tt.cluster.Stressers {
s.Cancel()
}
log.Printf("etcd-tester: [round#%d case#%d] canceled stressers", i, j)
log.Printf("etcd-tester: [round#%d case#%d] checking current revisions...", i, j)
var (
revs map[string]int64
rerr error
ok bool
)
for k := 0; k < 5; k++ {
time.Sleep(time.Second)
revs, rerr = tt.cluster.getRevision()
if rerr != nil {
log.Printf("etcd-tester: [round#%d case#%d.%d] failed to get current revisions (%v)", i, j, k, rerr)
continue
}
if currentRevision, ok = getSameValue(revs); ok {
break
}
log.Printf("etcd-tester: [round#%d case#%d.%d] inconsistent current revisions %+v", i, j, k, revs)
}
if !ok || rerr != nil {
log.Printf("etcd-tester: [round#%d case#%d] checking current revisions failed (%v)", i, j, revs)
if err := tt.cleanup(i, j); err != nil {
log.Printf("etcd-tester: [round#%d case#%d] cleanup error: %v", i, j, err)
return
}
continue
}
log.Printf("etcd-tester: [round#%d case#%d] all members are consistent with current revisions", i, j)
log.Printf("etcd-tester: [round#%d case#%d] checking current storage hashes...", i, j)
hashes, err := tt.cluster.getKVHash()
if err != nil {
log.Printf("etcd-tester: [round#%d case#%d] getKVHash error (%v)", i, j, err)
if err := tt.cleanup(i, j); err != nil {
log.Printf("etcd-tester: [round#%d case#%d] cleanup error: %v", i, j, err)
return
}
continue
}
if _, ok = getSameValue(hashes); !ok {
log.Printf("etcd-tester: [round#%d case#%d] checking current storage hashes failed (%v)", i, j, hashes)
if err := tt.cleanup(i, j); err != nil {
log.Printf("etcd-tester: [round#%d case#%d] cleanup error: %v", i, j, err)
return
}
continue
}
log.Printf("etcd-tester: [round#%d case#%d] all members are consistent with storage hashes", i, j)
log.Printf("etcd-tester: [round#%d case#%d] restarting the stressers...", i, j)
for _, s := range tt.cluster.Stressers {
go s.Stress()
}
log.Printf("etcd-tester: [round#%d case#%d] succeed!", i, j)
}
revToCompact := max(0, currentRevision-10000)
log.Printf("etcd-tester: [round#%d] compacting storage at %d (current revision %d)", i, revToCompact, currentRevision)
if err := tt.cluster.compactKV(revToCompact); err != nil {
log.Printf("etcd-tester: [round#%d] compactKV error (%v)", i, err)
if err := tt.cleanup(i, 0); err != nil {
log.Printf("etcd-tester: [round#%d] cleanup error: %v", i, err)
return
}
continue
}
log.Printf("etcd-tester: [round#%d] compacted storage", i)
// TODO: make sure compaction is finished
time.Sleep(30 * time.Second)
}
}
func (tt *tester) cleanup(i, j int) error {
log.Printf("etcd-tester: [round#%d case#%d] cleaning up...", i, j)
if err := tt.cluster.Cleanup(); err != nil {
return err
}
return tt.cluster.Bootstrap()
}
type Status struct {
Since time.Time
Failures []string
RoundLimit int
Cluster ClusterStatus
cluster *cluster
mu sync.Mutex // guards Round and Case
Round int
Case int
}
// get gets a copy of status
func (s *Status) get() Status {
s.mu.Lock()
got := *s
cluster := s.cluster
s.mu.Unlock()
got.Cluster = cluster.Status()
return got
}
func (s *Status) setRound(r int) {
s.mu.Lock()
defer s.mu.Unlock()
s.Round = r
}
func (s *Status) setCase(c int) {
s.mu.Lock()
defer s.mu.Unlock()
s.Case = c
}
func (c *cluster) getRevision() (map[string]int64, error) {
revs := make(map[string]int64)
for _, u := range c.GRPCURLs {
conn, err := grpc.Dial(u, grpc.WithInsecure(), grpc.WithTimeout(5*time.Second))
if err != nil {
return nil, err
}
kvc := pb.NewKVClient(conn)
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
resp, err := kvc.Range(ctx, &pb.RangeRequest{Key: []byte("foo")})
if err != nil {
return nil, err
}
cancel()
revs[u] = resp.Header.Revision
}
return revs, nil
}
func (c *cluster) getKVHash() (map[string]int64, error) {
hashes := make(map[string]int64)
for _, u := range c.GRPCURLs {
conn, err := grpc.Dial(u, grpc.WithInsecure(), grpc.WithTimeout(5*time.Second))
if err != nil {
return nil, err
}
kvc := pb.NewKVClient(conn)
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
resp, err := kvc.Hash(ctx, &pb.HashRequest{})
if err != nil {
return nil, err
}
cancel()
hashes[u] = int64(resp.Hash)
}
return hashes, nil
}
func (c *cluster) compactKV(rev int64) error {
var (
conn *grpc.ClientConn
err error
)
for _, u := range c.GRPCURLs {
conn, err = grpc.Dial(u, grpc.WithInsecure(), grpc.WithTimeout(5*time.Second))
if err != nil {
continue
}
kvc := pb.NewKVClient(conn)
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
_, err = kvc.Compact(ctx, &pb.CompactionRequest{Revision: rev})
cancel()
if err == nil {
return nil
}
}
return err
}