Gyu-Ho Lee f9b90e13ac etcd-tester: no error for compact double-send
When compactKV request is halted before final acknowledgement,
it used to just continue on the next endpoint. But there could be
a case than compactKV is requested twice, and the first one is already
replicated and applied by the time the second request was to be
applied (returning compact revision error). This skips the case
by parsing the error message.
2016-03-31 21:29:02 -07:00

379 lines
8.9 KiB
Go

// Copyright 2015 CoreOS, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package main
import (
"fmt"
"math/rand"
"net"
"strings"
"time"
"golang.org/x/net/context"
"google.golang.org/grpc"
clientv2 "github.com/coreos/etcd/client"
"github.com/coreos/etcd/clientv3"
pb "github.com/coreos/etcd/etcdserver/etcdserverpb"
"github.com/coreos/etcd/tools/functional-tester/etcd-agent/client"
)
const peerURLPort = 2380
type cluster struct {
v2Only bool // to be deprecated
agentEndpoints []string
datadir string
stressKeySize int
stressKeySuffixRange int
Size int
Agents []client.Agent
Stressers []Stresser
Names []string
GRPCURLs []string
ClientURLs []string
}
type ClusterStatus struct {
AgentStatuses map[string]client.Status
}
// newCluster starts and returns a new cluster. The caller should call Terminate when finished, to shut it down.
func newCluster(agentEndpoints []string, datadir string, stressKeySize, stressKeySuffixRange int, isV2Only bool) (*cluster, error) {
c := &cluster{
v2Only: isV2Only,
agentEndpoints: agentEndpoints,
datadir: datadir,
stressKeySize: stressKeySize,
stressKeySuffixRange: stressKeySuffixRange,
}
if err := c.Bootstrap(); err != nil {
return nil, err
}
return c, nil
}
func (c *cluster) Bootstrap() error {
size := len(c.agentEndpoints)
agents := make([]client.Agent, size)
names := make([]string, size)
grpcURLs := make([]string, size)
clientURLs := make([]string, size)
peerURLs := make([]string, size)
members := make([]string, size)
for i, u := range c.agentEndpoints {
var err error
agents[i], err = client.NewAgent(u)
if err != nil {
return err
}
names[i] = fmt.Sprintf("etcd-%d", i)
host, _, err := net.SplitHostPort(u)
if err != nil {
return err
}
grpcURLs[i] = fmt.Sprintf("%s:2379", host)
clientURLs[i] = fmt.Sprintf("http://%s:2379", host)
peerURLs[i] = fmt.Sprintf("http://%s:%d", host, peerURLPort)
members[i] = fmt.Sprintf("%s=%s", names[i], peerURLs[i])
}
clusterStr := strings.Join(members, ",")
token := fmt.Sprint(rand.Int())
for i, a := range agents {
flags := []string{
"--name", names[i],
"--data-dir", c.datadir,
"--listen-client-urls", clientURLs[i],
"--advertise-client-urls", clientURLs[i],
"--listen-peer-urls", peerURLs[i],
"--initial-advertise-peer-urls", peerURLs[i],
"--initial-cluster-token", token,
"--initial-cluster", clusterStr,
"--initial-cluster-state", "new",
}
if _, err := a.Start(flags...); err != nil {
// cleanup
for j := 0; j < i; j++ {
agents[j].Terminate()
}
return err
}
}
// TODO: Too intensive stressers can panic etcd member with
// 'out of memory' error. Put rate limits in server side.
stressN := 100
var stressers []Stresser
if c.v2Only {
for _, u := range clientURLs {
s := &stresserV2{
Endpoint: u,
KeySize: c.stressKeySize,
KeySuffixRange: c.stressKeySuffixRange,
N: stressN,
}
go s.Stress()
stressers = append(stressers, s)
}
} else {
for _, u := range grpcURLs {
s := &stresser{
Endpoint: u,
KeySize: c.stressKeySize,
KeySuffixRange: c.stressKeySuffixRange,
N: stressN,
}
go s.Stress()
stressers = append(stressers, s)
}
}
c.Size = size
c.Agents = agents
c.Stressers = stressers
c.Names = names
c.GRPCURLs = grpcURLs
c.ClientURLs = clientURLs
return nil
}
func (c *cluster) WaitHealth() error {
var err error
// wait 60s to check cluster health.
// TODO: set it to a reasonable value. It is set that high because
// follower may use long time to catch up the leader when reboot under
// reasonable workload (https://github.com/coreos/etcd/issues/2698)
healthFunc, urls := setHealthKey, c.GRPCURLs
if c.v2Only {
healthFunc, urls = setHealthKeyV2, c.ClientURLs
}
for i := 0; i < 60; i++ {
err = healthFunc(urls)
if err == nil {
return nil
}
time.Sleep(time.Second)
}
return err
}
// GetLeader returns the index of leader and error if any.
func (c *cluster) GetLeader() (int, error) {
if c.v2Only {
return 0, nil
}
cli, err := clientv3.New(clientv3.Config{
Endpoints: c.GRPCURLs,
DialTimeout: 5 * time.Second,
})
if err != nil {
return 0, err
}
defer cli.Close()
clus := clientv3.NewCluster(cli)
mem, err := clus.MemberLeader(context.Background())
if err != nil {
return 0, err
}
for i, name := range c.Names {
if name == mem.Name {
return i, nil
}
}
return 0, fmt.Errorf("no leader found")
}
func (c *cluster) Report() (success, failure int) {
for _, stress := range c.Stressers {
s, f := stress.Report()
success += s
failure += f
}
return
}
func (c *cluster) Cleanup() error {
var lasterr error
for _, a := range c.Agents {
if err := a.Cleanup(); err != nil {
lasterr = err
}
}
for _, s := range c.Stressers {
s.Cancel()
}
return lasterr
}
func (c *cluster) Terminate() {
for _, a := range c.Agents {
a.Terminate()
}
for _, s := range c.Stressers {
s.Cancel()
}
}
func (c *cluster) Status() ClusterStatus {
cs := ClusterStatus{
AgentStatuses: make(map[string]client.Status),
}
for i, a := range c.Agents {
s, err := a.Status()
// TODO: add a.Desc() as a key of the map
desc := c.agentEndpoints[i]
if err != nil {
cs.AgentStatuses[desc] = client.Status{State: "unknown"}
plog.Printf("failed to get the status of agent [%s]", desc)
}
cs.AgentStatuses[desc] = s
}
return cs
}
// setHealthKey sets health key on all given urls.
func setHealthKey(us []string) error {
for _, u := range us {
conn, err := grpc.Dial(u, grpc.WithInsecure(), grpc.WithTimeout(5*time.Second))
if err != nil {
return fmt.Errorf("%v (%s)", err, u)
}
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
kvc := pb.NewKVClient(conn)
_, err = kvc.Put(ctx, &pb.PutRequest{Key: []byte("health"), Value: []byte("good")})
cancel()
conn.Close()
if err != nil {
return err
}
}
return nil
}
// setHealthKeyV2 sets health key on all given urls.
func setHealthKeyV2(us []string) error {
for _, u := range us {
cfg := clientv2.Config{
Endpoints: []string{u},
}
c, err := clientv2.New(cfg)
if err != nil {
return err
}
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
kapi := clientv2.NewKeysAPI(c)
_, err = kapi.Set(ctx, "health", "good", nil)
cancel()
if err != nil {
return err
}
}
return nil
}
func (c *cluster) getRevisionHash() (map[string]int64, map[string]int64, error) {
revs := make(map[string]int64)
hashes := make(map[string]int64)
for _, u := range c.GRPCURLs {
conn, err := grpc.Dial(u, grpc.WithInsecure(), grpc.WithTimeout(5*time.Second))
if err != nil {
return nil, nil, err
}
m := pb.NewMaintenanceClient(conn)
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
resp, err := m.Hash(ctx, &pb.HashRequest{})
cancel()
conn.Close()
if err != nil {
return nil, nil, err
}
revs[u] = resp.Header.Revision
hashes[u] = int64(resp.Hash)
}
return revs, hashes, nil
}
func (c *cluster) compactKV(rev int64) error {
var (
conn *grpc.ClientConn
err error
)
if rev <= 0 {
return nil
}
for i, u := range c.GRPCURLs {
conn, err = grpc.Dial(u, grpc.WithInsecure(), grpc.WithTimeout(5*time.Second))
if err != nil {
continue
}
kvc := pb.NewKVClient(conn)
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
_, err = kvc.Compact(ctx, &pb.CompactionRequest{Revision: rev, Physical: true})
cancel()
conn.Close()
if err != nil {
if strings.Contains(err.Error(), "required revision has been compacted") && i > 0 {
plog.Printf("%s is already compacted with %d (%v)", u, rev, err)
err = nil // in case compact was requested more than once
}
}
}
return err
}
func (c *cluster) checkCompact(rev int64) error {
if rev == 0 {
return nil
}
for _, u := range c.GRPCURLs {
cli, err := clientv3.New(clientv3.Config{
Endpoints: []string{u},
DialTimeout: 5 * time.Second,
})
if err != nil {
return err
}
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
wch := cli.Watch(ctx, "\x00", clientv3.WithFromKey(), clientv3.WithRev(rev-1))
wr, ok := <-wch
cancel()
cli.Close()
if !ok {
return fmt.Errorf("watch channel terminated")
}
if wr.CompactRevision != rev {
return fmt.Errorf("got compact revision %v, wanted %v", wr.CompactRevision, rev)
}
}
return nil
}