From 7bbdad9068898117b967b736837cb209c0ca3e72 Mon Sep 17 00:00:00 2001 From: Xiang Li Date: Mon, 9 Feb 2015 14:24:25 -0800 Subject: [PATCH] etcdctl: support healthy checking --- etcdctl/command/cluster_health.go | 140 ++++++++++++++++++++++++++++++ etcdctl/main.go | 1 + 2 files changed, 141 insertions(+) create mode 100644 etcdctl/command/cluster_health.go diff --git a/etcdctl/command/cluster_health.go b/etcdctl/command/cluster_health.go new file mode 100644 index 000000000..eb4d43a08 --- /dev/null +++ b/etcdctl/command/cluster_health.go @@ -0,0 +1,140 @@ +package command + +import ( + "encoding/json" + "errors" + "fmt" + "net/http" + "os" + "sort" + "strings" + "time" + + "github.com/coreos/etcd/Godeps/_workspace/src/github.com/codegangsta/cli" + "github.com/coreos/etcd/Godeps/_workspace/src/github.com/coreos/go-etcd/etcd" + "github.com/coreos/etcd/etcdserver/stats" +) + +func NewClusterHealthCommand() cli.Command { + return cli.Command{ + Name: "cluster-health", + Usage: "check the health of the etcd cluster", + Flags: []cli.Flag{}, + Action: handleClusterHealth, + } +} + +func handleClusterHealth(c *cli.Context) { + endpoints, err := getEndpoints(c) + if err != nil { + handleError(ErrorFromEtcd, err) + } + tr, err := getTransport(c) + if err != nil { + handleError(ErrorFromEtcd, err) + } + + client := etcd.NewClient(endpoints) + client.SetTransport(tr) + + if c.GlobalBool("debug") { + go dumpCURL(client) + } + + if ok := client.SyncCluster(); !ok { + handleError(FailedToConnectToHost, errors.New("cannot sync with the cluster using endpoints "+strings.Join(endpoints, ", "))) + } + + // do we have a leader? + ep, ls0, err := getLeaderStats(tr, client.GetCluster()) + if err != nil { + fmt.Println("cluster is unhealthy") + os.Exit(1) + } + + // is raft stable and making progress? + client = etcd.NewClient([]string{ep}) + resp, err := client.Get("/", false, false) + if err != nil { + fmt.Println("cluster is unhealthy") + os.Exit(1) + } + rt0, ri0 := resp.RaftTerm, resp.RaftIndex + time.Sleep(time.Second) + + resp, err = client.Get("/", false, false) + if err != nil { + fmt.Println("cluster is unhealthy") + os.Exit(1) + } + rt1, ri1 := resp.RaftTerm, resp.RaftIndex + + if rt0 != rt1 { + fmt.Println("cluster is unhealthy") + os.Exit(1) + } + + if ri1 == ri0 { + fmt.Println("cluster is unhealthy") + os.Exit(1) + } + + // are all the members makeing progress? + _, ls1, err := getLeaderStats(tr, []string{ep}) + if err != nil { + fmt.Println("cluster is unhealthy") + os.Exit(1) + } + + fmt.Println("cluster is healthy") + // self is healthy + var prints []string + + prints = append(prints, fmt.Sprintf("member %s is healthy\n", ls1.Leader)) + for name, fs0 := range ls0.Followers { + fs1, ok := ls1.Followers[name] + if !ok { + fmt.Println("Cluster configuration changed during health checking. Please retry.") + os.Exit(1) + } + if fs1.Counts.Success <= fs0.Counts.Success { + prints = append(prints, fmt.Sprintf("member %s is unhealthy\n", name)) + } else { + prints = append(prints, fmt.Sprintf("member %s is healthy\n", name)) + } + } + + sort.Strings(prints) + for _, p := range prints { + fmt.Print(p) + } + os.Exit(0) +} + +func getLeaderStats(tr *http.Transport, endpoints []string) (string, *stats.LeaderStats, error) { + // go-etcd does not support cluster stats, use http client for now + // TODO: use new etcd client with new member/stats endpoint + httpclient := http.Client{ + Transport: tr, + } + + for _, ep := range endpoints { + resp, err := httpclient.Get(ep + "/v2/stats/leader") + if err != nil { + continue + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + continue + } + + ls := &stats.LeaderStats{} + d := json.NewDecoder(resp.Body) + err = d.Decode(ls) + if err != nil { + continue + } + return ep, ls, nil + } + return "", nil, errors.New("no leader") +} diff --git a/etcdctl/main.go b/etcdctl/main.go index 6687ae683..6083a85fc 100644 --- a/etcdctl/main.go +++ b/etcdctl/main.go @@ -39,6 +39,7 @@ func main() { } app.Commands = []cli.Command{ command.NewBackupCommand(), + command.NewClusterHealthCommand(), command.NewMakeCommand(), command.NewMakeDirCommand(), command.NewRemoveCommand(),