From 568d1c678387a59b550affa7ee4a5f87ea34851a Mon Sep 17 00:00:00 2001 From: Xiang Li Date: Tue, 18 Aug 2015 15:04:41 -0700 Subject: [PATCH] etcdctl: use health endpoint to greatly simplify health checking --- etcdctl/command/cluster_health.go | 140 ++++++++---------------------- 1 file changed, 37 insertions(+), 103 deletions(-) diff --git a/etcdctl/command/cluster_health.go b/etcdctl/command/cluster_health.go index cd9f2418b..f7dd9656b 100644 --- a/etcdctl/command/cluster_health.go +++ b/etcdctl/command/cluster_health.go @@ -2,12 +2,10 @@ package command import ( "encoding/json" - "errors" "fmt" "net/http" "os" "os/signal" - "sort" "time" "github.com/coreos/etcd/Godeps/_workspace/src/github.com/codegangsta/cli" @@ -42,124 +40,60 @@ func handleClusterHealth(c *cli.Context) { handleError(ExitServerError, err) } - // TODO: update members when forever is set. + hc := http.Client{ + Transport: tr, + } + mi := mustNewMembersAPI(c) ms, err := mi.List(context.TODO()) if err != nil { fmt.Println("cluster may be unhealthy: failed to list members") handleError(ExitServerError, err) } - cl := make([]string, 0) - for _, m := range ms { - cl = append(cl, m.ClientURLs...) - } for { - // check the /health endpoint of all members first + health := false + for _, m := range ms { + checked := false + for _, url := range m.ClientURLs { + resp, err := hc.Get(url + "/health") + if err != nil { + fmt.Printf("failed to check the health of member %s on %s: %v\n", m.ID, url, err) + continue + } - ep, rs0, err := getLeaderStatus(tr, cl) - if err != nil { - fmt.Println("cluster may be unhealthy: failed to connect", cl) - if forever { - time.Sleep(10 * time.Second) - continue + result := struct{ Health string }{} + d := json.NewDecoder(resp.Body) + err = d.Decode(&result) + resp.Body.Close() + if err != nil { + fmt.Printf("failed to check the health of member %s on %s: %v\n", m.ID, url, err) + continue + } + + checked = true + if result.Health == "true" { + checked = true + fmt.Printf("member %s is healthy: got healthy result from %s\n", m.ID, url) + } else { + fmt.Printf("member %s is unhealthy: got unhealthy result from %s\n", m.ID, url) + } + break } - os.Exit(1) - } - - time.Sleep(time.Second) - - // are all the members makeing progress? - _, rs1, err := getLeaderStatus(tr, []string{ep}) - if err != nil { - fmt.Println("cluster is unhealthy") - if forever { - time.Sleep(10 * time.Second) - continue + if !checked { + fmt.Printf("member %s is unreachable: %v are all unreachable\n", m.ID, m.ClientURLs) } - os.Exit(1) } - - if rs1.Commit > rs0.Commit { - fmt.Printf("cluster is healthy: raft is making progress [commit index: %v->%v]\n", rs0.Commit, rs1.Commit) + if health { + fmt.Println("cluster is healthy") } else { - fmt.Printf("cluster is unhealthy: raft is not making progress [commit index: %v]\n", rs0.Commit) - } - fmt.Printf("leader is %v\n", rs0.Lead) - - var prints []string - - for id, pr0 := range rs0.Progress { - pr1, ok := rs1.Progress[id] - if !ok { - // TODO: forever should handle configuration change. - fmt.Println("Cluster configuration changed during health checking. Please retry.") - os.Exit(1) - } - if pr1.Match <= pr0.Match { - prints = append(prints, fmt.Sprintf("member %s is unhealthy: raft is not making progress [match: %v->%v]\n", id, pr0.Match, pr1.Match)) - } else { - prints = append(prints, fmt.Sprintf("member %s is healthy: raft is making progress [match: %v->%v]\n", id, pr0.Match, pr1.Match)) - } - } - - sort.Strings(prints) - for _, p := range prints { - fmt.Print(p) + fmt.Println("cluster is unhealthy") } if !forever { - return + break } - + fmt.Printf("\nnext check after 10 second...\n\n") time.Sleep(10 * time.Second) } } - -type raftStatus struct { - ID string `json:"id"` - Term uint64 `json:"term"` - Vote string `json:"vote"` - Commit uint64 `json:"commit"` - Lead string `json:"lead"` - RaftState string `json:"raftState"` - Progress map[string]struct { - Match uint64 `json:"match"` - Next uint64 `json:"next"` - State string `json:"state"` - } `json:"progress"` -} - -type vars struct { - RaftStatus raftStatus `json:"raft.status"` -} - -func getLeaderStatus(tr *http.Transport, endpoints []string) (string, raftStatus, error) { - // TODO: use new etcd client - httpclient := http.Client{ - Transport: tr, - } - - for _, ep := range endpoints { - resp, err := httpclient.Get(ep + "/debug/vars") - if err != nil { - continue - } - defer resp.Body.Close() - if resp.StatusCode != http.StatusOK { - continue - } - - vs := &vars{} - d := json.NewDecoder(resp.Body) - err = d.Decode(vs) - if err != nil { - continue - } - if vs.RaftStatus.Lead != vs.RaftStatus.ID { - continue - } - return ep, vs.RaftStatus, nil - } - return "", raftStatus{}, errors.New("no leader") -}