mirror of
https://github.com/etcd-io/etcd.git
synced 2024-09-27 06:25:44 +00:00

This method uses raft status exposed at /debug/varz to determine the health of the cluster. It uses whether commit index increases to determine the cluster health, and uses whether match index increases to determine the member health. This could fix the bug #2711 that fails to detect follower is unhealthy because it doesn't rely on whether message in long-polling connection is sent. This health check is stricter than the old one, and reflects the situation that whether followers are healthy in the view of the leader. One example is that if the follower is receiving the snapshot, it will turns out to be unhealthy because it doesn't move forward. `etcdctl cluster-health` will reflect the healthy view in the raft level, while connectivity checks reflects the healthy view in transport level.
135 lines
3.1 KiB
Go
135 lines
3.1 KiB
Go
package command
|
|
|
|
import (
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"net/http"
|
|
"os"
|
|
"sort"
|
|
"time"
|
|
|
|
"github.com/coreos/etcd/Godeps/_workspace/src/github.com/codegangsta/cli"
|
|
"github.com/coreos/etcd/Godeps/_workspace/src/golang.org/x/net/context"
|
|
)
|
|
|
|
func NewClusterHealthCommand() cli.Command {
|
|
return cli.Command{
|
|
Name: "cluster-health",
|
|
Usage: "check the health of the etcd cluster",
|
|
Flags: []cli.Flag{},
|
|
Action: handleClusterHealth,
|
|
}
|
|
}
|
|
|
|
func handleClusterHealth(c *cli.Context) {
|
|
tr, err := getTransport(c)
|
|
if err != nil {
|
|
handleError(ExitServerError, err)
|
|
}
|
|
|
|
mi := mustNewMembersAPI(c)
|
|
ms, err := mi.List(context.TODO())
|
|
if err != nil {
|
|
handleError(ExitServerError, err)
|
|
}
|
|
|
|
cl := make([]string, 0)
|
|
for _, m := range ms {
|
|
cl = append(cl, m.ClientURLs...)
|
|
}
|
|
|
|
// check the /health endpoint of all members first
|
|
|
|
ep, rs0, err := getLeaderStatus(tr, cl)
|
|
if err != nil {
|
|
fmt.Println("cluster may be unhealthy: failed to connect", cl)
|
|
os.Exit(1)
|
|
}
|
|
|
|
time.Sleep(time.Second)
|
|
|
|
// are all the members makeing progress?
|
|
_, rs1, err := getLeaderStatus(tr, []string{ep})
|
|
if err != nil {
|
|
fmt.Println("cluster is unhealthy")
|
|
os.Exit(1)
|
|
}
|
|
|
|
if rs1.Commit > rs0.Commit {
|
|
fmt.Printf("cluster is healthy: raft is making progress [commit index: %v->%v]\n", rs0.Commit, rs1.Commit)
|
|
} else {
|
|
fmt.Printf("cluster is unhealthy: raft is not making progress [commit index: %v]\n", rs0.Commit)
|
|
}
|
|
fmt.Printf("leader is %v\n", rs0.Lead)
|
|
|
|
var prints []string
|
|
|
|
for id, pr0 := range rs0.Progress {
|
|
pr1, ok := rs1.Progress[id]
|
|
if !ok {
|
|
fmt.Println("Cluster configuration changed during health checking. Please retry.")
|
|
os.Exit(1)
|
|
}
|
|
if pr1.Match <= pr0.Match {
|
|
prints = append(prints, fmt.Sprintf("member %s is unhealthy: raft is not making progress [match: %v->%v]\n", id, pr0.Match, pr1.Match))
|
|
} else {
|
|
prints = append(prints, fmt.Sprintf("member %s is healthy: raft is making progress [match: %v->%v]\n", id, pr0.Match, pr1.Match))
|
|
}
|
|
}
|
|
|
|
sort.Strings(prints)
|
|
for _, p := range prints {
|
|
fmt.Print(p)
|
|
}
|
|
os.Exit(0)
|
|
}
|
|
|
|
type raftStatus struct {
|
|
ID string `json:"id"`
|
|
Term uint64 `json:"term"`
|
|
Vote string `json:"vote"`
|
|
Commit uint64 `json:"commit"`
|
|
Lead string `json:"lead"`
|
|
RaftState string `json:"raftState"`
|
|
Progress map[string]struct {
|
|
Match uint64 `json:"match"`
|
|
Next uint64 `json:"next"`
|
|
State string `json:"state"`
|
|
} `json:"progress"`
|
|
}
|
|
|
|
type vars struct {
|
|
RaftStatus raftStatus `json:"raft.status"`
|
|
}
|
|
|
|
func getLeaderStatus(tr *http.Transport, endpoints []string) (string, raftStatus, error) {
|
|
// TODO: use new etcd client
|
|
httpclient := http.Client{
|
|
Transport: tr,
|
|
}
|
|
|
|
for _, ep := range endpoints {
|
|
resp, err := httpclient.Get(ep + "/debug/vars")
|
|
if err != nil {
|
|
continue
|
|
}
|
|
defer resp.Body.Close()
|
|
if resp.StatusCode != http.StatusOK {
|
|
continue
|
|
}
|
|
|
|
vs := &vars{}
|
|
d := json.NewDecoder(resp.Body)
|
|
err = d.Decode(vs)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
if vs.RaftStatus.Lead != vs.RaftStatus.ID {
|
|
continue
|
|
}
|
|
return ep, vs.RaftStatus, nil
|
|
}
|
|
return "", raftStatus{}, errors.New("no leader")
|
|
}
|