etcd/etcdctl/command/cluster_health.go
Yicheng Qin 3d8fe3b3ca etcdctl/cluster_health: improve output if failed to get leader stats
When failing to get leader stats, it said 'cluster is unhealthy' before.
This is confusing when it cannot get stats because advertised client urls
are set wrong and the cluster is healthy.
2015-05-14 18:52:10 -07:00

143 lines
3.2 KiB
Go

package command
import (
"encoding/json"
"errors"
"fmt"
"net/http"
"os"
"sort"
"strings"
"time"
"github.com/coreos/etcd/Godeps/_workspace/src/github.com/codegangsta/cli"
"github.com/coreos/etcd/Godeps/_workspace/src/github.com/coreos/go-etcd/etcd"
"github.com/coreos/etcd/etcdserver/stats"
)
func NewClusterHealthCommand() cli.Command {
return cli.Command{
Name: "cluster-health",
Usage: "check the health of the etcd cluster",
Flags: []cli.Flag{},
Action: handleClusterHealth,
}
}
func handleClusterHealth(c *cli.Context) {
endpoints, err := getEndpoints(c)
if err != nil {
handleError(ErrorFromEtcd, err)
}
tr, err := getTransport(c)
if err != nil {
handleError(ErrorFromEtcd, err)
}
client := etcd.NewClient(endpoints)
client.SetTransport(tr)
if c.GlobalBool("debug") {
go dumpCURL(client)
}
if ok := client.SyncCluster(); !ok {
handleError(FailedToConnectToHost, errors.New("cannot sync with the cluster using endpoints "+strings.Join(endpoints, ", ")))
}
// do we have a leader?
cl := client.GetCluster()
ep, ls0, err := getLeaderStats(tr, cl)
if err != nil {
fmt.Println("cluster may be unhealthy: failed to connect", cl)
os.Exit(1)
}
// is raft stable and making progress?
client = etcd.NewClient([]string{ep})
client.SetTransport(tr)
resp, err := client.Get("/", false, false)
if err != nil {
fmt.Println("cluster is unhealthy")
os.Exit(1)
}
rt0, ri0 := resp.RaftTerm, resp.RaftIndex
time.Sleep(time.Second)
resp, err = client.Get("/", false, false)
if err != nil {
fmt.Println("cluster is unhealthy")
os.Exit(1)
}
rt1, ri1 := resp.RaftTerm, resp.RaftIndex
if rt0 != rt1 {
fmt.Println("cluster is unhealthy")
os.Exit(1)
}
if ri1 == ri0 {
fmt.Println("cluster is unhealthy")
os.Exit(1)
}
// are all the members makeing progress?
_, ls1, err := getLeaderStats(tr, []string{ep})
if err != nil {
fmt.Println("cluster is unhealthy")
os.Exit(1)
}
fmt.Println("cluster is healthy")
// self is healthy
var prints []string
prints = append(prints, fmt.Sprintf("member %s is healthy\n", ls1.Leader))
for name, fs0 := range ls0.Followers {
fs1, ok := ls1.Followers[name]
if !ok {
fmt.Println("Cluster configuration changed during health checking. Please retry.")
os.Exit(1)
}
if fs1.Counts.Success <= fs0.Counts.Success {
prints = append(prints, fmt.Sprintf("member %s is unhealthy\n", name))
} else {
prints = append(prints, fmt.Sprintf("member %s is healthy\n", name))
}
}
sort.Strings(prints)
for _, p := range prints {
fmt.Print(p)
}
os.Exit(0)
}
func getLeaderStats(tr *http.Transport, endpoints []string) (string, *stats.LeaderStats, error) {
// go-etcd does not support cluster stats, use http client for now
// TODO: use new etcd client with new member/stats endpoint
httpclient := http.Client{
Transport: tr,
}
for _, ep := range endpoints {
resp, err := httpclient.Get(ep + "/v2/stats/leader")
if err != nil {
continue
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
continue
}
ls := &stats.LeaderStats{}
d := json.NewDecoder(resp.Body)
err = d.Decode(ls)
if err != nil {
continue
}
return ep, ls, nil
}
return "", nil, errors.New("no leader")
}