mirror of
https://github.com/etcd-io/etcd.git
synced 2024-09-27 06:25:44 +00:00

When failing to get leader stats, it said 'cluster is unhealthy' before. This is confusing when it cannot get stats because advertised client urls are set wrong and the cluster is healthy.
143 lines
3.2 KiB
Go
143 lines
3.2 KiB
Go
package command
|
|
|
|
import (
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"net/http"
|
|
"os"
|
|
"sort"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/coreos/etcd/Godeps/_workspace/src/github.com/codegangsta/cli"
|
|
"github.com/coreos/etcd/Godeps/_workspace/src/github.com/coreos/go-etcd/etcd"
|
|
"github.com/coreos/etcd/etcdserver/stats"
|
|
)
|
|
|
|
func NewClusterHealthCommand() cli.Command {
|
|
return cli.Command{
|
|
Name: "cluster-health",
|
|
Usage: "check the health of the etcd cluster",
|
|
Flags: []cli.Flag{},
|
|
Action: handleClusterHealth,
|
|
}
|
|
}
|
|
|
|
func handleClusterHealth(c *cli.Context) {
|
|
endpoints, err := getEndpoints(c)
|
|
if err != nil {
|
|
handleError(ErrorFromEtcd, err)
|
|
}
|
|
tr, err := getTransport(c)
|
|
if err != nil {
|
|
handleError(ErrorFromEtcd, err)
|
|
}
|
|
|
|
client := etcd.NewClient(endpoints)
|
|
client.SetTransport(tr)
|
|
|
|
if c.GlobalBool("debug") {
|
|
go dumpCURL(client)
|
|
}
|
|
|
|
if ok := client.SyncCluster(); !ok {
|
|
handleError(FailedToConnectToHost, errors.New("cannot sync with the cluster using endpoints "+strings.Join(endpoints, ", ")))
|
|
}
|
|
|
|
// do we have a leader?
|
|
cl := client.GetCluster()
|
|
ep, ls0, err := getLeaderStats(tr, cl)
|
|
if err != nil {
|
|
fmt.Println("cluster may be unhealthy: failed to connect", cl)
|
|
os.Exit(1)
|
|
}
|
|
|
|
// is raft stable and making progress?
|
|
client = etcd.NewClient([]string{ep})
|
|
client.SetTransport(tr)
|
|
resp, err := client.Get("/", false, false)
|
|
if err != nil {
|
|
fmt.Println("cluster is unhealthy")
|
|
os.Exit(1)
|
|
}
|
|
rt0, ri0 := resp.RaftTerm, resp.RaftIndex
|
|
time.Sleep(time.Second)
|
|
|
|
resp, err = client.Get("/", false, false)
|
|
if err != nil {
|
|
fmt.Println("cluster is unhealthy")
|
|
os.Exit(1)
|
|
}
|
|
rt1, ri1 := resp.RaftTerm, resp.RaftIndex
|
|
|
|
if rt0 != rt1 {
|
|
fmt.Println("cluster is unhealthy")
|
|
os.Exit(1)
|
|
}
|
|
|
|
if ri1 == ri0 {
|
|
fmt.Println("cluster is unhealthy")
|
|
os.Exit(1)
|
|
}
|
|
|
|
// are all the members makeing progress?
|
|
_, ls1, err := getLeaderStats(tr, []string{ep})
|
|
if err != nil {
|
|
fmt.Println("cluster is unhealthy")
|
|
os.Exit(1)
|
|
}
|
|
|
|
fmt.Println("cluster is healthy")
|
|
// self is healthy
|
|
var prints []string
|
|
|
|
prints = append(prints, fmt.Sprintf("member %s is healthy\n", ls1.Leader))
|
|
for name, fs0 := range ls0.Followers {
|
|
fs1, ok := ls1.Followers[name]
|
|
if !ok {
|
|
fmt.Println("Cluster configuration changed during health checking. Please retry.")
|
|
os.Exit(1)
|
|
}
|
|
if fs1.Counts.Success <= fs0.Counts.Success {
|
|
prints = append(prints, fmt.Sprintf("member %s is unhealthy\n", name))
|
|
} else {
|
|
prints = append(prints, fmt.Sprintf("member %s is healthy\n", name))
|
|
}
|
|
}
|
|
|
|
sort.Strings(prints)
|
|
for _, p := range prints {
|
|
fmt.Print(p)
|
|
}
|
|
os.Exit(0)
|
|
}
|
|
|
|
func getLeaderStats(tr *http.Transport, endpoints []string) (string, *stats.LeaderStats, error) {
|
|
// go-etcd does not support cluster stats, use http client for now
|
|
// TODO: use new etcd client with new member/stats endpoint
|
|
httpclient := http.Client{
|
|
Transport: tr,
|
|
}
|
|
|
|
for _, ep := range endpoints {
|
|
resp, err := httpclient.Get(ep + "/v2/stats/leader")
|
|
if err != nil {
|
|
continue
|
|
}
|
|
defer resp.Body.Close()
|
|
if resp.StatusCode != http.StatusOK {
|
|
continue
|
|
}
|
|
|
|
ls := &stats.LeaderStats{}
|
|
d := json.NewDecoder(resp.Body)
|
|
err = d.Decode(ls)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
return ep, ls, nil
|
|
}
|
|
return "", nil, errors.New("no leader")
|
|
}
|