etcd/etcdctl/command/cluster_health.go

package command

import (
	"encoding/json"
	"errors"
	"fmt"
	"net/http"
	"os"
	"sort"
	"time"

	"github.com/coreos/etcd/Godeps/_workspace/src/github.com/codegangsta/cli"
	"github.com/coreos/etcd/Godeps/_workspace/src/golang.org/x/net/context"
)

func NewClusterHealthCommand() cli.Command {
	return cli.Command{
		Name:   "cluster-health",
		Usage:  "check the health of the etcd cluster",
		Flags:  []cli.Flag{},
		Action: handleClusterHealth,
	}
}

func handleClusterHealth(c *cli.Context) {
	tr, err := getTransport(c)
	if err != nil {
		handleError(ExitServerError, err)
	}

	mi := mustNewMembersAPI(c)
	ms, err := mi.List(context.TODO())
	if err != nil {
		handleError(ExitServerError, err)
	}

	cl := make([]string, 0)
	for _, m := range ms {
		cl = append(cl, m.ClientURLs...)
	}

	// check the /health endpoint of all members first

	ep, rs0, err := getLeaderStatus(tr, cl)
	if err != nil {
		fmt.Println("cluster may be unhealthy: failed to connect", cl)
		os.Exit(1)
	}

	time.Sleep(time.Second)

	// are all the members makeing progress?
	_, rs1, err := getLeaderStatus(tr, []string{ep})
	if err != nil {
		fmt.Println("cluster is unhealthy")
		os.Exit(1)
	}

	if rs1.Commit > rs0.Commit {
		fmt.Printf("cluster is healthy: raft is making progress [commit index: %v->%v]\n", rs0.Commit, rs1.Commit)
	} else {
		fmt.Printf("cluster is unhealthy: raft is not making progress [commit index: %v]\n", rs0.Commit)
	}
	fmt.Printf("leader is %v\n", rs0.Lead)

	var prints []string

	for id, pr0 := range rs0.Progress {
		pr1, ok := rs1.Progress[id]
		if !ok {
			fmt.Println("Cluster configuration changed during health checking. Please retry.")
			os.Exit(1)
		}
		if pr1.Match <= pr0.Match {
			prints = append(prints, fmt.Sprintf("member %s is unhealthy: raft is not making progress [match: %v->%v]\n", id, pr0.Match, pr1.Match))
		} else {
			prints = append(prints, fmt.Sprintf("member %s is healthy: raft is making progress [match: %v->%v]\n", id, pr0.Match, pr1.Match))
		}
	}

	sort.Strings(prints)
	for _, p := range prints {
		fmt.Print(p)
	}
	os.Exit(0)
}

type raftStatus struct {
	ID        string `json:"id"`
	Term      uint64 `json:"term"`
	Vote      string `json:"vote"`
	Commit    uint64 `json:"commit"`
	Lead      string `json:"lead"`
	RaftState string `json:"raftState"`
	Progress  map[string]struct {
		Match uint64 `json:"match"`
		Next  uint64 `json:"next"`
		State string `json:"state"`
	} `json:"progress"`
}

type vars struct {
	RaftStatus raftStatus `json:"raft.status"`
}

func getLeaderStatus(tr *http.Transport, endpoints []string) (string, raftStatus, error) {
	// TODO: use new etcd client
	httpclient := http.Client{
		Transport: tr,
	}

	for _, ep := range endpoints {
		resp, err := httpclient.Get(ep + "/debug/vars")
		if err != nil {
			continue
		}
		defer resp.Body.Close()
		if resp.StatusCode != http.StatusOK {
			continue
		}

		vs := &vars{}
		d := json.NewDecoder(resp.Body)
		err = d.Decode(vs)
		if err != nil {
			continue
		}
		if vs.RaftStatus.Lead != vs.RaftStatus.ID {
			continue
		}
		return ep, vs.RaftStatus, nil
	}
	return "", raftStatus{}, errors.New("no leader")
}