Merge pull request #3479 from mitake/membership

etcdserver: avoid deadlock caused by adding members with wrong peer URLs
This commit is contained in:
Yicheng Qin 2015-09-12 22:09:13 -07:00
commit 0ca800fbac
9 changed files with 125 additions and 0 deletions

View File

@ -95,6 +95,7 @@ type config struct {
fallback *flags.StringsFlag
initialCluster string
initialClusterToken string
strictReconfigCheck bool
// proxy
proxy *flags.StringsFlag
@ -177,6 +178,7 @@ func NewConfig() *config {
// Should never happen.
plog.Panicf("unexpected error setting up clusterStateFlag: %v", err)
}
fs.BoolVar(&cfg.strictReconfigCheck, "strict-reconfig-check", false, "Reject reconfiguration that might cause quorum loss")
// proxy
fs.Var(cfg.proxy, "proxy", fmt.Sprintf("Valid values include %s", strings.Join(cfg.proxy.Values, ", ")))

View File

@ -265,6 +265,7 @@ func startEtcd(cfg *config) (<-chan struct{}, error) {
TickMs: cfg.TickMs,
ElectionTicks: cfg.electionTicks(),
V3demo: cfg.v3demo,
StrictReconfigCheck: cfg.strictReconfigCheck,
}
var s *etcdserver.EtcdServer
s, err = etcdserver.NewServer(srvcfg)

View File

@ -368,6 +368,34 @@ func (c *cluster) SetVersion(ver *semver.Version) {
MustDetectDowngrade(c.version)
}
func (c *cluster) isReadyToAddNewMember() bool {
nmembers := 1
nstarted := 0
for _, member := range c.members {
if member.IsStarted() {
nstarted++
}
nmembers++
}
if nstarted == 1 && nmembers == 2 {
// a case of adding a new node to 1-member cluster for restoring cluster data
// https://github.com/coreos/etcd/blob/master/Documentation/admin_guide.md#restoring-the-cluster
plog.Debugf("The number of started member is 1. This cluster can accept add member request.")
return true
}
nquorum := nmembers/2 + 1
if nstarted < nquorum {
plog.Warningf("Reject add member request: the number of started member (%d) will be less than the quorum number of the cluster (%d)", nstarted, nquorum)
return false
}
return true
}
func membersFromStore(st store.Store) (map[types.ID]*Member, map[types.ID]bool) {
members := make(map[types.ID]*Member)
removed := make(map[types.ID]bool)

View File

@ -566,3 +566,83 @@ func newTestCluster(membs []*Member) *cluster {
}
func stringp(s string) *string { return &s }
func TestIsReadyToAddNewMember(t *testing.T) {
tests := []struct {
members []*Member
want bool
}{
{
// 0/3 members ready, should fail
[]*Member{
newTestMember(1, nil, "", nil),
newTestMember(2, nil, "", nil),
newTestMember(3, nil, "", nil),
},
false,
},
{
// 1/2 members ready, should fail
[]*Member{
newTestMember(1, nil, "1", nil),
newTestMember(2, nil, "", nil),
},
false,
},
{
// 1/3 members ready, should fail
[]*Member{
newTestMember(1, nil, "1", nil),
newTestMember(2, nil, "", nil),
newTestMember(3, nil, "", nil),
},
false,
},
{
// 1/1 members ready, should succeed (special case of 1-member cluster for recovery)
[]*Member{
newTestMember(1, nil, "1", nil),
},
true,
},
{
// 2/3 members ready, should fail
[]*Member{
newTestMember(1, nil, "1", nil),
newTestMember(2, nil, "2", nil),
newTestMember(3, nil, "", nil),
},
false,
},
{
// 3/3 members ready, should be fine to add one member and retain quorum
[]*Member{
newTestMember(1, nil, "1", nil),
newTestMember(2, nil, "2", nil),
newTestMember(3, nil, "3", nil),
},
true,
},
{
// 3/4 members ready, should be fine to add one member and retain quorum
[]*Member{
newTestMember(1, nil, "1", nil),
newTestMember(2, nil, "2", nil),
newTestMember(3, nil, "3", nil),
newTestMember(4, nil, "", nil),
},
true,
},
{
// empty cluster, it is impossible but should fail
[]*Member{},
false,
},
}
for i, tt := range tests {
c := newTestCluster(tt.members)
if got := c.isReadyToAddNewMember(); got != tt.want {
t.Errorf("%d: isReadyToAddNewMember returned %t, want %t", i, got, tt.want)
}
}
}

View File

@ -49,6 +49,8 @@ type ServerConfig struct {
ElectionTicks int
V3demo bool
StrictReconfigCheck bool
}
// VerifyBootstrapConfig sanity-checks the initial config for bootstrap case

View File

@ -31,6 +31,7 @@ var (
ErrTimeout = errors.New("etcdserver: request timed out")
ErrTimeoutDueToLeaderFail = errors.New("etcdserver: request timed out, possibly due to previous leader failure")
ErrTimeoutDueToConnectionLost = errors.New("etcdserver: request timed out, possibly due to connection lost")
ErrNotEnoughStartedMembers = errors.New("etcdserver: re-configuration failed due to not enough started members")
)
func isKeyNotFound(err error) bool {

View File

@ -105,6 +105,10 @@ func (m *Member) Clone() *Member {
return mm
}
func (m *Member) IsStarted() bool {
return len(m.Name) != 0
}
func memberStoreKey(id types.ID) string {
return path.Join(storeMembersPrefix, id.String())
}

View File

@ -603,6 +603,12 @@ func (s *EtcdServer) LeaderStats() []byte {
func (s *EtcdServer) StoreStats() []byte { return s.store.JsonStats() }
func (s *EtcdServer) AddMember(ctx context.Context, memb Member) error {
if s.cfg.StrictReconfigCheck && !s.cluster.isReadyToAddNewMember() {
// If s.cfg.StrictReconfigCheck is false, it means the option -strict-reconfig-check isn't passed to etcd.
// In such a case adding a new member is allowed unconditionally
return ErrNotEnoughStartedMembers
}
// TODO: move Member to protobuf type
b, err := json.Marshal(memb)
if err != nil {

View File

@ -866,6 +866,7 @@ func TestAddMember(t *testing.T) {
store: st,
cluster: cl,
reqIDGen: idutil.NewGenerator(0, time.Time{}),
cfg: &ServerConfig{},
}
s.start()
m := Member{ID: 1234, RaftAttributes: RaftAttributes{PeerURLs: []string{"foo"}}}