etcdserver: adjust StrictReconfigCheck

Adjust StrictReconfigCheck logic to accommodate learner members in the
cluster.
This commit is contained in:
Jingyi Hu 2019-04-18 14:59:37 -07:00
parent bdcecd1fc4
commit 76a63f9f7d
3 changed files with 141 additions and 42 deletions

View File

@ -123,6 +123,19 @@ func (c *RaftCluster) Member(id types.ID) *Member {
return c.members[id].Clone()
}
func (c *RaftCluster) VotingMembers() []*Member {
c.Lock()
defer c.Unlock()
var ms MembersByID
for _, m := range c.members {
if !m.IsLearner {
ms = append(ms, m.Clone())
}
}
sort.Sort(ms)
return []*Member(ms)
}
// MemberByName returns a Member with the given name if exists.
// If more than one member has the given name, it will panic.
func (c *RaftCluster) MemberByName(name string) *Member {
@ -551,11 +564,11 @@ func (c *RaftCluster) SetVersion(ver *semver.Version, onSet func(*zap.Logger, *s
onSet(c.lg, ver)
}
func (c *RaftCluster) IsReadyToAddNewMember() bool {
func (c *RaftCluster) IsReadyToAddVotingMember() bool {
nmembers := 1
nstarted := 0
for _, member := range c.members {
for _, member := range c.VotingMembers() {
if member.IsStarted() {
nstarted++
}
@ -592,11 +605,11 @@ func (c *RaftCluster) IsReadyToAddNewMember() bool {
return true
}
func (c *RaftCluster) IsReadyToRemoveMember(id uint64) bool {
func (c *RaftCluster) IsReadyToRemoveVotingMember(id uint64) bool {
nmembers := 0
nstarted := 0
for _, member := range c.members {
for _, member := range c.VotingMembers() {
if uint64(member.ID) == id {
continue
}

View File

@ -626,7 +626,7 @@ func newTestCluster(membs []*Member) *RaftCluster {
func stringp(s string) *string { return &s }
func TestIsReadyToAddNewMember(t *testing.T) {
func TestIsReadyToAddVotingMember(t *testing.T) {
tests := []struct {
members []*Member
want bool
@ -697,16 +697,38 @@ func TestIsReadyToAddNewMember(t *testing.T) {
[]*Member{},
false,
},
{
// 2 voting members ready in cluster with 2 voting members and 2 unstarted learner member, should succeed
// (the status of learner members does not affect the readiness of adding voting member)
[]*Member{
newTestMember(1, nil, "1", nil),
newTestMember(2, nil, "2", nil),
newTestMemberAsLearner(3, nil, "", nil),
newTestMemberAsLearner(4, nil, "", nil),
},
true,
},
{
// 1 voting member ready in cluster with 2 voting members and 2 ready learner member, should fail
// (the status of learner members does not affect the readiness of adding voting member)
[]*Member{
newTestMember(1, nil, "1", nil),
newTestMember(2, nil, "", nil),
newTestMemberAsLearner(3, nil, "3", nil),
newTestMemberAsLearner(4, nil, "4", nil),
},
false,
},
}
for i, tt := range tests {
c := newTestCluster(tt.members)
if got := c.IsReadyToAddNewMember(); got != tt.want {
if got := c.IsReadyToAddVotingMember(); got != tt.want {
t.Errorf("%d: isReadyToAddNewMember returned %t, want %t", i, got, tt.want)
}
}
}
func TestIsReadyToRemoveMember(t *testing.T) {
func TestIsReadyToRemoveVotingMember(t *testing.T) {
tests := []struct {
members []*Member
removeID uint64
@ -782,10 +804,57 @@ func TestIsReadyToRemoveMember(t *testing.T) {
4,
true,
},
{
// 1 voting members ready in cluster with 1 voting member and 1 ready learner,
// removing voting member should fail
// (the status of learner members does not affect the readiness of removing voting member)
[]*Member{
newTestMember(1, nil, "1", nil),
newTestMemberAsLearner(2, nil, "2", nil),
},
1,
false,
},
{
// 1 voting members ready in cluster with 2 voting member and 1 ready learner,
// removing ready voting member should fail
// (the status of learner members does not affect the readiness of removing voting member)
[]*Member{
newTestMember(1, nil, "1", nil),
newTestMember(2, nil, "", nil),
newTestMemberAsLearner(3, nil, "3", nil),
},
1,
false,
},
{
// 1 voting members ready in cluster with 2 voting member and 1 ready learner,
// removing unstarted voting member should be fine. (Actual operation will fail)
// (the status of learner members does not affect the readiness of removing voting member)
[]*Member{
newTestMember(1, nil, "1", nil),
newTestMember(2, nil, "", nil),
newTestMemberAsLearner(3, nil, "3", nil),
},
2,
true,
},
{
// 1 voting members ready in cluster with 2 voting member and 1 unstarted learner,
// removing not-ready voting member should be fine. (Actual operation will fail)
// (the status of learner members does not affect the readiness of removing voting member)
[]*Member{
newTestMember(1, nil, "1", nil),
newTestMember(2, nil, "", nil),
newTestMemberAsLearner(3, nil, "", nil),
},
2,
true,
},
}
for i, tt := range tests {
c := newTestCluster(tt.members)
if got := c.IsReadyToRemoveMember(tt.removeID); got != tt.want {
if got := c.IsReadyToRemoveVotingMember(tt.removeID); got != tt.want {
t.Errorf("%d: isReadyToAddNewMember returned %t, want %t", i, got, tt.want)
}
}

View File

@ -1553,43 +1553,17 @@ func (s *EtcdServer) AddMember(ctx context.Context, memb membership.Member) ([]*
return nil, err
}
// TODO: might switch to less strict check when adding raft learner
if s.Cfg.StrictReconfigCheck {
// by default StrictReconfigCheck is enabled; reject new members if unhealthy
if !s.cluster.IsReadyToAddNewMember() {
if lg := s.getLogger(); lg != nil {
lg.Warn(
"rejecting member add request; not enough healthy members",
zap.String("local-member-id", s.ID().String()),
zap.String("requested-member-add", fmt.Sprintf("%+v", memb)),
zap.Error(ErrNotEnoughStartedMembers),
)
} else {
plog.Warningf("not enough started members, rejecting member add %+v", memb)
}
return nil, ErrNotEnoughStartedMembers
}
if !isConnectedFullySince(s.r.transport, time.Now().Add(-HealthInterval), s.ID(), s.cluster.Members()) {
if lg := s.getLogger(); lg != nil {
lg.Warn(
"rejecting member add request; local member has not been connected to all peers, reconfigure breaks active quorum",
zap.String("local-member-id", s.ID().String()),
zap.String("requested-member-add", fmt.Sprintf("%+v", memb)),
zap.Error(ErrUnhealthy),
)
} else {
plog.Warningf("not healthy for reconfigure, rejecting member add %+v", memb)
}
return nil, ErrUnhealthy
}
}
// TODO: move Member to protobuf type
b, err := json.Marshal(memb)
if err != nil {
return nil, err
}
// by default StrictReconfigCheck is enabled; reject new members if unhealthy.
if err := s.mayAddMember(memb); err != nil {
return nil, err
}
cc := raftpb.ConfChange{
Type: raftpb.ConfChangeAddNode,
NodeID: uint64(memb.ID),
@ -1603,6 +1577,43 @@ func (s *EtcdServer) AddMember(ctx context.Context, memb membership.Member) ([]*
return s.configure(ctx, cc)
}
func (s *EtcdServer) mayAddMember(memb membership.Member) error {
if !s.Cfg.StrictReconfigCheck {
return nil
}
// protect quorum when adding voting member
if !memb.IsLearner && !s.cluster.IsReadyToAddVotingMember() {
if lg := s.getLogger(); lg != nil {
lg.Warn(
"rejecting member add request; not enough healthy members",
zap.String("local-member-id", s.ID().String()),
zap.String("requested-member-add", fmt.Sprintf("%+v", memb)),
zap.Error(ErrNotEnoughStartedMembers),
)
} else {
plog.Warningf("not enough started members, rejecting member add %+v", memb)
}
return ErrNotEnoughStartedMembers
}
if !isConnectedFullySince(s.r.transport, time.Now().Add(-HealthInterval), s.ID(), s.cluster.VotingMembers()) {
if lg := s.getLogger(); lg != nil {
lg.Warn(
"rejecting member add request; local member has not been connected to all peers, reconfigure breaks active quorum",
zap.String("local-member-id", s.ID().String()),
zap.String("requested-member-add", fmt.Sprintf("%+v", memb)),
zap.Error(ErrUnhealthy),
)
} else {
plog.Warningf("not healthy for reconfigure, rejecting member add %+v", memb)
}
return ErrUnhealthy
}
return nil
}
func (s *EtcdServer) RemoveMember(ctx context.Context, id uint64) ([]*membership.Member, error) {
if err := s.checkMembershipOperationPermission(ctx); err != nil {
return nil, err
@ -1667,7 +1678,13 @@ func (s *EtcdServer) mayRemoveMember(id types.ID) error {
return nil
}
if !s.cluster.IsReadyToRemoveMember(uint64(id)) {
isLearner := s.cluster.IsMemberExist(id) && s.cluster.Member(id).IsLearner
// no need to check quorum when removing non-voting member
if isLearner {
return nil
}
if !s.cluster.IsReadyToRemoveVotingMember(uint64(id)) {
if lg := s.getLogger(); lg != nil {
lg.Warn(
"rejecting member remove request; not enough healthy members",
@ -1687,7 +1704,7 @@ func (s *EtcdServer) mayRemoveMember(id types.ID) error {
}
// protect quorum if some members are down
m := s.cluster.Members()
m := s.cluster.VotingMembers()
active := numConnectedSince(s.r.transport, time.Now().Add(-HealthInterval), s.ID(), m)
if (active - 1) < 1+((len(m)-1)/2) {
if lg := s.getLogger(); lg != nil {