etcdserver: reject member removal that breaks the current active quorum

This commit is contained in:
Anthony Romano 2016-08-10 16:29:03 -07:00
parent 06e2338108
commit a1ce07a321
2 changed files with 40 additions and 14 deletions

View File

@ -844,10 +844,9 @@ func (s *EtcdServer) AddMember(ctx context.Context, memb membership.Member) erro
}
func (s *EtcdServer) RemoveMember(ctx context.Context, id uint64) error {
if s.Cfg.StrictReconfigCheck && !s.cluster.IsReadyToRemoveMember(id) {
// If s.cfg.StrictReconfigCheck is false, it means the option --strict-reconfig-check isn't passed to etcd.
// In such a case removing a member is allowed unconditionally
return ErrNotEnoughStartedMembers
// by default StrictReconfigCheck is enabled; reject removal if leads to quorum loss
if err := s.mayRemoveMember(types.ID(id)); err != nil {
return err
}
cc := raftpb.ConfChange{
@ -857,6 +856,32 @@ func (s *EtcdServer) RemoveMember(ctx context.Context, id uint64) error {
return s.configure(ctx, cc)
}
func (s *EtcdServer) mayRemoveMember(id types.ID) error {
if !s.Cfg.StrictReconfigCheck {
return nil
}
if !s.cluster.IsReadyToRemoveMember(uint64(id)) {
plog.Warningf("not enough started members, rejecting remove member %s", id)
return ErrNotEnoughStartedMembers
}
// downed member is safe to remove since it's not part of the active quorum
if t := s.r.transport.ActiveSince(id); id != s.ID() && t.IsZero() {
return nil
}
// protect quorum if some members are down
m := s.cluster.Members()
active := numConnectedSince(s.r.transport, time.Now().Add(-HealthInterval), s.ID(), m)
if (active - 1) < 1+((len(m)-1)/2) {
plog.Warningf("reconfigure breaks active quorum, rejecting remove member %s", id)
return ErrUnhealthy
}
return nil
}
func (s *EtcdServer) UpdateMember(ctx context.Context, memb membership.Member) error {
b, err := json.Marshal(memb)
if err != nil {

View File

@ -25,13 +25,7 @@ import (
// isConnectedToQuorumSince checks whether the local member is connected to the
// quorum of the cluster since the given time.
func isConnectedToQuorumSince(transport rafthttp.Transporter, since time.Time, self types.ID, members []*membership.Member) bool {
var connectedNum int
for _, m := range members {
if m.ID == self || isConnectedSince(transport, since, m.ID) {
connectedNum++
}
}
return connectedNum >= (len(members)+1)/2
return numConnectedSince(transport, since, self, members) >= (len(members)/2)+1
}
// isConnectedSince checks whether the local member is connected to the
@ -44,10 +38,17 @@ func isConnectedSince(transport rafthttp.Transporter, since time.Time, remote ty
// isConnectedFullySince checks whether the local member is connected to all
// members in the cluster since the given time.
func isConnectedFullySince(transport rafthttp.Transporter, since time.Time, self types.ID, members []*membership.Member) bool {
return numConnectedSince(transport, since, self, members) == len(members)
}
// numConnectedSince counts how many members are connected to the local member
// since the given time.
func numConnectedSince(transport rafthttp.Transporter, since time.Time, self types.ID, members []*membership.Member) int {
connectedNum := 0
for _, m := range members {
if m.ID != self && !isConnectedSince(transport, since, m.ID) {
return false
if m.ID == self || isConnectedSince(transport, since, m.ID) {
connectedNum++
}
}
return true
return connectedNum
}