diff --git a/etcdserver/server.go b/etcdserver/server.go index 9e0fb7509..2954c7b3d 100644 --- a/etcdserver/server.go +++ b/etcdserver/server.go @@ -844,10 +844,9 @@ func (s *EtcdServer) AddMember(ctx context.Context, memb membership.Member) erro } func (s *EtcdServer) RemoveMember(ctx context.Context, id uint64) error { - if s.Cfg.StrictReconfigCheck && !s.cluster.IsReadyToRemoveMember(id) { - // If s.cfg.StrictReconfigCheck is false, it means the option --strict-reconfig-check isn't passed to etcd. - // In such a case removing a member is allowed unconditionally - return ErrNotEnoughStartedMembers + // by default StrictReconfigCheck is enabled; reject removal if leads to quorum loss + if err := s.mayRemoveMember(types.ID(id)); err != nil { + return err } cc := raftpb.ConfChange{ @@ -857,6 +856,32 @@ func (s *EtcdServer) RemoveMember(ctx context.Context, id uint64) error { return s.configure(ctx, cc) } +func (s *EtcdServer) mayRemoveMember(id types.ID) error { + if !s.Cfg.StrictReconfigCheck { + return nil + } + + if !s.cluster.IsReadyToRemoveMember(uint64(id)) { + plog.Warningf("not enough started members, rejecting remove member %s", id) + return ErrNotEnoughStartedMembers + } + + // downed member is safe to remove since it's not part of the active quorum + if t := s.r.transport.ActiveSince(id); id != s.ID() && t.IsZero() { + return nil + } + + // protect quorum if some members are down + m := s.cluster.Members() + active := numConnectedSince(s.r.transport, time.Now().Add(-HealthInterval), s.ID(), m) + if (active - 1) < 1+((len(m)-1)/2) { + plog.Warningf("reconfigure breaks active quorum, rejecting remove member %s", id) + return ErrUnhealthy + } + + return nil +} + func (s *EtcdServer) UpdateMember(ctx context.Context, memb membership.Member) error { b, err := json.Marshal(memb) if err != nil { diff --git a/etcdserver/util.go b/etcdserver/util.go index 5a3fd81ee..8365bf7cf 100644 --- a/etcdserver/util.go +++ b/etcdserver/util.go @@ -25,13 +25,7 @@ import ( // isConnectedToQuorumSince checks whether the local member is connected to the // quorum of the cluster since the given time. func isConnectedToQuorumSince(transport rafthttp.Transporter, since time.Time, self types.ID, members []*membership.Member) bool { - var connectedNum int - for _, m := range members { - if m.ID == self || isConnectedSince(transport, since, m.ID) { - connectedNum++ - } - } - return connectedNum >= (len(members)+1)/2 + return numConnectedSince(transport, since, self, members) >= (len(members)/2)+1 } // isConnectedSince checks whether the local member is connected to the @@ -44,10 +38,17 @@ func isConnectedSince(transport rafthttp.Transporter, since time.Time, remote ty // isConnectedFullySince checks whether the local member is connected to all // members in the cluster since the given time. func isConnectedFullySince(transport rafthttp.Transporter, since time.Time, self types.ID, members []*membership.Member) bool { + return numConnectedSince(transport, since, self, members) == len(members) +} + +// numConnectedSince counts how many members are connected to the local member +// since the given time. +func numConnectedSince(transport rafthttp.Transporter, since time.Time, self types.ID, members []*membership.Member) int { + connectedNum := 0 for _, m := range members { - if m.ID != self && !isConnectedSince(transport, since, m.ID) { - return false + if m.ID == self || isConnectedSince(transport, since, m.ID) { + connectedNum++ } } - return true + return connectedNum }