etcdserver: add mayPromote check

This commit is contained in:
宇慕 2019-04-16 15:45:16 +08:00 committed by Jingyi Hu
parent cca8b0d44f
commit dfe296ac3c
5 changed files with 102 additions and 19 deletions

View File

@ -214,14 +214,13 @@ func TestMemberAddForLearner(t *testing.T) {
}
}
func TestMemberPromoteForLearner(t *testing.T) {
// TODO test not ready learner promotion.
func TestMemberPromoteForNotReadyLearner(t *testing.T) {
defer testutil.AfterTest(t)
clus := integration.NewClusterV3(t, &integration.ClusterConfig{Size: 3})
clus := integration.NewClusterV3(t, &integration.ClusterConfig{Size: 1})
defer clus.Terminate(t)
// TODO change the random client to client that talk to leader directly.
capi := clus.RandClient()
// first client is talked to leader because cluster size is 1
capi := clus.Client(0)
urls := []string{"http://127.0.0.1:1234"}
memberAddResp, err := capi.MemberAddAsLearner(context.Background(), urls)
@ -244,19 +243,14 @@ func TestMemberPromoteForLearner(t *testing.T) {
t.Fatalf("Added 1 learner node to cluster, got %d", numberOfLearners)
}
memberPromoteResp, err := capi.MemberPromote(context.Background(), learnerID)
if err != nil {
t.Fatalf("failed to promote member: %v", err)
// since we do not start learner, learner must be not ready.
_, err = capi.MemberPromote(context.Background(), learnerID)
expectedErrKeywords := "can only promote a learner member which catches up with leader"
if err == nil {
t.Fatalf("expecting promote not ready learner to fail, got no error")
}
numberOfLearners = 0
for _, m := range memberPromoteResp.Members {
if m.IsLearner {
numberOfLearners++
}
}
if numberOfLearners != 0 {
t.Errorf("learner promoted, expect 0 learner, got %d", numberOfLearners)
if !strings.Contains(err.Error(), expectedErrKeywords) {
t.Errorf("expecting error to contain %s, got %s", expectedErrKeywords, err.Error())
}
}

View File

@ -652,6 +652,36 @@ func (c *RaftCluster) IsReadyToRemoveVotingMember(id uint64) bool {
return true
}
func (c *RaftCluster) IsReadyToPromoteMember(id uint64) bool {
nmembers := 1
nstarted := 0
for _, member := range c.VotingMembers() {
if member.IsStarted() {
nstarted++
}
nmembers++
}
nquorum := nmembers/2 + 1
if nstarted < nquorum {
if c.lg != nil {
c.lg.Warn(
"rejecting member promote; started member will be less than quorum",
zap.Int("number-of-started-member", nstarted),
zap.Int("quorum", nquorum),
zap.String("cluster-id", c.cid.String()),
zap.String("local-member-id", c.localID.String()),
)
} else {
plog.Warningf("Reject promote member request: the number of started member (%d) will be less than the quorum number of the cluster (%d)", nstarted, nquorum)
}
return false
}
return true
}
func membersFromStore(lg *zap.Logger, st v2store.Store) (map[types.ID]*Member, map[types.ID]bool) {
members := make(map[types.ID]*Member)
removed := make(map[types.ID]bool)

View File

@ -39,6 +39,7 @@ var toGRPCErrorMap = map[error]error{
membership.ErrLearnerNotReady: rpctypes.ErrGRPCLearnerNotReady,
membership.ErrTooManyLearners: rpctypes.ErrGRPCTooManyLearners,
etcdserver.ErrNotEnoughStartedMembers: rpctypes.ErrMemberNotEnoughStarted,
etcdserver.ErrLearnerNotReady: rpctypes.ErrGRPCLearnerNotReady,
mvcc.ErrCompacted: rpctypes.ErrGRPCCompacted,
mvcc.ErrFutureRev: rpctypes.ErrGRPCFutureRev,

View File

@ -29,6 +29,7 @@ var (
ErrTimeoutLeaderTransfer = errors.New("etcdserver: request timed out, leader transfer took too long")
ErrLeaderChanged = errors.New("etcdserver: leader changed")
ErrNotEnoughStartedMembers = errors.New("etcdserver: re-configuration failed due to not enough started members")
ErrLearnerNotReady = errors.New("etcdserver: can only promote a learner member which catches up with leader")
ErrNoLeader = errors.New("etcdserver: no leader")
ErrNotLeader = errors.New("etcdserver: not leader")
ErrRequestTooLarge = errors.New("etcdserver: request is too large")

View File

@ -97,6 +97,8 @@ const (
maxPendingRevokes = 16
recommendedMaxRequestBytes = 10 * 1024 * 1024
readyPercent = 0.9
)
var (
@ -1637,7 +1639,7 @@ func (s *EtcdServer) PromoteMember(ctx context.Context, id uint64) ([]*membershi
return nil, err
}
// check if we can promote this learner
// check if we can promote this learner.
if err := s.mayPromoteMember(types.ID(id)); err != nil {
return nil, err
}
@ -1665,10 +1667,65 @@ func (s *EtcdServer) PromoteMember(ctx context.Context, id uint64) ([]*membershi
}
func (s *EtcdServer) mayPromoteMember(id types.ID) error {
err := isLearnerReady(uint64(id))
if err != nil {
return err
}
if !s.Cfg.StrictReconfigCheck {
return nil
}
// TODO add more checks whether the member can be promoted.
if !s.cluster.IsReadyToPromoteMember(uint64(id)) {
if lg := s.getLogger(); lg != nil {
lg.Warn(
"rejecting member promote request; not enough healthy members",
zap.String("local-member-id", s.ID().String()),
zap.String("requested-member-remove-id", id.String()),
zap.Error(ErrNotEnoughStartedMembers),
)
} else {
plog.Warningf("not enough started members, rejecting promote member %s", id)
}
return ErrNotEnoughStartedMembers
}
return nil
}
// check whether the learner catches up with leader or not.
// Note: it will return nil if member is not found in cluster or if member is not learner.
// These two conditions will be checked before apply phase later.
func isLearnerReady(id uint64) error {
// sanity check, this can happen in the unit test when we do not start node.
if raftStatus == nil {
return nil
}
rs := raftStatus()
// leader's raftStatus.Progress is not nil
if rs.Progress == nil {
return ErrNotLeader
}
var learnerMatch uint64
isFound := false
leaderID := rs.ID
for memberID, progress := range rs.Progress {
if id == memberID {
// check its status
learnerMatch = progress.Match
isFound = true
break
}
}
if isFound {
leaderMatch := rs.Progress[leaderID].Match
// the learner's Match not caught up with leader yet
if float64(learnerMatch) < float64(leaderMatch)*readyPercent {
return ErrLearnerNotReady
}
}
return nil
}