etcdserver: fix panic when checking IsLearner of removed member

Previously, calling s.IsLearner() when the local node is no longer a
member panics. There was an attempt to fix this by first checking
IsMemberExist(), but this is not a correct fix because the member could
be removed between the two calls. Instead of panicking when the member
was removed, IsLearner() should return false. A node which is not a
member is also not a learner.

There was a similar concurrency bug when accessing the IsLearner
property of a member, which will panic with a nil pointer access error
if the member is removed between the IsMemberExist() and Member() calls.

Signed-off-by: Jan Schär <jan@monogon.tech>
This commit is contained in:
Jan Schär 2024-09-19 11:11:06 +02:00
parent 5704c6148d
commit 605abca29d
5 changed files with 60 additions and 12 deletions

View File

@ -783,11 +783,7 @@ func (c *RaftCluster) IsLocalMemberLearner() bool {
defer c.Unlock()
localMember, ok := c.members[c.localID]
if !ok {
c.lg.Panic(
"failed to find local ID in cluster members",
zap.String("cluster-id", c.cid.String()),
zap.String("local-member-id", c.localID.String()),
)
return false
}
return localMember.IsLearner
}
@ -816,6 +812,9 @@ func (c *RaftCluster) SetDowngradeInfo(d *serverversion.DowngradeInfo, shouldApp
// IsMemberExist returns if the member with the given id exists in cluster.
func (c *RaftCluster) IsMemberExist(id types.ID) bool {
// gofail: var sleepAfterIsMemberExist struct{}
// defer time.Sleep(time.Second)
c.Lock()
defer c.Unlock()
_, ok := c.members[id]

View File

@ -49,7 +49,7 @@ func newUnaryInterceptor(s *etcdserver.EtcdServer) grpc.UnaryServerInterceptor {
return nil, rpctypes.ErrGRPCNotCapable
}
if s.IsMemberExist(s.MemberID()) && s.IsLearner() && !isRPCSupportedForLearner(req) {
if s.IsLearner() && !isRPCSupportedForLearner(req) {
return nil, rpctypes.ErrGRPCNotSupportedForLearner
}
@ -218,7 +218,7 @@ func newStreamInterceptor(s *etcdserver.EtcdServer) grpc.StreamServerInterceptor
return rpctypes.ErrGRPCNotCapable
}
if s.IsMemberExist(s.MemberID()) && s.IsLearner() && info.FullMethod != snapshotMethod { // learner does not support stream RPC except Snapshot
if s.IsLearner() && info.FullMethod != snapshotMethod { // learner does not support stream RPC except Snapshot
return rpctypes.ErrGRPCNotSupportedForLearner
}

View File

@ -1224,7 +1224,8 @@ func (s *EtcdServer) isLeader() bool {
// MoveLeader transfers the leader to the given transferee.
func (s *EtcdServer) MoveLeader(ctx context.Context, lead, transferee uint64) error {
if !s.cluster.IsMemberExist(types.ID(transferee)) || s.cluster.Member(types.ID(transferee)).IsLearner {
member := s.cluster.Member(types.ID(transferee))
if member == nil || member.IsLearner {
return errors.ErrBadLeaderTransferee
}
@ -1593,9 +1594,9 @@ func (s *EtcdServer) mayRemoveMember(id types.ID) error {
}
lg := s.Logger()
isLearner := s.cluster.IsMemberExist(id) && s.cluster.Member(id).IsLearner
member := s.cluster.Member(id)
// no need to check quorum when removing non-voting member
if isLearner {
if member != nil && member.IsLearner {
return nil
}

View File

@ -518,3 +518,51 @@ func TestSpeedyTerminate(t *testing.T) {
case <-donec:
}
}
// TestConcurrentRemoveMember demonstrated a panic in mayRemoveMember with
// concurrent calls to MemberRemove. To reliably reproduce the panic, a delay
// needed to be injected in IsMemberExist, which is done using a failpoint.
// After fixing the bug, IsMemberExist is no longer called by mayRemoveMember.
func TestConcurrentRemoveMember(t *testing.T) {
integration.BeforeTest(t, integration.WithFailpoint("sleepAfterIsMemberExist", `return`))
c := integration.NewCluster(t, &integration.ClusterConfig{Size: 1})
defer c.Terminate(t)
addResp, err := c.Members[0].Client.MemberAddAsLearner(context.Background(), []string{"http://localhost:123"})
if err != nil {
t.Fatal(err)
}
removeID := addResp.Member.ID
done := make(chan struct{})
go func() {
time.Sleep(time.Second / 2)
c.Members[0].Client.MemberRemove(context.Background(), removeID)
close(done)
}()
if _, err := c.Members[0].Client.MemberRemove(context.Background(), removeID); err != nil {
t.Fatal(err)
}
<-done
}
func TestConcurrentMoveLeader(t *testing.T) {
integration.BeforeTest(t, integration.WithFailpoint("sleepAfterIsMemberExist", `return`))
c := integration.NewCluster(t, &integration.ClusterConfig{Size: 1})
defer c.Terminate(t)
addResp, err := c.Members[0].Client.MemberAddAsLearner(context.Background(), []string{"http://localhost:123"})
if err != nil {
t.Fatal(err)
}
removeID := addResp.Member.ID
done := make(chan struct{})
go func() {
time.Sleep(time.Second / 2)
c.Members[0].Client.MoveLeader(context.Background(), removeID)
close(done)
}()
if _, err := c.Members[0].Client.MemberRemove(context.Background(), removeID); err != nil {
t.Fatal(err)
}
<-done
}

View File

@ -54,7 +54,7 @@ install-gofail: $(GOPATH)/bin/gofail
.PHONY: gofail-enable
gofail-enable: $(GOPATH)/bin/gofail
$(GOPATH)/bin/gofail enable server/etcdserver/ server/lease/leasehttp server/storage/backend/ server/storage/mvcc/ server/storage/wal/ server/etcdserver/api/v3rpc/
$(GOPATH)/bin/gofail enable server/etcdserver/ server/lease/leasehttp server/storage/backend/ server/storage/mvcc/ server/storage/wal/ server/etcdserver/api/v3rpc/ server/etcdserver/api/membership/
cd ./server && go get go.etcd.io/gofail@${GOFAIL_VERSION}
cd ./etcdutl && go get go.etcd.io/gofail@${GOFAIL_VERSION}
cd ./etcdctl && go get go.etcd.io/gofail@${GOFAIL_VERSION}
@ -62,7 +62,7 @@ gofail-enable: $(GOPATH)/bin/gofail
.PHONY: gofail-disable
gofail-disable: $(GOPATH)/bin/gofail
$(GOPATH)/bin/gofail disable server/etcdserver/ server/lease/leasehttp server/storage/backend/ server/storage/mvcc/ server/storage/wal/ server/etcdserver/api/v3rpc/
$(GOPATH)/bin/gofail disable server/etcdserver/ server/lease/leasehttp server/storage/backend/ server/storage/mvcc/ server/storage/wal/ server/etcdserver/api/v3rpc/ server/etcdserver/api/membership/
cd ./server && go mod tidy
cd ./etcdutl && go mod tidy
cd ./etcdctl && go mod tidy