Merge pull request #13690 from ahrtr/lease_renew_linearizable

Leases wait for entries to be applied
This commit is contained in:
Marek Siarkowicz 2022-04-11 14:53:18 +02:00 committed by GitHub
commit dd08e15d7c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 128 additions and 17 deletions

View File

@ -65,6 +65,7 @@ See [code changes](https://github.com/etcd-io/etcd/compare/v3.5.0...v3.6.0).
- Fix [segmentation violation(SIGSEGV) error due to premature unlocking of watchableStore](https://github.com/etcd-io/etcd/pull/13505) - Fix [segmentation violation(SIGSEGV) error due to premature unlocking of watchableStore](https://github.com/etcd-io/etcd/pull/13505)
- Fix [inconsistent log format](https://github.com/etcd-io/etcd/pull/13864) - Fix [inconsistent log format](https://github.com/etcd-io/etcd/pull/13864)
- Fix [Inconsistent revision and data occurs](https://github.com/etcd-io/etcd/pull/13854) - Fix [Inconsistent revision and data occurs](https://github.com/etcd-io/etcd/pull/13854)
- Fix [Etcdserver is still in progress of processing LeaseGrantRequest when it receives a LeaseKeepAliveRequest on the same leaseID](https://github.com/etcd-io/etcd/pull/13690)
### tools/benchmark ### tools/benchmark

View File

@ -77,6 +77,7 @@ var (
ErrGRPCTimeout = status.New(codes.Unavailable, "etcdserver: request timed out").Err() ErrGRPCTimeout = status.New(codes.Unavailable, "etcdserver: request timed out").Err()
ErrGRPCTimeoutDueToLeaderFail = status.New(codes.Unavailable, "etcdserver: request timed out, possibly due to previous leader failure").Err() ErrGRPCTimeoutDueToLeaderFail = status.New(codes.Unavailable, "etcdserver: request timed out, possibly due to previous leader failure").Err()
ErrGRPCTimeoutDueToConnectionLost = status.New(codes.Unavailable, "etcdserver: request timed out, possibly due to connection lost").Err() ErrGRPCTimeoutDueToConnectionLost = status.New(codes.Unavailable, "etcdserver: request timed out, possibly due to connection lost").Err()
ErrGRPCTimeoutWaitAppliedIndex = status.New(codes.Unavailable, "etcdserver: request timed out, waiting for the applied index took too long").Err()
ErrGRPCUnhealthy = status.New(codes.Unavailable, "etcdserver: unhealthy cluster").Err() ErrGRPCUnhealthy = status.New(codes.Unavailable, "etcdserver: unhealthy cluster").Err()
ErrGRPCCorrupt = status.New(codes.DataLoss, "etcdserver: corrupt cluster").Err() ErrGRPCCorrupt = status.New(codes.DataLoss, "etcdserver: corrupt cluster").Err()
ErrGRPCNotSupportedForLearner = status.New(codes.FailedPrecondition, "etcdserver: rpc not supported for learner").Err() ErrGRPCNotSupportedForLearner = status.New(codes.FailedPrecondition, "etcdserver: rpc not supported for learner").Err()
@ -212,6 +213,7 @@ var (
ErrTimeout = Error(ErrGRPCTimeout) ErrTimeout = Error(ErrGRPCTimeout)
ErrTimeoutDueToLeaderFail = Error(ErrGRPCTimeoutDueToLeaderFail) ErrTimeoutDueToLeaderFail = Error(ErrGRPCTimeoutDueToLeaderFail)
ErrTimeoutDueToConnectionLost = Error(ErrGRPCTimeoutDueToConnectionLost) ErrTimeoutDueToConnectionLost = Error(ErrGRPCTimeoutDueToConnectionLost)
ErrTimeoutWaitAppliedIndex = Error(ErrGRPCTimeoutWaitAppliedIndex)
ErrUnhealthy = Error(ErrGRPCUnhealthy) ErrUnhealthy = Error(ErrGRPCUnhealthy)
ErrCorrupt = Error(ErrGRPCCorrupt) ErrCorrupt = Error(ErrGRPCCorrupt)
ErrBadLeaderTransferee = Error(ErrGRPCBadLeaderTransferee) ErrBadLeaderTransferee = Error(ErrGRPCBadLeaderTransferee)

View File

@ -54,6 +54,7 @@ var toGRPCErrorMap = map[error]error{
etcdserver.ErrTimeout: rpctypes.ErrGRPCTimeout, etcdserver.ErrTimeout: rpctypes.ErrGRPCTimeout,
etcdserver.ErrTimeoutDueToLeaderFail: rpctypes.ErrGRPCTimeoutDueToLeaderFail, etcdserver.ErrTimeoutDueToLeaderFail: rpctypes.ErrGRPCTimeoutDueToLeaderFail,
etcdserver.ErrTimeoutDueToConnectionLost: rpctypes.ErrGRPCTimeoutDueToConnectionLost, etcdserver.ErrTimeoutDueToConnectionLost: rpctypes.ErrGRPCTimeoutDueToConnectionLost,
etcdserver.ErrTimeoutWaitAppliedIndex: rpctypes.ErrGRPCTimeoutWaitAppliedIndex,
etcdserver.ErrUnhealthy: rpctypes.ErrGRPCUnhealthy, etcdserver.ErrUnhealthy: rpctypes.ErrGRPCUnhealthy,
etcdserver.ErrKeyNotFound: rpctypes.ErrGRPCKeyNotFound, etcdserver.ErrKeyNotFound: rpctypes.ErrGRPCKeyNotFound,
etcdserver.ErrCorrupt: rpctypes.ErrGRPCCorrupt, etcdserver.ErrCorrupt: rpctypes.ErrGRPCCorrupt,

View File

@ -27,6 +27,7 @@ var (
ErrTimeoutDueToLeaderFail = errors.New("etcdserver: request timed out, possibly due to previous leader failure") ErrTimeoutDueToLeaderFail = errors.New("etcdserver: request timed out, possibly due to previous leader failure")
ErrTimeoutDueToConnectionLost = errors.New("etcdserver: request timed out, possibly due to connection lost") ErrTimeoutDueToConnectionLost = errors.New("etcdserver: request timed out, possibly due to connection lost")
ErrTimeoutLeaderTransfer = errors.New("etcdserver: request timed out, leader transfer took too long") ErrTimeoutLeaderTransfer = errors.New("etcdserver: request timed out, leader transfer took too long")
ErrTimeoutWaitAppliedIndex = errors.New("etcdserver: request timed out, waiting for the applied index took too long")
ErrLeaderChanged = errors.New("etcdserver: leader changed") ErrLeaderChanged = errors.New("etcdserver: leader changed")
ErrNotEnoughStartedMembers = errors.New("etcdserver: re-configuration failed due to not enough started members") ErrNotEnoughStartedMembers = errors.New("etcdserver: re-configuration failed due to not enough started members")
ErrLearnerNotReady = errors.New("etcdserver: can only promote a learner member which is in sync with leader") ErrLearnerNotReady = errors.New("etcdserver: can only promote a learner member which is in sync with leader")

View File

@ -1906,3 +1906,59 @@ func (s *sendMsgAppRespTransporter) Send(m []raftpb.Message) {
} }
s.sendC <- send s.sendC <- send
} }
func TestWaitAppliedIndex(t *testing.T) {
cases := []struct {
name string
appliedIndex uint64
committedIndex uint64
action func(s *EtcdServer)
ExpectedError error
}{
{
name: "The applied Id is already equal to the commitId",
appliedIndex: 10,
committedIndex: 10,
action: func(s *EtcdServer) {
s.applyWait.Trigger(10)
},
ExpectedError: nil,
},
{
name: "The etcd server has already stopped",
appliedIndex: 10,
committedIndex: 12,
action: func(s *EtcdServer) {
s.stopping <- struct{}{}
},
ExpectedError: ErrStopped,
},
{
name: "Timed out waiting for the applied index",
appliedIndex: 10,
committedIndex: 12,
action: nil,
ExpectedError: ErrTimeoutWaitAppliedIndex,
},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
s := &EtcdServer{
appliedIndex: tc.appliedIndex,
committedIndex: tc.committedIndex,
stopping: make(chan struct{}, 1),
applyWait: wait.NewTimeList(),
}
if tc.action != nil {
go tc.action(s)
}
err := s.waitAppliedIndex()
if err != tc.ExpectedError {
t.Errorf("Unexpected error, want (%v), got (%v)", tc.ExpectedError, err)
}
})
}
}

View File

@ -45,6 +45,10 @@ const (
maxGapBetweenApplyAndCommitIndex = 5000 maxGapBetweenApplyAndCommitIndex = 5000
traceThreshold = 100 * time.Millisecond traceThreshold = 100 * time.Millisecond
readIndexRetryTime = 500 * time.Millisecond readIndexRetryTime = 500 * time.Millisecond
// The timeout for the node to catch up its applied index, and is used in
// lease related operations, such as LeaseRenew and LeaseTimeToLive.
applyTimeout = time.Second
) )
type RaftKV interface { type RaftKV interface {
@ -275,6 +279,18 @@ func (s *EtcdServer) LeaseGrant(ctx context.Context, r *pb.LeaseGrantRequest) (*
return resp.(*pb.LeaseGrantResponse), nil return resp.(*pb.LeaseGrantResponse), nil
} }
func (s *EtcdServer) waitAppliedIndex() error {
select {
case <-s.ApplyWait():
case <-s.stopping:
return ErrStopped
case <-time.After(applyTimeout):
return ErrTimeoutWaitAppliedIndex
}
return nil
}
func (s *EtcdServer) LeaseRevoke(ctx context.Context, r *pb.LeaseRevokeRequest) (*pb.LeaseRevokeResponse, error) { func (s *EtcdServer) LeaseRevoke(ctx context.Context, r *pb.LeaseRevokeRequest) (*pb.LeaseRevokeResponse, error) {
resp, err := s.raftRequestOnce(ctx, pb.InternalRaftRequest{LeaseRevoke: r}) resp, err := s.raftRequestOnce(ctx, pb.InternalRaftRequest{LeaseRevoke: r})
if err != nil { if err != nil {
@ -284,12 +300,18 @@ func (s *EtcdServer) LeaseRevoke(ctx context.Context, r *pb.LeaseRevokeRequest)
} }
func (s *EtcdServer) LeaseRenew(ctx context.Context, id lease.LeaseID) (int64, error) { func (s *EtcdServer) LeaseRenew(ctx context.Context, id lease.LeaseID) (int64, error) {
ttl, err := s.lessor.Renew(id) if s.isLeader() {
if err == nil { // already requested to primary lessor(leader) if err := s.waitAppliedIndex(); err != nil {
return ttl, nil return 0, err
} }
if err != lease.ErrNotPrimary {
return -1, err ttl, err := s.lessor.Renew(id)
if err == nil { // already requested to primary lessor(leader)
return ttl, nil
}
if err != lease.ErrNotPrimary {
return -1, err
}
} }
cctx, cancel := context.WithTimeout(ctx, s.Cfg.ReqTimeout()) cctx, cancel := context.WithTimeout(ctx, s.Cfg.ReqTimeout())
@ -303,7 +325,7 @@ func (s *EtcdServer) LeaseRenew(ctx context.Context, id lease.LeaseID) (int64, e
} }
for _, url := range leader.PeerURLs { for _, url := range leader.PeerURLs {
lurl := url + leasehttp.LeasePrefix lurl := url + leasehttp.LeasePrefix
ttl, err = leasehttp.RenewHTTP(cctx, id, lurl, s.peerRt) ttl, err := leasehttp.RenewHTTP(cctx, id, lurl, s.peerRt)
if err == nil || err == lease.ErrLeaseNotFound { if err == nil || err == lease.ErrLeaseNotFound {
return ttl, err return ttl, err
} }
@ -319,7 +341,10 @@ func (s *EtcdServer) LeaseRenew(ctx context.Context, id lease.LeaseID) (int64, e
} }
func (s *EtcdServer) LeaseTimeToLive(ctx context.Context, r *pb.LeaseTimeToLiveRequest) (*pb.LeaseTimeToLiveResponse, error) { func (s *EtcdServer) LeaseTimeToLive(ctx context.Context, r *pb.LeaseTimeToLiveRequest) (*pb.LeaseTimeToLiveResponse, error) {
if s.Leader() == s.ID() { if s.isLeader() {
if err := s.waitAppliedIndex(); err != nil {
return nil, err
}
// primary; timetolive directly from leader // primary; timetolive directly from leader
le := s.lessor.Lookup(lease.LeaseID(r.ID)) le := s.lessor.Lookup(lease.LeaseID(r.ID))
if le == nil { if le == nil {

View File

@ -1411,7 +1411,7 @@ func (c *Cluster) Endpoints() []string {
func (c *Cluster) ClusterClient() (client *clientv3.Client, err error) { func (c *Cluster) ClusterClient() (client *clientv3.Client, err error) {
if c.clusterClient == nil { if c.clusterClient == nil {
endpoints := []string{} var endpoints []string
for _, m := range c.Members { for _, m := range c.Members {
endpoints = append(endpoints, m.GrpcURL) endpoints = append(endpoints, m.GrpcURL)
} }

View File

@ -16,6 +16,7 @@ package integration
import ( import (
"context" "context"
"errors"
"fmt" "fmt"
"math" "math"
"testing" "testing"
@ -487,17 +488,31 @@ func TestV3LeaseLeases(t *testing.T) {
// it was oberserved that the immediate lease renewal after granting a lease from follower resulted lease not found. // it was oberserved that the immediate lease renewal after granting a lease from follower resulted lease not found.
// related issue https://github.com/etcd-io/etcd/issues/6978 // related issue https://github.com/etcd-io/etcd/issues/6978
func TestV3LeaseRenewStress(t *testing.T) { func TestV3LeaseRenewStress(t *testing.T) {
testLeaseStress(t, stressLeaseRenew) testLeaseStress(t, stressLeaseRenew, false)
}
// TestV3LeaseRenewStressWithClusterClient is similar to TestV3LeaseRenewStress,
// but it uses a cluster client instead of a specific member's client.
// The related issue is https://github.com/etcd-io/etcd/issues/13675.
func TestV3LeaseRenewStressWithClusterClient(t *testing.T) {
testLeaseStress(t, stressLeaseRenew, true)
} }
// TestV3LeaseTimeToLiveStress keeps creating lease and retrieving it immediately to ensure the lease can be retrieved. // TestV3LeaseTimeToLiveStress keeps creating lease and retrieving it immediately to ensure the lease can be retrieved.
// it was oberserved that the immediate lease retrieval after granting a lease from follower resulted lease not found. // it was oberserved that the immediate lease retrieval after granting a lease from follower resulted lease not found.
// related issue https://github.com/etcd-io/etcd/issues/6978 // related issue https://github.com/etcd-io/etcd/issues/6978
func TestV3LeaseTimeToLiveStress(t *testing.T) { func TestV3LeaseTimeToLiveStress(t *testing.T) {
testLeaseStress(t, stressLeaseTimeToLive) testLeaseStress(t, stressLeaseTimeToLive, false)
} }
func testLeaseStress(t *testing.T, stresser func(context.Context, pb.LeaseClient) error) { // TestV3LeaseTimeToLiveStressWithClusterClient is similar to TestV3LeaseTimeToLiveStress,
// but it uses a cluster client instead of a specific member's client.
// The related issue is https://github.com/etcd-io/etcd/issues/13675.
func TestV3LeaseTimeToLiveStressWithClusterClient(t *testing.T) {
testLeaseStress(t, stressLeaseTimeToLive, true)
}
func testLeaseStress(t *testing.T, stresser func(context.Context, pb.LeaseClient) error, useClusterClient bool) {
integration.BeforeTest(t) integration.BeforeTest(t)
clus := integration.NewCluster(t, &integration.ClusterConfig{Size: 3}) clus := integration.NewCluster(t, &integration.ClusterConfig{Size: 3})
defer clus.Terminate(t) defer clus.Terminate(t)
@ -506,13 +521,23 @@ func testLeaseStress(t *testing.T, stresser func(context.Context, pb.LeaseClient
defer cancel() defer cancel()
errc := make(chan error) errc := make(chan error)
for i := 0; i < 30; i++ { if useClusterClient {
for j := 0; j < 3; j++ { for i := 0; i < 300; i++ {
go func(i int) { errc <- stresser(ctx, integration.ToGRPC(clus.Client(i)).Lease) }(j) clusterClient, err := clus.ClusterClient()
if err != nil {
t.Fatal(err)
}
go func(i int) { errc <- stresser(ctx, integration.ToGRPC(clusterClient).Lease) }(i)
}
} else {
for i := 0; i < 100; i++ {
for j := 0; j < 3; j++ {
go func(i int) { errc <- stresser(ctx, integration.ToGRPC(clus.Client(i)).Lease) }(j)
}
} }
} }
for i := 0; i < 90; i++ { for i := 0; i < 300; i++ {
if err := <-errc; err != nil { if err := <-errc; err != nil {
t.Fatal(err) t.Fatal(err)
} }
@ -543,7 +568,7 @@ func stressLeaseRenew(tctx context.Context, lc pb.LeaseClient) (reterr error) {
continue continue
} }
if rresp.TTL == 0 { if rresp.TTL == 0 {
return fmt.Errorf("TTL shouldn't be 0 so soon") return errors.New("TTL shouldn't be 0 so soon")
} }
} }
return nil return nil