mirror of
https://github.com/etcd-io/etcd.git
synced 2024-09-27 06:25:44 +00:00
fix the data inconsistency issue by adding a txPostLockHook into the backend
Previously the SetConsistentIndex() is called during the apply workflow, but it's outside the db transaction. If a commit happens between SetConsistentIndex and the following apply workflow, and etcd crashes for whatever reason right after the commit, then etcd commits an incomplete transaction to db. Eventually etcd runs into the data inconsistency issue. In this commit, we move the SetConsistentIndex into a txPostLockHook, so it will be executed inside the transaction lock.
This commit is contained in:
@@ -52,7 +52,7 @@ func unsafeSaveMemberToBackend(lg *zap.Logger, be backend.Backend, m *Member) er
|
||||
}
|
||||
|
||||
tx := be.BatchTx()
|
||||
tx.Lock()
|
||||
tx.LockInsideApply()
|
||||
defer tx.Unlock()
|
||||
if unsafeMemberExists(tx, mkey) {
|
||||
return errMemberAlreadyExist
|
||||
@@ -65,7 +65,7 @@ func unsafeSaveMemberToBackend(lg *zap.Logger, be backend.Backend, m *Member) er
|
||||
// from the v3 backend.
|
||||
func TrimClusterFromBackend(be backend.Backend) error {
|
||||
tx := be.BatchTx()
|
||||
tx.Lock()
|
||||
tx.LockOutsideApply()
|
||||
defer tx.Unlock()
|
||||
tx.UnsafeDeleteBucket(buckets.Cluster)
|
||||
return nil
|
||||
@@ -75,7 +75,7 @@ func unsafeDeleteMemberFromBackend(be backend.Backend, id types.ID) error {
|
||||
mkey := backendMemberKey(id)
|
||||
|
||||
tx := be.BatchTx()
|
||||
tx.Lock()
|
||||
tx.LockInsideApply()
|
||||
defer tx.Unlock()
|
||||
tx.UnsafePut(buckets.MembersRemoved, mkey, []byte("removed"))
|
||||
if !unsafeMemberExists(tx, mkey) {
|
||||
@@ -140,7 +140,7 @@ func mustReadMembersFromBackend(lg *zap.Logger, be backend.Backend) (map[types.I
|
||||
func TrimMembershipFromBackend(lg *zap.Logger, be backend.Backend) error {
|
||||
lg.Info("Trimming membership information from the backend...")
|
||||
tx := be.BatchTx()
|
||||
tx.Lock()
|
||||
tx.LockOutsideApply()
|
||||
defer tx.Unlock()
|
||||
err := tx.UnsafeForEach(buckets.Members, func(k, v []byte) error {
|
||||
tx.UnsafeDelete(buckets.Members, k)
|
||||
@@ -185,7 +185,7 @@ func mustSaveClusterVersionToBackend(be backend.Backend, ver *semver.Version) {
|
||||
ckey := backendClusterVersionKey()
|
||||
|
||||
tx := be.BatchTx()
|
||||
tx.Lock()
|
||||
tx.LockInsideApply()
|
||||
defer tx.Unlock()
|
||||
tx.UnsafePut(buckets.Cluster, ckey, []byte(ver.String()))
|
||||
}
|
||||
@@ -198,7 +198,7 @@ func mustSaveDowngradeToBackend(lg *zap.Logger, be backend.Backend, downgrade *D
|
||||
lg.Panic("failed to marshal downgrade information", zap.Error(err))
|
||||
}
|
||||
tx := be.BatchTx()
|
||||
tx.Lock()
|
||||
tx.LockInsideApply()
|
||||
defer tx.Unlock()
|
||||
tx.UnsafePut(buckets.Cluster, dkey, dvalue)
|
||||
}
|
||||
@@ -316,7 +316,7 @@ func backendDowngradeKey() []byte {
|
||||
|
||||
func mustCreateBackendBuckets(be backend.Backend) {
|
||||
tx := be.BatchTx()
|
||||
tx.Lock()
|
||||
tx.LockOutsideApply()
|
||||
defer tx.Unlock()
|
||||
tx.UnsafeCreateBucket(buckets.Members)
|
||||
tx.UnsafeCreateBucket(buckets.MembersRemoved)
|
||||
|
||||
@@ -65,7 +65,7 @@ func (a *AlarmStore) Activate(id types.ID, at pb.AlarmType) *pb.AlarmMember {
|
||||
}
|
||||
|
||||
b := a.bg.Backend()
|
||||
b.BatchTx().Lock()
|
||||
b.BatchTx().LockInsideApply()
|
||||
b.BatchTx().UnsafePut(buckets.Alarm, v, nil)
|
||||
b.BatchTx().Unlock()
|
||||
|
||||
@@ -94,7 +94,7 @@ func (a *AlarmStore) Deactivate(id types.ID, at pb.AlarmType) *pb.AlarmMember {
|
||||
}
|
||||
|
||||
b := a.bg.Backend()
|
||||
b.BatchTx().Lock()
|
||||
b.BatchTx().LockInsideApply()
|
||||
b.BatchTx().UnsafeDelete(buckets.Alarm, v)
|
||||
b.BatchTx().Unlock()
|
||||
|
||||
@@ -122,7 +122,7 @@ func (a *AlarmStore) restore() error {
|
||||
b := a.bg.Backend()
|
||||
tx := b.BatchTx()
|
||||
|
||||
tx.Lock()
|
||||
tx.LockOutsideApply()
|
||||
tx.UnsafeCreateBucket(buckets.Alarm)
|
||||
err := tx.UnsafeForEach(buckets.Alarm, func(k, v []byte) error {
|
||||
var m pb.AlarmMember
|
||||
|
||||
@@ -99,7 +99,7 @@ func openBackend(cfg config.ServerConfig, hooks backend.Hooks) backend.Backend {
|
||||
func recoverSnapshotBackend(cfg config.ServerConfig, oldbe backend.Backend, snapshot raftpb.Snapshot, beExist bool, hooks backend.Hooks) (backend.Backend, error) {
|
||||
consistentIndex := uint64(0)
|
||||
if beExist {
|
||||
consistentIndex, _ = cindex.ReadConsistentIndex(oldbe.BatchTx())
|
||||
consistentIndex, _ = cindex.ReadConsistentIndex(oldbe.ReadTx())
|
||||
}
|
||||
if snapshot.Metadata.Index <= consistentIndex {
|
||||
return oldbe, nil
|
||||
|
||||
@@ -34,9 +34,18 @@ type ConsistentIndexer interface {
|
||||
// ConsistentIndex returns the consistent index of current executing entry.
|
||||
ConsistentIndex() uint64
|
||||
|
||||
// ConsistentApplyingIndex returns the consistent applying index of current executing entry.
|
||||
ConsistentApplyingIndex() (uint64, uint64)
|
||||
|
||||
// UnsafeConsistentIndex is similar to ConsistentIndex, but it doesn't lock the transaction.
|
||||
UnsafeConsistentIndex() uint64
|
||||
|
||||
// SetConsistentIndex set the consistent index of current executing entry.
|
||||
SetConsistentIndex(v uint64, term uint64)
|
||||
|
||||
// SetConsistentApplyingIndex set the consistent applying index of current executing entry.
|
||||
SetConsistentApplyingIndex(v uint64, term uint64)
|
||||
|
||||
// UnsafeSave must be called holding the lock on the tx.
|
||||
// It saves consistentIndex to the underlying stable storage.
|
||||
UnsafeSave(tx backend.BatchTx)
|
||||
@@ -56,6 +65,12 @@ type consistentIndex struct {
|
||||
// The value is being persisted in the backend since v3.5.
|
||||
term uint64
|
||||
|
||||
// applyingIndex and applyingTerm are just temporary cache of the raftpb.Entry.Index
|
||||
// and raftpb.Entry.Term, and they are not ready to be persisted yet. They will be
|
||||
// saved to consistentIndex and term above in the txPostLockInsideApplyHook.
|
||||
applyingIndex uint64
|
||||
applyingTerm uint64
|
||||
|
||||
// be is used for initial read consistentIndex
|
||||
be Backend
|
||||
// mutex is protecting be.
|
||||
@@ -75,7 +90,17 @@ func (ci *consistentIndex) ConsistentIndex() uint64 {
|
||||
ci.mutex.Lock()
|
||||
defer ci.mutex.Unlock()
|
||||
|
||||
v, term := ReadConsistentIndex(ci.be.BatchTx())
|
||||
v, term := ReadConsistentIndex(ci.be.ReadTx())
|
||||
ci.SetConsistentIndex(v, term)
|
||||
return v
|
||||
}
|
||||
|
||||
func (ci *consistentIndex) UnsafeConsistentIndex() uint64 {
|
||||
if index := atomic.LoadUint64(&ci.consistentIndex); index > 0 {
|
||||
return index
|
||||
}
|
||||
|
||||
v, term := unsafeReadConsistentIndex(ci.be.ReadTx())
|
||||
ci.SetConsistentIndex(v, term)
|
||||
return v
|
||||
}
|
||||
@@ -99,6 +124,15 @@ func (ci *consistentIndex) SetBackend(be Backend) {
|
||||
ci.SetConsistentIndex(0, 0)
|
||||
}
|
||||
|
||||
func (ci *consistentIndex) ConsistentApplyingIndex() (uint64, uint64) {
|
||||
return atomic.LoadUint64(&ci.applyingIndex), atomic.LoadUint64(&ci.applyingTerm)
|
||||
}
|
||||
|
||||
func (ci *consistentIndex) SetConsistentApplyingIndex(v uint64, term uint64) {
|
||||
atomic.StoreUint64(&ci.applyingIndex, v)
|
||||
atomic.StoreUint64(&ci.applyingTerm, term)
|
||||
}
|
||||
|
||||
func NewFakeConsistentIndex(index uint64) ConsistentIndexer {
|
||||
return &fakeConsistentIndex{index: index}
|
||||
}
|
||||
@@ -108,12 +142,24 @@ type fakeConsistentIndex struct {
|
||||
term uint64
|
||||
}
|
||||
|
||||
func (f *fakeConsistentIndex) ConsistentIndex() uint64 { return f.index }
|
||||
func (f *fakeConsistentIndex) ConsistentIndex() uint64 {
|
||||
return atomic.LoadUint64(&f.index)
|
||||
}
|
||||
func (f *fakeConsistentIndex) ConsistentApplyingIndex() (uint64, uint64) {
|
||||
return atomic.LoadUint64(&f.index), atomic.LoadUint64(&f.term)
|
||||
}
|
||||
func (f *fakeConsistentIndex) UnsafeConsistentIndex() uint64 {
|
||||
return atomic.LoadUint64(&f.index)
|
||||
}
|
||||
|
||||
func (f *fakeConsistentIndex) SetConsistentIndex(index uint64, term uint64) {
|
||||
atomic.StoreUint64(&f.index, index)
|
||||
atomic.StoreUint64(&f.term, term)
|
||||
}
|
||||
func (f *fakeConsistentIndex) SetConsistentApplyingIndex(index uint64, term uint64) {
|
||||
atomic.StoreUint64(&f.index, index)
|
||||
atomic.StoreUint64(&f.term, term)
|
||||
}
|
||||
|
||||
func (f *fakeConsistentIndex) UnsafeSave(_ backend.BatchTx) {}
|
||||
func (f *fakeConsistentIndex) SetBackend(_ Backend) {}
|
||||
@@ -125,7 +171,7 @@ func UnsafeCreateMetaBucket(tx backend.BatchTx) {
|
||||
|
||||
// CreateMetaBucket creates the `meta` bucket (if it does not exists yet).
|
||||
func CreateMetaBucket(tx backend.BatchTx) {
|
||||
tx.Lock()
|
||||
tx.LockOutsideApply()
|
||||
defer tx.Unlock()
|
||||
tx.UnsafeCreateBucket(buckets.Meta)
|
||||
}
|
||||
@@ -174,7 +220,7 @@ func UnsafeUpdateConsistentIndex(tx backend.BatchTx, index uint64, term uint64)
|
||||
}
|
||||
|
||||
func UpdateConsistentIndex(tx backend.BatchTx, index uint64, term uint64) {
|
||||
tx.Lock()
|
||||
tx.LockOutsideApply()
|
||||
defer tx.Unlock()
|
||||
UnsafeUpdateConsistentIndex(tx, index, term)
|
||||
}
|
||||
|
||||
@@ -661,6 +661,10 @@ func NewServer(cfg config.ServerConfig) (srv *EtcdServer, err error) {
|
||||
})
|
||||
}
|
||||
|
||||
// Set the hook after EtcdServer finishes the initialization to avoid
|
||||
// the hook being called during the initialization process.
|
||||
srv.be.SetTxPostLockInsideApplyHook(srv.getTxPostLockInsideApplyHook())
|
||||
|
||||
// TODO: move transport initialization near the definition of remote
|
||||
tr := &rafthttp.Transport{
|
||||
Logger: cfg.Logger,
|
||||
@@ -1260,6 +1264,7 @@ func (s *EtcdServer) applySnapshot(ep *etcdProgress, apply *apply) {
|
||||
}
|
||||
|
||||
s.consistIndex.SetBackend(newbe)
|
||||
newbe.SetTxPostLockInsideApplyHook(s.getTxPostLockInsideApplyHook())
|
||||
lg.Info("restored mvcc store", zap.Uint64("consistent-index", s.consistIndex.ConsistentIndex()))
|
||||
|
||||
// Closing old backend might block until all the txns
|
||||
@@ -2128,7 +2133,7 @@ func (s *EtcdServer) apply(
|
||||
|
||||
// set the consistent index of current executing entry
|
||||
if e.Index > s.consistIndex.ConsistentIndex() {
|
||||
s.consistIndex.SetConsistentIndex(e.Index, e.Term)
|
||||
s.consistIndex.SetConsistentApplyingIndex(e.Index, e.Term)
|
||||
shouldApplyV3 = membership.ApplyBoth
|
||||
}
|
||||
|
||||
@@ -2155,10 +2160,18 @@ func (s *EtcdServer) apply(
|
||||
// applyEntryNormal apples an EntryNormal type raftpb request to the EtcdServer
|
||||
func (s *EtcdServer) applyEntryNormal(e *raftpb.Entry) {
|
||||
shouldApplyV3 := membership.ApplyV2storeOnly
|
||||
applyV3Performed := false
|
||||
defer func() {
|
||||
// The txPostLock callback will not get called in this case,
|
||||
// so we should set the consistent index directly.
|
||||
if s.consistIndex != nil && !applyV3Performed && membership.ApplyBoth == shouldApplyV3 {
|
||||
s.consistIndex.SetConsistentIndex(e.Index, e.Term)
|
||||
}
|
||||
}()
|
||||
index := s.consistIndex.ConsistentIndex()
|
||||
if e.Index > index {
|
||||
// set the consistent index of current executing entry
|
||||
s.consistIndex.SetConsistentIndex(e.Index, e.Term)
|
||||
s.consistIndex.SetConsistentApplyingIndex(e.Index, e.Term)
|
||||
shouldApplyV3 = membership.ApplyBoth
|
||||
}
|
||||
s.lg.Debug("apply entry normal",
|
||||
@@ -2207,6 +2220,7 @@ func (s *EtcdServer) applyEntryNormal(e *raftpb.Entry) {
|
||||
if !needResult && raftReq.Txn != nil {
|
||||
removeNeedlessRangeReqs(raftReq.Txn)
|
||||
}
|
||||
applyV3Performed = true
|
||||
ar = s.applyV3.Apply(&raftReq, shouldApplyV3)
|
||||
}
|
||||
|
||||
@@ -2258,6 +2272,13 @@ func (s *EtcdServer) applyConfChange(cc raftpb.ConfChange, confState *raftpb.Con
|
||||
if err := s.cluster.ValidateConfigurationChange(cc); err != nil {
|
||||
cc.NodeID = raft.None
|
||||
s.r.ApplyConfChange(cc)
|
||||
|
||||
// The txPostLock callback will not get called in this case,
|
||||
// so we should set the consistent index directly.
|
||||
if s.consistIndex != nil && membership.ApplyBoth == shouldApplyV3 {
|
||||
applyingIndex, applyingTerm := s.consistIndex.ConsistentApplyingIndex()
|
||||
s.consistIndex.SetConsistentIndex(applyingIndex, applyingTerm)
|
||||
}
|
||||
return false, err
|
||||
}
|
||||
|
||||
@@ -2683,6 +2704,15 @@ func (s *EtcdServer) raftStatus() raft.Status {
|
||||
return s.r.Node.Status()
|
||||
}
|
||||
|
||||
func (s *EtcdServer) getTxPostLockInsideApplyHook() func() {
|
||||
return func() {
|
||||
applyingIdx, applyingTerm := s.consistIndex.ConsistentApplyingIndex()
|
||||
if applyingIdx > s.consistIndex.UnsafeConsistentIndex() {
|
||||
s.consistIndex.SetConsistentIndex(applyingIdx, applyingTerm)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func maybeDefragBackend(cfg config.ServerConfig, be backend.Backend) error {
|
||||
size := be.Size()
|
||||
sizeInUse := be.SizeInUse()
|
||||
|
||||
@@ -686,9 +686,7 @@ func TestApplyConfigChangeUpdatesConsistIndex(t *testing.T) {
|
||||
|
||||
_, appliedi, _ := srv.apply(ents, &raftpb.ConfState{})
|
||||
consistIndex := srv.consistIndex.ConsistentIndex()
|
||||
if consistIndex != appliedi {
|
||||
t.Fatalf("consistIndex = %v, want %v", consistIndex, appliedi)
|
||||
}
|
||||
assert.Equal(t, uint64(2), appliedi)
|
||||
|
||||
t.Run("verify-backend", func(t *testing.T) {
|
||||
tx := be.BatchTx()
|
||||
@@ -697,9 +695,8 @@ func TestApplyConfigChangeUpdatesConsistIndex(t *testing.T) {
|
||||
srv.beHooks.OnPreCommitUnsafe(tx)
|
||||
assert.Equal(t, raftpb.ConfState{Voters: []uint64{2}}, *membership.UnsafeConfStateFromBackend(lg, tx))
|
||||
})
|
||||
rindex, rterm := cindex.ReadConsistentIndex(be.BatchTx())
|
||||
rindex, _ := cindex.ReadConsistentIndex(be.ReadTx())
|
||||
assert.Equal(t, consistIndex, rindex)
|
||||
assert.Equal(t, uint64(4), rterm)
|
||||
}
|
||||
|
||||
func realisticRaftNode(lg *zap.Logger) *raftNode {
|
||||
|
||||
Reference in New Issue
Block a user