etcdserver: resolve review comments in PR 14828

Signed-off-by: Benjamin Wang <wachao@vmware.com>
This commit is contained in:
Benjamin Wang 2022-11-23 15:43:46 +08:00
parent 7b19ee6396
commit 85fc09d09b
2 changed files with 34 additions and 21 deletions

View File

@ -279,7 +279,7 @@ func (cm *corruptionChecker) CompactHashCheck() {
// false: skipped some members, so need to check next hash // false: skipped some members, so need to check next hash
func (cm *corruptionChecker) checkPeerHashes(leaderHash mvcc.KeyValueHash, peers []*peerHashKVResp) bool { func (cm *corruptionChecker) checkPeerHashes(leaderHash mvcc.KeyValueHash, peers []*peerHashKVResp) bool {
leaderId := cm.hasher.MemberId() leaderId := cm.hasher.MemberId()
hashMap := map[uint32]types.IDSlice{leaderHash.Hash: {leaderId}} hash2members := map[uint32]types.IDSlice{leaderHash.Hash: {leaderId}}
peersChecked := 0 peersChecked := 0
// group all peers by hash // group all peers by hash
@ -304,16 +304,16 @@ func (cm *corruptionChecker) checkPeerHashes(leaderHash mvcc.KeyValueHash, peers
} }
peersChecked++ peersChecked++
if ids, ok := hashMap[peer.resp.Hash]; !ok { if ids, ok := hash2members[peer.resp.Hash]; !ok {
hashMap[peer.resp.Hash] = []types.ID{peer.id} hash2members[peer.resp.Hash] = []types.ID{peer.id}
} else { } else {
ids = append(ids, peer.id) ids = append(ids, peer.id)
hashMap[peer.resp.Hash] = ids hash2members[peer.resp.Hash] = ids
} }
} }
// All members have the same CompactRevision and Hash. // All members have the same CompactRevision and Hash.
if len(hashMap) == 1 { if len(hash2members) == 1 {
if peersChecked == len(peers) { if peersChecked == len(peers) {
cm.lg.Info("successfully checked hash on whole cluster", cm.lg.Info("successfully checked hash on whole cluster",
zap.Int("number-of-peers-checked", peersChecked), zap.Int("number-of-peers-checked", peersChecked),
@ -342,32 +342,25 @@ func (cm *corruptionChecker) checkPeerHashes(leaderHash mvcc.KeyValueHash, peers
memberCnt := len(peers) + 1 memberCnt := len(peers) + 1
quorum := memberCnt/2 + 1 quorum := memberCnt/2 + 1
quorumExist := false quorumExist := false
for k, v := range hashMap { for k, v := range hash2members {
if len(v) >= quorum { if len(v) >= quorum {
quorumExist = true quorumExist = true
// remove the majority, and we might raise alarms for the left members. // remove the majority, and we might raise alarms for the left members.
delete(hashMap, k) delete(hash2members, k)
break break
} }
} }
if !quorumExist { if !quorumExist {
// If quorumExist doesn't exist, then only raise alarm for the least minority // If quorumExist doesn't exist, then only raise alarm for the least minority.
cm.lg.Error("Detected compaction hash mismatch but can't identify the corrupted members, so only raise alarm for the least minority", // The first step is to find out the members which are the least minority.
zap.String("leader-id", leaderId.String()),
zap.Int64("leader-revision", leaderHash.Revision),
zap.Int64("leader-compact-revision", leaderHash.CompactRevision),
zap.Uint32("leader-hash", leaderHash.Hash),
)
// find out the members which are the least minority
var ( var (
minCnt int = math.MaxInt minCnt int = math.MaxInt
hashVal uint32 hashVal uint32
memberIDs types.IDSlice memberIDs types.IDSlice
) )
for k, v := range hashMap { for k, v := range hash2members {
if v.Len() < minCnt { if v.Len() < minCnt {
minCnt = v.Len() minCnt = v.Len()
hashVal = k hashVal = k
@ -375,10 +368,11 @@ func (cm *corruptionChecker) checkPeerHashes(leaderHash mvcc.KeyValueHash, peers
} }
} }
// raise alarms
for _, pid := range memberIDs { for _, pid := range memberIDs {
cm.hasher.TriggerCorruptAlarm(pid) cm.hasher.TriggerCorruptAlarm(pid)
} }
delete(hashMap, hashVal) delete(hash2members, hashVal)
cm.lg.Error("Detected compaction hash mismatch but can't identify the corrupted members, so only raise alarm for the least minority", cm.lg.Error("Detected compaction hash mismatch but can't identify the corrupted members, so only raise alarm for the least minority",
zap.String("leader-id", leaderId.String()), zap.String("leader-id", leaderId.String()),
@ -392,12 +386,13 @@ func (cm *corruptionChecker) checkPeerHashes(leaderHash mvcc.KeyValueHash, peers
// Raise alarm for the left members if the quorum is present. // Raise alarm for the left members if the quorum is present.
// But we should always generate error log for debugging. // But we should always generate error log for debugging.
for k, v := range hashMap { for k, v := range hash2members {
for _, pid := range v {
if quorumExist { if quorumExist {
for _, pid := range v {
cm.hasher.TriggerCorruptAlarm(pid) cm.hasher.TriggerCorruptAlarm(pid)
} }
} }
cm.lg.Error("Detected compaction hash mismatch", cm.lg.Error("Detected compaction hash mismatch",
zap.String("leader-id", leaderId.String()), zap.String("leader-id", leaderId.String()),
zap.Int64("leader-revision", leaderHash.Revision), zap.Int64("leader-revision", leaderHash.Revision),

View File

@ -289,6 +289,24 @@ func TestCompactHashCheck(t *testing.T) {
expectActions: []string{"MemberId()", "ReqTimeout()", "Hashes()", "PeerHashByRev(2)", "MemberId()", "TriggerCorruptAlarm(46)"}, expectActions: []string{"MemberId()", "ReqTimeout()", "Hashes()", "PeerHashByRev(2)", "MemberId()", "TriggerCorruptAlarm(46)"},
expectCorrupt: true, expectCorrupt: true,
}, },
{
name: "Peer returned same compaction revision but all members have different hash",
hasher: fakeHasher{
hashes: []mvcc.KeyValueHash{{Revision: 1, CompactRevision: 1, Hash: 1}, {Revision: 2, CompactRevision: 1, Hash: 2}},
// Each member is the least minority, and etcd may trigger alarm for any one.
// So intentionally set the same member id for all members.
peerHashes: []*peerHashKVResp{
{peerInfo: peerInfo{id: 42}, resp: &pb.HashKVResponse{CompactRevision: 1, Hash: 3}},
{peerInfo: peerInfo{id: 42}, resp: &pb.HashKVResponse{CompactRevision: 1, Hash: 4}},
{peerInfo: peerInfo{id: 42}, resp: &pb.HashKVResponse{CompactRevision: 1, Hash: 5}},
{peerInfo: peerInfo{id: 42}, resp: &pb.HashKVResponse{CompactRevision: 1, Hash: 6}},
{peerInfo: peerInfo{id: 42}, resp: &pb.HashKVResponse{CompactRevision: 1, Hash: 7}},
{peerInfo: peerInfo{id: 42}, resp: &pb.HashKVResponse{CompactRevision: 1, Hash: 8}},
},
},
expectActions: []string{"MemberId()", "ReqTimeout()", "Hashes()", "PeerHashByRev(2)", "MemberId()", "TriggerCorruptAlarm(42)"},
expectCorrupt: true,
},
{ {
name: "Peer returned same hash bumps last revision checked", name: "Peer returned same hash bumps last revision checked",
hasher: fakeHasher{ hasher: fakeHasher{