etcdserver: resolve review comments in PR 14828

Signed-off-by: Benjamin Wang <wachao@vmware.com>
This commit is contained in:
Benjamin Wang 2022-11-23 15:43:46 +08:00
parent 7b19ee6396
commit 85fc09d09b
2 changed files with 34 additions and 21 deletions

View File

@ -279,7 +279,7 @@ func (cm *corruptionChecker) CompactHashCheck() {
// false: skipped some members, so need to check next hash
func (cm *corruptionChecker) checkPeerHashes(leaderHash mvcc.KeyValueHash, peers []*peerHashKVResp) bool {
leaderId := cm.hasher.MemberId()
hashMap := map[uint32]types.IDSlice{leaderHash.Hash: {leaderId}}
hash2members := map[uint32]types.IDSlice{leaderHash.Hash: {leaderId}}
peersChecked := 0
// group all peers by hash
@ -304,16 +304,16 @@ func (cm *corruptionChecker) checkPeerHashes(leaderHash mvcc.KeyValueHash, peers
}
peersChecked++
if ids, ok := hashMap[peer.resp.Hash]; !ok {
hashMap[peer.resp.Hash] = []types.ID{peer.id}
if ids, ok := hash2members[peer.resp.Hash]; !ok {
hash2members[peer.resp.Hash] = []types.ID{peer.id}
} else {
ids = append(ids, peer.id)
hashMap[peer.resp.Hash] = ids
hash2members[peer.resp.Hash] = ids
}
}
// All members have the same CompactRevision and Hash.
if len(hashMap) == 1 {
if len(hash2members) == 1 {
if peersChecked == len(peers) {
cm.lg.Info("successfully checked hash on whole cluster",
zap.Int("number-of-peers-checked", peersChecked),
@ -342,32 +342,25 @@ func (cm *corruptionChecker) checkPeerHashes(leaderHash mvcc.KeyValueHash, peers
memberCnt := len(peers) + 1
quorum := memberCnt/2 + 1
quorumExist := false
for k, v := range hashMap {
for k, v := range hash2members {
if len(v) >= quorum {
quorumExist = true
// remove the majority, and we might raise alarms for the left members.
delete(hashMap, k)
delete(hash2members, k)
break
}
}
if !quorumExist {
// If quorumExist doesn't exist, then only raise alarm for the least minority
cm.lg.Error("Detected compaction hash mismatch but can't identify the corrupted members, so only raise alarm for the least minority",
zap.String("leader-id", leaderId.String()),
zap.Int64("leader-revision", leaderHash.Revision),
zap.Int64("leader-compact-revision", leaderHash.CompactRevision),
zap.Uint32("leader-hash", leaderHash.Hash),
)
// find out the members which are the least minority
// If quorumExist doesn't exist, then only raise alarm for the least minority.
// The first step is to find out the members which are the least minority.
var (
minCnt int = math.MaxInt
hashVal uint32
memberIDs types.IDSlice
)
for k, v := range hashMap {
for k, v := range hash2members {
if v.Len() < minCnt {
minCnt = v.Len()
hashVal = k
@ -375,10 +368,11 @@ func (cm *corruptionChecker) checkPeerHashes(leaderHash mvcc.KeyValueHash, peers
}
}
// raise alarms
for _, pid := range memberIDs {
cm.hasher.TriggerCorruptAlarm(pid)
}
delete(hashMap, hashVal)
delete(hash2members, hashVal)
cm.lg.Error("Detected compaction hash mismatch but can't identify the corrupted members, so only raise alarm for the least minority",
zap.String("leader-id", leaderId.String()),
@ -392,12 +386,13 @@ func (cm *corruptionChecker) checkPeerHashes(leaderHash mvcc.KeyValueHash, peers
// Raise alarm for the left members if the quorum is present.
// But we should always generate error log for debugging.
for k, v := range hashMap {
for _, pid := range v {
if quorumExist {
for k, v := range hash2members {
if quorumExist {
for _, pid := range v {
cm.hasher.TriggerCorruptAlarm(pid)
}
}
cm.lg.Error("Detected compaction hash mismatch",
zap.String("leader-id", leaderId.String()),
zap.Int64("leader-revision", leaderHash.Revision),

View File

@ -289,6 +289,24 @@ func TestCompactHashCheck(t *testing.T) {
expectActions: []string{"MemberId()", "ReqTimeout()", "Hashes()", "PeerHashByRev(2)", "MemberId()", "TriggerCorruptAlarm(46)"},
expectCorrupt: true,
},
{
name: "Peer returned same compaction revision but all members have different hash",
hasher: fakeHasher{
hashes: []mvcc.KeyValueHash{{Revision: 1, CompactRevision: 1, Hash: 1}, {Revision: 2, CompactRevision: 1, Hash: 2}},
// Each member is the least minority, and etcd may trigger alarm for any one.
// So intentionally set the same member id for all members.
peerHashes: []*peerHashKVResp{
{peerInfo: peerInfo{id: 42}, resp: &pb.HashKVResponse{CompactRevision: 1, Hash: 3}},
{peerInfo: peerInfo{id: 42}, resp: &pb.HashKVResponse{CompactRevision: 1, Hash: 4}},
{peerInfo: peerInfo{id: 42}, resp: &pb.HashKVResponse{CompactRevision: 1, Hash: 5}},
{peerInfo: peerInfo{id: 42}, resp: &pb.HashKVResponse{CompactRevision: 1, Hash: 6}},
{peerInfo: peerInfo{id: 42}, resp: &pb.HashKVResponse{CompactRevision: 1, Hash: 7}},
{peerInfo: peerInfo{id: 42}, resp: &pb.HashKVResponse{CompactRevision: 1, Hash: 8}},
},
},
expectActions: []string{"MemberId()", "ReqTimeout()", "Hashes()", "PeerHashByRev(2)", "MemberId()", "TriggerCorruptAlarm(42)"},
expectCorrupt: true,
},
{
name: "Peer returned same hash bumps last revision checked",
hasher: fakeHasher{