mirror of
https://github.com/etcd-io/etcd.git
synced 2024-09-27 06:25:44 +00:00
raft: fix correctness bug in CommittedEntries pagination
In #9982, a mechanism to limit the size of `CommittedEntries` was introduced. The way this mechanism worked was that it would load applicable entries (passing the max size hint) and would emit a `HardState` whose commit index was truncated to match the limitation applied to the entries. Unfortunately, this was subtly incorrect when the user-provided `Entries` implementation didn't exactly match what Raft uses internally. Depending on whether a `Node` or a `RawNode` was used, this would either lead to regressing the HardState's commit index or outright forgetting to apply entries, respectively. Asking implementers to precisely match the Raft size limitation semantics was considered but looks like a bad idea as it puts correctness squarely in the hands of downstream users. Instead, this PR removes the truncation of `HardState` when limiting is active and tracks the applied index separately. This removes the old paradigm (that the previous code tried to work around) that the client will always apply all the way to the commit index, which isn't true when commit entries are paginated. See [1] for more on the discovery of this bug (CockroachDB's implementation of `Entries` returns one more entry than Raft's when the size limit hits). [1]: https://github.com/cockroachdb/cockroach/issues/28918#issuecomment-418174448
This commit is contained in:
@@ -17,6 +17,8 @@ package raft
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"fmt"
|
||||
"math"
|
||||
"reflect"
|
||||
"strings"
|
||||
"testing"
|
||||
@@ -926,3 +928,72 @@ func TestCommitPagination(t *testing.T) {
|
||||
s.Append(rd.Entries)
|
||||
n.Advance()
|
||||
}
|
||||
|
||||
type ignoreSizeHintMemStorage struct {
|
||||
*MemoryStorage
|
||||
}
|
||||
|
||||
func (s *ignoreSizeHintMemStorage) Entries(lo, hi uint64, maxSize uint64) ([]raftpb.Entry, error) {
|
||||
return s.MemoryStorage.Entries(lo, hi, math.MaxUint64)
|
||||
}
|
||||
|
||||
// TestNodeCommitPaginationAfterRestart regression tests a scenario in which the
|
||||
// Storage's Entries size limitation is slightly more permissive than Raft's
|
||||
// internal one. The original bug was the following:
|
||||
//
|
||||
// - node learns that index 11 (or 100, doesn't matter) is committed
|
||||
// - nextEnts returns index 1..10 in CommittedEntries due to size limiting. However,
|
||||
// index 10 already exceeds maxBytes, due to a user-provided impl of Entries.
|
||||
// - Commit index gets bumped to 10
|
||||
// - the node persists the HardState, but crashes before applying the entries
|
||||
// - upon restart, the storage returns the same entries, but `slice` takes a different code path
|
||||
// (since it is now called with an upper bound of 10) and removes the last entry.
|
||||
// - Raft emits a HardState with a regressing commit index.
|
||||
//
|
||||
// A simpler version of this test would have the storage return a lot less entries than dictated
|
||||
// by maxSize (for example, exactly one entry) after the restart, resulting in a larger regression.
|
||||
// This wouldn't need to exploit anything about Raft-internal code paths to fail.
|
||||
func TestNodeCommitPaginationAfterRestart(t *testing.T) {
|
||||
s := &ignoreSizeHintMemStorage{
|
||||
MemoryStorage: NewMemoryStorage(),
|
||||
}
|
||||
persistedHardState := raftpb.HardState{
|
||||
Term: 1,
|
||||
Vote: 1,
|
||||
Commit: 10,
|
||||
}
|
||||
|
||||
s.hardState = persistedHardState
|
||||
s.ents = make([]raftpb.Entry, 10)
|
||||
var size uint64
|
||||
for i := range s.ents {
|
||||
ent := raftpb.Entry{
|
||||
Term: 1,
|
||||
Index: uint64(i + 1),
|
||||
Type: raftpb.EntryNormal,
|
||||
Data: []byte("a"),
|
||||
}
|
||||
|
||||
s.ents[i] = ent
|
||||
size += uint64(ent.Size())
|
||||
}
|
||||
|
||||
cfg := newTestConfig(1, []uint64{1}, 10, 1, s)
|
||||
// Set a MaxSizePerMsg that would suggest to Raft that the last committed entry should
|
||||
// not be included in the initial rd.CommittedEntries. However, our storage will ignore
|
||||
// this and *will* return it (which is how the Commit index ended up being 10 initially).
|
||||
cfg.MaxSizePerMsg = size - uint64(s.ents[len(s.ents)-1].Size()) - 1
|
||||
|
||||
r := newRaft(cfg)
|
||||
n := newNode()
|
||||
go n.run(r)
|
||||
defer n.Stop()
|
||||
|
||||
rd := readyWithTimeout(&n)
|
||||
if !IsEmptyHardState(rd.HardState) && rd.HardState.Commit < persistedHardState.Commit {
|
||||
t.Errorf("HardState regressed: Commit %d -> %d\nCommitting:\n%+v",
|
||||
persistedHardState.Commit, rd.HardState.Commit,
|
||||
DescribeEntries(rd.CommittedEntries, func(data []byte) string { return fmt.Sprintf("%q", data) }),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user