*: keep tombstone if revision == compactAtRev

Before this patch, the tombstone can be deleted if its revision is equal
compacted revision. It causes that the watch subscriber won't get this
DELETE event. Based on Compact API[1], we should keep tombstone revision
if it's not less than the compaction revision.

> CompactionRequest compacts the key-value store up to a given revision.
> All superseded keys with a revision less than the compaction revision
> will be removed.

[1]: https://etcd.io/docs/latest/dev-guide/api_reference_v3/

Signed-off-by: Wei Fu <fuweid89@gmail.com>
(cherry picked from commit bbdc94181a6d67904b575ad936c20d1be10e220c)
Signed-off-by: Wei Fu <fuweid89@gmail.com>
This commit is contained in:
Wei Fu 2024-07-04 14:35:51 +08:00
parent 4b7c1f49dd
commit 2828a12ad6
4 changed files with 503 additions and 97 deletions

View File

@ -18,8 +18,9 @@ import (
"reflect"
"testing"
"github.com/google/btree"
"github.com/stretchr/testify/require"
"go.uber.org/zap"
"go.uber.org/zap/zaptest"
)
func TestIndexGet(t *testing.T) {
@ -290,98 +291,445 @@ func TestIndexRevision(t *testing.T) {
func TestIndexCompactAndKeep(t *testing.T) {
maxRev := int64(20)
tests := []struct {
key []byte
remove bool
rev revision
created revision
ver int64
}{
{[]byte("foo"), false, revision{main: 1}, revision{main: 1}, 1},
{[]byte("foo1"), false, revision{main: 2}, revision{main: 2}, 1},
{[]byte("foo2"), false, revision{main: 3}, revision{main: 3}, 1},
{[]byte("foo2"), false, revision{main: 4}, revision{main: 3}, 2},
{[]byte("foo"), false, revision{main: 5}, revision{main: 1}, 2},
{[]byte("foo1"), false, revision{main: 6}, revision{main: 2}, 2},
{[]byte("foo1"), true, revision{main: 7}, revision{}, 0},
{[]byte("foo2"), true, revision{main: 8}, revision{}, 0},
{[]byte("foo"), true, revision{main: 9}, revision{}, 0},
{[]byte("foo"), false, revision{10, 0}, revision{10, 0}, 1},
{[]byte("foo1"), false, revision{10, 1}, revision{10, 1}, 1},
// key: "foo"
// modified: 10
// generations:
// {{10, 0}}
// {{1, 0}, {5, 0}, {9, 0}(t)}
//
// key: "foo1"
// modified: 10, 1
// generations:
// {{10, 1}}
// {{2, 0}, {6, 0}, {7, 0}(t)}
//
// key: "foo2"
// modified: 8
// generations:
// {empty}
// {{3, 0}, {4, 0}, {8, 0}(t)}
//
buildTreeIndex := func() index {
ti := newTreeIndex(zaptest.NewLogger(t))
ti.Put([]byte("foo"), revision{main: 1})
ti.Put([]byte("foo1"), revision{main: 2})
ti.Put([]byte("foo2"), revision{main: 3})
ti.Put([]byte("foo2"), revision{main: 4})
ti.Put([]byte("foo"), revision{main: 5})
ti.Put([]byte("foo1"), revision{main: 6})
require.NoError(t, ti.Tombstone([]byte("foo1"), revision{main: 7}))
require.NoError(t, ti.Tombstone([]byte("foo2"), revision{main: 8}))
require.NoError(t, ti.Tombstone([]byte("foo"), revision{main: 9}))
ti.Put([]byte("foo"), revision{main: 10})
ti.Put([]byte("foo1"), revision{main: 10, sub: 1})
return ti
}
// Continuous Compact and Keep
ti := newTreeIndex(zap.NewExample())
for _, tt := range tests {
if tt.remove {
ti.Tombstone(tt.key, tt.rev)
} else {
ti.Put(tt.key, tt.rev)
}
afterCompacts := []struct {
atRev int
keyIndexes []keyIndex
keep map[revision]struct{}
compacted map[revision]struct{}
}{
{
atRev: 1,
keyIndexes: []keyIndex{
{
key: []byte("foo"),
modified: revision{main: 10},
generations: []generation{
{ver: 3, created: revision{main: 1}, revs: []revision{{main: 1}, {main: 5}, {main: 9}}},
{ver: 1, created: revision{main: 10}, revs: []revision{{main: 10}}},
},
},
{
key: []byte("foo1"),
modified: revision{main: 10, sub: 1},
generations: []generation{
{ver: 3, created: revision{main: 2}, revs: []revision{{main: 2}, {main: 6}, {main: 7}}},
{ver: 1, created: revision{main: 10, sub: 1}, revs: []revision{{main: 10, sub: 1}}},
},
},
{
key: []byte("foo2"),
modified: revision{main: 8},
generations: []generation{
{ver: 3, created: revision{main: 3}, revs: []revision{{main: 3}, {main: 4}, {main: 8}}},
{},
},
},
},
keep: map[revision]struct{}{
{main: 1}: {},
},
compacted: map[revision]struct{}{
{main: 1}: {},
},
},
{
atRev: 2,
keyIndexes: []keyIndex{
{
key: []byte("foo"),
modified: revision{main: 10},
generations: []generation{
{ver: 3, created: revision{main: 1}, revs: []revision{{main: 1}, {main: 5}, {main: 9}}},
{ver: 1, created: revision{main: 10}, revs: []revision{{main: 10}}},
},
},
{
key: []byte("foo1"),
modified: revision{main: 10, sub: 1},
generations: []generation{
{ver: 3, created: revision{main: 2}, revs: []revision{{main: 2}, {main: 6}, {main: 7}}},
{ver: 1, created: revision{main: 10, sub: 1}, revs: []revision{{main: 10, sub: 1}}},
},
},
{
key: []byte("foo2"),
modified: revision{main: 8},
generations: []generation{
{ver: 3, created: revision{main: 3}, revs: []revision{{main: 3}, {main: 4}, {main: 8}}},
{},
},
},
},
keep: map[revision]struct{}{
{main: 1}: {},
{main: 2}: {},
},
compacted: map[revision]struct{}{
{main: 1}: {},
{main: 2}: {},
},
},
{
atRev: 3,
keyIndexes: []keyIndex{
{
key: []byte("foo"),
modified: revision{main: 10},
generations: []generation{
{ver: 3, created: revision{main: 1}, revs: []revision{{main: 1}, {main: 5}, {main: 9}}},
{ver: 1, created: revision{main: 10}, revs: []revision{{main: 10}}},
},
},
{
key: []byte("foo1"),
modified: revision{main: 10, sub: 1},
generations: []generation{
{ver: 3, created: revision{main: 2}, revs: []revision{{main: 2}, {main: 6}, {main: 7}}},
{ver: 1, created: revision{main: 10, sub: 1}, revs: []revision{{main: 10, sub: 1}}},
},
},
{
key: []byte("foo2"),
modified: revision{main: 8},
generations: []generation{
{ver: 3, created: revision{main: 3}, revs: []revision{{main: 3}, {main: 4}, {main: 8}}},
{},
},
},
},
keep: map[revision]struct{}{
{main: 1}: {},
{main: 2}: {},
{main: 3}: {},
},
compacted: map[revision]struct{}{
{main: 1}: {},
{main: 2}: {},
{main: 3}: {},
},
},
{
atRev: 4,
keyIndexes: []keyIndex{
{
key: []byte("foo"),
modified: revision{main: 10},
generations: []generation{
{ver: 3, created: revision{main: 1}, revs: []revision{{main: 1}, {main: 5}, {main: 9}}},
{ver: 1, created: revision{main: 10}, revs: []revision{{main: 10}}},
},
},
{
key: []byte("foo1"),
modified: revision{main: 10, sub: 1},
generations: []generation{
{ver: 3, created: revision{main: 2}, revs: []revision{{main: 2}, {main: 6}, {main: 7}}},
{ver: 1, created: revision{main: 10, sub: 1}, revs: []revision{{main: 10, sub: 1}}},
},
},
{
key: []byte("foo2"),
modified: revision{main: 8},
generations: []generation{
{ver: 3, created: revision{main: 3}, revs: []revision{{main: 4}, {main: 8}}},
{},
},
},
},
keep: map[revision]struct{}{
{main: 1}: {},
{main: 2}: {},
{main: 4}: {},
},
compacted: map[revision]struct{}{
{main: 1}: {},
{main: 2}: {},
{main: 4}: {},
},
},
{
atRev: 5,
keyIndexes: []keyIndex{
{
key: []byte("foo"),
modified: revision{main: 10},
generations: []generation{
{ver: 3, created: revision{main: 1}, revs: []revision{{main: 5}, {main: 9}}},
{ver: 1, created: revision{main: 10}, revs: []revision{{main: 10}}},
},
},
{
key: []byte("foo1"),
modified: revision{main: 10, sub: 1},
generations: []generation{
{ver: 3, created: revision{main: 2}, revs: []revision{{main: 2}, {main: 6}, {main: 7}}},
{ver: 1, created: revision{main: 10, sub: 1}, revs: []revision{{main: 10, sub: 1}}},
},
},
{
key: []byte("foo2"),
modified: revision{main: 8},
generations: []generation{
{ver: 3, created: revision{main: 3}, revs: []revision{{main: 4}, {main: 8}}},
{},
},
},
},
keep: map[revision]struct{}{
{main: 2}: {},
{main: 4}: {},
{main: 5}: {},
},
compacted: map[revision]struct{}{
{main: 2}: {},
{main: 4}: {},
{main: 5}: {},
},
},
{
atRev: 6,
keyIndexes: []keyIndex{
{
key: []byte("foo"),
modified: revision{main: 10},
generations: []generation{
{ver: 3, created: revision{main: 1}, revs: []revision{{main: 5}, {main: 9}}},
{ver: 1, created: revision{main: 10}, revs: []revision{{main: 10}}},
},
},
{
key: []byte("foo1"),
modified: revision{main: 10, sub: 1},
generations: []generation{
{ver: 3, created: revision{main: 2}, revs: []revision{{main: 6}, {main: 7}}},
{ver: 1, created: revision{main: 10, sub: 1}, revs: []revision{{main: 10, sub: 1}}},
},
},
{
key: []byte("foo2"),
modified: revision{main: 8},
generations: []generation{
{ver: 3, created: revision{main: 3}, revs: []revision{{main: 4}, {main: 8}}},
{},
},
},
},
keep: map[revision]struct{}{
{main: 6}: {},
{main: 4}: {},
{main: 5}: {},
},
compacted: map[revision]struct{}{
{main: 6}: {},
{main: 4}: {},
{main: 5}: {},
},
},
{
atRev: 7,
keyIndexes: []keyIndex{
{
key: []byte("foo"),
modified: revision{main: 10},
generations: []generation{
{ver: 3, created: revision{main: 1}, revs: []revision{{main: 5}, {main: 9}}},
{ver: 1, created: revision{main: 10}, revs: []revision{{main: 10}}},
},
},
{
key: []byte("foo1"),
modified: revision{main: 10, sub: 1},
generations: []generation{
{ver: 3, created: revision{main: 2}, revs: []revision{{main: 7}}},
{ver: 1, created: revision{main: 10, sub: 1}, revs: []revision{{main: 10, sub: 1}}},
},
},
{
key: []byte("foo2"),
modified: revision{main: 8},
generations: []generation{
{ver: 3, created: revision{main: 3}, revs: []revision{{main: 4}, {main: 8}}},
{},
},
},
},
keep: map[revision]struct{}{
{main: 4}: {},
{main: 5}: {},
},
compacted: map[revision]struct{}{
{main: 7}: {},
{main: 4}: {},
{main: 5}: {},
},
},
{
atRev: 8,
keyIndexes: []keyIndex{
{
key: []byte("foo"),
modified: revision{main: 10},
generations: []generation{
{ver: 3, created: revision{main: 1}, revs: []revision{{main: 5}, {main: 9}}},
{ver: 1, created: revision{main: 10}, revs: []revision{{main: 10}}},
},
},
{
key: []byte("foo1"),
modified: revision{main: 10, sub: 1},
generations: []generation{
{ver: 1, created: revision{main: 10, sub: 1}, revs: []revision{{main: 10, sub: 1}}},
},
},
{
key: []byte("foo2"),
modified: revision{main: 8},
generations: []generation{
{ver: 3, created: revision{main: 3}, revs: []revision{{main: 8}}},
{},
},
},
},
keep: map[revision]struct{}{
{main: 5}: {},
},
compacted: map[revision]struct{}{
{main: 8}: {},
{main: 5}: {},
},
},
{
atRev: 9,
keyIndexes: []keyIndex{
{
key: []byte("foo"),
modified: revision{main: 10},
generations: []generation{
{ver: 3, created: revision{main: 1}, revs: []revision{{main: 9}}},
{ver: 1, created: revision{main: 10}, revs: []revision{{main: 10}}},
},
},
{
key: []byte("foo1"),
modified: revision{main: 10, sub: 1},
generations: []generation{
{ver: 1, created: revision{main: 10, sub: 1}, revs: []revision{{main: 10, sub: 1}}},
},
},
},
keep: map[revision]struct{}{},
compacted: map[revision]struct{}{
{main: 9}: {},
},
},
{
atRev: 10,
keyIndexes: []keyIndex{
{
key: []byte("foo"),
modified: revision{main: 10},
generations: []generation{
{ver: 1, created: revision{main: 10}, revs: []revision{{main: 10}}},
},
},
{
key: []byte("foo1"),
modified: revision{main: 10, sub: 1},
generations: []generation{
{ver: 1, created: revision{main: 10, sub: 1}, revs: []revision{{main: 10, sub: 1}}},
},
},
},
keep: map[revision]struct{}{
{main: 10}: {},
{main: 10, sub: 1}: {},
},
compacted: map[revision]struct{}{
{main: 10}: {},
{main: 10, sub: 1}: {},
},
},
}
ti := buildTreeIndex()
// Continuous Compact and Keep
for i := int64(1); i < maxRev; i++ {
j := i - 1
if i >= int64(len(afterCompacts)) {
j = int64(len(afterCompacts)) - 1
}
am := ti.Compact(i)
require.Equal(t, afterCompacts[j].compacted, am, "#%d: compact(%d) != expected", i, i)
keep := ti.Keep(i)
if !(reflect.DeepEqual(am, keep)) {
t.Errorf("#%d: compact keep %v != Keep keep %v", i, am, keep)
}
wti := &treeIndex{tree: btree.New(32)}
for _, tt := range tests {
if _, ok := am[tt.rev]; ok || tt.rev.GreaterThan(revision{main: i}) {
if tt.remove {
wti.Tombstone(tt.key, tt.rev)
} else {
restore(wti, tt.key, tt.created, tt.rev, tt.ver)
}
}
}
if !ti.Equal(wti) {
t.Errorf("#%d: not equal ti", i)
require.Equal(t, afterCompacts[j].keep, keep, "#%d: keep(%d) != expected", i, i)
nti := newTreeIndex(zaptest.NewLogger(t)).(*treeIndex)
for k := range afterCompacts[j].keyIndexes {
ki := afterCompacts[j].keyIndexes[k]
nti.tree.ReplaceOrInsert(&ki)
}
require.True(t, ti.Equal(nti), "#%d: not equal ti", i)
}
// Once Compact and Keep
for i := int64(1); i < maxRev; i++ {
ti := newTreeIndex(zap.NewExample())
for _, tt := range tests {
if tt.remove {
ti.Tombstone(tt.key, tt.rev)
} else {
ti.Put(tt.key, tt.rev)
}
ti := buildTreeIndex()
j := i - 1
if i >= int64(len(afterCompacts)) {
j = int64(len(afterCompacts)) - 1
}
am := ti.Compact(i)
require.Equal(t, afterCompacts[j].compacted, am, "#%d: compact(%d) != expected", i, i)
keep := ti.Keep(i)
if !(reflect.DeepEqual(am, keep)) {
t.Errorf("#%d: compact keep %v != Keep keep %v", i, am, keep)
}
wti := &treeIndex{tree: btree.New(32)}
for _, tt := range tests {
if _, ok := am[tt.rev]; ok || tt.rev.GreaterThan(revision{main: i}) {
if tt.remove {
wti.Tombstone(tt.key, tt.rev)
} else {
restore(wti, tt.key, tt.created, tt.rev, tt.ver)
}
}
}
if !ti.Equal(wti) {
t.Errorf("#%d: not equal ti", i)
require.Equal(t, afterCompacts[j].keep, keep, "#%d: keep(%d) != expected", i, i)
nti := newTreeIndex(zaptest.NewLogger(t)).(*treeIndex)
for k := range afterCompacts[j].keyIndexes {
ki := afterCompacts[j].keyIndexes[k]
nti.tree.ReplaceOrInsert(&ki)
}
require.True(t, ti.Equal(nti), "#%d: not equal ti", i)
}
}
func restore(ti *treeIndex, key []byte, created, modified revision, ver int64) {
keyi := &keyIndex{key: key}
ti.Lock()
defer ti.Unlock()
item := ti.tree.Get(keyi)
if item == nil {
keyi.restore(ti.lg, created, modified, ver)
ti.tree.ReplaceOrInsert(keyi)
return
}
okeyi := item.(*keyIndex)
okeyi.put(ti.lg, modified.main, modified.sub)
}

View File

@ -66,7 +66,8 @@ var (
// compact(5):
// generations:
//
// {empty} -> key SHOULD be removed.
// {empty}
// {5.0(t)}
//
// compact(6):
// generations:
@ -223,8 +224,7 @@ func (ki *keyIndex) since(lg *zap.Logger, rev int64) []revision {
}
// compact compacts a keyIndex by removing the versions with smaller or equal
// revision than the given atRev except the largest one (If the largest one is
// a tombstone, it will not be kept).
// revision than the given atRev except the largest one.
// If a generation becomes empty during compaction, it will be removed.
func (ki *keyIndex) compact(lg *zap.Logger, atRev int64, available map[revision]struct{}) {
if ki.isEmpty() {
@ -246,11 +246,6 @@ func (ki *keyIndex) compact(lg *zap.Logger, atRev int64, available map[revision]
if revIndex != -1 {
g.revs = g.revs[revIndex:]
}
// remove any tombstone
if len(g.revs) == 1 && genIdx != len(ki.generations)-1 {
delete(available, g.revs[0])
genIdx++
}
}
// remove the previous generations.
@ -266,7 +261,12 @@ func (ki *keyIndex) keep(atRev int64, available map[revision]struct{}) {
genIdx, revIndex := ki.doCompact(atRev, available)
g := &ki.generations[genIdx]
if !g.isEmpty() {
// remove any tombstone
// If the given `atRev` is a tombstone, we need to skip it.
//
// Note that this s different from the `compact` function which
// keeps tombstone in such case. We need to stay consistent with
// existing versions, ensuring they always generate the same hash
// values.
if revIndex == len(g.revs)-1 && genIdx != len(ki.generations)-1 {
delete(available, g.revs[revIndex])
}
@ -287,7 +287,7 @@ func (ki *keyIndex) doCompact(atRev int64, available map[revision]struct{}) (gen
genIdx, g := 0, &ki.generations[0]
// find first generation includes atRev or created after atRev
for genIdx < len(ki.generations)-1 {
if tomb := g.revs[len(g.revs)-1].main; tomb > atRev {
if tomb := g.revs[len(g.revs)-1].main; tomb >= atRev {
break
}
genIdx++

View File

@ -18,6 +18,7 @@ import (
"reflect"
"testing"
"github.com/stretchr/testify/assert"
"go.uber.org/zap"
)
@ -298,12 +299,15 @@ func TestKeyIndexCompactAndKeep(t *testing.T) {
key: []byte("foo"),
modified: revision{16, 0},
generations: []generation{
{created: revision{main: 2}, ver: 3, revs: []revision{{main: 6}}},
{created: revision{8, 0}, ver: 3, revs: []revision{{main: 8}, {main: 10}, {main: 12}}},
{created: revision{14, 0}, ver: 3, revs: []revision{{main: 14}, {main: 15, sub: 1}, {main: 16}}},
{},
},
},
map[revision]struct{}{},
map[revision]struct{}{
{main: 6}: {},
},
},
{
7,
@ -384,11 +388,14 @@ func TestKeyIndexCompactAndKeep(t *testing.T) {
key: []byte("foo"),
modified: revision{16, 0},
generations: []generation{
{created: revision{main: 8}, ver: 3, revs: []revision{{main: 12}}},
{created: revision{14, 0}, ver: 3, revs: []revision{{main: 14}, {main: 15, sub: 1}, {main: 16}}},
{},
},
},
map[revision]struct{}{},
map[revision]struct{}{
{main: 12}: {},
},
},
{
13,
@ -434,7 +441,21 @@ func TestKeyIndexCompactAndKeep(t *testing.T) {
16,
&keyIndex{
key: []byte("foo"),
modified: revision{16, 0},
modified: revision{main: 16},
generations: []generation{
{created: revision{main: 14}, ver: 3, revs: []revision{{main: 16}}},
{},
},
},
map[revision]struct{}{
{main: 16}: {},
},
},
{
17,
&keyIndex{
key: []byte("foo"),
modified: revision{main: 16},
generations: []generation{
{},
},
@ -443,18 +464,36 @@ func TestKeyIndexCompactAndKeep(t *testing.T) {
},
}
isTombstoneRevFn := func(ki *keyIndex, rev int64) bool {
for i := 0; i < len(ki.generations)-1; i++ {
g := ki.generations[i]
if l := len(g.revs); l > 0 && g.revs[l-1].main == rev {
return true
}
}
return false
}
// Continuous Compaction and finding Keep
ki := newTestKeyIndex()
for i, tt := range tests {
isTombstone := isTombstoneRevFn(ki, tt.compact)
am := make(map[revision]struct{})
kiclone := cloneKeyIndex(ki)
ki.keep(tt.compact, am)
if !reflect.DeepEqual(ki, kiclone) {
t.Errorf("#%d: ki = %+v, want %+v", i, ki, kiclone)
}
if !reflect.DeepEqual(am, tt.wam) {
t.Errorf("#%d: am = %+v, want %+v", i, am, tt.wam)
if isTombstone {
assert.Equal(t, 0, len(am), "#%d: ki = %d, keep result wants empty because tombstone", i, ki)
} else {
assert.Equal(t, tt.wam, am,
"#%d: ki = %d, compact keep should be equal to keep keep if it's not tombstone", i, ki)
}
am = make(map[revision]struct{})
ki.compact(zap.NewExample(), tt.compact, am)
if !reflect.DeepEqual(ki, tt.wki) {
@ -468,7 +507,7 @@ func TestKeyIndexCompactAndKeep(t *testing.T) {
// Jump Compaction and finding Keep
ki = newTestKeyIndex()
for i, tt := range tests {
if (i%2 == 0 && i < 6) || (i%2 == 1 && i > 6) {
if !isTombstoneRevFn(ki, tt.compact) {
am := make(map[revision]struct{})
kiclone := cloneKeyIndex(ki)
ki.keep(tt.compact, am)
@ -498,9 +537,14 @@ func TestKeyIndexCompactAndKeep(t *testing.T) {
if !reflect.DeepEqual(ki, kiClone) {
t.Errorf("#%d: ki = %+v, want %+v", i, ki, kiClone)
}
if !reflect.DeepEqual(am, tt.wam) {
t.Errorf("#%d: am = %+v, want %+v", i, am, tt.wam)
if isTombstoneRevFn(ki, tt.compact) {
assert.Equal(t, 0, len(am), "#%d: ki = %d, keep result wants empty because tombstone", i, ki)
} else {
assert.Equal(t, tt.wam, am,
"#%d: ki = %d, compact keep should be equal to keep keep if it's not tombstone", i, ki)
}
am = make(map[revision]struct{})
ki.compact(zap.NewExample(), tt.compact, am)
if !reflect.DeepEqual(ki, tt.wki) {

View File

@ -228,6 +228,9 @@ func (s *store) HashByRev(rev int64) (hash uint32, currentRev int64, compactRev
if !upper.GreaterThan(kr) {
return nil
}
isTombstoneRev := isTombstone(k)
// skip revisions that are scheduled for deletion
// due to compacting; don't skip if there isn't one.
if lower.GreaterThan(kr) && len(keep) > 0 {
@ -235,6 +238,17 @@ func (s *store) HashByRev(rev int64) (hash uint32, currentRev int64, compactRev
return nil
}
}
// When performing compaction, if the compacted revision is a
// tombstone, older versions (<= 3.5.15 or <= 3.4.33) will delete
// the tombstone. But newer versions (> 3.5.15 or > 3.4.33) won't
// delete it. So we should skip the tombstone in such cases when
// computing the hash to ensure that both older and newer versions
// can always generate the same hash values.
if kr.main == compactRev && isTombstoneRev {
return nil
}
h.Write(k)
h.Write(v)
return nil