mirror of
https://github.com/etcd-io/etcd.git
synced 2024-09-27 06:25:44 +00:00
Merge pull request #17864 from fuweid/backport-17815-34
[3.4] fix revision loss issue caused by compaction - 17780
This commit is contained in:
commit
1d02c16e2a
@ -450,6 +450,17 @@ func (s *store) restore() error {
|
|||||||
s.currentRev = s.compactMainRev
|
s.currentRev = s.compactMainRev
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If the latest revision was a tombstone revision and etcd just compacted
|
||||||
|
// it, but crashed right before persisting the FinishedCompactRevision,
|
||||||
|
// then it would lead to revision decreasing in bbolt db file. In such
|
||||||
|
// a scenario, we should adjust the current revision using the scheduled
|
||||||
|
// compact revision on bootstrap when etcd gets started again.
|
||||||
|
//
|
||||||
|
// See https://github.com/etcd-io/etcd/issues/17780#issuecomment-2061900231
|
||||||
|
if s.currentRev < scheduledCompact {
|
||||||
|
s.currentRev = scheduledCompact
|
||||||
|
}
|
||||||
|
|
||||||
if scheduledCompact <= s.compactMainRev {
|
if scheduledCompact <= s.compactMainRev {
|
||||||
scheduledCompact = 0
|
scheduledCompact = 0
|
||||||
}
|
}
|
||||||
|
@ -49,6 +49,7 @@ func (s *store) scheduleCompaction(compactMainRev int64, keep map[revision]struc
|
|||||||
}
|
}
|
||||||
|
|
||||||
if len(keys) < s.cfg.CompactionBatchLimit {
|
if len(keys) < s.cfg.CompactionBatchLimit {
|
||||||
|
// gofail: var compactBeforeSetFinishedCompact struct{}
|
||||||
rbytes := make([]byte, 8+1+8)
|
rbytes := make([]byte, 8+1+8)
|
||||||
revToBytes(revision{main: compactMainRev}, rbytes)
|
revToBytes(revision{main: compactMainRev}, rbytes)
|
||||||
tx.UnsafePut(metaBucketName, finishedCompactKeyName, rbytes)
|
tx.UnsafePut(metaBucketName, finishedCompactKeyName, rbytes)
|
||||||
|
@ -140,6 +140,7 @@ type etcdProcessClusterConfig struct {
|
|||||||
|
|
||||||
MaxConcurrentStreams uint32 // default is math.MaxUint32
|
MaxConcurrentStreams uint32 // default is math.MaxUint32
|
||||||
WatchProcessNotifyInterval time.Duration
|
WatchProcessNotifyInterval time.Duration
|
||||||
|
CompactionBatchLimit int
|
||||||
|
|
||||||
debug bool
|
debug bool
|
||||||
|
|
||||||
@ -333,6 +334,9 @@ func (cfg *etcdProcessClusterConfig) etcdServerProcessConfigs() []*etcdServerPro
|
|||||||
if cfg.WatchProcessNotifyInterval != 0 {
|
if cfg.WatchProcessNotifyInterval != 0 {
|
||||||
args = append(args, "--experimental-watch-progress-notify-interval", cfg.WatchProcessNotifyInterval.String())
|
args = append(args, "--experimental-watch-progress-notify-interval", cfg.WatchProcessNotifyInterval.String())
|
||||||
}
|
}
|
||||||
|
if cfg.CompactionBatchLimit != 0 {
|
||||||
|
args = append(args, "--experimental-compaction-batch-limit", fmt.Sprintf("%d", cfg.CompactionBatchLimit))
|
||||||
|
}
|
||||||
|
|
||||||
if cfg.debug {
|
if cfg.debug {
|
||||||
args = append(args, "--debug")
|
args = append(args, "--debug")
|
||||||
|
114
tests/e2e/reproduce_17780_test.go
Normal file
114
tests/e2e/reproduce_17780_test.go
Normal file
@ -0,0 +1,114 @@
|
|||||||
|
// Copyright 2024 The etcd Authors
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package e2e
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/stretchr/testify/assert"
|
||||||
|
"github.com/stretchr/testify/require"
|
||||||
|
|
||||||
|
"go.etcd.io/etcd/clientv3"
|
||||||
|
"go.etcd.io/etcd/pkg/stringutil"
|
||||||
|
"go.etcd.io/etcd/pkg/testutil"
|
||||||
|
)
|
||||||
|
|
||||||
|
// TestReproduce17780 reproduces the issue: https://github.com/etcd-io/etcd/issues/17780.
|
||||||
|
func TestReproduce17780(t *testing.T) {
|
||||||
|
defer testutil.AfterTest(t)
|
||||||
|
|
||||||
|
compactionBatchLimit := 10
|
||||||
|
|
||||||
|
ctx := context.TODO()
|
||||||
|
clus, cerr := newEtcdProcessCluster(t, &etcdProcessClusterConfig{
|
||||||
|
clusterSize: 3,
|
||||||
|
goFailEnabled: true,
|
||||||
|
goFailClientTimeout: 40 * time.Second,
|
||||||
|
snapshotCount: 1000,
|
||||||
|
CompactionBatchLimit: compactionBatchLimit,
|
||||||
|
WatchProcessNotifyInterval: 100 * time.Millisecond,
|
||||||
|
})
|
||||||
|
require.NoError(t, cerr)
|
||||||
|
|
||||||
|
t.Cleanup(func() { require.NoError(t, clus.Stop()) })
|
||||||
|
|
||||||
|
leaderIdx := clus.WaitLeader(t)
|
||||||
|
targetIdx := (leaderIdx + 1) % clus.cfg.clusterSize
|
||||||
|
|
||||||
|
cli := newClient(t, clus.procs[targetIdx].EndpointsGRPC(), clientNonTLS, false)
|
||||||
|
|
||||||
|
// Revision: 2 -> 8 for new keys
|
||||||
|
n := compactionBatchLimit - 2
|
||||||
|
valueSize := 16
|
||||||
|
for i := 2; i <= n; i++ {
|
||||||
|
_, err := cli.Put(ctx, fmt.Sprintf("%d", i), stringutil.RandString(uint(valueSize)))
|
||||||
|
require.NoError(t, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Revision: 9 -> 11 for delete keys with compared revision
|
||||||
|
//
|
||||||
|
// We need last compaction batch is no-op and all the tombstones should
|
||||||
|
// be deleted in previous compaction batch. So that we just lost the
|
||||||
|
// finishedCompactRev after panic.
|
||||||
|
for i := 9; i <= compactionBatchLimit+1; i++ {
|
||||||
|
rev := i - 5
|
||||||
|
key := fmt.Sprintf("%d", rev)
|
||||||
|
|
||||||
|
_, err := cli.Delete(ctx, key)
|
||||||
|
require.NoError(t, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
require.NoError(t, clus.procs[targetIdx].Failpoints().SetupHTTP(ctx, "compactBeforeSetFinishedCompact", `panic`))
|
||||||
|
|
||||||
|
_, err := cli.Compact(ctx, 11, clientv3.WithCompactPhysical())
|
||||||
|
require.Error(t, err)
|
||||||
|
|
||||||
|
require.Error(t, clus.procs[targetIdx].Stop())
|
||||||
|
// NOTE: The proc panics and exit code is 2. It's impossible to restart
|
||||||
|
// that etcd proc because last exit code is 2 and Restart() refuses to
|
||||||
|
// start new one. Using IsRunning() function is to cleanup status.
|
||||||
|
require.False(t, clus.procs[targetIdx].IsRunning())
|
||||||
|
require.NoError(t, clus.procs[targetIdx].Restart())
|
||||||
|
|
||||||
|
// NOTE: We should not decrease the revision if there is no record
|
||||||
|
// about finished compact operation.
|
||||||
|
resp, err := cli.Get(ctx, fmt.Sprintf("%d", n))
|
||||||
|
require.NoError(t, err)
|
||||||
|
assert.GreaterOrEqual(t, resp.Header.Revision, int64(11))
|
||||||
|
|
||||||
|
// Revision 4 should be deleted by compaction.
|
||||||
|
resp, err = cli.Get(ctx, fmt.Sprintf("%d", 4))
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.True(t, resp.Count == 0)
|
||||||
|
|
||||||
|
next := 20
|
||||||
|
for i := 12; i <= next; i++ {
|
||||||
|
_, err := cli.Put(ctx, fmt.Sprintf("%d", i), stringutil.RandString(uint(valueSize)))
|
||||||
|
require.NoError(t, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
expectedRevision := next
|
||||||
|
for procIdx, proc := range clus.procs {
|
||||||
|
cli = newClient(t, proc.EndpointsGRPC(), clientNonTLS, false)
|
||||||
|
resp, err := cli.Get(ctx, fmt.Sprintf("%d", next))
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
assert.GreaterOrEqual(t, resp.Header.Revision, int64(expectedRevision),
|
||||||
|
fmt.Sprintf("LeaderIdx: %d, Current: %d", leaderIdx, procIdx))
|
||||||
|
}
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user