server: Move server files to 'server' directory.

26  git mv mvcc wal auth etcdserver etcdmain proxy embed/ lease/ server
   36  git mv go.mod go.sum server
This commit is contained in:
Piotr Tabor
2020-10-20 23:05:17 +02:00
parent eee8dec0c3
commit 4a5e9d1261
316 changed files with 0 additions and 0 deletions

View File

@@ -0,0 +1,572 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package backend
import (
"fmt"
"hash/crc32"
"io"
"io/ioutil"
"os"
"path/filepath"
"sync"
"sync/atomic"
"time"
humanize "github.com/dustin/go-humanize"
bolt "go.etcd.io/bbolt"
"go.uber.org/zap"
)
var (
defaultBatchLimit = 10000
defaultBatchInterval = 100 * time.Millisecond
defragLimit = 10000
// initialMmapSize is the initial size of the mmapped region. Setting this larger than
// the potential max db size can prevent writer from blocking reader.
// This only works for linux.
initialMmapSize = uint64(10 * 1024 * 1024 * 1024)
// minSnapshotWarningTimeout is the minimum threshold to trigger a long running snapshot warning.
minSnapshotWarningTimeout = 30 * time.Second
)
type Backend interface {
// ReadTx returns a read transaction. It is replaced by ConcurrentReadTx in the main data path, see #10523.
ReadTx() ReadTx
BatchTx() BatchTx
// ConcurrentReadTx returns a non-blocking read transaction.
ConcurrentReadTx() ReadTx
Snapshot() Snapshot
Hash(ignores map[IgnoreKey]struct{}) (uint32, error)
// Size returns the current size of the backend physically allocated.
// The backend can hold DB space that is not utilized at the moment,
// since it can conduct pre-allocation or spare unused space for recycling.
// Use SizeInUse() instead for the actual DB size.
Size() int64
// SizeInUse returns the current size of the backend logically in use.
// Since the backend can manage free space in a non-byte unit such as
// number of pages, the returned value can be not exactly accurate in bytes.
SizeInUse() int64
// OpenReadTxN returns the number of currently open read transactions in the backend.
OpenReadTxN() int64
Defrag() error
ForceCommit()
Close() error
}
type Snapshot interface {
// Size gets the size of the snapshot.
Size() int64
// WriteTo writes the snapshot into the given writer.
WriteTo(w io.Writer) (n int64, err error)
// Close closes the snapshot.
Close() error
}
type backend struct {
// size and commits are used with atomic operations so they must be
// 64-bit aligned, otherwise 32-bit tests will crash
// size is the number of bytes allocated in the backend
size int64
// sizeInUse is the number of bytes actually used in the backend
sizeInUse int64
// commits counts number of commits since start
commits int64
// openReadTxN is the number of currently open read transactions in the backend
openReadTxN int64
mu sync.RWMutex
db *bolt.DB
batchInterval time.Duration
batchLimit int
batchTx *batchTxBuffered
readTx *readTx
stopc chan struct{}
donec chan struct{}
lg *zap.Logger
}
type BackendConfig struct {
// Path is the file path to the backend file.
Path string
// BatchInterval is the maximum time before flushing the BatchTx.
BatchInterval time.Duration
// BatchLimit is the maximum puts before flushing the BatchTx.
BatchLimit int
// BackendFreelistType is the backend boltdb's freelist type.
BackendFreelistType bolt.FreelistType
// MmapSize is the number of bytes to mmap for the backend.
MmapSize uint64
// Logger logs backend-side operations.
Logger *zap.Logger
// UnsafeNoFsync disables all uses of fsync.
UnsafeNoFsync bool `json:"unsafe-no-fsync"`
}
func DefaultBackendConfig() BackendConfig {
return BackendConfig{
BatchInterval: defaultBatchInterval,
BatchLimit: defaultBatchLimit,
MmapSize: initialMmapSize,
}
}
func New(bcfg BackendConfig) Backend {
return newBackend(bcfg)
}
func NewDefaultBackend(path string) Backend {
bcfg := DefaultBackendConfig()
bcfg.Path = path
return newBackend(bcfg)
}
func newBackend(bcfg BackendConfig) *backend {
if bcfg.Logger == nil {
bcfg.Logger = zap.NewNop()
}
bopts := &bolt.Options{}
if boltOpenOptions != nil {
*bopts = *boltOpenOptions
}
bopts.InitialMmapSize = bcfg.mmapSize()
bopts.FreelistType = bcfg.BackendFreelistType
bopts.NoSync = bcfg.UnsafeNoFsync
bopts.NoGrowSync = bcfg.UnsafeNoFsync
db, err := bolt.Open(bcfg.Path, 0600, bopts)
if err != nil {
bcfg.Logger.Panic("failed to open database", zap.String("path", bcfg.Path), zap.Error(err))
}
// In future, may want to make buffering optional for low-concurrency systems
// or dynamically swap between buffered/non-buffered depending on workload.
b := &backend{
db: db,
batchInterval: bcfg.BatchInterval,
batchLimit: bcfg.BatchLimit,
readTx: &readTx{
baseReadTx: baseReadTx{
buf: txReadBuffer{
txBuffer: txBuffer{make(map[string]*bucketBuffer)},
},
buckets: make(map[string]*bolt.Bucket),
txWg: new(sync.WaitGroup),
txMu: new(sync.RWMutex),
},
},
stopc: make(chan struct{}),
donec: make(chan struct{}),
lg: bcfg.Logger,
}
b.batchTx = newBatchTxBuffered(b)
go b.run()
return b
}
// BatchTx returns the current batch tx in coalescer. The tx can be used for read and
// write operations. The write result can be retrieved within the same tx immediately.
// The write result is isolated with other txs until the current one get committed.
func (b *backend) BatchTx() BatchTx {
return b.batchTx
}
func (b *backend) ReadTx() ReadTx { return b.readTx }
// ConcurrentReadTx creates and returns a new ReadTx, which:
// A) creates and keeps a copy of backend.readTx.txReadBuffer,
// B) references the boltdb read Tx (and its bucket cache) of current batch interval.
func (b *backend) ConcurrentReadTx() ReadTx {
b.readTx.RLock()
defer b.readTx.RUnlock()
// prevent boltdb read Tx from been rolled back until store read Tx is done. Needs to be called when holding readTx.RLock().
b.readTx.txWg.Add(1)
// TODO: might want to copy the read buffer lazily - create copy when A) end of a write transaction B) end of a batch interval.
return &concurrentReadTx{
baseReadTx: baseReadTx{
buf: b.readTx.buf.unsafeCopy(),
txMu: b.readTx.txMu,
tx: b.readTx.tx,
buckets: b.readTx.buckets,
txWg: b.readTx.txWg,
},
}
}
// ForceCommit forces the current batching tx to commit.
func (b *backend) ForceCommit() {
b.batchTx.Commit()
}
func (b *backend) Snapshot() Snapshot {
b.batchTx.Commit()
b.mu.RLock()
defer b.mu.RUnlock()
tx, err := b.db.Begin(false)
if err != nil {
b.lg.Fatal("failed to begin tx", zap.Error(err))
}
stopc, donec := make(chan struct{}), make(chan struct{})
dbBytes := tx.Size()
go func() {
defer close(donec)
// sendRateBytes is based on transferring snapshot data over a 1 gigabit/s connection
// assuming a min tcp throughput of 100MB/s.
var sendRateBytes int64 = 100 * 1024 * 1024
warningTimeout := time.Duration(int64((float64(dbBytes) / float64(sendRateBytes)) * float64(time.Second)))
if warningTimeout < minSnapshotWarningTimeout {
warningTimeout = minSnapshotWarningTimeout
}
start := time.Now()
ticker := time.NewTicker(warningTimeout)
defer ticker.Stop()
for {
select {
case <-ticker.C:
b.lg.Warn(
"snapshotting taking too long to transfer",
zap.Duration("taking", time.Since(start)),
zap.Int64("bytes", dbBytes),
zap.String("size", humanize.Bytes(uint64(dbBytes))),
)
case <-stopc:
snapshotTransferSec.Observe(time.Since(start).Seconds())
return
}
}
}()
return &snapshot{tx, stopc, donec}
}
type IgnoreKey struct {
Bucket string
Key string
}
func (b *backend) Hash(ignores map[IgnoreKey]struct{}) (uint32, error) {
h := crc32.New(crc32.MakeTable(crc32.Castagnoli))
b.mu.RLock()
defer b.mu.RUnlock()
err := b.db.View(func(tx *bolt.Tx) error {
c := tx.Cursor()
for next, _ := c.First(); next != nil; next, _ = c.Next() {
b := tx.Bucket(next)
if b == nil {
return fmt.Errorf("cannot get hash of bucket %s", string(next))
}
h.Write(next)
b.ForEach(func(k, v []byte) error {
bk := IgnoreKey{Bucket: string(next), Key: string(k)}
if _, ok := ignores[bk]; !ok {
h.Write(k)
h.Write(v)
}
return nil
})
}
return nil
})
if err != nil {
return 0, err
}
return h.Sum32(), nil
}
func (b *backend) Size() int64 {
return atomic.LoadInt64(&b.size)
}
func (b *backend) SizeInUse() int64 {
return atomic.LoadInt64(&b.sizeInUse)
}
func (b *backend) run() {
defer close(b.donec)
t := time.NewTimer(b.batchInterval)
defer t.Stop()
for {
select {
case <-t.C:
case <-b.stopc:
b.batchTx.CommitAndStop()
return
}
if b.batchTx.safePending() != 0 {
b.batchTx.Commit()
}
t.Reset(b.batchInterval)
}
}
func (b *backend) Close() error {
close(b.stopc)
<-b.donec
return b.db.Close()
}
// Commits returns total number of commits since start
func (b *backend) Commits() int64 {
return atomic.LoadInt64(&b.commits)
}
func (b *backend) Defrag() error {
return b.defrag()
}
func (b *backend) defrag() error {
now := time.Now()
// TODO: make this non-blocking?
// lock batchTx to ensure nobody is using previous tx, and then
// close previous ongoing tx.
b.batchTx.Lock()
defer b.batchTx.Unlock()
// lock database after lock tx to avoid deadlock.
b.mu.Lock()
defer b.mu.Unlock()
// block concurrent read requests while resetting tx
b.readTx.Lock()
defer b.readTx.Unlock()
b.batchTx.unsafeCommit(true)
b.batchTx.tx = nil
// Create a temporary file to ensure we start with a clean slate.
// Snapshotter.cleanupSnapdir cleans up any of these that are found during startup.
dir := filepath.Dir(b.db.Path())
temp, err := ioutil.TempFile(dir, "db.tmp.*")
if err != nil {
return err
}
options := bolt.Options{}
if boltOpenOptions != nil {
options = *boltOpenOptions
}
options.OpenFile = func(path string, i int, mode os.FileMode) (file *os.File, err error) {
return temp, nil
}
tdbp := temp.Name()
tmpdb, err := bolt.Open(tdbp, 0600, &options)
if err != nil {
return err
}
dbp := b.db.Path()
size1, sizeInUse1 := b.Size(), b.SizeInUse()
if b.lg != nil {
b.lg.Info(
"defragmenting",
zap.String("path", dbp),
zap.Int64("current-db-size-bytes", size1),
zap.String("current-db-size", humanize.Bytes(uint64(size1))),
zap.Int64("current-db-size-in-use-bytes", sizeInUse1),
zap.String("current-db-size-in-use", humanize.Bytes(uint64(sizeInUse1))),
)
}
// gofail: var defragBeforeCopy struct{}
err = defragdb(b.db, tmpdb, defragLimit)
if err != nil {
tmpdb.Close()
if rmErr := os.RemoveAll(tmpdb.Path()); rmErr != nil {
b.lg.Error("failed to remove db.tmp after defragmentation completed", zap.Error(rmErr))
}
return err
}
err = b.db.Close()
if err != nil {
b.lg.Fatal("failed to close database", zap.Error(err))
}
err = tmpdb.Close()
if err != nil {
b.lg.Fatal("failed to close tmp database", zap.Error(err))
}
// gofail: var defragBeforeRename struct{}
err = os.Rename(tdbp, dbp)
if err != nil {
b.lg.Fatal("failed to rename tmp database", zap.Error(err))
}
b.db, err = bolt.Open(dbp, 0600, boltOpenOptions)
if err != nil {
b.lg.Fatal("failed to open database", zap.String("path", dbp), zap.Error(err))
}
b.batchTx.tx = b.unsafeBegin(true)
b.readTx.reset()
b.readTx.tx = b.unsafeBegin(false)
size := b.readTx.tx.Size()
db := b.readTx.tx.DB()
atomic.StoreInt64(&b.size, size)
atomic.StoreInt64(&b.sizeInUse, size-(int64(db.Stats().FreePageN)*int64(db.Info().PageSize)))
took := time.Since(now)
defragSec.Observe(took.Seconds())
size2, sizeInUse2 := b.Size(), b.SizeInUse()
if b.lg != nil {
b.lg.Info(
"defragmented",
zap.String("path", dbp),
zap.Int64("current-db-size-bytes-diff", size2-size1),
zap.Int64("current-db-size-bytes", size2),
zap.String("current-db-size", humanize.Bytes(uint64(size2))),
zap.Int64("current-db-size-in-use-bytes-diff", sizeInUse2-sizeInUse1),
zap.Int64("current-db-size-in-use-bytes", sizeInUse2),
zap.String("current-db-size-in-use", humanize.Bytes(uint64(sizeInUse2))),
zap.Duration("took", took),
)
}
return nil
}
func defragdb(odb, tmpdb *bolt.DB, limit int) error {
// open a tx on tmpdb for writes
tmptx, err := tmpdb.Begin(true)
if err != nil {
return err
}
defer func() {
if err != nil {
tmptx.Rollback()
}
}()
// open a tx on old db for read
tx, err := odb.Begin(false)
if err != nil {
return err
}
defer tx.Rollback()
c := tx.Cursor()
count := 0
for next, _ := c.First(); next != nil; next, _ = c.Next() {
b := tx.Bucket(next)
if b == nil {
return fmt.Errorf("backend: cannot defrag bucket %s", string(next))
}
tmpb, berr := tmptx.CreateBucketIfNotExists(next)
if berr != nil {
return berr
}
tmpb.FillPercent = 0.9 // for seq write in for each
if err = b.ForEach(func(k, v []byte) error {
count++
if count > limit {
err = tmptx.Commit()
if err != nil {
return err
}
tmptx, err = tmpdb.Begin(true)
if err != nil {
return err
}
tmpb = tmptx.Bucket(next)
tmpb.FillPercent = 0.9 // for seq write in for each
count = 0
}
return tmpb.Put(k, v)
}); err != nil {
return err
}
}
return tmptx.Commit()
}
func (b *backend) begin(write bool) *bolt.Tx {
b.mu.RLock()
tx := b.unsafeBegin(write)
b.mu.RUnlock()
size := tx.Size()
db := tx.DB()
stats := db.Stats()
atomic.StoreInt64(&b.size, size)
atomic.StoreInt64(&b.sizeInUse, size-(int64(stats.FreePageN)*int64(db.Info().PageSize)))
atomic.StoreInt64(&b.openReadTxN, int64(stats.OpenTxN))
return tx
}
func (b *backend) unsafeBegin(write bool) *bolt.Tx {
tx, err := b.db.Begin(write)
if err != nil {
b.lg.Fatal("failed to begin tx", zap.Error(err))
}
return tx
}
func (b *backend) OpenReadTxN() int64 {
return atomic.LoadInt64(&b.openReadTxN)
}
// NewTmpBackend creates a backend implementation for testing.
func NewTmpBackend(batchInterval time.Duration, batchLimit int) (*backend, string) {
dir, err := ioutil.TempDir(os.TempDir(), "etcd_backend_test")
if err != nil {
panic(err)
}
tmpPath := filepath.Join(dir, "database")
bcfg := DefaultBackendConfig()
bcfg.Path, bcfg.BatchInterval, bcfg.BatchLimit = tmpPath, batchInterval, batchLimit
return newBackend(bcfg), tmpPath
}
func NewDefaultTmpBackend() (*backend, string) {
return NewTmpBackend(defaultBatchInterval, defaultBatchLimit)
}
type snapshot struct {
*bolt.Tx
stopc chan struct{}
donec chan struct{}
}
func (s *snapshot) Close() error {
close(s.stopc)
<-s.donec
return s.Tx.Rollback()
}

View File

@@ -0,0 +1,50 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package backend
import (
"crypto/rand"
"os"
"testing"
"time"
)
func BenchmarkBackendPut(b *testing.B) {
backend, tmppath := NewTmpBackend(100*time.Millisecond, 10000)
defer backend.Close()
defer os.Remove(tmppath)
// prepare keys
keys := make([][]byte, b.N)
for i := 0; i < b.N; i++ {
keys[i] = make([]byte, 64)
rand.Read(keys[i])
}
value := make([]byte, 128)
rand.Read(value)
batchTx := backend.BatchTx()
batchTx.Lock()
batchTx.UnsafeCreateBucket([]byte("test"))
batchTx.Unlock()
b.ResetTimer()
for i := 0; i < b.N; i++ {
batchTx.Lock()
batchTx.UnsafePut([]byte("test"), keys[i], value)
batchTx.Unlock()
}
}

View File

@@ -0,0 +1,335 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package backend
import (
"fmt"
"io/ioutil"
"os"
"reflect"
"testing"
"time"
bolt "go.etcd.io/bbolt"
)
func TestBackendClose(t *testing.T) {
b, tmpPath := NewTmpBackend(time.Hour, 10000)
defer os.Remove(tmpPath)
// check close could work
done := make(chan struct{})
go func() {
err := b.Close()
if err != nil {
t.Errorf("close error = %v, want nil", err)
}
done <- struct{}{}
}()
select {
case <-done:
case <-time.After(10 * time.Second):
t.Errorf("failed to close database in 10s")
}
}
func TestBackendSnapshot(t *testing.T) {
b, tmpPath := NewTmpBackend(time.Hour, 10000)
defer cleanup(b, tmpPath)
tx := b.BatchTx()
tx.Lock()
tx.UnsafeCreateBucket([]byte("test"))
tx.UnsafePut([]byte("test"), []byte("foo"), []byte("bar"))
tx.Unlock()
b.ForceCommit()
// write snapshot to a new file
f, err := ioutil.TempFile(os.TempDir(), "etcd_backend_test")
if err != nil {
t.Fatal(err)
}
snap := b.Snapshot()
defer snap.Close()
if _, err := snap.WriteTo(f); err != nil {
t.Fatal(err)
}
f.Close()
// bootstrap new backend from the snapshot
bcfg := DefaultBackendConfig()
bcfg.Path, bcfg.BatchInterval, bcfg.BatchLimit = f.Name(), time.Hour, 10000
nb := New(bcfg)
defer cleanup(nb, f.Name())
newTx := nb.BatchTx()
newTx.Lock()
ks, _ := newTx.UnsafeRange([]byte("test"), []byte("foo"), []byte("goo"), 0)
if len(ks) != 1 {
t.Errorf("len(kvs) = %d, want 1", len(ks))
}
newTx.Unlock()
}
func TestBackendBatchIntervalCommit(t *testing.T) {
// start backend with super short batch interval so
// we do not need to wait long before commit to happen.
b, tmpPath := NewTmpBackend(time.Nanosecond, 10000)
defer cleanup(b, tmpPath)
pc := b.Commits()
tx := b.BatchTx()
tx.Lock()
tx.UnsafeCreateBucket([]byte("test"))
tx.UnsafePut([]byte("test"), []byte("foo"), []byte("bar"))
tx.Unlock()
for i := 0; i < 10; i++ {
if b.Commits() >= pc+1 {
break
}
time.Sleep(time.Duration(i*100) * time.Millisecond)
}
// check whether put happens via db view
b.db.View(func(tx *bolt.Tx) error {
bucket := tx.Bucket([]byte("test"))
if bucket == nil {
t.Errorf("bucket test does not exit")
return nil
}
v := bucket.Get([]byte("foo"))
if v == nil {
t.Errorf("foo key failed to written in backend")
}
return nil
})
}
func TestBackendDefrag(t *testing.T) {
b, tmpPath := NewDefaultTmpBackend()
defer cleanup(b, tmpPath)
tx := b.BatchTx()
tx.Lock()
tx.UnsafeCreateBucket([]byte("test"))
for i := 0; i < defragLimit+100; i++ {
tx.UnsafePut([]byte("test"), []byte(fmt.Sprintf("foo_%d", i)), []byte("bar"))
}
tx.Unlock()
b.ForceCommit()
// remove some keys to ensure the disk space will be reclaimed after defrag
tx = b.BatchTx()
tx.Lock()
for i := 0; i < 50; i++ {
tx.UnsafeDelete([]byte("test"), []byte(fmt.Sprintf("foo_%d", i)))
}
tx.Unlock()
b.ForceCommit()
size := b.Size()
// shrink and check hash
oh, err := b.Hash(nil)
if err != nil {
t.Fatal(err)
}
err = b.Defrag()
if err != nil {
t.Fatal(err)
}
nh, err := b.Hash(nil)
if err != nil {
t.Fatal(err)
}
if oh != nh {
t.Errorf("hash = %v, want %v", nh, oh)
}
nsize := b.Size()
if nsize >= size {
t.Errorf("new size = %v, want < %d", nsize, size)
}
// try put more keys after shrink.
tx = b.BatchTx()
tx.Lock()
tx.UnsafeCreateBucket([]byte("test"))
tx.UnsafePut([]byte("test"), []byte("more"), []byte("bar"))
tx.Unlock()
b.ForceCommit()
}
// TestBackendWriteback ensures writes are stored to the read txn on write txn unlock.
func TestBackendWriteback(t *testing.T) {
b, tmpPath := NewDefaultTmpBackend()
defer cleanup(b, tmpPath)
tx := b.BatchTx()
tx.Lock()
tx.UnsafeCreateBucket([]byte("key"))
tx.UnsafePut([]byte("key"), []byte("abc"), []byte("bar"))
tx.UnsafePut([]byte("key"), []byte("def"), []byte("baz"))
tx.UnsafePut([]byte("key"), []byte("overwrite"), []byte("1"))
tx.Unlock()
// overwrites should be propagated too
tx.Lock()
tx.UnsafePut([]byte("key"), []byte("overwrite"), []byte("2"))
tx.Unlock()
keys := []struct {
key []byte
end []byte
limit int64
wkey [][]byte
wval [][]byte
}{
{
key: []byte("abc"),
end: nil,
wkey: [][]byte{[]byte("abc")},
wval: [][]byte{[]byte("bar")},
},
{
key: []byte("abc"),
end: []byte("def"),
wkey: [][]byte{[]byte("abc")},
wval: [][]byte{[]byte("bar")},
},
{
key: []byte("abc"),
end: []byte("deg"),
wkey: [][]byte{[]byte("abc"), []byte("def")},
wval: [][]byte{[]byte("bar"), []byte("baz")},
},
{
key: []byte("abc"),
end: []byte("\xff"),
limit: 1,
wkey: [][]byte{[]byte("abc")},
wval: [][]byte{[]byte("bar")},
},
{
key: []byte("abc"),
end: []byte("\xff"),
wkey: [][]byte{[]byte("abc"), []byte("def"), []byte("overwrite")},
wval: [][]byte{[]byte("bar"), []byte("baz"), []byte("2")},
},
}
rtx := b.ReadTx()
for i, tt := range keys {
rtx.RLock()
k, v := rtx.UnsafeRange([]byte("key"), tt.key, tt.end, tt.limit)
rtx.RUnlock()
if !reflect.DeepEqual(tt.wkey, k) || !reflect.DeepEqual(tt.wval, v) {
t.Errorf("#%d: want k=%+v, v=%+v; got k=%+v, v=%+v", i, tt.wkey, tt.wval, k, v)
}
}
}
// TestConcurrentReadTx ensures that current read transaction can see all prior writes stored in read buffer
func TestConcurrentReadTx(t *testing.T) {
b, tmpPath := NewTmpBackend(time.Hour, 10000)
defer cleanup(b, tmpPath)
wtx1 := b.BatchTx()
wtx1.Lock()
wtx1.UnsafeCreateBucket([]byte("key"))
wtx1.UnsafePut([]byte("key"), []byte("abc"), []byte("ABC"))
wtx1.UnsafePut([]byte("key"), []byte("overwrite"), []byte("1"))
wtx1.Unlock()
wtx2 := b.BatchTx()
wtx2.Lock()
wtx2.UnsafePut([]byte("key"), []byte("def"), []byte("DEF"))
wtx2.UnsafePut([]byte("key"), []byte("overwrite"), []byte("2"))
wtx2.Unlock()
rtx := b.ConcurrentReadTx()
rtx.RLock() // no-op
k, v := rtx.UnsafeRange([]byte("key"), []byte("abc"), []byte("\xff"), 0)
rtx.RUnlock()
wKey := [][]byte{[]byte("abc"), []byte("def"), []byte("overwrite")}
wVal := [][]byte{[]byte("ABC"), []byte("DEF"), []byte("2")}
if !reflect.DeepEqual(wKey, k) || !reflect.DeepEqual(wVal, v) {
t.Errorf("want k=%+v, v=%+v; got k=%+v, v=%+v", wKey, wVal, k, v)
}
}
// TestBackendWritebackForEach checks that partially written / buffered
// data is visited in the same order as fully committed data.
func TestBackendWritebackForEach(t *testing.T) {
b, tmpPath := NewTmpBackend(time.Hour, 10000)
defer cleanup(b, tmpPath)
tx := b.BatchTx()
tx.Lock()
tx.UnsafeCreateBucket([]byte("key"))
for i := 0; i < 5; i++ {
k := []byte(fmt.Sprintf("%04d", i))
tx.UnsafePut([]byte("key"), k, []byte("bar"))
}
tx.Unlock()
// writeback
b.ForceCommit()
tx.Lock()
tx.UnsafeCreateBucket([]byte("key"))
for i := 5; i < 20; i++ {
k := []byte(fmt.Sprintf("%04d", i))
tx.UnsafePut([]byte("key"), k, []byte("bar"))
}
tx.Unlock()
seq := ""
getSeq := func(k, v []byte) error {
seq += string(k)
return nil
}
rtx := b.ReadTx()
rtx.RLock()
rtx.UnsafeForEach([]byte("key"), getSeq)
rtx.RUnlock()
partialSeq := seq
seq = ""
b.ForceCommit()
tx.Lock()
tx.UnsafeForEach([]byte("key"), getSeq)
tx.Unlock()
if seq != partialSeq {
t.Fatalf("expected %q, got %q", seq, partialSeq)
}
}
func cleanup(b Backend, path string) {
b.Close()
os.Remove(path)
}

View File

@@ -0,0 +1,307 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package backend
import (
"bytes"
"math"
"sync"
"sync/atomic"
"time"
bolt "go.etcd.io/bbolt"
"go.uber.org/zap"
)
type BatchTx interface {
ReadTx
UnsafeCreateBucket(name []byte)
UnsafePut(bucketName []byte, key []byte, value []byte)
UnsafeSeqPut(bucketName []byte, key []byte, value []byte)
UnsafeDelete(bucketName []byte, key []byte)
// Commit commits a previous tx and begins a new writable one.
Commit()
// CommitAndStop commits the previous tx and does not create a new one.
CommitAndStop()
}
type batchTx struct {
sync.Mutex
tx *bolt.Tx
backend *backend
pending int
}
func (t *batchTx) Lock() {
t.Mutex.Lock()
}
func (t *batchTx) Unlock() {
if t.pending >= t.backend.batchLimit {
t.commit(false)
}
t.Mutex.Unlock()
}
// BatchTx interface embeds ReadTx interface. But RLock() and RUnlock() do not
// have appropriate semantics in BatchTx interface. Therefore should not be called.
// TODO: might want to decouple ReadTx and BatchTx
func (t *batchTx) RLock() {
panic("unexpected RLock")
}
func (t *batchTx) RUnlock() {
panic("unexpected RUnlock")
}
func (t *batchTx) UnsafeCreateBucket(name []byte) {
_, err := t.tx.CreateBucket(name)
if err != nil && err != bolt.ErrBucketExists {
t.backend.lg.Fatal(
"failed to create a bucket",
zap.String("bucket-name", string(name)),
zap.Error(err),
)
}
t.pending++
}
// UnsafePut must be called holding the lock on the tx.
func (t *batchTx) UnsafePut(bucketName []byte, key []byte, value []byte) {
t.unsafePut(bucketName, key, value, false)
}
// UnsafeSeqPut must be called holding the lock on the tx.
func (t *batchTx) UnsafeSeqPut(bucketName []byte, key []byte, value []byte) {
t.unsafePut(bucketName, key, value, true)
}
func (t *batchTx) unsafePut(bucketName []byte, key []byte, value []byte, seq bool) {
bucket := t.tx.Bucket(bucketName)
if bucket == nil {
t.backend.lg.Fatal(
"failed to find a bucket",
zap.String("bucket-name", string(bucketName)),
)
}
if seq {
// it is useful to increase fill percent when the workloads are mostly append-only.
// this can delay the page split and reduce space usage.
bucket.FillPercent = 0.9
}
if err := bucket.Put(key, value); err != nil {
t.backend.lg.Fatal(
"failed to write to a bucket",
zap.String("bucket-name", string(bucketName)),
zap.Error(err),
)
}
t.pending++
}
// UnsafeRange must be called holding the lock on the tx.
func (t *batchTx) UnsafeRange(bucketName, key, endKey []byte, limit int64) ([][]byte, [][]byte) {
bucket := t.tx.Bucket(bucketName)
if bucket == nil {
t.backend.lg.Fatal(
"failed to find a bucket",
zap.String("bucket-name", string(bucketName)),
)
}
return unsafeRange(bucket.Cursor(), key, endKey, limit)
}
func unsafeRange(c *bolt.Cursor, key, endKey []byte, limit int64) (keys [][]byte, vs [][]byte) {
if limit <= 0 {
limit = math.MaxInt64
}
var isMatch func(b []byte) bool
if len(endKey) > 0 {
isMatch = func(b []byte) bool { return bytes.Compare(b, endKey) < 0 }
} else {
isMatch = func(b []byte) bool { return bytes.Equal(b, key) }
limit = 1
}
for ck, cv := c.Seek(key); ck != nil && isMatch(ck); ck, cv = c.Next() {
vs = append(vs, cv)
keys = append(keys, ck)
if limit == int64(len(keys)) {
break
}
}
return keys, vs
}
// UnsafeDelete must be called holding the lock on the tx.
func (t *batchTx) UnsafeDelete(bucketName []byte, key []byte) {
bucket := t.tx.Bucket(bucketName)
if bucket == nil {
t.backend.lg.Fatal(
"failed to find a bucket",
zap.String("bucket-name", string(bucketName)),
)
}
err := bucket.Delete(key)
if err != nil {
t.backend.lg.Fatal(
"failed to delete a key",
zap.String("bucket-name", string(bucketName)),
zap.Error(err),
)
}
t.pending++
}
// UnsafeForEach must be called holding the lock on the tx.
func (t *batchTx) UnsafeForEach(bucketName []byte, visitor func(k, v []byte) error) error {
return unsafeForEach(t.tx, bucketName, visitor)
}
func unsafeForEach(tx *bolt.Tx, bucket []byte, visitor func(k, v []byte) error) error {
if b := tx.Bucket(bucket); b != nil {
return b.ForEach(visitor)
}
return nil
}
// Commit commits a previous tx and begins a new writable one.
func (t *batchTx) Commit() {
t.Lock()
t.commit(false)
t.Unlock()
}
// CommitAndStop commits the previous tx and does not create a new one.
func (t *batchTx) CommitAndStop() {
t.Lock()
t.commit(true)
t.Unlock()
}
func (t *batchTx) safePending() int {
t.Mutex.Lock()
defer t.Mutex.Unlock()
return t.pending
}
func (t *batchTx) commit(stop bool) {
// commit the last tx
if t.tx != nil {
if t.pending == 0 && !stop {
return
}
start := time.Now()
// gofail: var beforeCommit struct{}
err := t.tx.Commit()
// gofail: var afterCommit struct{}
rebalanceSec.Observe(t.tx.Stats().RebalanceTime.Seconds())
spillSec.Observe(t.tx.Stats().SpillTime.Seconds())
writeSec.Observe(t.tx.Stats().WriteTime.Seconds())
commitSec.Observe(time.Since(start).Seconds())
atomic.AddInt64(&t.backend.commits, 1)
t.pending = 0
if err != nil {
t.backend.lg.Fatal("failed to commit tx", zap.Error(err))
}
}
if !stop {
t.tx = t.backend.begin(true)
}
}
type batchTxBuffered struct {
batchTx
buf txWriteBuffer
}
func newBatchTxBuffered(backend *backend) *batchTxBuffered {
tx := &batchTxBuffered{
batchTx: batchTx{backend: backend},
buf: txWriteBuffer{
txBuffer: txBuffer{make(map[string]*bucketBuffer)},
seq: true,
},
}
tx.Commit()
return tx
}
func (t *batchTxBuffered) Unlock() {
if t.pending != 0 {
t.backend.readTx.Lock() // blocks txReadBuffer for writing.
t.buf.writeback(&t.backend.readTx.buf)
t.backend.readTx.Unlock()
if t.pending >= t.backend.batchLimit {
t.commit(false)
}
}
t.batchTx.Unlock()
}
func (t *batchTxBuffered) Commit() {
t.Lock()
t.commit(false)
t.Unlock()
}
func (t *batchTxBuffered) CommitAndStop() {
t.Lock()
t.commit(true)
t.Unlock()
}
func (t *batchTxBuffered) commit(stop bool) {
// all read txs must be closed to acquire boltdb commit rwlock
t.backend.readTx.Lock()
t.unsafeCommit(stop)
t.backend.readTx.Unlock()
}
func (t *batchTxBuffered) unsafeCommit(stop bool) {
if t.backend.readTx.tx != nil {
// wait all store read transactions using the current boltdb tx to finish,
// then close the boltdb tx
go func(tx *bolt.Tx, wg *sync.WaitGroup) {
wg.Wait()
if err := tx.Rollback(); err != nil {
t.backend.lg.Fatal("failed to rollback tx", zap.Error(err))
}
}(t.backend.readTx.tx, t.backend.readTx.txWg)
t.backend.readTx.reset()
}
t.batchTx.commit(stop)
if !stop {
t.backend.readTx.tx = t.backend.begin(false)
}
}
func (t *batchTxBuffered) UnsafePut(bucketName []byte, key []byte, value []byte) {
t.batchTx.UnsafePut(bucketName, key, value)
t.buf.put(bucketName, key, value)
}
func (t *batchTxBuffered) UnsafeSeqPut(bucketName []byte, key []byte, value []byte) {
t.batchTx.UnsafeSeqPut(bucketName, key, value)
t.buf.putSeq(bucketName, key, value)
}

View File

@@ -0,0 +1,197 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package backend
import (
"reflect"
"testing"
"time"
bolt "go.etcd.io/bbolt"
)
func TestBatchTxPut(t *testing.T) {
b, tmpPath := NewTmpBackend(time.Hour, 10000)
defer cleanup(b, tmpPath)
tx := b.batchTx
tx.Lock()
defer tx.Unlock()
// create bucket
tx.UnsafeCreateBucket([]byte("test"))
// put
v := []byte("bar")
tx.UnsafePut([]byte("test"), []byte("foo"), v)
// check put result before and after tx is committed
for k := 0; k < 2; k++ {
_, gv := tx.UnsafeRange([]byte("test"), []byte("foo"), nil, 0)
if !reflect.DeepEqual(gv[0], v) {
t.Errorf("v = %s, want %s", string(gv[0]), string(v))
}
tx.commit(false)
}
}
func TestBatchTxRange(t *testing.T) {
b, tmpPath := NewTmpBackend(time.Hour, 10000)
defer cleanup(b, tmpPath)
tx := b.batchTx
tx.Lock()
defer tx.Unlock()
tx.UnsafeCreateBucket([]byte("test"))
// put keys
allKeys := [][]byte{[]byte("foo"), []byte("foo1"), []byte("foo2")}
allVals := [][]byte{[]byte("bar"), []byte("bar1"), []byte("bar2")}
for i := range allKeys {
tx.UnsafePut([]byte("test"), allKeys[i], allVals[i])
}
tests := []struct {
key []byte
endKey []byte
limit int64
wkeys [][]byte
wvals [][]byte
}{
// single key
{
[]byte("foo"), nil, 0,
allKeys[:1], allVals[:1],
},
// single key, bad
{
[]byte("doo"), nil, 0,
nil, nil,
},
// key range
{
[]byte("foo"), []byte("foo1"), 0,
allKeys[:1], allVals[:1],
},
// key range, get all keys
{
[]byte("foo"), []byte("foo3"), 0,
allKeys, allVals,
},
// key range, bad
{
[]byte("goo"), []byte("goo3"), 0,
nil, nil,
},
// key range with effective limit
{
[]byte("foo"), []byte("foo3"), 1,
allKeys[:1], allVals[:1],
},
// key range with limit
{
[]byte("foo"), []byte("foo3"), 4,
allKeys, allVals,
},
}
for i, tt := range tests {
keys, vals := tx.UnsafeRange([]byte("test"), tt.key, tt.endKey, tt.limit)
if !reflect.DeepEqual(keys, tt.wkeys) {
t.Errorf("#%d: keys = %+v, want %+v", i, keys, tt.wkeys)
}
if !reflect.DeepEqual(vals, tt.wvals) {
t.Errorf("#%d: vals = %+v, want %+v", i, vals, tt.wvals)
}
}
}
func TestBatchTxDelete(t *testing.T) {
b, tmpPath := NewTmpBackend(time.Hour, 10000)
defer cleanup(b, tmpPath)
tx := b.batchTx
tx.Lock()
defer tx.Unlock()
tx.UnsafeCreateBucket([]byte("test"))
tx.UnsafePut([]byte("test"), []byte("foo"), []byte("bar"))
tx.UnsafeDelete([]byte("test"), []byte("foo"))
// check put result before and after tx is committed
for k := 0; k < 2; k++ {
ks, _ := tx.UnsafeRange([]byte("test"), []byte("foo"), nil, 0)
if len(ks) != 0 {
t.Errorf("keys on foo = %v, want nil", ks)
}
tx.commit(false)
}
}
func TestBatchTxCommit(t *testing.T) {
b, tmpPath := NewTmpBackend(time.Hour, 10000)
defer cleanup(b, tmpPath)
tx := b.batchTx
tx.Lock()
tx.UnsafeCreateBucket([]byte("test"))
tx.UnsafePut([]byte("test"), []byte("foo"), []byte("bar"))
tx.Unlock()
tx.Commit()
// check whether put happens via db view
b.db.View(func(tx *bolt.Tx) error {
bucket := tx.Bucket([]byte("test"))
if bucket == nil {
t.Errorf("bucket test does not exit")
return nil
}
v := bucket.Get([]byte("foo"))
if v == nil {
t.Errorf("foo key failed to written in backend")
}
return nil
})
}
func TestBatchTxBatchLimitCommit(t *testing.T) {
// start backend with batch limit 1 so one write can
// trigger a commit
b, tmpPath := NewTmpBackend(time.Hour, 1)
defer cleanup(b, tmpPath)
tx := b.batchTx
tx.Lock()
tx.UnsafeCreateBucket([]byte("test"))
tx.UnsafePut([]byte("test"), []byte("foo"), []byte("bar"))
tx.Unlock()
// batch limit commit should have been triggered
// check whether put happens via db view
b.db.View(func(tx *bolt.Tx) error {
bucket := tx.Bucket([]byte("test"))
if bucket == nil {
t.Errorf("bucket test does not exit")
return nil
}
v := bucket.Get([]byte("foo"))
if v == nil {
t.Errorf("foo key failed to written in backend")
}
return nil
})
}

View File

@@ -0,0 +1,23 @@
// Copyright 2016 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// +build !linux,!windows
package backend
import bolt "go.etcd.io/bbolt"
var boltOpenOptions *bolt.Options
func (bcfg *BackendConfig) mmapSize() int { return int(bcfg.MmapSize) }

View File

@@ -0,0 +1,34 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package backend
import (
"syscall"
bolt "go.etcd.io/bbolt"
)
// syscall.MAP_POPULATE on linux 2.6.23+ does sequential read-ahead
// which can speed up entire-database read with boltdb. We want to
// enable MAP_POPULATE for faster key-value store recovery in storage
// package. If your kernel version is lower than 2.6.23
// (https://github.com/torvalds/linux/releases/tag/v2.6.23), mmap might
// silently ignore this flag. Please update your kernel to prevent this.
var boltOpenOptions = &bolt.Options{
MmapFlags: syscall.MAP_POPULATE,
NoFreelistSync: true,
}
func (bcfg *BackendConfig) mmapSize() int { return int(bcfg.MmapSize) }

View File

@@ -0,0 +1,26 @@
// Copyright 2017 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// +build windows
package backend
import bolt "go.etcd.io/bbolt"
var boltOpenOptions *bolt.Options = nil
// setting mmap size != 0 on windows will allocate the entire
// mmap size for the file, instead of growing it. So, force 0.
func (bcfg *BackendConfig) mmapSize() int { return 0 }

View File

@@ -0,0 +1,16 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package backend defines a standard interface for etcd's backend MVCC storage.
package backend

View File

@@ -0,0 +1,95 @@
// Copyright 2016 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package backend
import "github.com/prometheus/client_golang/prometheus"
var (
commitSec = prometheus.NewHistogram(prometheus.HistogramOpts{
Namespace: "etcd",
Subsystem: "disk",
Name: "backend_commit_duration_seconds",
Help: "The latency distributions of commit called by backend.",
// lowest bucket start of upper bound 0.001 sec (1 ms) with factor 2
// highest bucket start of 0.001 sec * 2^13 == 8.192 sec
Buckets: prometheus.ExponentialBuckets(0.001, 2, 14),
})
rebalanceSec = prometheus.NewHistogram(prometheus.HistogramOpts{
Namespace: "etcd_debugging",
Subsystem: "disk",
Name: "backend_commit_rebalance_duration_seconds",
Help: "The latency distributions of commit.rebalance called by bboltdb backend.",
// lowest bucket start of upper bound 0.001 sec (1 ms) with factor 2
// highest bucket start of 0.001 sec * 2^13 == 8.192 sec
Buckets: prometheus.ExponentialBuckets(0.001, 2, 14),
})
spillSec = prometheus.NewHistogram(prometheus.HistogramOpts{
Namespace: "etcd_debugging",
Subsystem: "disk",
Name: "backend_commit_spill_duration_seconds",
Help: "The latency distributions of commit.spill called by bboltdb backend.",
// lowest bucket start of upper bound 0.001 sec (1 ms) with factor 2
// highest bucket start of 0.001 sec * 2^13 == 8.192 sec
Buckets: prometheus.ExponentialBuckets(0.001, 2, 14),
})
writeSec = prometheus.NewHistogram(prometheus.HistogramOpts{
Namespace: "etcd_debugging",
Subsystem: "disk",
Name: "backend_commit_write_duration_seconds",
Help: "The latency distributions of commit.write called by bboltdb backend.",
// lowest bucket start of upper bound 0.001 sec (1 ms) with factor 2
// highest bucket start of 0.001 sec * 2^13 == 8.192 sec
Buckets: prometheus.ExponentialBuckets(0.001, 2, 14),
})
defragSec = prometheus.NewHistogram(prometheus.HistogramOpts{
Namespace: "etcd",
Subsystem: "disk",
Name: "backend_defrag_duration_seconds",
Help: "The latency distribution of backend defragmentation.",
// 100 MB usually takes 1 sec, so start with 10 MB of 100 ms
// lowest bucket start of upper bound 0.1 sec (100 ms) with factor 2
// highest bucket start of 0.1 sec * 2^12 == 409.6 sec
Buckets: prometheus.ExponentialBuckets(.1, 2, 13),
})
snapshotTransferSec = prometheus.NewHistogram(prometheus.HistogramOpts{
Namespace: "etcd",
Subsystem: "disk",
Name: "backend_snapshot_duration_seconds",
Help: "The latency distribution of backend snapshots.",
// lowest bucket start of upper bound 0.01 sec (10 ms) with factor 2
// highest bucket start of 0.01 sec * 2^16 == 655.36 sec
Buckets: prometheus.ExponentialBuckets(.01, 2, 17),
})
)
func init() {
prometheus.MustRegister(commitSec)
prometheus.MustRegister(rebalanceSec)
prometheus.MustRegister(spillSec)
prometheus.MustRegister(writeSec)
prometheus.MustRegister(defragSec)
prometheus.MustRegister(snapshotTransferSec)
}

View File

@@ -0,0 +1,153 @@
// Copyright 2017 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package backend
import (
"bytes"
"math"
"sync"
bolt "go.etcd.io/bbolt"
)
// safeRangeBucket is a hack to avoid inadvertently reading duplicate keys;
// overwrites on a bucket should only fetch with limit=1, but safeRangeBucket
// is known to never overwrite any key so range is safe.
var safeRangeBucket = []byte("key")
type ReadTx interface {
Lock()
Unlock()
RLock()
RUnlock()
UnsafeRange(bucketName []byte, key, endKey []byte, limit int64) (keys [][]byte, vals [][]byte)
UnsafeForEach(bucketName []byte, visitor func(k, v []byte) error) error
}
// Base type for readTx and concurrentReadTx to eliminate duplicate functions between these
type baseReadTx struct {
// mu protects accesses to the txReadBuffer
mu sync.RWMutex
buf txReadBuffer
// TODO: group and encapsulate {txMu, tx, buckets, txWg}, as they share the same lifecycle.
// txMu protects accesses to buckets and tx on Range requests.
txMu *sync.RWMutex
tx *bolt.Tx
buckets map[string]*bolt.Bucket
// txWg protects tx from being rolled back at the end of a batch interval until all reads using this tx are done.
txWg *sync.WaitGroup
}
func (baseReadTx *baseReadTx) UnsafeForEach(bucketName []byte, visitor func(k, v []byte) error) error {
dups := make(map[string]struct{})
getDups := func(k, v []byte) error {
dups[string(k)] = struct{}{}
return nil
}
visitNoDup := func(k, v []byte) error {
if _, ok := dups[string(k)]; ok {
return nil
}
return visitor(k, v)
}
if err := baseReadTx.buf.ForEach(bucketName, getDups); err != nil {
return err
}
baseReadTx.txMu.Lock()
err := unsafeForEach(baseReadTx.tx, bucketName, visitNoDup)
baseReadTx.txMu.Unlock()
if err != nil {
return err
}
return baseReadTx.buf.ForEach(bucketName, visitor)
}
func (baseReadTx *baseReadTx) UnsafeRange(bucketName, key, endKey []byte, limit int64) ([][]byte, [][]byte) {
if endKey == nil {
// forbid duplicates for single keys
limit = 1
}
if limit <= 0 {
limit = math.MaxInt64
}
if limit > 1 && !bytes.Equal(bucketName, safeRangeBucket) {
panic("do not use unsafeRange on non-keys bucket")
}
keys, vals := baseReadTx.buf.Range(bucketName, key, endKey, limit)
if int64(len(keys)) == limit {
return keys, vals
}
// find/cache bucket
bn := string(bucketName)
baseReadTx.txMu.RLock()
bucket, ok := baseReadTx.buckets[bn]
baseReadTx.txMu.RUnlock()
lockHeld := false
if !ok {
baseReadTx.txMu.Lock()
lockHeld = true
bucket = baseReadTx.tx.Bucket(bucketName)
baseReadTx.buckets[bn] = bucket
}
// ignore missing bucket since may have been created in this batch
if bucket == nil {
if lockHeld {
baseReadTx.txMu.Unlock()
}
return keys, vals
}
if !lockHeld {
baseReadTx.txMu.Lock()
lockHeld = true
}
c := bucket.Cursor()
baseReadTx.txMu.Unlock()
k2, v2 := unsafeRange(c, key, endKey, limit-int64(len(keys)))
return append(k2, keys...), append(v2, vals...)
}
type readTx struct {
baseReadTx
}
func (rt *readTx) Lock() { rt.mu.Lock() }
func (rt *readTx) Unlock() { rt.mu.Unlock() }
func (rt *readTx) RLock() { rt.mu.RLock() }
func (rt *readTx) RUnlock() { rt.mu.RUnlock() }
func (rt *readTx) reset() {
rt.buf.reset()
rt.buckets = make(map[string]*bolt.Bucket)
rt.tx = nil
rt.txWg = new(sync.WaitGroup)
}
type concurrentReadTx struct {
baseReadTx
}
func (rt *concurrentReadTx) Lock() {}
func (rt *concurrentReadTx) Unlock() {}
// RLock is no-op. concurrentReadTx does not need to be locked after it is created.
func (rt *concurrentReadTx) RLock() {}
// RUnlock signals the end of concurrentReadTx.
func (rt *concurrentReadTx) RUnlock() { rt.txWg.Done() }

View File

@@ -0,0 +1,203 @@
// Copyright 2017 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package backend
import (
"bytes"
"sort"
)
// txBuffer handles functionality shared between txWriteBuffer and txReadBuffer.
type txBuffer struct {
buckets map[string]*bucketBuffer
}
func (txb *txBuffer) reset() {
for k, v := range txb.buckets {
if v.used == 0 {
// demote
delete(txb.buckets, k)
}
v.used = 0
}
}
// txWriteBuffer buffers writes of pending updates that have not yet committed.
type txWriteBuffer struct {
txBuffer
seq bool
}
func (txw *txWriteBuffer) put(bucket, k, v []byte) {
txw.seq = false
txw.putSeq(bucket, k, v)
}
func (txw *txWriteBuffer) putSeq(bucket, k, v []byte) {
b, ok := txw.buckets[string(bucket)]
if !ok {
b = newBucketBuffer()
txw.buckets[string(bucket)] = b
}
b.add(k, v)
}
func (txw *txWriteBuffer) writeback(txr *txReadBuffer) {
for k, wb := range txw.buckets {
rb, ok := txr.buckets[k]
if !ok {
delete(txw.buckets, k)
txr.buckets[k] = wb
continue
}
if !txw.seq && wb.used > 1 {
// assume no duplicate keys
sort.Sort(wb)
}
rb.merge(wb)
}
txw.reset()
}
// txReadBuffer accesses buffered updates.
type txReadBuffer struct{ txBuffer }
func (txr *txReadBuffer) Range(bucketName, key, endKey []byte, limit int64) ([][]byte, [][]byte) {
if b := txr.buckets[string(bucketName)]; b != nil {
return b.Range(key, endKey, limit)
}
return nil, nil
}
func (txr *txReadBuffer) ForEach(bucketName []byte, visitor func(k, v []byte) error) error {
if b := txr.buckets[string(bucketName)]; b != nil {
return b.ForEach(visitor)
}
return nil
}
// unsafeCopy returns a copy of txReadBuffer, caller should acquire backend.readTx.RLock()
func (txr *txReadBuffer) unsafeCopy() txReadBuffer {
txrCopy := txReadBuffer{
txBuffer: txBuffer{
buckets: make(map[string]*bucketBuffer, len(txr.txBuffer.buckets)),
},
}
for bucketName, bucket := range txr.txBuffer.buckets {
txrCopy.txBuffer.buckets[bucketName] = bucket.Copy()
}
return txrCopy
}
type kv struct {
key []byte
val []byte
}
// bucketBuffer buffers key-value pairs that are pending commit.
type bucketBuffer struct {
buf []kv
// used tracks number of elements in use so buf can be reused without reallocation.
used int
}
func newBucketBuffer() *bucketBuffer {
return &bucketBuffer{buf: make([]kv, 512), used: 0}
}
func (bb *bucketBuffer) Range(key, endKey []byte, limit int64) (keys [][]byte, vals [][]byte) {
f := func(i int) bool { return bytes.Compare(bb.buf[i].key, key) >= 0 }
idx := sort.Search(bb.used, f)
if idx < 0 {
return nil, nil
}
if len(endKey) == 0 {
if bytes.Equal(key, bb.buf[idx].key) {
keys = append(keys, bb.buf[idx].key)
vals = append(vals, bb.buf[idx].val)
}
return keys, vals
}
if bytes.Compare(endKey, bb.buf[idx].key) <= 0 {
return nil, nil
}
for i := idx; i < bb.used && int64(len(keys)) < limit; i++ {
if bytes.Compare(endKey, bb.buf[i].key) <= 0 {
break
}
keys = append(keys, bb.buf[i].key)
vals = append(vals, bb.buf[i].val)
}
return keys, vals
}
func (bb *bucketBuffer) ForEach(visitor func(k, v []byte) error) error {
for i := 0; i < bb.used; i++ {
if err := visitor(bb.buf[i].key, bb.buf[i].val); err != nil {
return err
}
}
return nil
}
func (bb *bucketBuffer) add(k, v []byte) {
bb.buf[bb.used].key, bb.buf[bb.used].val = k, v
bb.used++
if bb.used == len(bb.buf) {
buf := make([]kv, (3*len(bb.buf))/2)
copy(buf, bb.buf)
bb.buf = buf
}
}
// merge merges data from bbsrc into bb.
func (bb *bucketBuffer) merge(bbsrc *bucketBuffer) {
for i := 0; i < bbsrc.used; i++ {
bb.add(bbsrc.buf[i].key, bbsrc.buf[i].val)
}
if bb.used == bbsrc.used {
return
}
if bytes.Compare(bb.buf[(bb.used-bbsrc.used)-1].key, bbsrc.buf[0].key) < 0 {
return
}
sort.Stable(bb)
// remove duplicates, using only newest update
widx := 0
for ridx := 1; ridx < bb.used; ridx++ {
if !bytes.Equal(bb.buf[ridx].key, bb.buf[widx].key) {
widx++
}
bb.buf[widx] = bb.buf[ridx]
}
bb.used = widx + 1
}
func (bb *bucketBuffer) Len() int { return bb.used }
func (bb *bucketBuffer) Less(i, j int) bool {
return bytes.Compare(bb.buf[i].key, bb.buf[j].key) < 0
}
func (bb *bucketBuffer) Swap(i, j int) { bb.buf[i], bb.buf[j] = bb.buf[j], bb.buf[i] }
func (bb *bucketBuffer) Copy() *bucketBuffer {
bbCopy := bucketBuffer{
buf: make([]kv, len(bb.buf)),
used: bb.used,
}
copy(bbCopy.buf, bb.buf)
return &bbCopy
}

16
server/mvcc/doc.go Normal file
View File

@@ -0,0 +1,16 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package mvcc defines etcd's stable MVCC storage.
package mvcc

279
server/mvcc/index.go Normal file
View File

@@ -0,0 +1,279 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package mvcc
import (
"sort"
"sync"
"github.com/google/btree"
"go.uber.org/zap"
)
type index interface {
Get(key []byte, atRev int64) (rev, created revision, ver int64, err error)
Range(key, end []byte, atRev int64) ([][]byte, []revision)
Revisions(key, end []byte, atRev int64, limit int) []revision
CountRevisions(key, end []byte, atRev int64, limit int) int
Put(key []byte, rev revision)
Tombstone(key []byte, rev revision) error
RangeSince(key, end []byte, rev int64) []revision
Compact(rev int64) map[revision]struct{}
Keep(rev int64) map[revision]struct{}
Equal(b index) bool
Insert(ki *keyIndex)
KeyIndex(ki *keyIndex) *keyIndex
}
type treeIndex struct {
sync.RWMutex
tree *btree.BTree
lg *zap.Logger
}
func newTreeIndex(lg *zap.Logger) index {
return &treeIndex{
tree: btree.New(32),
lg: lg,
}
}
func (ti *treeIndex) Put(key []byte, rev revision) {
keyi := &keyIndex{key: key}
ti.Lock()
defer ti.Unlock()
item := ti.tree.Get(keyi)
if item == nil {
keyi.put(ti.lg, rev.main, rev.sub)
ti.tree.ReplaceOrInsert(keyi)
return
}
okeyi := item.(*keyIndex)
okeyi.put(ti.lg, rev.main, rev.sub)
}
func (ti *treeIndex) Get(key []byte, atRev int64) (modified, created revision, ver int64, err error) {
keyi := &keyIndex{key: key}
ti.RLock()
defer ti.RUnlock()
if keyi = ti.keyIndex(keyi); keyi == nil {
return revision{}, revision{}, 0, ErrRevisionNotFound
}
return keyi.get(ti.lg, atRev)
}
func (ti *treeIndex) KeyIndex(keyi *keyIndex) *keyIndex {
ti.RLock()
defer ti.RUnlock()
return ti.keyIndex(keyi)
}
func (ti *treeIndex) keyIndex(keyi *keyIndex) *keyIndex {
if item := ti.tree.Get(keyi); item != nil {
return item.(*keyIndex)
}
return nil
}
func (ti *treeIndex) visit(key, end []byte, f func(ki *keyIndex) bool) {
keyi, endi := &keyIndex{key: key}, &keyIndex{key: end}
ti.RLock()
defer ti.RUnlock()
ti.tree.AscendGreaterOrEqual(keyi, func(item btree.Item) bool {
if len(endi.key) > 0 && !item.Less(endi) {
return false
}
if !f(item.(*keyIndex)) {
return false
}
return true
})
}
func (ti *treeIndex) Revisions(key, end []byte, atRev int64, limit int) (revs []revision) {
if end == nil {
rev, _, _, err := ti.Get(key, atRev)
if err != nil {
return nil
}
return []revision{rev}
}
ti.visit(key, end, func(ki *keyIndex) bool {
if rev, _, _, err := ki.get(ti.lg, atRev); err == nil {
revs = append(revs, rev)
if len(revs) == limit {
return false
}
}
return true
})
return revs
}
func (ti *treeIndex) CountRevisions(key, end []byte, atRev int64, limit int) int {
if end == nil {
_, _, _, err := ti.Get(key, atRev)
if err != nil {
return 0
}
return 1
}
total := 0
ti.visit(key, end, func(ki *keyIndex) bool {
if _, _, _, err := ki.get(ti.lg, atRev); err == nil {
total++
if total == limit {
return false
}
}
return true
})
return total
}
func (ti *treeIndex) Range(key, end []byte, atRev int64) (keys [][]byte, revs []revision) {
if end == nil {
rev, _, _, err := ti.Get(key, atRev)
if err != nil {
return nil, nil
}
return [][]byte{key}, []revision{rev}
}
ti.visit(key, end, func(ki *keyIndex) bool {
if rev, _, _, err := ki.get(ti.lg, atRev); err == nil {
revs = append(revs, rev)
keys = append(keys, ki.key)
}
return true
})
return keys, revs
}
func (ti *treeIndex) Tombstone(key []byte, rev revision) error {
keyi := &keyIndex{key: key}
ti.Lock()
defer ti.Unlock()
item := ti.tree.Get(keyi)
if item == nil {
return ErrRevisionNotFound
}
ki := item.(*keyIndex)
return ki.tombstone(ti.lg, rev.main, rev.sub)
}
// RangeSince returns all revisions from key(including) to end(excluding)
// at or after the given rev. The returned slice is sorted in the order
// of revision.
func (ti *treeIndex) RangeSince(key, end []byte, rev int64) []revision {
keyi := &keyIndex{key: key}
ti.RLock()
defer ti.RUnlock()
if end == nil {
item := ti.tree.Get(keyi)
if item == nil {
return nil
}
keyi = item.(*keyIndex)
return keyi.since(ti.lg, rev)
}
endi := &keyIndex{key: end}
var revs []revision
ti.tree.AscendGreaterOrEqual(keyi, func(item btree.Item) bool {
if len(endi.key) > 0 && !item.Less(endi) {
return false
}
curKeyi := item.(*keyIndex)
revs = append(revs, curKeyi.since(ti.lg, rev)...)
return true
})
sort.Sort(revisions(revs))
return revs
}
func (ti *treeIndex) Compact(rev int64) map[revision]struct{} {
available := make(map[revision]struct{})
ti.lg.Info("compact tree index", zap.Int64("revision", rev))
ti.Lock()
clone := ti.tree.Clone()
ti.Unlock()
clone.Ascend(func(item btree.Item) bool {
keyi := item.(*keyIndex)
//Lock is needed here to prevent modification to the keyIndex while
//compaction is going on or revision added to empty before deletion
ti.Lock()
keyi.compact(ti.lg, rev, available)
if keyi.isEmpty() {
item := ti.tree.Delete(keyi)
if item == nil {
ti.lg.Panic("failed to delete during compaction")
}
}
ti.Unlock()
return true
})
return available
}
// Keep finds all revisions to be kept for a Compaction at the given rev.
func (ti *treeIndex) Keep(rev int64) map[revision]struct{} {
available := make(map[revision]struct{})
ti.RLock()
defer ti.RUnlock()
ti.tree.Ascend(func(i btree.Item) bool {
keyi := i.(*keyIndex)
keyi.keep(rev, available)
return true
})
return available
}
func (ti *treeIndex) Equal(bi index) bool {
b := bi.(*treeIndex)
if ti.tree.Len() != b.tree.Len() {
return false
}
equal := true
ti.tree.Ascend(func(item btree.Item) bool {
aki := item.(*keyIndex)
bki := b.tree.Get(item).(*keyIndex)
if !aki.equal(bki) {
equal = false
return false
}
return true
})
return equal
}
func (ti *treeIndex) Insert(ki *keyIndex) {
ti.Lock()
defer ti.Unlock()
ti.tree.ReplaceOrInsert(ki)
}

View File

@@ -0,0 +1,42 @@
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package mvcc
import (
"testing"
"go.uber.org/zap"
)
func BenchmarkIndexCompact1(b *testing.B) { benchmarkIndexCompact(b, 1) }
func BenchmarkIndexCompact100(b *testing.B) { benchmarkIndexCompact(b, 100) }
func BenchmarkIndexCompact10000(b *testing.B) { benchmarkIndexCompact(b, 10000) }
func BenchmarkIndexCompact100000(b *testing.B) { benchmarkIndexCompact(b, 100000) }
func BenchmarkIndexCompact1000000(b *testing.B) { benchmarkIndexCompact(b, 1000000) }
func benchmarkIndexCompact(b *testing.B, size int) {
log := zap.NewNop()
kvindex := newTreeIndex(log)
bytesN := 64
keys := createBytesSlice(bytesN, size)
for i := 1; i < size; i++ {
kvindex.Put(keys[i], revision{main: int64(i), sub: int64(i)})
}
b.ResetTimer()
for i := 1; i < b.N; i++ {
kvindex.Compact(int64(i))
}
}

293
server/mvcc/index_test.go Normal file
View File

@@ -0,0 +1,293 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package mvcc
import (
"reflect"
"testing"
"github.com/google/btree"
"go.uber.org/zap"
)
func TestIndexGet(t *testing.T) {
ti := newTreeIndex(zap.NewExample())
ti.Put([]byte("foo"), revision{main: 2})
ti.Put([]byte("foo"), revision{main: 4})
ti.Tombstone([]byte("foo"), revision{main: 6})
tests := []struct {
rev int64
wrev revision
wcreated revision
wver int64
werr error
}{
{0, revision{}, revision{}, 0, ErrRevisionNotFound},
{1, revision{}, revision{}, 0, ErrRevisionNotFound},
{2, revision{main: 2}, revision{main: 2}, 1, nil},
{3, revision{main: 2}, revision{main: 2}, 1, nil},
{4, revision{main: 4}, revision{main: 2}, 2, nil},
{5, revision{main: 4}, revision{main: 2}, 2, nil},
{6, revision{}, revision{}, 0, ErrRevisionNotFound},
}
for i, tt := range tests {
rev, created, ver, err := ti.Get([]byte("foo"), tt.rev)
if err != tt.werr {
t.Errorf("#%d: err = %v, want %v", i, err, tt.werr)
}
if rev != tt.wrev {
t.Errorf("#%d: rev = %+v, want %+v", i, rev, tt.wrev)
}
if created != tt.wcreated {
t.Errorf("#%d: created = %+v, want %+v", i, created, tt.wcreated)
}
if ver != tt.wver {
t.Errorf("#%d: ver = %d, want %d", i, ver, tt.wver)
}
}
}
func TestIndexRange(t *testing.T) {
allKeys := [][]byte{[]byte("foo"), []byte("foo1"), []byte("foo2")}
allRevs := []revision{{main: 1}, {main: 2}, {main: 3}}
ti := newTreeIndex(zap.NewExample())
for i := range allKeys {
ti.Put(allKeys[i], allRevs[i])
}
atRev := int64(3)
tests := []struct {
key, end []byte
wkeys [][]byte
wrevs []revision
}{
// single key that not found
{
[]byte("bar"), nil, nil, nil,
},
// single key that found
{
[]byte("foo"), nil, allKeys[:1], allRevs[:1],
},
// range keys, return first member
{
[]byte("foo"), []byte("foo1"), allKeys[:1], allRevs[:1],
},
// range keys, return first two members
{
[]byte("foo"), []byte("foo2"), allKeys[:2], allRevs[:2],
},
// range keys, return all members
{
[]byte("foo"), []byte("fop"), allKeys, allRevs,
},
// range keys, return last two members
{
[]byte("foo1"), []byte("fop"), allKeys[1:], allRevs[1:],
},
// range keys, return last member
{
[]byte("foo2"), []byte("fop"), allKeys[2:], allRevs[2:],
},
// range keys, return nothing
{
[]byte("foo3"), []byte("fop"), nil, nil,
},
}
for i, tt := range tests {
keys, revs := ti.Range(tt.key, tt.end, atRev)
if !reflect.DeepEqual(keys, tt.wkeys) {
t.Errorf("#%d: keys = %+v, want %+v", i, keys, tt.wkeys)
}
if !reflect.DeepEqual(revs, tt.wrevs) {
t.Errorf("#%d: revs = %+v, want %+v", i, revs, tt.wrevs)
}
}
}
func TestIndexTombstone(t *testing.T) {
ti := newTreeIndex(zap.NewExample())
ti.Put([]byte("foo"), revision{main: 1})
err := ti.Tombstone([]byte("foo"), revision{main: 2})
if err != nil {
t.Errorf("tombstone error = %v, want nil", err)
}
_, _, _, err = ti.Get([]byte("foo"), 2)
if err != ErrRevisionNotFound {
t.Errorf("get error = %v, want ErrRevisionNotFound", err)
}
err = ti.Tombstone([]byte("foo"), revision{main: 3})
if err != ErrRevisionNotFound {
t.Errorf("tombstone error = %v, want %v", err, ErrRevisionNotFound)
}
}
func TestIndexRangeSince(t *testing.T) {
allKeys := [][]byte{[]byte("foo"), []byte("foo1"), []byte("foo2"), []byte("foo2"), []byte("foo1"), []byte("foo")}
allRevs := []revision{{main: 1}, {main: 2}, {main: 3}, {main: 4}, {main: 5}, {main: 6}}
ti := newTreeIndex(zap.NewExample())
for i := range allKeys {
ti.Put(allKeys[i], allRevs[i])
}
atRev := int64(1)
tests := []struct {
key, end []byte
wrevs []revision
}{
// single key that not found
{
[]byte("bar"), nil, nil,
},
// single key that found
{
[]byte("foo"), nil, []revision{{main: 1}, {main: 6}},
},
// range keys, return first member
{
[]byte("foo"), []byte("foo1"), []revision{{main: 1}, {main: 6}},
},
// range keys, return first two members
{
[]byte("foo"), []byte("foo2"), []revision{{main: 1}, {main: 2}, {main: 5}, {main: 6}},
},
// range keys, return all members
{
[]byte("foo"), []byte("fop"), allRevs,
},
// range keys, return last two members
{
[]byte("foo1"), []byte("fop"), []revision{{main: 2}, {main: 3}, {main: 4}, {main: 5}},
},
// range keys, return last member
{
[]byte("foo2"), []byte("fop"), []revision{{main: 3}, {main: 4}},
},
// range keys, return nothing
{
[]byte("foo3"), []byte("fop"), nil,
},
}
for i, tt := range tests {
revs := ti.RangeSince(tt.key, tt.end, atRev)
if !reflect.DeepEqual(revs, tt.wrevs) {
t.Errorf("#%d: revs = %+v, want %+v", i, revs, tt.wrevs)
}
}
}
func TestIndexCompactAndKeep(t *testing.T) {
maxRev := int64(20)
tests := []struct {
key []byte
remove bool
rev revision
created revision
ver int64
}{
{[]byte("foo"), false, revision{main: 1}, revision{main: 1}, 1},
{[]byte("foo1"), false, revision{main: 2}, revision{main: 2}, 1},
{[]byte("foo2"), false, revision{main: 3}, revision{main: 3}, 1},
{[]byte("foo2"), false, revision{main: 4}, revision{main: 3}, 2},
{[]byte("foo"), false, revision{main: 5}, revision{main: 1}, 2},
{[]byte("foo1"), false, revision{main: 6}, revision{main: 2}, 2},
{[]byte("foo1"), true, revision{main: 7}, revision{}, 0},
{[]byte("foo2"), true, revision{main: 8}, revision{}, 0},
{[]byte("foo"), true, revision{main: 9}, revision{}, 0},
{[]byte("foo"), false, revision{10, 0}, revision{10, 0}, 1},
{[]byte("foo1"), false, revision{10, 1}, revision{10, 1}, 1},
}
// Continuous Compact and Keep
ti := newTreeIndex(zap.NewExample())
for _, tt := range tests {
if tt.remove {
ti.Tombstone(tt.key, tt.rev)
} else {
ti.Put(tt.key, tt.rev)
}
}
for i := int64(1); i < maxRev; i++ {
am := ti.Compact(i)
keep := ti.Keep(i)
if !(reflect.DeepEqual(am, keep)) {
t.Errorf("#%d: compact keep %v != Keep keep %v", i, am, keep)
}
wti := &treeIndex{tree: btree.New(32)}
for _, tt := range tests {
if _, ok := am[tt.rev]; ok || tt.rev.GreaterThan(revision{main: i}) {
if tt.remove {
wti.Tombstone(tt.key, tt.rev)
} else {
restore(wti, tt.key, tt.created, tt.rev, tt.ver)
}
}
}
if !ti.Equal(wti) {
t.Errorf("#%d: not equal ti", i)
}
}
// Once Compact and Keep
for i := int64(1); i < maxRev; i++ {
ti := newTreeIndex(zap.NewExample())
for _, tt := range tests {
if tt.remove {
ti.Tombstone(tt.key, tt.rev)
} else {
ti.Put(tt.key, tt.rev)
}
}
am := ti.Compact(i)
keep := ti.Keep(i)
if !(reflect.DeepEqual(am, keep)) {
t.Errorf("#%d: compact keep %v != Keep keep %v", i, am, keep)
}
wti := &treeIndex{tree: btree.New(32)}
for _, tt := range tests {
if _, ok := am[tt.rev]; ok || tt.rev.GreaterThan(revision{main: i}) {
if tt.remove {
wti.Tombstone(tt.key, tt.rev)
} else {
restore(wti, tt.key, tt.created, tt.rev, tt.ver)
}
}
}
if !ti.Equal(wti) {
t.Errorf("#%d: not equal ti", i)
}
}
}
func restore(ti *treeIndex, key []byte, created, modified revision, ver int64) {
keyi := &keyIndex{key: key}
ti.Lock()
defer ti.Unlock()
item := ti.tree.Get(keyi)
if item == nil {
keyi.restore(ti.lg, created, modified, ver)
ti.tree.ReplaceOrInsert(keyi)
return
}
okeyi := item.(*keyIndex)
okeyi.put(ti.lg, modified.main, modified.sub)
}

378
server/mvcc/key_index.go Normal file
View File

@@ -0,0 +1,378 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package mvcc
import (
"bytes"
"errors"
"fmt"
"github.com/google/btree"
"go.uber.org/zap"
)
var (
ErrRevisionNotFound = errors.New("mvcc: revision not found")
)
// keyIndex stores the revisions of a key in the backend.
// Each keyIndex has at least one key generation.
// Each generation might have several key versions.
// Tombstone on a key appends an tombstone version at the end
// of the current generation and creates a new empty generation.
// Each version of a key has an index pointing to the backend.
//
// For example: put(1.0);put(2.0);tombstone(3.0);put(4.0);tombstone(5.0) on key "foo"
// generate a keyIndex:
// key: "foo"
// rev: 5
// generations:
// {empty}
// {4.0, 5.0(t)}
// {1.0, 2.0, 3.0(t)}
//
// Compact a keyIndex removes the versions with smaller or equal to
// rev except the largest one. If the generation becomes empty
// during compaction, it will be removed. if all the generations get
// removed, the keyIndex should be removed.
//
// For example:
// compact(2) on the previous example
// generations:
// {empty}
// {4.0, 5.0(t)}
// {2.0, 3.0(t)}
//
// compact(4)
// generations:
// {empty}
// {4.0, 5.0(t)}
//
// compact(5):
// generations:
// {empty} -> key SHOULD be removed.
//
// compact(6):
// generations:
// {empty} -> key SHOULD be removed.
type keyIndex struct {
key []byte
modified revision // the main rev of the last modification
generations []generation
}
// put puts a revision to the keyIndex.
func (ki *keyIndex) put(lg *zap.Logger, main int64, sub int64) {
rev := revision{main: main, sub: sub}
if !rev.GreaterThan(ki.modified) {
lg.Panic(
"'put' with an unexpected smaller revision",
zap.Int64("given-revision-main", rev.main),
zap.Int64("given-revision-sub", rev.sub),
zap.Int64("modified-revision-main", ki.modified.main),
zap.Int64("modified-revision-sub", ki.modified.sub),
)
}
if len(ki.generations) == 0 {
ki.generations = append(ki.generations, generation{})
}
g := &ki.generations[len(ki.generations)-1]
if len(g.revs) == 0 { // create a new key
keysGauge.Inc()
g.created = rev
}
g.revs = append(g.revs, rev)
g.ver++
ki.modified = rev
}
func (ki *keyIndex) restore(lg *zap.Logger, created, modified revision, ver int64) {
if len(ki.generations) != 0 {
lg.Panic(
"'restore' got an unexpected non-empty generations",
zap.Int("generations-size", len(ki.generations)),
)
}
ki.modified = modified
g := generation{created: created, ver: ver, revs: []revision{modified}}
ki.generations = append(ki.generations, g)
keysGauge.Inc()
}
// tombstone puts a revision, pointing to a tombstone, to the keyIndex.
// It also creates a new empty generation in the keyIndex.
// It returns ErrRevisionNotFound when tombstone on an empty generation.
func (ki *keyIndex) tombstone(lg *zap.Logger, main int64, sub int64) error {
if ki.isEmpty() {
lg.Panic(
"'tombstone' got an unexpected empty keyIndex",
zap.String("key", string(ki.key)),
)
}
if ki.generations[len(ki.generations)-1].isEmpty() {
return ErrRevisionNotFound
}
ki.put(lg, main, sub)
ki.generations = append(ki.generations, generation{})
keysGauge.Dec()
return nil
}
// get gets the modified, created revision and version of the key that satisfies the given atRev.
// Rev must be higher than or equal to the given atRev.
func (ki *keyIndex) get(lg *zap.Logger, atRev int64) (modified, created revision, ver int64, err error) {
if ki.isEmpty() {
lg.Panic(
"'get' got an unexpected empty keyIndex",
zap.String("key", string(ki.key)),
)
}
g := ki.findGeneration(atRev)
if g.isEmpty() {
return revision{}, revision{}, 0, ErrRevisionNotFound
}
n := g.walk(func(rev revision) bool { return rev.main > atRev })
if n != -1 {
return g.revs[n], g.created, g.ver - int64(len(g.revs)-n-1), nil
}
return revision{}, revision{}, 0, ErrRevisionNotFound
}
// since returns revisions since the given rev. Only the revision with the
// largest sub revision will be returned if multiple revisions have the same
// main revision.
func (ki *keyIndex) since(lg *zap.Logger, rev int64) []revision {
if ki.isEmpty() {
lg.Panic(
"'since' got an unexpected empty keyIndex",
zap.String("key", string(ki.key)),
)
}
since := revision{rev, 0}
var gi int
// find the generations to start checking
for gi = len(ki.generations) - 1; gi > 0; gi-- {
g := ki.generations[gi]
if g.isEmpty() {
continue
}
if since.GreaterThan(g.created) {
break
}
}
var revs []revision
var last int64
for ; gi < len(ki.generations); gi++ {
for _, r := range ki.generations[gi].revs {
if since.GreaterThan(r) {
continue
}
if r.main == last {
// replace the revision with a new one that has higher sub value,
// because the original one should not be seen by external
revs[len(revs)-1] = r
continue
}
revs = append(revs, r)
last = r.main
}
}
return revs
}
// compact compacts a keyIndex by removing the versions with smaller or equal
// revision than the given atRev except the largest one (If the largest one is
// a tombstone, it will not be kept).
// If a generation becomes empty during compaction, it will be removed.
func (ki *keyIndex) compact(lg *zap.Logger, atRev int64, available map[revision]struct{}) {
if ki.isEmpty() {
lg.Panic(
"'compact' got an unexpected empty keyIndex",
zap.String("key", string(ki.key)),
)
}
genIdx, revIndex := ki.doCompact(atRev, available)
g := &ki.generations[genIdx]
if !g.isEmpty() {
// remove the previous contents.
if revIndex != -1 {
g.revs = g.revs[revIndex:]
}
// remove any tombstone
if len(g.revs) == 1 && genIdx != len(ki.generations)-1 {
delete(available, g.revs[0])
genIdx++
}
}
// remove the previous generations.
ki.generations = ki.generations[genIdx:]
}
// keep finds the revision to be kept if compact is called at given atRev.
func (ki *keyIndex) keep(atRev int64, available map[revision]struct{}) {
if ki.isEmpty() {
return
}
genIdx, revIndex := ki.doCompact(atRev, available)
g := &ki.generations[genIdx]
if !g.isEmpty() {
// remove any tombstone
if revIndex == len(g.revs)-1 && genIdx != len(ki.generations)-1 {
delete(available, g.revs[revIndex])
}
}
}
func (ki *keyIndex) doCompact(atRev int64, available map[revision]struct{}) (genIdx int, revIndex int) {
// walk until reaching the first revision smaller or equal to "atRev",
// and add the revision to the available map
f := func(rev revision) bool {
if rev.main <= atRev {
available[rev] = struct{}{}
return false
}
return true
}
genIdx, g := 0, &ki.generations[0]
// find first generation includes atRev or created after atRev
for genIdx < len(ki.generations)-1 {
if tomb := g.revs[len(g.revs)-1].main; tomb > atRev {
break
}
genIdx++
g = &ki.generations[genIdx]
}
revIndex = g.walk(f)
return genIdx, revIndex
}
func (ki *keyIndex) isEmpty() bool {
return len(ki.generations) == 1 && ki.generations[0].isEmpty()
}
// findGeneration finds out the generation of the keyIndex that the
// given rev belongs to. If the given rev is at the gap of two generations,
// which means that the key does not exist at the given rev, it returns nil.
func (ki *keyIndex) findGeneration(rev int64) *generation {
lastg := len(ki.generations) - 1
cg := lastg
for cg >= 0 {
if len(ki.generations[cg].revs) == 0 {
cg--
continue
}
g := ki.generations[cg]
if cg != lastg {
if tomb := g.revs[len(g.revs)-1].main; tomb <= rev {
return nil
}
}
if g.revs[0].main <= rev {
return &ki.generations[cg]
}
cg--
}
return nil
}
func (ki *keyIndex) Less(b btree.Item) bool {
return bytes.Compare(ki.key, b.(*keyIndex).key) == -1
}
func (ki *keyIndex) equal(b *keyIndex) bool {
if !bytes.Equal(ki.key, b.key) {
return false
}
if ki.modified != b.modified {
return false
}
if len(ki.generations) != len(b.generations) {
return false
}
for i := range ki.generations {
ag, bg := ki.generations[i], b.generations[i]
if !ag.equal(bg) {
return false
}
}
return true
}
func (ki *keyIndex) String() string {
var s string
for _, g := range ki.generations {
s += g.String()
}
return s
}
// generation contains multiple revisions of a key.
type generation struct {
ver int64
created revision // when the generation is created (put in first revision).
revs []revision
}
func (g *generation) isEmpty() bool { return g == nil || len(g.revs) == 0 }
// walk walks through the revisions in the generation in descending order.
// It passes the revision to the given function.
// walk returns until: 1. it finishes walking all pairs 2. the function returns false.
// walk returns the position at where it stopped. If it stopped after
// finishing walking, -1 will be returned.
func (g *generation) walk(f func(rev revision) bool) int {
l := len(g.revs)
for i := range g.revs {
ok := f(g.revs[l-i-1])
if !ok {
return l - i - 1
}
}
return -1
}
func (g *generation) String() string {
return fmt.Sprintf("g: created[%d] ver[%d], revs %#v\n", g.created, g.ver, g.revs)
}
func (g generation) equal(b generation) bool {
if g.ver != b.ver {
return false
}
if len(g.revs) != len(b.revs) {
return false
}
for i := range g.revs {
ar, br := g.revs[i], b.revs[i]
if ar != br {
return false
}
}
return true
}

View File

@@ -0,0 +1,700 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package mvcc
import (
"reflect"
"testing"
"go.uber.org/zap"
)
func TestKeyIndexGet(t *testing.T) {
// key: "foo"
// rev: 16
// generations:
// {empty}
// {{14, 0}[1], {14, 1}[2], {16, 0}(t)[3]}
// {{8, 0}[1], {10, 0}[2], {12, 0}(t)[3]}
// {{2, 0}[1], {4, 0}[2], {6, 0}(t)[3]}
ki := newTestKeyIndex()
ki.compact(zap.NewExample(), 4, make(map[revision]struct{}))
tests := []struct {
rev int64
wmod revision
wcreat revision
wver int64
werr error
}{
{17, revision{}, revision{}, 0, ErrRevisionNotFound},
{16, revision{}, revision{}, 0, ErrRevisionNotFound},
// get on generation 3
{15, revision{14, 1}, revision{14, 0}, 2, nil},
{14, revision{14, 1}, revision{14, 0}, 2, nil},
{13, revision{}, revision{}, 0, ErrRevisionNotFound},
{12, revision{}, revision{}, 0, ErrRevisionNotFound},
// get on generation 2
{11, revision{10, 0}, revision{8, 0}, 2, nil},
{10, revision{10, 0}, revision{8, 0}, 2, nil},
{9, revision{8, 0}, revision{8, 0}, 1, nil},
{8, revision{8, 0}, revision{8, 0}, 1, nil},
{7, revision{}, revision{}, 0, ErrRevisionNotFound},
{6, revision{}, revision{}, 0, ErrRevisionNotFound},
// get on generation 1
{5, revision{4, 0}, revision{2, 0}, 2, nil},
{4, revision{4, 0}, revision{2, 0}, 2, nil},
{3, revision{}, revision{}, 0, ErrRevisionNotFound},
{2, revision{}, revision{}, 0, ErrRevisionNotFound},
{1, revision{}, revision{}, 0, ErrRevisionNotFound},
{0, revision{}, revision{}, 0, ErrRevisionNotFound},
}
for i, tt := range tests {
mod, creat, ver, err := ki.get(zap.NewExample(), tt.rev)
if err != tt.werr {
t.Errorf("#%d: err = %v, want %v", i, err, tt.werr)
}
if mod != tt.wmod {
t.Errorf("#%d: modified = %+v, want %+v", i, mod, tt.wmod)
}
if creat != tt.wcreat {
t.Errorf("#%d: created = %+v, want %+v", i, creat, tt.wcreat)
}
if ver != tt.wver {
t.Errorf("#%d: version = %d, want %d", i, ver, tt.wver)
}
}
}
func TestKeyIndexSince(t *testing.T) {
ki := newTestKeyIndex()
ki.compact(zap.NewExample(), 4, make(map[revision]struct{}))
allRevs := []revision{{4, 0}, {6, 0}, {8, 0}, {10, 0}, {12, 0}, {14, 1}, {16, 0}}
tests := []struct {
rev int64
wrevs []revision
}{
{17, nil},
{16, allRevs[6:]},
{15, allRevs[6:]},
{14, allRevs[5:]},
{13, allRevs[5:]},
{12, allRevs[4:]},
{11, allRevs[4:]},
{10, allRevs[3:]},
{9, allRevs[3:]},
{8, allRevs[2:]},
{7, allRevs[2:]},
{6, allRevs[1:]},
{5, allRevs[1:]},
{4, allRevs},
{3, allRevs},
{2, allRevs},
{1, allRevs},
{0, allRevs},
}
for i, tt := range tests {
revs := ki.since(zap.NewExample(), tt.rev)
if !reflect.DeepEqual(revs, tt.wrevs) {
t.Errorf("#%d: revs = %+v, want %+v", i, revs, tt.wrevs)
}
}
}
func TestKeyIndexPut(t *testing.T) {
ki := &keyIndex{key: []byte("foo")}
ki.put(zap.NewExample(), 5, 0)
wki := &keyIndex{
key: []byte("foo"),
modified: revision{5, 0},
generations: []generation{{created: revision{5, 0}, ver: 1, revs: []revision{{main: 5}}}},
}
if !reflect.DeepEqual(ki, wki) {
t.Errorf("ki = %+v, want %+v", ki, wki)
}
ki.put(zap.NewExample(), 7, 0)
wki = &keyIndex{
key: []byte("foo"),
modified: revision{7, 0},
generations: []generation{{created: revision{5, 0}, ver: 2, revs: []revision{{main: 5}, {main: 7}}}},
}
if !reflect.DeepEqual(ki, wki) {
t.Errorf("ki = %+v, want %+v", ki, wki)
}
}
func TestKeyIndexRestore(t *testing.T) {
ki := &keyIndex{key: []byte("foo")}
ki.restore(zap.NewExample(), revision{5, 0}, revision{7, 0}, 2)
wki := &keyIndex{
key: []byte("foo"),
modified: revision{7, 0},
generations: []generation{{created: revision{5, 0}, ver: 2, revs: []revision{{main: 7}}}},
}
if !reflect.DeepEqual(ki, wki) {
t.Errorf("ki = %+v, want %+v", ki, wki)
}
}
func TestKeyIndexTombstone(t *testing.T) {
ki := &keyIndex{key: []byte("foo")}
ki.put(zap.NewExample(), 5, 0)
err := ki.tombstone(zap.NewExample(), 7, 0)
if err != nil {
t.Errorf("unexpected tombstone error: %v", err)
}
wki := &keyIndex{
key: []byte("foo"),
modified: revision{7, 0},
generations: []generation{{created: revision{5, 0}, ver: 2, revs: []revision{{main: 5}, {main: 7}}}, {}},
}
if !reflect.DeepEqual(ki, wki) {
t.Errorf("ki = %+v, want %+v", ki, wki)
}
ki.put(zap.NewExample(), 8, 0)
ki.put(zap.NewExample(), 9, 0)
err = ki.tombstone(zap.NewExample(), 15, 0)
if err != nil {
t.Errorf("unexpected tombstone error: %v", err)
}
wki = &keyIndex{
key: []byte("foo"),
modified: revision{15, 0},
generations: []generation{
{created: revision{5, 0}, ver: 2, revs: []revision{{main: 5}, {main: 7}}},
{created: revision{8, 0}, ver: 3, revs: []revision{{main: 8}, {main: 9}, {main: 15}}},
{},
},
}
if !reflect.DeepEqual(ki, wki) {
t.Errorf("ki = %+v, want %+v", ki, wki)
}
err = ki.tombstone(zap.NewExample(), 16, 0)
if err != ErrRevisionNotFound {
t.Errorf("tombstone error = %v, want %v", err, ErrRevisionNotFound)
}
}
func TestKeyIndexCompactAndKeep(t *testing.T) {
tests := []struct {
compact int64
wki *keyIndex
wam map[revision]struct{}
}{
{
1,
&keyIndex{
key: []byte("foo"),
modified: revision{16, 0},
generations: []generation{
{created: revision{2, 0}, ver: 3, revs: []revision{{main: 2}, {main: 4}, {main: 6}}},
{created: revision{8, 0}, ver: 3, revs: []revision{{main: 8}, {main: 10}, {main: 12}}},
{created: revision{14, 0}, ver: 3, revs: []revision{{main: 14}, {main: 14, sub: 1}, {main: 16}}},
{},
},
},
map[revision]struct{}{},
},
{
2,
&keyIndex{
key: []byte("foo"),
modified: revision{16, 0},
generations: []generation{
{created: revision{2, 0}, ver: 3, revs: []revision{{main: 2}, {main: 4}, {main: 6}}},
{created: revision{8, 0}, ver: 3, revs: []revision{{main: 8}, {main: 10}, {main: 12}}},
{created: revision{14, 0}, ver: 3, revs: []revision{{main: 14}, {main: 14, sub: 1}, {main: 16}}},
{},
},
},
map[revision]struct{}{
{main: 2}: {},
},
},
{
3,
&keyIndex{
key: []byte("foo"),
modified: revision{16, 0},
generations: []generation{
{created: revision{2, 0}, ver: 3, revs: []revision{{main: 2}, {main: 4}, {main: 6}}},
{created: revision{8, 0}, ver: 3, revs: []revision{{main: 8}, {main: 10}, {main: 12}}},
{created: revision{14, 0}, ver: 3, revs: []revision{{main: 14}, {main: 14, sub: 1}, {main: 16}}},
{},
},
},
map[revision]struct{}{
{main: 2}: {},
},
},
{
4,
&keyIndex{
key: []byte("foo"),
modified: revision{16, 0},
generations: []generation{
{created: revision{2, 0}, ver: 3, revs: []revision{{main: 4}, {main: 6}}},
{created: revision{8, 0}, ver: 3, revs: []revision{{main: 8}, {main: 10}, {main: 12}}},
{created: revision{14, 0}, ver: 3, revs: []revision{{main: 14}, {main: 14, sub: 1}, {main: 16}}},
{},
},
},
map[revision]struct{}{
{main: 4}: {},
},
},
{
5,
&keyIndex{
key: []byte("foo"),
modified: revision{16, 0},
generations: []generation{
{created: revision{2, 0}, ver: 3, revs: []revision{{main: 4}, {main: 6}}},
{created: revision{8, 0}, ver: 3, revs: []revision{{main: 8}, {main: 10}, {main: 12}}},
{created: revision{14, 0}, ver: 3, revs: []revision{{main: 14}, {main: 14, sub: 1}, {main: 16}}},
{},
},
},
map[revision]struct{}{
{main: 4}: {},
},
},
{
6,
&keyIndex{
key: []byte("foo"),
modified: revision{16, 0},
generations: []generation{
{created: revision{8, 0}, ver: 3, revs: []revision{{main: 8}, {main: 10}, {main: 12}}},
{created: revision{14, 0}, ver: 3, revs: []revision{{main: 14}, {main: 14, sub: 1}, {main: 16}}},
{},
},
},
map[revision]struct{}{},
},
{
7,
&keyIndex{
key: []byte("foo"),
modified: revision{16, 0},
generations: []generation{
{created: revision{8, 0}, ver: 3, revs: []revision{{main: 8}, {main: 10}, {main: 12}}},
{created: revision{14, 0}, ver: 3, revs: []revision{{main: 14}, {main: 14, sub: 1}, {main: 16}}},
{},
},
},
map[revision]struct{}{},
},
{
8,
&keyIndex{
key: []byte("foo"),
modified: revision{16, 0},
generations: []generation{
{created: revision{8, 0}, ver: 3, revs: []revision{{main: 8}, {main: 10}, {main: 12}}},
{created: revision{14, 0}, ver: 3, revs: []revision{{main: 14}, {main: 14, sub: 1}, {main: 16}}},
{},
},
},
map[revision]struct{}{
{main: 8}: {},
},
},
{
9,
&keyIndex{
key: []byte("foo"),
modified: revision{16, 0},
generations: []generation{
{created: revision{8, 0}, ver: 3, revs: []revision{{main: 8}, {main: 10}, {main: 12}}},
{created: revision{14, 0}, ver: 3, revs: []revision{{main: 14}, {main: 14, sub: 1}, {main: 16}}},
{},
},
},
map[revision]struct{}{
{main: 8}: {},
},
},
{
10,
&keyIndex{
key: []byte("foo"),
modified: revision{16, 0},
generations: []generation{
{created: revision{8, 0}, ver: 3, revs: []revision{{main: 10}, {main: 12}}},
{created: revision{14, 0}, ver: 3, revs: []revision{{main: 14}, {main: 14, sub: 1}, {main: 16}}},
{},
},
},
map[revision]struct{}{
{main: 10}: {},
},
},
{
11,
&keyIndex{
key: []byte("foo"),
modified: revision{16, 0},
generations: []generation{
{created: revision{8, 0}, ver: 3, revs: []revision{{main: 10}, {main: 12}}},
{created: revision{14, 0}, ver: 3, revs: []revision{{main: 14}, {main: 14, sub: 1}, {main: 16}}},
{},
},
},
map[revision]struct{}{
{main: 10}: {},
},
},
{
12,
&keyIndex{
key: []byte("foo"),
modified: revision{16, 0},
generations: []generation{
{created: revision{14, 0}, ver: 3, revs: []revision{{main: 14}, {main: 14, sub: 1}, {main: 16}}},
{},
},
},
map[revision]struct{}{},
},
{
13,
&keyIndex{
key: []byte("foo"),
modified: revision{16, 0},
generations: []generation{
{created: revision{14, 0}, ver: 3, revs: []revision{{main: 14}, {main: 14, sub: 1}, {main: 16}}},
{},
},
},
map[revision]struct{}{},
},
{
14,
&keyIndex{
key: []byte("foo"),
modified: revision{16, 0},
generations: []generation{
{created: revision{14, 0}, ver: 3, revs: []revision{{main: 14, sub: 1}, {main: 16}}},
{},
},
},
map[revision]struct{}{
{main: 14, sub: 1}: {},
},
},
{
15,
&keyIndex{
key: []byte("foo"),
modified: revision{16, 0},
generations: []generation{
{created: revision{14, 0}, ver: 3, revs: []revision{{main: 14, sub: 1}, {main: 16}}},
{},
},
},
map[revision]struct{}{
{main: 14, sub: 1}: {},
},
},
{
16,
&keyIndex{
key: []byte("foo"),
modified: revision{16, 0},
generations: []generation{
{},
},
},
map[revision]struct{}{},
},
}
// Continuous Compaction and finding Keep
ki := newTestKeyIndex()
for i, tt := range tests {
am := make(map[revision]struct{})
kiclone := cloneKeyIndex(ki)
ki.keep(tt.compact, am)
if !reflect.DeepEqual(ki, kiclone) {
t.Errorf("#%d: ki = %+v, want %+v", i, ki, kiclone)
}
if !reflect.DeepEqual(am, tt.wam) {
t.Errorf("#%d: am = %+v, want %+v", i, am, tt.wam)
}
am = make(map[revision]struct{})
ki.compact(zap.NewExample(), tt.compact, am)
if !reflect.DeepEqual(ki, tt.wki) {
t.Errorf("#%d: ki = %+v, want %+v", i, ki, tt.wki)
}
if !reflect.DeepEqual(am, tt.wam) {
t.Errorf("#%d: am = %+v, want %+v", i, am, tt.wam)
}
}
// Jump Compaction and finding Keep
ki = newTestKeyIndex()
for i, tt := range tests {
if (i%2 == 0 && i < 6) || (i%2 == 1 && i > 6) {
am := make(map[revision]struct{})
kiclone := cloneKeyIndex(ki)
ki.keep(tt.compact, am)
if !reflect.DeepEqual(ki, kiclone) {
t.Errorf("#%d: ki = %+v, want %+v", i, ki, kiclone)
}
if !reflect.DeepEqual(am, tt.wam) {
t.Errorf("#%d: am = %+v, want %+v", i, am, tt.wam)
}
am = make(map[revision]struct{})
ki.compact(zap.NewExample(), tt.compact, am)
if !reflect.DeepEqual(ki, tt.wki) {
t.Errorf("#%d: ki = %+v, want %+v", i, ki, tt.wki)
}
if !reflect.DeepEqual(am, tt.wam) {
t.Errorf("#%d: am = %+v, want %+v", i, am, tt.wam)
}
}
}
kiClone := newTestKeyIndex()
// Once Compaction and finding Keep
for i, tt := range tests {
ki := newTestKeyIndex()
am := make(map[revision]struct{})
ki.keep(tt.compact, am)
if !reflect.DeepEqual(ki, kiClone) {
t.Errorf("#%d: ki = %+v, want %+v", i, ki, kiClone)
}
if !reflect.DeepEqual(am, tt.wam) {
t.Errorf("#%d: am = %+v, want %+v", i, am, tt.wam)
}
am = make(map[revision]struct{})
ki.compact(zap.NewExample(), tt.compact, am)
if !reflect.DeepEqual(ki, tt.wki) {
t.Errorf("#%d: ki = %+v, want %+v", i, ki, tt.wki)
}
if !reflect.DeepEqual(am, tt.wam) {
t.Errorf("#%d: am = %+v, want %+v", i, am, tt.wam)
}
}
}
func cloneKeyIndex(ki *keyIndex) *keyIndex {
generations := make([]generation, len(ki.generations))
for i, gen := range ki.generations {
generations[i] = *cloneGeneration(&gen)
}
return &keyIndex{ki.key, ki.modified, generations}
}
func cloneGeneration(g *generation) *generation {
if g.revs == nil {
return &generation{g.ver, g.created, nil}
}
tmp := make([]revision, len(g.revs))
copy(tmp, g.revs)
return &generation{g.ver, g.created, tmp}
}
// test that compact on version that higher than last modified version works well
func TestKeyIndexCompactOnFurtherRev(t *testing.T) {
ki := &keyIndex{key: []byte("foo")}
ki.put(zap.NewExample(), 1, 0)
ki.put(zap.NewExample(), 2, 0)
am := make(map[revision]struct{})
ki.compact(zap.NewExample(), 3, am)
wki := &keyIndex{
key: []byte("foo"),
modified: revision{2, 0},
generations: []generation{
{created: revision{1, 0}, ver: 2, revs: []revision{{main: 2}}},
},
}
wam := map[revision]struct{}{
{main: 2}: {},
}
if !reflect.DeepEqual(ki, wki) {
t.Errorf("ki = %+v, want %+v", ki, wki)
}
if !reflect.DeepEqual(am, wam) {
t.Errorf("am = %+v, want %+v", am, wam)
}
}
func TestKeyIndexIsEmpty(t *testing.T) {
tests := []struct {
ki *keyIndex
w bool
}{
{
&keyIndex{
key: []byte("foo"),
generations: []generation{{}},
},
true,
},
{
&keyIndex{
key: []byte("foo"),
modified: revision{2, 0},
generations: []generation{
{created: revision{1, 0}, ver: 2, revs: []revision{{main: 2}}},
},
},
false,
},
}
for i, tt := range tests {
g := tt.ki.isEmpty()
if g != tt.w {
t.Errorf("#%d: isEmpty = %v, want %v", i, g, tt.w)
}
}
}
func TestKeyIndexFindGeneration(t *testing.T) {
ki := newTestKeyIndex()
tests := []struct {
rev int64
wg *generation
}{
{0, nil},
{1, nil},
{2, &ki.generations[0]},
{3, &ki.generations[0]},
{4, &ki.generations[0]},
{5, &ki.generations[0]},
{6, nil},
{7, nil},
{8, &ki.generations[1]},
{9, &ki.generations[1]},
{10, &ki.generations[1]},
{11, &ki.generations[1]},
{12, nil},
{13, nil},
}
for i, tt := range tests {
g := ki.findGeneration(tt.rev)
if g != tt.wg {
t.Errorf("#%d: generation = %+v, want %+v", i, g, tt.wg)
}
}
}
func TestKeyIndexLess(t *testing.T) {
ki := &keyIndex{key: []byte("foo")}
tests := []struct {
ki *keyIndex
w bool
}{
{&keyIndex{key: []byte("doo")}, false},
{&keyIndex{key: []byte("foo")}, false},
{&keyIndex{key: []byte("goo")}, true},
}
for i, tt := range tests {
g := ki.Less(tt.ki)
if g != tt.w {
t.Errorf("#%d: Less = %v, want %v", i, g, tt.w)
}
}
}
func TestGenerationIsEmpty(t *testing.T) {
tests := []struct {
g *generation
w bool
}{
{nil, true},
{&generation{}, true},
{&generation{revs: []revision{{main: 1}}}, false},
}
for i, tt := range tests {
g := tt.g.isEmpty()
if g != tt.w {
t.Errorf("#%d: isEmpty = %v, want %v", i, g, tt.w)
}
}
}
func TestGenerationWalk(t *testing.T) {
g := &generation{
ver: 3,
created: revision{2, 0},
revs: []revision{{main: 2}, {main: 4}, {main: 6}},
}
tests := []struct {
f func(rev revision) bool
wi int
}{
{func(rev revision) bool { return rev.main >= 7 }, 2},
{func(rev revision) bool { return rev.main >= 6 }, 1},
{func(rev revision) bool { return rev.main >= 5 }, 1},
{func(rev revision) bool { return rev.main >= 4 }, 0},
{func(rev revision) bool { return rev.main >= 3 }, 0},
{func(rev revision) bool { return rev.main >= 2 }, -1},
}
for i, tt := range tests {
idx := g.walk(tt.f)
if idx != tt.wi {
t.Errorf("#%d: index = %d, want %d", i, idx, tt.wi)
}
}
}
func newTestKeyIndex() *keyIndex {
// key: "foo"
// rev: 16
// generations:
// {empty}
// {{14, 0}[1], {14, 1}[2], {16, 0}(t)[3]}
// {{8, 0}[1], {10, 0}[2], {12, 0}(t)[3]}
// {{2, 0}[1], {4, 0}[2], {6, 0}(t)[3]}
ki := &keyIndex{key: []byte("foo")}
ki.put(zap.NewExample(), 2, 0)
ki.put(zap.NewExample(), 4, 0)
ki.tombstone(zap.NewExample(), 6, 0)
ki.put(zap.NewExample(), 8, 0)
ki.put(zap.NewExample(), 10, 0)
ki.tombstone(zap.NewExample(), 12, 0)
ki.put(zap.NewExample(), 14, 0)
ki.put(zap.NewExample(), 14, 1)
ki.tombstone(zap.NewExample(), 16, 0)
return ki
}

150
server/mvcc/kv.go Normal file
View File

@@ -0,0 +1,150 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package mvcc
import (
"go.etcd.io/etcd/api/v3/mvccpb"
"go.etcd.io/etcd/pkg/v3/traceutil"
"go.etcd.io/etcd/v3/lease"
"go.etcd.io/etcd/v3/mvcc/backend"
)
type RangeOptions struct {
Limit int64
Rev int64
Count bool
}
type RangeResult struct {
KVs []mvccpb.KeyValue
Rev int64
Count int
}
type ReadView interface {
// FirstRev returns the first KV revision at the time of opening the txn.
// After a compaction, the first revision increases to the compaction
// revision.
FirstRev() int64
// Rev returns the revision of the KV at the time of opening the txn.
Rev() int64
// Range gets the keys in the range at rangeRev.
// The returned rev is the current revision of the KV when the operation is executed.
// If rangeRev <=0, range gets the keys at currentRev.
// If `end` is nil, the request returns the key.
// If `end` is not nil and not empty, it gets the keys in range [key, range_end).
// If `end` is not nil and empty, it gets the keys greater than or equal to key.
// Limit limits the number of keys returned.
// If the required rev is compacted, ErrCompacted will be returned.
Range(key, end []byte, ro RangeOptions) (r *RangeResult, err error)
}
// TxnRead represents a read-only transaction with operations that will not
// block other read transactions.
type TxnRead interface {
ReadView
// End marks the transaction is complete and ready to commit.
End()
}
type WriteView interface {
// DeleteRange deletes the given range from the store.
// A deleteRange increases the rev of the store if any key in the range exists.
// The number of key deleted will be returned.
// The returned rev is the current revision of the KV when the operation is executed.
// It also generates one event for each key delete in the event history.
// if the `end` is nil, deleteRange deletes the key.
// if the `end` is not nil, deleteRange deletes the keys in range [key, range_end).
DeleteRange(key, end []byte) (n, rev int64)
// Put puts the given key, value into the store. Put also takes additional argument lease to
// attach a lease to a key-value pair as meta-data. KV implementation does not validate the lease
// id.
// A put also increases the rev of the store, and generates one event in the event history.
// The returned rev is the current revision of the KV when the operation is executed.
Put(key, value []byte, lease lease.LeaseID) (rev int64)
}
// TxnWrite represents a transaction that can modify the store.
type TxnWrite interface {
TxnRead
WriteView
// Changes gets the changes made since opening the write txn.
Changes() []mvccpb.KeyValue
}
// txnReadWrite coerces a read txn to a write, panicking on any write operation.
type txnReadWrite struct{ TxnRead }
func (trw *txnReadWrite) DeleteRange(key, end []byte) (n, rev int64) { panic("unexpected DeleteRange") }
func (trw *txnReadWrite) Put(key, value []byte, lease lease.LeaseID) (rev int64) {
panic("unexpected Put")
}
func (trw *txnReadWrite) Changes() []mvccpb.KeyValue { return nil }
func NewReadOnlyTxnWrite(txn TxnRead) TxnWrite { return &txnReadWrite{txn} }
type KV interface {
ReadView
WriteView
// Read creates a read transaction.
Read(trace *traceutil.Trace) TxnRead
// Write creates a write transaction.
Write(trace *traceutil.Trace) TxnWrite
// Hash computes the hash of the KV's backend.
Hash() (hash uint32, revision int64, err error)
// HashByRev computes the hash of all MVCC revisions up to a given revision.
HashByRev(rev int64) (hash uint32, revision int64, compactRev int64, err error)
// Compact frees all superseded keys with revisions less than rev.
Compact(trace *traceutil.Trace, rev int64) (<-chan struct{}, error)
// Commit commits outstanding txns into the underlying backend.
Commit()
// Restore restores the KV store from a backend.
Restore(b backend.Backend) error
Close() error
}
// WatchableKV is a KV that can be watched.
type WatchableKV interface {
KV
Watchable
}
// Watchable is the interface that wraps the NewWatchStream function.
type Watchable interface {
// NewWatchStream returns a WatchStream that can be used to
// watch events happened or happening on the KV.
NewWatchStream() WatchStream
}
// ConsistentWatchableKV is a WatchableKV that understands the consistency
// algorithm and consistent index.
// If the consistent index of executing entry is not larger than the
// consistent index of ConsistentWatchableKV, all operations in
// this entry are skipped and return empty response.
type ConsistentWatchableKV interface {
WatchableKV
// ConsistentIndex returns the current consistent index of the KV.
ConsistentIndex() uint64
}

838
server/mvcc/kv_test.go Normal file
View File

@@ -0,0 +1,838 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package mvcc
import (
"fmt"
"os"
"reflect"
"testing"
"time"
"go.etcd.io/etcd/api/v3/mvccpb"
"go.etcd.io/etcd/pkg/v3/testutil"
"go.etcd.io/etcd/pkg/v3/traceutil"
"go.etcd.io/etcd/v3/lease"
"go.etcd.io/etcd/v3/mvcc/backend"
"github.com/prometheus/client_golang/prometheus"
dto "github.com/prometheus/client_model/go"
"go.uber.org/zap"
)
// Functional tests for features implemented in v3 store. It treats v3 store
// as a black box, and tests it by feeding the input and validating the output.
// TODO: add similar tests on operations in one txn/rev
type (
rangeFunc func(kv KV, key, end []byte, ro RangeOptions) (*RangeResult, error)
putFunc func(kv KV, key, value []byte, lease lease.LeaseID) int64
deleteRangeFunc func(kv KV, key, end []byte) (n, rev int64)
)
var (
normalRangeFunc = func(kv KV, key, end []byte, ro RangeOptions) (*RangeResult, error) {
return kv.Range(key, end, ro)
}
txnRangeFunc = func(kv KV, key, end []byte, ro RangeOptions) (*RangeResult, error) {
txn := kv.Read(traceutil.TODO())
defer txn.End()
return txn.Range(key, end, ro)
}
normalPutFunc = func(kv KV, key, value []byte, lease lease.LeaseID) int64 {
return kv.Put(key, value, lease)
}
txnPutFunc = func(kv KV, key, value []byte, lease lease.LeaseID) int64 {
txn := kv.Write(traceutil.TODO())
defer txn.End()
return txn.Put(key, value, lease)
}
normalDeleteRangeFunc = func(kv KV, key, end []byte) (n, rev int64) {
return kv.DeleteRange(key, end)
}
txnDeleteRangeFunc = func(kv KV, key, end []byte) (n, rev int64) {
txn := kv.Write(traceutil.TODO())
defer txn.End()
return txn.DeleteRange(key, end)
}
)
func TestKVRange(t *testing.T) { testKVRange(t, normalRangeFunc) }
func TestKVTxnRange(t *testing.T) { testKVRange(t, txnRangeFunc) }
func testKVRange(t *testing.T, f rangeFunc) {
b, tmpPath := backend.NewDefaultTmpBackend()
s := NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
defer cleanup(s, b, tmpPath)
kvs := put3TestKVs(s)
wrev := int64(4)
tests := []struct {
key, end []byte
wkvs []mvccpb.KeyValue
}{
// get no keys
{
[]byte("doo"), []byte("foo"),
nil,
},
// get no keys when key == end
{
[]byte("foo"), []byte("foo"),
nil,
},
// get no keys when ranging single key
{
[]byte("doo"), nil,
nil,
},
// get all keys
{
[]byte("foo"), []byte("foo3"),
kvs,
},
// get partial keys
{
[]byte("foo"), []byte("foo1"),
kvs[:1],
},
// get single key
{
[]byte("foo"), nil,
kvs[:1],
},
// get entire keyspace
{
[]byte(""), []byte(""),
kvs,
},
}
for i, tt := range tests {
r, err := f(s, tt.key, tt.end, RangeOptions{})
if err != nil {
t.Fatal(err)
}
if r.Rev != wrev {
t.Errorf("#%d: rev = %d, want %d", i, r.Rev, wrev)
}
if !reflect.DeepEqual(r.KVs, tt.wkvs) {
t.Errorf("#%d: kvs = %+v, want %+v", i, r.KVs, tt.wkvs)
}
}
}
func TestKVRangeRev(t *testing.T) { testKVRangeRev(t, normalRangeFunc) }
func TestKVTxnRangeRev(t *testing.T) { testKVRangeRev(t, txnRangeFunc) }
func testKVRangeRev(t *testing.T, f rangeFunc) {
b, tmpPath := backend.NewDefaultTmpBackend()
s := NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
defer cleanup(s, b, tmpPath)
kvs := put3TestKVs(s)
tests := []struct {
rev int64
wrev int64
wkvs []mvccpb.KeyValue
}{
{-1, 4, kvs},
{0, 4, kvs},
{2, 4, kvs[:1]},
{3, 4, kvs[:2]},
{4, 4, kvs},
}
for i, tt := range tests {
r, err := f(s, []byte("foo"), []byte("foo3"), RangeOptions{Rev: tt.rev})
if err != nil {
t.Fatal(err)
}
if r.Rev != tt.wrev {
t.Errorf("#%d: rev = %d, want %d", i, r.Rev, tt.wrev)
}
if !reflect.DeepEqual(r.KVs, tt.wkvs) {
t.Errorf("#%d: kvs = %+v, want %+v", i, r.KVs, tt.wkvs)
}
}
}
func TestKVRangeBadRev(t *testing.T) { testKVRangeBadRev(t, normalRangeFunc) }
func TestKVTxnRangeBadRev(t *testing.T) { testKVRangeBadRev(t, txnRangeFunc) }
func testKVRangeBadRev(t *testing.T, f rangeFunc) {
b, tmpPath := backend.NewDefaultTmpBackend()
s := NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
defer cleanup(s, b, tmpPath)
put3TestKVs(s)
if _, err := s.Compact(traceutil.TODO(), 4); err != nil {
t.Fatalf("compact error (%v)", err)
}
tests := []struct {
rev int64
werr error
}{
{-1, nil}, // <= 0 is most recent store
{0, nil},
{1, ErrCompacted},
{2, ErrCompacted},
{4, nil},
{5, ErrFutureRev},
{100, ErrFutureRev},
}
for i, tt := range tests {
_, err := f(s, []byte("foo"), []byte("foo3"), RangeOptions{Rev: tt.rev})
if err != tt.werr {
t.Errorf("#%d: error = %v, want %v", i, err, tt.werr)
}
}
}
func TestKVRangeLimit(t *testing.T) { testKVRangeLimit(t, normalRangeFunc) }
func TestKVTxnRangeLimit(t *testing.T) { testKVRangeLimit(t, txnRangeFunc) }
func testKVRangeLimit(t *testing.T, f rangeFunc) {
b, tmpPath := backend.NewDefaultTmpBackend()
s := NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
defer cleanup(s, b, tmpPath)
kvs := put3TestKVs(s)
wrev := int64(4)
tests := []struct {
limit int64
wkvs []mvccpb.KeyValue
}{
// no limit
{-1, kvs},
// no limit
{0, kvs},
{1, kvs[:1]},
{2, kvs[:2]},
{3, kvs},
{100, kvs},
}
for i, tt := range tests {
r, err := f(s, []byte("foo"), []byte("foo3"), RangeOptions{Limit: tt.limit})
if err != nil {
t.Fatalf("#%d: range error (%v)", i, err)
}
if !reflect.DeepEqual(r.KVs, tt.wkvs) {
t.Errorf("#%d: kvs = %+v, want %+v", i, r.KVs, tt.wkvs)
}
if r.Rev != wrev {
t.Errorf("#%d: rev = %d, want %d", i, r.Rev, wrev)
}
if tt.limit <= 0 || int(tt.limit) > len(kvs) {
if r.Count != len(kvs) {
t.Errorf("#%d: count = %d, want %d", i, r.Count, len(kvs))
}
} else if r.Count != int(tt.limit) {
t.Errorf("#%d: count = %d, want %d", i, r.Count, tt.limit)
}
}
}
func TestKVPutMultipleTimes(t *testing.T) { testKVPutMultipleTimes(t, normalPutFunc) }
func TestKVTxnPutMultipleTimes(t *testing.T) { testKVPutMultipleTimes(t, txnPutFunc) }
func testKVPutMultipleTimes(t *testing.T, f putFunc) {
b, tmpPath := backend.NewDefaultTmpBackend()
s := NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
defer cleanup(s, b, tmpPath)
for i := 0; i < 10; i++ {
base := int64(i + 1)
rev := f(s, []byte("foo"), []byte("bar"), lease.LeaseID(base))
if rev != base+1 {
t.Errorf("#%d: rev = %d, want %d", i, rev, base+1)
}
r, err := s.Range([]byte("foo"), nil, RangeOptions{})
if err != nil {
t.Fatal(err)
}
wkvs := []mvccpb.KeyValue{
{Key: []byte("foo"), Value: []byte("bar"), CreateRevision: 2, ModRevision: base + 1, Version: base, Lease: base},
}
if !reflect.DeepEqual(r.KVs, wkvs) {
t.Errorf("#%d: kvs = %+v, want %+v", i, r.KVs, wkvs)
}
}
}
func TestKVDeleteRange(t *testing.T) { testKVDeleteRange(t, normalDeleteRangeFunc) }
func TestKVTxnDeleteRange(t *testing.T) { testKVDeleteRange(t, txnDeleteRangeFunc) }
func testKVDeleteRange(t *testing.T, f deleteRangeFunc) {
tests := []struct {
key, end []byte
wrev int64
wN int64
}{
{
[]byte("foo"), nil,
5, 1,
},
{
[]byte("foo"), []byte("foo1"),
5, 1,
},
{
[]byte("foo"), []byte("foo2"),
5, 2,
},
{
[]byte("foo"), []byte("foo3"),
5, 3,
},
{
[]byte("foo3"), []byte("foo8"),
4, 0,
},
{
[]byte("foo3"), nil,
4, 0,
},
}
for i, tt := range tests {
b, tmpPath := backend.NewDefaultTmpBackend()
s := NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
s.Put([]byte("foo"), []byte("bar"), lease.NoLease)
s.Put([]byte("foo1"), []byte("bar1"), lease.NoLease)
s.Put([]byte("foo2"), []byte("bar2"), lease.NoLease)
n, rev := f(s, tt.key, tt.end)
if n != tt.wN || rev != tt.wrev {
t.Errorf("#%d: n = %d, rev = %d, want (%d, %d)", i, n, rev, tt.wN, tt.wrev)
}
cleanup(s, b, tmpPath)
}
}
func TestKVDeleteMultipleTimes(t *testing.T) { testKVDeleteMultipleTimes(t, normalDeleteRangeFunc) }
func TestKVTxnDeleteMultipleTimes(t *testing.T) { testKVDeleteMultipleTimes(t, txnDeleteRangeFunc) }
func testKVDeleteMultipleTimes(t *testing.T, f deleteRangeFunc) {
b, tmpPath := backend.NewDefaultTmpBackend()
s := NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
defer cleanup(s, b, tmpPath)
s.Put([]byte("foo"), []byte("bar"), lease.NoLease)
n, rev := f(s, []byte("foo"), nil)
if n != 1 || rev != 3 {
t.Fatalf("n = %d, rev = %d, want (%d, %d)", n, rev, 1, 3)
}
for i := 0; i < 10; i++ {
n, rev := f(s, []byte("foo"), nil)
if n != 0 || rev != 3 {
t.Fatalf("#%d: n = %d, rev = %d, want (%d, %d)", i, n, rev, 0, 3)
}
}
}
// test that range, put, delete on single key in sequence repeatedly works correctly.
func TestKVOperationInSequence(t *testing.T) {
b, tmpPath := backend.NewDefaultTmpBackend()
s := NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
defer cleanup(s, b, tmpPath)
for i := 0; i < 10; i++ {
base := int64(i*2 + 1)
// put foo
rev := s.Put([]byte("foo"), []byte("bar"), lease.NoLease)
if rev != base+1 {
t.Errorf("#%d: put rev = %d, want %d", i, rev, base+1)
}
r, err := s.Range([]byte("foo"), nil, RangeOptions{Rev: base + 1})
if err != nil {
t.Fatal(err)
}
wkvs := []mvccpb.KeyValue{
{Key: []byte("foo"), Value: []byte("bar"), CreateRevision: base + 1, ModRevision: base + 1, Version: 1, Lease: int64(lease.NoLease)},
}
if !reflect.DeepEqual(r.KVs, wkvs) {
t.Errorf("#%d: kvs = %+v, want %+v", i, r.KVs, wkvs)
}
if r.Rev != base+1 {
t.Errorf("#%d: range rev = %d, want %d", i, rev, base+1)
}
// delete foo
n, rev := s.DeleteRange([]byte("foo"), nil)
if n != 1 || rev != base+2 {
t.Errorf("#%d: n = %d, rev = %d, want (%d, %d)", i, n, rev, 1, base+2)
}
r, err = s.Range([]byte("foo"), nil, RangeOptions{Rev: base + 2})
if err != nil {
t.Fatal(err)
}
if r.KVs != nil {
t.Errorf("#%d: kvs = %+v, want %+v", i, r.KVs, nil)
}
if r.Rev != base+2 {
t.Errorf("#%d: range rev = %d, want %d", i, r.Rev, base+2)
}
}
}
func TestKVTxnBlockWriteOperations(t *testing.T) {
b, tmpPath := backend.NewDefaultTmpBackend()
s := NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
tests := []func(){
func() { s.Put([]byte("foo"), nil, lease.NoLease) },
func() { s.DeleteRange([]byte("foo"), nil) },
}
for i, tt := range tests {
tf := tt
txn := s.Write(traceutil.TODO())
done := make(chan struct{}, 1)
go func() {
tf()
done <- struct{}{}
}()
select {
case <-done:
t.Fatalf("#%d: operation failed to be blocked", i)
case <-time.After(10 * time.Millisecond):
}
txn.End()
select {
case <-done:
case <-time.After(10 * time.Second):
testutil.FatalStack(t, fmt.Sprintf("#%d: operation failed to be unblocked", i))
}
}
// only close backend when we know all the tx are finished
cleanup(s, b, tmpPath)
}
func TestKVTxnNonBlockRange(t *testing.T) {
b, tmpPath := backend.NewDefaultTmpBackend()
s := NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
defer cleanup(s, b, tmpPath)
txn := s.Write(traceutil.TODO())
defer txn.End()
donec := make(chan struct{})
go func() {
defer close(donec)
s.Range([]byte("foo"), nil, RangeOptions{})
}()
select {
case <-donec:
case <-time.After(100 * time.Millisecond):
t.Fatalf("range operation blocked on write txn")
}
}
// test that txn range, put, delete on single key in sequence repeatedly works correctly.
func TestKVTxnOperationInSequence(t *testing.T) {
b, tmpPath := backend.NewDefaultTmpBackend()
s := NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
defer cleanup(s, b, tmpPath)
for i := 0; i < 10; i++ {
txn := s.Write(traceutil.TODO())
base := int64(i + 1)
// put foo
rev := txn.Put([]byte("foo"), []byte("bar"), lease.NoLease)
if rev != base+1 {
t.Errorf("#%d: put rev = %d, want %d", i, rev, base+1)
}
r, err := txn.Range([]byte("foo"), nil, RangeOptions{Rev: base + 1})
if err != nil {
t.Fatal(err)
}
wkvs := []mvccpb.KeyValue{
{Key: []byte("foo"), Value: []byte("bar"), CreateRevision: base + 1, ModRevision: base + 1, Version: 1, Lease: int64(lease.NoLease)},
}
if !reflect.DeepEqual(r.KVs, wkvs) {
t.Errorf("#%d: kvs = %+v, want %+v", i, r.KVs, wkvs)
}
if r.Rev != base+1 {
t.Errorf("#%d: range rev = %d, want %d", i, r.Rev, base+1)
}
// delete foo
n, rev := txn.DeleteRange([]byte("foo"), nil)
if n != 1 || rev != base+1 {
t.Errorf("#%d: n = %d, rev = %d, want (%d, %d)", i, n, rev, 1, base+1)
}
r, err = txn.Range([]byte("foo"), nil, RangeOptions{Rev: base + 1})
if err != nil {
t.Errorf("#%d: range error (%v)", i, err)
}
if r.KVs != nil {
t.Errorf("#%d: kvs = %+v, want %+v", i, r.KVs, nil)
}
if r.Rev != base+1 {
t.Errorf("#%d: range rev = %d, want %d", i, r.Rev, base+1)
}
txn.End()
}
}
func TestKVCompactReserveLastValue(t *testing.T) {
b, tmpPath := backend.NewDefaultTmpBackend()
s := NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
defer cleanup(s, b, tmpPath)
s.Put([]byte("foo"), []byte("bar0"), 1)
s.Put([]byte("foo"), []byte("bar1"), 2)
s.DeleteRange([]byte("foo"), nil)
s.Put([]byte("foo"), []byte("bar2"), 3)
// rev in tests will be called in Compact() one by one on the same store
tests := []struct {
rev int64
// wanted kvs right after the compacted rev
wkvs []mvccpb.KeyValue
}{
{
1,
[]mvccpb.KeyValue{
{Key: []byte("foo"), Value: []byte("bar0"), CreateRevision: 2, ModRevision: 2, Version: 1, Lease: 1},
},
},
{
2,
[]mvccpb.KeyValue{
{Key: []byte("foo"), Value: []byte("bar1"), CreateRevision: 2, ModRevision: 3, Version: 2, Lease: 2},
},
},
{
3,
nil,
},
{
4,
[]mvccpb.KeyValue{
{Key: []byte("foo"), Value: []byte("bar2"), CreateRevision: 5, ModRevision: 5, Version: 1, Lease: 3},
},
},
}
for i, tt := range tests {
_, err := s.Compact(traceutil.TODO(), tt.rev)
if err != nil {
t.Errorf("#%d: unexpect compact error %v", i, err)
}
r, err := s.Range([]byte("foo"), nil, RangeOptions{Rev: tt.rev + 1})
if err != nil {
t.Errorf("#%d: unexpect range error %v", i, err)
}
if !reflect.DeepEqual(r.KVs, tt.wkvs) {
t.Errorf("#%d: kvs = %+v, want %+v", i, r.KVs, tt.wkvs)
}
}
}
func TestKVCompactBad(t *testing.T) {
b, tmpPath := backend.NewDefaultTmpBackend()
s := NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
defer cleanup(s, b, tmpPath)
s.Put([]byte("foo"), []byte("bar0"), lease.NoLease)
s.Put([]byte("foo"), []byte("bar1"), lease.NoLease)
s.Put([]byte("foo"), []byte("bar2"), lease.NoLease)
// rev in tests will be called in Compact() one by one on the same store
tests := []struct {
rev int64
werr error
}{
{0, nil},
{1, nil},
{1, ErrCompacted},
{4, nil},
{5, ErrFutureRev},
{100, ErrFutureRev},
}
for i, tt := range tests {
_, err := s.Compact(traceutil.TODO(), tt.rev)
if err != tt.werr {
t.Errorf("#%d: compact error = %v, want %v", i, err, tt.werr)
}
}
}
func TestKVHash(t *testing.T) {
hashes := make([]uint32, 3)
for i := 0; i < len(hashes); i++ {
var err error
b, tmpPath := backend.NewDefaultTmpBackend()
kv := NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
kv.Put([]byte("foo0"), []byte("bar0"), lease.NoLease)
kv.Put([]byte("foo1"), []byte("bar0"), lease.NoLease)
hashes[i], _, err = kv.Hash()
if err != nil {
t.Fatalf("failed to get hash: %v", err)
}
cleanup(kv, b, tmpPath)
}
for i := 1; i < len(hashes); i++ {
if hashes[i-1] != hashes[i] {
t.Errorf("hash[%d](%d) != hash[%d](%d)", i-1, hashes[i-1], i, hashes[i])
}
}
}
func TestKVRestore(t *testing.T) {
tests := []func(kv KV){
func(kv KV) {
kv.Put([]byte("foo"), []byte("bar0"), 1)
kv.Put([]byte("foo"), []byte("bar1"), 2)
kv.Put([]byte("foo"), []byte("bar2"), 3)
kv.Put([]byte("foo2"), []byte("bar0"), 1)
},
func(kv KV) {
kv.Put([]byte("foo"), []byte("bar0"), 1)
kv.DeleteRange([]byte("foo"), nil)
kv.Put([]byte("foo"), []byte("bar1"), 2)
},
func(kv KV) {
kv.Put([]byte("foo"), []byte("bar0"), 1)
kv.Put([]byte("foo"), []byte("bar1"), 2)
kv.Compact(traceutil.TODO(), 1)
},
}
for i, tt := range tests {
b, tmpPath := backend.NewDefaultTmpBackend()
s := NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
tt(s)
var kvss [][]mvccpb.KeyValue
for k := int64(0); k < 10; k++ {
r, _ := s.Range([]byte("a"), []byte("z"), RangeOptions{Rev: k})
kvss = append(kvss, r.KVs)
}
keysBefore := readGaugeInt(keysGauge)
s.Close()
// ns should recover the the previous state from backend.
ns := NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
if keysRestore := readGaugeInt(keysGauge); keysBefore != keysRestore {
t.Errorf("#%d: got %d key count, expected %d", i, keysRestore, keysBefore)
}
// wait for possible compaction to finish
testutil.WaitSchedule()
var nkvss [][]mvccpb.KeyValue
for k := int64(0); k < 10; k++ {
r, _ := ns.Range([]byte("a"), []byte("z"), RangeOptions{Rev: k})
nkvss = append(nkvss, r.KVs)
}
cleanup(ns, b, tmpPath)
if !reflect.DeepEqual(nkvss, kvss) {
t.Errorf("#%d: kvs history = %+v, want %+v", i, nkvss, kvss)
}
}
}
func readGaugeInt(g prometheus.Gauge) int {
ch := make(chan prometheus.Metric, 1)
g.Collect(ch)
m := <-ch
mm := &dto.Metric{}
m.Write(mm)
return int(mm.GetGauge().GetValue())
}
func TestKVSnapshot(t *testing.T) {
b, tmpPath := backend.NewDefaultTmpBackend()
s := NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
defer cleanup(s, b, tmpPath)
wkvs := put3TestKVs(s)
newPath := "new_test"
f, err := os.Create(newPath)
if err != nil {
t.Fatal(err)
}
defer os.Remove(newPath)
snap := s.b.Snapshot()
defer snap.Close()
_, err = snap.WriteTo(f)
if err != nil {
t.Fatal(err)
}
f.Close()
ns := NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
defer ns.Close()
r, err := ns.Range([]byte("a"), []byte("z"), RangeOptions{})
if err != nil {
t.Errorf("unexpect range error (%v)", err)
}
if !reflect.DeepEqual(r.KVs, wkvs) {
t.Errorf("kvs = %+v, want %+v", r.KVs, wkvs)
}
if r.Rev != 4 {
t.Errorf("rev = %d, want %d", r.Rev, 4)
}
}
func TestWatchableKVWatch(t *testing.T) {
b, tmpPath := backend.NewDefaultTmpBackend()
s := WatchableKV(newWatchableStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{}))
defer cleanup(s, b, tmpPath)
w := s.NewWatchStream()
defer w.Close()
wid, _ := w.Watch(0, []byte("foo"), []byte("fop"), 0)
wev := []mvccpb.Event{
{Type: mvccpb.PUT,
Kv: &mvccpb.KeyValue{
Key: []byte("foo"),
Value: []byte("bar"),
CreateRevision: 2,
ModRevision: 2,
Version: 1,
Lease: 1,
},
},
{
Type: mvccpb.PUT,
Kv: &mvccpb.KeyValue{
Key: []byte("foo1"),
Value: []byte("bar1"),
CreateRevision: 3,
ModRevision: 3,
Version: 1,
Lease: 2,
},
},
{
Type: mvccpb.PUT,
Kv: &mvccpb.KeyValue{
Key: []byte("foo1"),
Value: []byte("bar11"),
CreateRevision: 3,
ModRevision: 4,
Version: 2,
Lease: 3,
},
},
}
s.Put([]byte("foo"), []byte("bar"), 1)
select {
case resp := <-w.Chan():
if resp.WatchID != wid {
t.Errorf("resp.WatchID got = %d, want = %d", resp.WatchID, wid)
}
ev := resp.Events[0]
if !reflect.DeepEqual(ev, wev[0]) {
t.Errorf("watched event = %+v, want %+v", ev, wev[0])
}
case <-time.After(5 * time.Second):
// CPU might be too slow, and the routine is not able to switch around
testutil.FatalStack(t, "failed to watch the event")
}
s.Put([]byte("foo1"), []byte("bar1"), 2)
select {
case resp := <-w.Chan():
if resp.WatchID != wid {
t.Errorf("resp.WatchID got = %d, want = %d", resp.WatchID, wid)
}
ev := resp.Events[0]
if !reflect.DeepEqual(ev, wev[1]) {
t.Errorf("watched event = %+v, want %+v", ev, wev[1])
}
case <-time.After(5 * time.Second):
testutil.FatalStack(t, "failed to watch the event")
}
w = s.NewWatchStream()
wid, _ = w.Watch(0, []byte("foo1"), []byte("foo2"), 3)
select {
case resp := <-w.Chan():
if resp.WatchID != wid {
t.Errorf("resp.WatchID got = %d, want = %d", resp.WatchID, wid)
}
ev := resp.Events[0]
if !reflect.DeepEqual(ev, wev[1]) {
t.Errorf("watched event = %+v, want %+v", ev, wev[1])
}
case <-time.After(5 * time.Second):
testutil.FatalStack(t, "failed to watch the event")
}
s.Put([]byte("foo1"), []byte("bar11"), 3)
select {
case resp := <-w.Chan():
if resp.WatchID != wid {
t.Errorf("resp.WatchID got = %d, want = %d", resp.WatchID, wid)
}
ev := resp.Events[0]
if !reflect.DeepEqual(ev, wev[2]) {
t.Errorf("watched event = %+v, want %+v", ev, wev[2])
}
case <-time.After(5 * time.Second):
testutil.FatalStack(t, "failed to watch the event")
}
}
func cleanup(s KV, b backend.Backend, path string) {
s.Close()
b.Close()
os.Remove(path)
}
func put3TestKVs(s KV) []mvccpb.KeyValue {
s.Put([]byte("foo"), []byte("bar"), 1)
s.Put([]byte("foo1"), []byte("bar1"), 2)
s.Put([]byte("foo2"), []byte("bar2"), 3)
return []mvccpb.KeyValue{
{Key: []byte("foo"), Value: []byte("bar"), CreateRevision: 2, ModRevision: 2, Version: 1, Lease: 1},
{Key: []byte("foo1"), Value: []byte("bar1"), CreateRevision: 3, ModRevision: 3, Version: 1, Lease: 2},
{Key: []byte("foo2"), Value: []byte("bar2"), CreateRevision: 4, ModRevision: 4, Version: 1, Lease: 3},
}
}

54
server/mvcc/kv_view.go Normal file
View File

@@ -0,0 +1,54 @@
// Copyright 2017 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package mvcc
import (
"go.etcd.io/etcd/pkg/v3/traceutil"
"go.etcd.io/etcd/v3/lease"
)
type readView struct{ kv KV }
func (rv *readView) FirstRev() int64 {
tr := rv.kv.Read(traceutil.TODO())
defer tr.End()
return tr.FirstRev()
}
func (rv *readView) Rev() int64 {
tr := rv.kv.Read(traceutil.TODO())
defer tr.End()
return tr.Rev()
}
func (rv *readView) Range(key, end []byte, ro RangeOptions) (r *RangeResult, err error) {
tr := rv.kv.Read(traceutil.TODO())
defer tr.End()
return tr.Range(key, end, ro)
}
type writeView struct{ kv KV }
func (wv *writeView) DeleteRange(key, end []byte) (n, rev int64) {
tw := wv.kv.Write(traceutil.TODO())
defer tw.End()
return tw.DeleteRange(key, end)
}
func (wv *writeView) Put(key, value []byte, lease lease.LeaseID) (rev int64) {
tw := wv.kv.Write(traceutil.TODO())
defer tw.End()
return tw.Put(key, value, lease)
}

592
server/mvcc/kvstore.go Normal file
View File

@@ -0,0 +1,592 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package mvcc
import (
"context"
"errors"
"fmt"
"hash/crc32"
"math"
"sync"
"time"
"go.etcd.io/etcd/api/v3/mvccpb"
"go.etcd.io/etcd/pkg/v3/schedule"
"go.etcd.io/etcd/pkg/v3/traceutil"
"go.etcd.io/etcd/v3/etcdserver/cindex"
"go.etcd.io/etcd/v3/lease"
"go.etcd.io/etcd/v3/mvcc/backend"
"go.uber.org/zap"
)
var (
keyBucketName = []byte("key")
metaBucketName = []byte("meta")
consistentIndexKeyName = []byte("consistent_index")
scheduledCompactKeyName = []byte("scheduledCompactRev")
finishedCompactKeyName = []byte("finishedCompactRev")
ErrCompacted = errors.New("mvcc: required revision has been compacted")
ErrFutureRev = errors.New("mvcc: required revision is a future revision")
ErrCanceled = errors.New("mvcc: watcher is canceled")
)
const (
// markedRevBytesLen is the byte length of marked revision.
// The first `revBytesLen` bytes represents a normal revision. The last
// one byte is the mark.
markedRevBytesLen = revBytesLen + 1
markBytePosition = markedRevBytesLen - 1
markTombstone byte = 't'
)
var restoreChunkKeys = 10000 // non-const for testing
var defaultCompactBatchLimit = 1000
type StoreConfig struct {
CompactionBatchLimit int
}
type store struct {
ReadView
WriteView
cfg StoreConfig
// mu read locks for txns and write locks for non-txn store changes.
mu sync.RWMutex
ci cindex.ConsistentIndexer
b backend.Backend
kvindex index
le lease.Lessor
// revMuLock protects currentRev and compactMainRev.
// Locked at end of write txn and released after write txn unlock lock.
// Locked before locking read txn and released after locking.
revMu sync.RWMutex
// currentRev is the revision of the last completed transaction.
currentRev int64
// compactMainRev is the main revision of the last compaction.
compactMainRev int64
fifoSched schedule.Scheduler
stopc chan struct{}
lg *zap.Logger
}
// NewStore returns a new store. It is useful to create a store inside
// mvcc pkg. It should only be used for testing externally.
func NewStore(lg *zap.Logger, b backend.Backend, le lease.Lessor, ci cindex.ConsistentIndexer, cfg StoreConfig) *store {
if lg == nil {
lg = zap.NewNop()
}
if cfg.CompactionBatchLimit == 0 {
cfg.CompactionBatchLimit = defaultCompactBatchLimit
}
s := &store{
cfg: cfg,
b: b,
ci: ci,
kvindex: newTreeIndex(lg),
le: le,
currentRev: 1,
compactMainRev: -1,
fifoSched: schedule.NewFIFOScheduler(),
stopc: make(chan struct{}),
lg: lg,
}
s.ReadView = &readView{s}
s.WriteView = &writeView{s}
if s.le != nil {
s.le.SetRangeDeleter(func() lease.TxnDelete { return s.Write(traceutil.TODO()) })
}
tx := s.b.BatchTx()
tx.Lock()
tx.UnsafeCreateBucket(keyBucketName)
tx.UnsafeCreateBucket(metaBucketName)
tx.Unlock()
s.b.ForceCommit()
s.mu.Lock()
defer s.mu.Unlock()
if err := s.restore(); err != nil {
// TODO: return the error instead of panic here?
panic("failed to recover store from backend")
}
return s
}
func (s *store) compactBarrier(ctx context.Context, ch chan struct{}) {
if ctx == nil || ctx.Err() != nil {
select {
case <-s.stopc:
default:
// fix deadlock in mvcc,for more information, please refer to pr 11817.
// s.stopc is only updated in restore operation, which is called by apply
// snapshot call, compaction and apply snapshot requests are serialized by
// raft, and do not happen at the same time.
s.mu.Lock()
f := func(ctx context.Context) { s.compactBarrier(ctx, ch) }
s.fifoSched.Schedule(f)
s.mu.Unlock()
}
return
}
close(ch)
}
func (s *store) Hash() (hash uint32, revision int64, err error) {
// TODO: hash and revision could be inconsistent, one possible fix is to add s.revMu.RLock() at the beginning of function, which is costly
start := time.Now()
s.b.ForceCommit()
h, err := s.b.Hash(DefaultIgnores)
hashSec.Observe(time.Since(start).Seconds())
return h, s.currentRev, err
}
func (s *store) HashByRev(rev int64) (hash uint32, currentRev int64, compactRev int64, err error) {
start := time.Now()
s.mu.RLock()
s.revMu.RLock()
compactRev, currentRev = s.compactMainRev, s.currentRev
s.revMu.RUnlock()
if rev > 0 && rev <= compactRev {
s.mu.RUnlock()
return 0, 0, compactRev, ErrCompacted
} else if rev > 0 && rev > currentRev {
s.mu.RUnlock()
return 0, currentRev, 0, ErrFutureRev
}
if rev == 0 {
rev = currentRev
}
keep := s.kvindex.Keep(rev)
tx := s.b.ReadTx()
tx.RLock()
defer tx.RUnlock()
s.mu.RUnlock()
upper := revision{main: rev + 1}
lower := revision{main: compactRev + 1}
h := crc32.New(crc32.MakeTable(crc32.Castagnoli))
h.Write(keyBucketName)
err = tx.UnsafeForEach(keyBucketName, func(k, v []byte) error {
kr := bytesToRev(k)
if !upper.GreaterThan(kr) {
return nil
}
// skip revisions that are scheduled for deletion
// due to compacting; don't skip if there isn't one.
if lower.GreaterThan(kr) && len(keep) > 0 {
if _, ok := keep[kr]; !ok {
return nil
}
}
h.Write(k)
h.Write(v)
return nil
})
hash = h.Sum32()
hashRevSec.Observe(time.Since(start).Seconds())
return hash, currentRev, compactRev, err
}
func (s *store) updateCompactRev(rev int64) (<-chan struct{}, error) {
s.revMu.Lock()
if rev <= s.compactMainRev {
ch := make(chan struct{})
f := func(ctx context.Context) { s.compactBarrier(ctx, ch) }
s.fifoSched.Schedule(f)
s.revMu.Unlock()
return ch, ErrCompacted
}
if rev > s.currentRev {
s.revMu.Unlock()
return nil, ErrFutureRev
}
s.compactMainRev = rev
rbytes := newRevBytes()
revToBytes(revision{main: rev}, rbytes)
tx := s.b.BatchTx()
tx.Lock()
tx.UnsafePut(metaBucketName, scheduledCompactKeyName, rbytes)
tx.Unlock()
// ensure that desired compaction is persisted
s.b.ForceCommit()
s.revMu.Unlock()
return nil, nil
}
func (s *store) compact(trace *traceutil.Trace, rev int64) (<-chan struct{}, error) {
ch := make(chan struct{})
var j = func(ctx context.Context) {
if ctx.Err() != nil {
s.compactBarrier(ctx, ch)
return
}
start := time.Now()
keep := s.kvindex.Compact(rev)
indexCompactionPauseMs.Observe(float64(time.Since(start) / time.Millisecond))
if !s.scheduleCompaction(rev, keep) {
s.compactBarrier(nil, ch)
return
}
close(ch)
}
s.fifoSched.Schedule(j)
trace.Step("schedule compaction")
return ch, nil
}
func (s *store) compactLockfree(rev int64) (<-chan struct{}, error) {
ch, err := s.updateCompactRev(rev)
if err != nil {
return ch, err
}
return s.compact(traceutil.TODO(), rev)
}
func (s *store) Compact(trace *traceutil.Trace, rev int64) (<-chan struct{}, error) {
s.mu.Lock()
ch, err := s.updateCompactRev(rev)
trace.Step("check and update compact revision")
if err != nil {
s.mu.Unlock()
return ch, err
}
s.mu.Unlock()
return s.compact(trace, rev)
}
// DefaultIgnores is a map of keys to ignore in hash checking.
var DefaultIgnores map[backend.IgnoreKey]struct{}
func init() {
DefaultIgnores = map[backend.IgnoreKey]struct{}{
// consistent index might be changed due to v2 internal sync, which
// is not controllable by the user.
{Bucket: string(metaBucketName), Key: string(consistentIndexKeyName)}: {},
}
}
func (s *store) Commit() {
s.mu.Lock()
defer s.mu.Unlock()
tx := s.b.BatchTx()
tx.Lock()
s.saveIndex(tx)
tx.Unlock()
s.b.ForceCommit()
}
func (s *store) Restore(b backend.Backend) error {
s.mu.Lock()
defer s.mu.Unlock()
close(s.stopc)
s.fifoSched.Stop()
s.b = b
s.kvindex = newTreeIndex(s.lg)
{
// During restore the metrics might report 'special' values
s.revMu.Lock()
s.currentRev = 1
s.compactMainRev = -1
s.revMu.Unlock()
}
s.fifoSched = schedule.NewFIFOScheduler()
s.stopc = make(chan struct{})
s.ci.SetBatchTx(b.BatchTx())
s.ci.SetConsistentIndex(0)
return s.restore()
}
func (s *store) restore() error {
s.setupMetricsReporter()
min, max := newRevBytes(), newRevBytes()
revToBytes(revision{main: 1}, min)
revToBytes(revision{main: math.MaxInt64, sub: math.MaxInt64}, max)
keyToLease := make(map[string]lease.LeaseID)
// restore index
tx := s.b.BatchTx()
tx.Lock()
_, finishedCompactBytes := tx.UnsafeRange(metaBucketName, finishedCompactKeyName, nil, 0)
if len(finishedCompactBytes) != 0 {
s.revMu.Lock()
s.compactMainRev = bytesToRev(finishedCompactBytes[0]).main
s.lg.Info(
"restored last compact revision",
zap.String("meta-bucket-name", string(metaBucketName)),
zap.String("meta-bucket-name-key", string(finishedCompactKeyName)),
zap.Int64("restored-compact-revision", s.compactMainRev),
)
s.revMu.Unlock()
}
_, scheduledCompactBytes := tx.UnsafeRange(metaBucketName, scheduledCompactKeyName, nil, 0)
scheduledCompact := int64(0)
if len(scheduledCompactBytes) != 0 {
scheduledCompact = bytesToRev(scheduledCompactBytes[0]).main
}
// index keys concurrently as they're loaded in from tx
keysGauge.Set(0)
rkvc, revc := restoreIntoIndex(s.lg, s.kvindex)
for {
keys, vals := tx.UnsafeRange(keyBucketName, min, max, int64(restoreChunkKeys))
if len(keys) == 0 {
break
}
// rkvc blocks if the total pending keys exceeds the restore
// chunk size to keep keys from consuming too much memory.
restoreChunk(s.lg, rkvc, keys, vals, keyToLease)
if len(keys) < restoreChunkKeys {
// partial set implies final set
break
}
// next set begins after where this one ended
newMin := bytesToRev(keys[len(keys)-1][:revBytesLen])
newMin.sub++
revToBytes(newMin, min)
}
close(rkvc)
{
s.revMu.Lock()
s.currentRev = <-revc
// keys in the range [compacted revision -N, compaction] might all be deleted due to compaction.
// the correct revision should be set to compaction revision in the case, not the largest revision
// we have seen.
if s.currentRev < s.compactMainRev {
s.currentRev = s.compactMainRev
}
s.revMu.Unlock()
}
if scheduledCompact <= s.compactMainRev {
scheduledCompact = 0
}
for key, lid := range keyToLease {
if s.le == nil {
tx.Unlock()
panic("no lessor to attach lease")
}
err := s.le.Attach(lid, []lease.LeaseItem{{Key: key}})
if err != nil {
s.lg.Error(
"failed to attach a lease",
zap.String("lease-id", fmt.Sprintf("%016x", lid)),
zap.Error(err),
)
}
}
tx.Unlock()
if scheduledCompact != 0 {
if _, err := s.compactLockfree(scheduledCompact); err != nil {
s.lg.Warn("compaction encountered error", zap.Error(err))
}
s.lg.Info(
"resume scheduled compaction",
zap.String("meta-bucket-name", string(metaBucketName)),
zap.String("meta-bucket-name-key", string(scheduledCompactKeyName)),
zap.Int64("scheduled-compact-revision", scheduledCompact),
)
}
return nil
}
type revKeyValue struct {
key []byte
kv mvccpb.KeyValue
kstr string
}
func restoreIntoIndex(lg *zap.Logger, idx index) (chan<- revKeyValue, <-chan int64) {
rkvc, revc := make(chan revKeyValue, restoreChunkKeys), make(chan int64, 1)
go func() {
currentRev := int64(1)
defer func() { revc <- currentRev }()
// restore the tree index from streaming the unordered index.
kiCache := make(map[string]*keyIndex, restoreChunkKeys)
for rkv := range rkvc {
ki, ok := kiCache[rkv.kstr]
// purge kiCache if many keys but still missing in the cache
if !ok && len(kiCache) >= restoreChunkKeys {
i := 10
for k := range kiCache {
delete(kiCache, k)
if i--; i == 0 {
break
}
}
}
// cache miss, fetch from tree index if there
if !ok {
ki = &keyIndex{key: rkv.kv.Key}
if idxKey := idx.KeyIndex(ki); idxKey != nil {
kiCache[rkv.kstr], ki = idxKey, idxKey
ok = true
}
}
rev := bytesToRev(rkv.key)
currentRev = rev.main
if ok {
if isTombstone(rkv.key) {
if err := ki.tombstone(lg, rev.main, rev.sub); err != nil {
lg.Warn("tombstone encountered error", zap.Error(err))
}
continue
}
ki.put(lg, rev.main, rev.sub)
} else if !isTombstone(rkv.key) {
ki.restore(lg, revision{rkv.kv.CreateRevision, 0}, rev, rkv.kv.Version)
idx.Insert(ki)
kiCache[rkv.kstr] = ki
}
}
}()
return rkvc, revc
}
func restoreChunk(lg *zap.Logger, kvc chan<- revKeyValue, keys, vals [][]byte, keyToLease map[string]lease.LeaseID) {
for i, key := range keys {
rkv := revKeyValue{key: key}
if err := rkv.kv.Unmarshal(vals[i]); err != nil {
lg.Fatal("failed to unmarshal mvccpb.KeyValue", zap.Error(err))
}
rkv.kstr = string(rkv.kv.Key)
if isTombstone(key) {
delete(keyToLease, rkv.kstr)
} else if lid := lease.LeaseID(rkv.kv.Lease); lid != lease.NoLease {
keyToLease[rkv.kstr] = lid
} else {
delete(keyToLease, rkv.kstr)
}
kvc <- rkv
}
}
func (s *store) Close() error {
close(s.stopc)
s.fifoSched.Stop()
return nil
}
func (s *store) saveIndex(tx backend.BatchTx) {
if s.ci != nil {
s.ci.UnsafeSave(tx)
}
}
func (s *store) ConsistentIndex() uint64 {
if s.ci != nil {
return s.ci.ConsistentIndex()
}
return 0
}
func (s *store) setupMetricsReporter() {
b := s.b
reportDbTotalSizeInBytesMu.Lock()
reportDbTotalSizeInBytes = func() float64 { return float64(b.Size()) }
reportDbTotalSizeInBytesMu.Unlock()
reportDbTotalSizeInBytesDebugMu.Lock()
reportDbTotalSizeInBytesDebug = func() float64 { return float64(b.Size()) }
reportDbTotalSizeInBytesDebugMu.Unlock()
reportDbTotalSizeInUseInBytesMu.Lock()
reportDbTotalSizeInUseInBytes = func() float64 { return float64(b.SizeInUse()) }
reportDbTotalSizeInUseInBytesMu.Unlock()
reportDbOpenReadTxNMu.Lock()
reportDbOpenReadTxN = func() float64 { return float64(b.OpenReadTxN()) }
reportDbOpenReadTxNMu.Unlock()
reportCurrentRevMu.Lock()
reportCurrentRev = func() float64 {
s.revMu.RLock()
defer s.revMu.RUnlock()
return float64(s.currentRev)
}
reportCurrentRevMu.Unlock()
reportCompactRevMu.Lock()
reportCompactRev = func() float64 {
s.revMu.RLock()
defer s.revMu.RUnlock()
return float64(s.compactMainRev)
}
reportCompactRevMu.Unlock()
}
// appendMarkTombstone appends tombstone mark to normal revision bytes.
func appendMarkTombstone(lg *zap.Logger, b []byte) []byte {
if len(b) != revBytesLen {
lg.Panic(
"cannot append tombstone mark to non-normal revision bytes",
zap.Int("expected-revision-bytes-size", revBytesLen),
zap.Int("given-revision-bytes-size", len(b)),
)
}
return append(b, markTombstone)
}
// isTombstone checks whether the revision bytes is a tombstone.
func isTombstone(b []byte) bool {
return len(b) == markedRevBytesLen && b[markBytePosition] == markTombstone
}

View File

@@ -0,0 +1,165 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package mvcc
import (
"testing"
"go.etcd.io/etcd/pkg/v3/traceutil"
"go.etcd.io/etcd/v3/etcdserver/cindex"
"go.etcd.io/etcd/v3/lease"
"go.etcd.io/etcd/v3/mvcc/backend"
"go.uber.org/zap"
)
func BenchmarkStorePut(b *testing.B) {
be, tmpPath := backend.NewDefaultTmpBackend()
s := NewStore(zap.NewExample(), be, &lease.FakeLessor{}, cindex.NewConsistentIndex(be.BatchTx()), StoreConfig{})
defer cleanup(s, be, tmpPath)
// arbitrary number of bytes
bytesN := 64
keys := createBytesSlice(bytesN, b.N)
vals := createBytesSlice(bytesN, b.N)
b.ResetTimer()
for i := 0; i < b.N; i++ {
s.Put(keys[i], vals[i], lease.NoLease)
}
}
func BenchmarkStoreRangeKey1(b *testing.B) { benchmarkStoreRange(b, 1) }
func BenchmarkStoreRangeKey100(b *testing.B) { benchmarkStoreRange(b, 100) }
func benchmarkStoreRange(b *testing.B, n int) {
be, tmpPath := backend.NewDefaultTmpBackend()
s := NewStore(zap.NewExample(), be, &lease.FakeLessor{}, cindex.NewConsistentIndex(be.BatchTx()), StoreConfig{})
defer cleanup(s, be, tmpPath)
// 64 byte key/val
keys, val := createBytesSlice(64, n), createBytesSlice(64, 1)
for i := range keys {
s.Put(keys[i], val[0], lease.NoLease)
}
// Force into boltdb tx instead of backend read tx.
s.Commit()
var begin, end []byte
if n == 1 {
begin, end = keys[0], nil
} else {
begin, end = []byte{}, []byte{}
}
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
s.Range(begin, end, RangeOptions{})
}
}
func BenchmarkConsistentIndex(b *testing.B) {
be, tmpPath := backend.NewDefaultTmpBackend()
s := NewStore(zap.NewExample(), be, &lease.FakeLessor{}, cindex.NewConsistentIndex(be.BatchTx()), StoreConfig{})
defer cleanup(s, be, tmpPath)
tx := s.b.BatchTx()
tx.Lock()
s.saveIndex(tx)
tx.Unlock()
b.ReportAllocs()
b.ResetTimer()
for i := 0; i < b.N; i++ {
s.ConsistentIndex()
}
}
// BenchmarkStoreTxnPutUpdate is same as above, but instead updates single key
func BenchmarkStorePutUpdate(b *testing.B) {
be, tmpPath := backend.NewDefaultTmpBackend()
s := NewStore(zap.NewExample(), be, &lease.FakeLessor{}, cindex.NewConsistentIndex(be.BatchTx()), StoreConfig{})
defer cleanup(s, be, tmpPath)
// arbitrary number of bytes
keys := createBytesSlice(64, 1)
vals := createBytesSlice(1024, 1)
b.ResetTimer()
for i := 0; i < b.N; i++ {
s.Put(keys[0], vals[0], lease.NoLease)
}
}
// BenchmarkStoreTxnPut benchmarks the Put operation
// with transaction begin and end, where transaction involves
// some synchronization operations, such as mutex locking.
func BenchmarkStoreTxnPut(b *testing.B) {
be, tmpPath := backend.NewDefaultTmpBackend()
s := NewStore(zap.NewExample(), be, &lease.FakeLessor{}, cindex.NewConsistentIndex(be.BatchTx()), StoreConfig{})
defer cleanup(s, be, tmpPath)
// arbitrary number of bytes
bytesN := 64
keys := createBytesSlice(bytesN, b.N)
vals := createBytesSlice(bytesN, b.N)
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
txn := s.Write(traceutil.TODO())
txn.Put(keys[i], vals[i], lease.NoLease)
txn.End()
}
}
// benchmarkStoreRestore benchmarks the restore operation
func benchmarkStoreRestore(revsPerKey int, b *testing.B) {
be, tmpPath := backend.NewDefaultTmpBackend()
s := NewStore(zap.NewExample(), be, &lease.FakeLessor{}, cindex.NewConsistentIndex(be.BatchTx()), StoreConfig{})
// use closure to capture 's' to pick up the reassignment
defer func() { cleanup(s, be, tmpPath) }()
// arbitrary number of bytes
bytesN := 64
keys := createBytesSlice(bytesN, b.N)
vals := createBytesSlice(bytesN, b.N)
for i := 0; i < b.N; i++ {
for j := 0; j < revsPerKey; j++ {
txn := s.Write(traceutil.TODO())
txn.Put(keys[i], vals[i], lease.NoLease)
txn.End()
}
}
s.Close()
b.ReportAllocs()
b.ResetTimer()
s = NewStore(zap.NewExample(), be, &lease.FakeLessor{}, cindex.NewConsistentIndex(be.BatchTx()), StoreConfig{})
}
func BenchmarkStoreRestoreRevs1(b *testing.B) {
benchmarkStoreRestore(1, b)
}
func BenchmarkStoreRestoreRevs10(b *testing.B) {
benchmarkStoreRestore(10, b)
}
func BenchmarkStoreRestoreRevs20(b *testing.B) {
benchmarkStoreRestore(20, b)
}

View File

@@ -0,0 +1,77 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package mvcc
import (
"encoding/binary"
"time"
"go.uber.org/zap"
)
func (s *store) scheduleCompaction(compactMainRev int64, keep map[revision]struct{}) bool {
totalStart := time.Now()
defer func() { dbCompactionTotalMs.Observe(float64(time.Since(totalStart) / time.Millisecond)) }()
keyCompactions := 0
defer func() { dbCompactionKeysCounter.Add(float64(keyCompactions)) }()
defer func() { dbCompactionLast.Set(float64(time.Now().Unix())) }()
end := make([]byte, 8)
binary.BigEndian.PutUint64(end, uint64(compactMainRev+1))
last := make([]byte, 8+1+8)
for {
var rev revision
start := time.Now()
tx := s.b.BatchTx()
tx.Lock()
keys, _ := tx.UnsafeRange(keyBucketName, last, end, int64(s.cfg.CompactionBatchLimit))
for _, key := range keys {
rev = bytesToRev(key)
if _, ok := keep[rev]; !ok {
tx.UnsafeDelete(keyBucketName, key)
keyCompactions++
}
}
if len(keys) < s.cfg.CompactionBatchLimit {
rbytes := make([]byte, 8+1+8)
revToBytes(revision{main: compactMainRev}, rbytes)
tx.UnsafePut(metaBucketName, finishedCompactKeyName, rbytes)
tx.Unlock()
s.lg.Info(
"finished scheduled compaction",
zap.Int64("compact-revision", compactMainRev),
zap.Duration("took", time.Since(totalStart)),
)
return true
}
// update last
revToBytes(revision{main: rev.main, sub: rev.sub + 1}, last)
tx.Unlock()
// Immediately commit the compaction deletes instead of letting them accumulate in the write buffer
s.b.ForceCommit()
dbCompactionPauseMs.Observe(float64(time.Since(start) / time.Millisecond))
select {
case <-time.After(10 * time.Millisecond):
case <-s.stopc:
return false
}
}
}

View File

@@ -0,0 +1,137 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package mvcc
import (
"os"
"reflect"
"testing"
"time"
"go.etcd.io/etcd/pkg/v3/traceutil"
"go.etcd.io/etcd/v3/lease"
"go.etcd.io/etcd/v3/mvcc/backend"
"go.uber.org/zap"
)
func TestScheduleCompaction(t *testing.T) {
revs := []revision{{1, 0}, {2, 0}, {3, 0}}
tests := []struct {
rev int64
keep map[revision]struct{}
wrevs []revision
}{
// compact at 1 and discard all history
{
1,
nil,
revs[1:],
},
// compact at 3 and discard all history
{
3,
nil,
nil,
},
// compact at 1 and keeps history one step earlier
{
1,
map[revision]struct{}{
{main: 1}: {},
},
revs,
},
// compact at 1 and keeps history two steps earlier
{
3,
map[revision]struct{}{
{main: 2}: {},
{main: 3}: {},
},
revs[1:],
},
}
for i, tt := range tests {
b, tmpPath := backend.NewDefaultTmpBackend()
s := NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
tx := s.b.BatchTx()
tx.Lock()
ibytes := newRevBytes()
for _, rev := range revs {
revToBytes(rev, ibytes)
tx.UnsafePut(keyBucketName, ibytes, []byte("bar"))
}
tx.Unlock()
s.scheduleCompaction(tt.rev, tt.keep)
tx.Lock()
for _, rev := range tt.wrevs {
revToBytes(rev, ibytes)
keys, _ := tx.UnsafeRange(keyBucketName, ibytes, nil, 0)
if len(keys) != 1 {
t.Errorf("#%d: range on %v = %d, want 1", i, rev, len(keys))
}
}
_, vals := tx.UnsafeRange(metaBucketName, finishedCompactKeyName, nil, 0)
revToBytes(revision{main: tt.rev}, ibytes)
if w := [][]byte{ibytes}; !reflect.DeepEqual(vals, w) {
t.Errorf("#%d: vals on %v = %+v, want %+v", i, finishedCompactKeyName, vals, w)
}
tx.Unlock()
cleanup(s, b, tmpPath)
}
}
func TestCompactAllAndRestore(t *testing.T) {
b, tmpPath := backend.NewDefaultTmpBackend()
s0 := NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
defer os.Remove(tmpPath)
s0.Put([]byte("foo"), []byte("bar"), lease.NoLease)
s0.Put([]byte("foo"), []byte("bar1"), lease.NoLease)
s0.Put([]byte("foo"), []byte("bar2"), lease.NoLease)
s0.DeleteRange([]byte("foo"), nil)
rev := s0.Rev()
// compact all keys
done, err := s0.Compact(traceutil.TODO(), rev)
if err != nil {
t.Fatal(err)
}
select {
case <-done:
case <-time.After(10 * time.Second):
t.Fatal("timeout waiting for compaction to finish")
}
err = s0.Close()
if err != nil {
t.Fatal(err)
}
s1 := NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
if s1.Rev() != rev {
t.Errorf("rev = %v, want %v", s1.Rev(), rev)
}
_, err = s1.Range([]byte("foo"), nil, RangeOptions{})
if err != nil {
t.Errorf("unexpect range error %v", err)
}
}

1000
server/mvcc/kvstore_test.go Normal file

File diff suppressed because it is too large Load Diff

289
server/mvcc/kvstore_txn.go Normal file
View File

@@ -0,0 +1,289 @@
// Copyright 2017 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package mvcc
import (
"go.etcd.io/etcd/api/v3/mvccpb"
"go.etcd.io/etcd/pkg/v3/traceutil"
"go.etcd.io/etcd/v3/lease"
"go.etcd.io/etcd/v3/mvcc/backend"
"go.uber.org/zap"
)
type storeTxnRead struct {
s *store
tx backend.ReadTx
firstRev int64
rev int64
trace *traceutil.Trace
}
func (s *store) Read(trace *traceutil.Trace) TxnRead {
s.mu.RLock()
s.revMu.RLock()
// backend holds b.readTx.RLock() only when creating the concurrentReadTx. After
// ConcurrentReadTx is created, it will not block write transaction.
tx := s.b.ConcurrentReadTx()
tx.RLock() // RLock is no-op. concurrentReadTx does not need to be locked after it is created.
firstRev, rev := s.compactMainRev, s.currentRev
s.revMu.RUnlock()
return newMetricsTxnRead(&storeTxnRead{s, tx, firstRev, rev, trace})
}
func (tr *storeTxnRead) FirstRev() int64 { return tr.firstRev }
func (tr *storeTxnRead) Rev() int64 { return tr.rev }
func (tr *storeTxnRead) Range(key, end []byte, ro RangeOptions) (r *RangeResult, err error) {
return tr.rangeKeys(key, end, tr.Rev(), ro)
}
func (tr *storeTxnRead) End() {
tr.tx.RUnlock() // RUnlock signals the end of concurrentReadTx.
tr.s.mu.RUnlock()
}
type storeTxnWrite struct {
storeTxnRead
tx backend.BatchTx
// beginRev is the revision where the txn begins; it will write to the next revision.
beginRev int64
changes []mvccpb.KeyValue
}
func (s *store) Write(trace *traceutil.Trace) TxnWrite {
s.mu.RLock()
tx := s.b.BatchTx()
tx.Lock()
tw := &storeTxnWrite{
storeTxnRead: storeTxnRead{s, tx, 0, 0, trace},
tx: tx,
beginRev: s.currentRev,
changes: make([]mvccpb.KeyValue, 0, 4),
}
return newMetricsTxnWrite(tw)
}
func (tw *storeTxnWrite) Rev() int64 { return tw.beginRev }
func (tw *storeTxnWrite) Range(key, end []byte, ro RangeOptions) (r *RangeResult, err error) {
rev := tw.beginRev
if len(tw.changes) > 0 {
rev++
}
return tw.rangeKeys(key, end, rev, ro)
}
func (tw *storeTxnWrite) DeleteRange(key, end []byte) (int64, int64) {
if n := tw.deleteRange(key, end); n != 0 || len(tw.changes) > 0 {
return n, tw.beginRev + 1
}
return 0, tw.beginRev
}
func (tw *storeTxnWrite) Put(key, value []byte, lease lease.LeaseID) int64 {
tw.put(key, value, lease)
return tw.beginRev + 1
}
func (tw *storeTxnWrite) End() {
// only update index if the txn modifies the mvcc state.
if len(tw.changes) != 0 {
tw.s.saveIndex(tw.tx)
// hold revMu lock to prevent new read txns from opening until writeback.
tw.s.revMu.Lock()
tw.s.currentRev++
}
tw.tx.Unlock()
if len(tw.changes) != 0 {
tw.s.revMu.Unlock()
}
tw.s.mu.RUnlock()
}
func (tr *storeTxnRead) rangeKeys(key, end []byte, curRev int64, ro RangeOptions) (*RangeResult, error) {
rev := ro.Rev
if rev > curRev {
return &RangeResult{KVs: nil, Count: -1, Rev: curRev}, ErrFutureRev
}
if rev <= 0 {
rev = curRev
}
if rev < tr.s.compactMainRev {
return &RangeResult{KVs: nil, Count: -1, Rev: 0}, ErrCompacted
}
if ro.Count {
total := tr.s.kvindex.CountRevisions(key, end, rev, int(ro.Limit))
tr.trace.Step("count revisions from in-memory index tree")
return &RangeResult{KVs: nil, Count: total, Rev: curRev}, nil
}
revpairs := tr.s.kvindex.Revisions(key, end, rev, int(ro.Limit))
tr.trace.Step("range keys from in-memory index tree")
if len(revpairs) == 0 {
return &RangeResult{KVs: nil, Count: 0, Rev: curRev}, nil
}
limit := int(ro.Limit)
if limit <= 0 || limit > len(revpairs) {
limit = len(revpairs)
}
kvs := make([]mvccpb.KeyValue, limit)
revBytes := newRevBytes()
for i, revpair := range revpairs[:len(kvs)] {
revToBytes(revpair, revBytes)
_, vs := tr.tx.UnsafeRange(keyBucketName, revBytes, nil, 0)
if len(vs) != 1 {
tr.s.lg.Fatal(
"range failed to find revision pair",
zap.Int64("revision-main", revpair.main),
zap.Int64("revision-sub", revpair.sub),
)
}
if err := kvs[i].Unmarshal(vs[0]); err != nil {
tr.s.lg.Fatal(
"failed to unmarshal mvccpb.KeyValue",
zap.Error(err),
)
}
}
tr.trace.Step("range keys from bolt db")
return &RangeResult{KVs: kvs, Count: len(revpairs), Rev: curRev}, nil
}
func (tw *storeTxnWrite) put(key, value []byte, leaseID lease.LeaseID) {
rev := tw.beginRev + 1
c := rev
oldLease := lease.NoLease
// if the key exists before, use its previous created and
// get its previous leaseID
_, created, ver, err := tw.s.kvindex.Get(key, rev)
if err == nil {
c = created.main
oldLease = tw.s.le.GetLease(lease.LeaseItem{Key: string(key)})
}
tw.trace.Step("get key's previous created_revision and leaseID")
ibytes := newRevBytes()
idxRev := revision{main: rev, sub: int64(len(tw.changes))}
revToBytes(idxRev, ibytes)
ver = ver + 1
kv := mvccpb.KeyValue{
Key: key,
Value: value,
CreateRevision: c,
ModRevision: rev,
Version: ver,
Lease: int64(leaseID),
}
d, err := kv.Marshal()
if err != nil {
tw.storeTxnRead.s.lg.Fatal(
"failed to marshal mvccpb.KeyValue",
zap.Error(err),
)
}
tw.trace.Step("marshal mvccpb.KeyValue")
tw.tx.UnsafeSeqPut(keyBucketName, ibytes, d)
tw.s.kvindex.Put(key, idxRev)
tw.changes = append(tw.changes, kv)
tw.trace.Step("store kv pair into bolt db")
if oldLease != lease.NoLease {
if tw.s.le == nil {
panic("no lessor to detach lease")
}
err = tw.s.le.Detach(oldLease, []lease.LeaseItem{{Key: string(key)}})
if err != nil {
tw.storeTxnRead.s.lg.Error(
"failed to detach old lease from a key",
zap.Error(err),
)
}
}
if leaseID != lease.NoLease {
if tw.s.le == nil {
panic("no lessor to attach lease")
}
err = tw.s.le.Attach(leaseID, []lease.LeaseItem{{Key: string(key)}})
if err != nil {
panic("unexpected error from lease Attach")
}
}
tw.trace.Step("attach lease to kv pair")
}
func (tw *storeTxnWrite) deleteRange(key, end []byte) int64 {
rrev := tw.beginRev
if len(tw.changes) > 0 {
rrev++
}
keys, _ := tw.s.kvindex.Range(key, end, rrev)
if len(keys) == 0 {
return 0
}
for _, key := range keys {
tw.delete(key)
}
return int64(len(keys))
}
func (tw *storeTxnWrite) delete(key []byte) {
ibytes := newRevBytes()
idxRev := revision{main: tw.beginRev + 1, sub: int64(len(tw.changes))}
revToBytes(idxRev, ibytes)
ibytes = appendMarkTombstone(tw.storeTxnRead.s.lg, ibytes)
kv := mvccpb.KeyValue{Key: key}
d, err := kv.Marshal()
if err != nil {
tw.storeTxnRead.s.lg.Fatal(
"failed to marshal mvccpb.KeyValue",
zap.Error(err),
)
}
tw.tx.UnsafeSeqPut(keyBucketName, ibytes, d)
err = tw.s.kvindex.Tombstone(key, idxRev)
if err != nil {
tw.storeTxnRead.s.lg.Fatal(
"failed to tombstone an existing key",
zap.String("key", string(key)),
zap.Error(err),
)
}
tw.changes = append(tw.changes, kv)
item := lease.LeaseItem{Key: string(key)}
leaseID := tw.s.le.GetLease(item)
if leaseID != lease.NoLease {
err = tw.s.le.Detach(leaseID, []lease.LeaseItem{item})
if err != nil {
tw.storeTxnRead.s.lg.Error(
"failed to detach old lease from a key",
zap.Error(err),
)
}
}
}
func (tw *storeTxnWrite) Changes() []mvccpb.KeyValue { return tw.changes }

354
server/mvcc/metrics.go Normal file
View File

@@ -0,0 +1,354 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package mvcc
import (
"sync"
"github.com/prometheus/client_golang/prometheus"
)
var (
rangeCounter = prometheus.NewCounter(
prometheus.CounterOpts{
Namespace: "etcd",
Subsystem: "mvcc",
Name: "range_total",
Help: "Total number of ranges seen by this member.",
})
rangeCounterDebug = prometheus.NewCounter(
prometheus.CounterOpts{
Namespace: "etcd_debugging",
Subsystem: "mvcc",
Name: "range_total",
Help: "Total number of ranges seen by this member.",
})
putCounter = prometheus.NewCounter(
prometheus.CounterOpts{
Namespace: "etcd",
Subsystem: "mvcc",
Name: "put_total",
Help: "Total number of puts seen by this member.",
})
// TODO: remove in 3.5 release
putCounterDebug = prometheus.NewCounter(
prometheus.CounterOpts{
Namespace: "etcd_debugging",
Subsystem: "mvcc",
Name: "put_total",
Help: "Total number of puts seen by this member.",
})
deleteCounter = prometheus.NewCounter(
prometheus.CounterOpts{
Namespace: "etcd",
Subsystem: "mvcc",
Name: "delete_total",
Help: "Total number of deletes seen by this member.",
})
// TODO: remove in 3.5 release
deleteCounterDebug = prometheus.NewCounter(
prometheus.CounterOpts{
Namespace: "etcd_debugging",
Subsystem: "mvcc",
Name: "delete_total",
Help: "Total number of deletes seen by this member.",
})
txnCounter = prometheus.NewCounter(
prometheus.CounterOpts{
Namespace: "etcd",
Subsystem: "mvcc",
Name: "txn_total",
Help: "Total number of txns seen by this member.",
})
txnCounterDebug = prometheus.NewCounter(
prometheus.CounterOpts{
Namespace: "etcd_debugging",
Subsystem: "mvcc",
Name: "txn_total",
Help: "Total number of txns seen by this member.",
})
keysGauge = prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: "etcd_debugging",
Subsystem: "mvcc",
Name: "keys_total",
Help: "Total number of keys.",
})
watchStreamGauge = prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: "etcd_debugging",
Subsystem: "mvcc",
Name: "watch_stream_total",
Help: "Total number of watch streams.",
})
watcherGauge = prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: "etcd_debugging",
Subsystem: "mvcc",
Name: "watcher_total",
Help: "Total number of watchers.",
})
slowWatcherGauge = prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: "etcd_debugging",
Subsystem: "mvcc",
Name: "slow_watcher_total",
Help: "Total number of unsynced slow watchers.",
})
totalEventsCounter = prometheus.NewCounter(
prometheus.CounterOpts{
Namespace: "etcd_debugging",
Subsystem: "mvcc",
Name: "events_total",
Help: "Total number of events sent by this member.",
})
pendingEventsGauge = prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: "etcd_debugging",
Subsystem: "mvcc",
Name: "pending_events_total",
Help: "Total number of pending events to be sent.",
})
indexCompactionPauseMs = prometheus.NewHistogram(
prometheus.HistogramOpts{
Namespace: "etcd_debugging",
Subsystem: "mvcc",
Name: "index_compaction_pause_duration_milliseconds",
Help: "Bucketed histogram of index compaction pause duration.",
// lowest bucket start of upper bound 0.5 ms with factor 2
// highest bucket start of 0.5 ms * 2^13 == 4.096 sec
Buckets: prometheus.ExponentialBuckets(0.5, 2, 14),
})
dbCompactionPauseMs = prometheus.NewHistogram(
prometheus.HistogramOpts{
Namespace: "etcd_debugging",
Subsystem: "mvcc",
Name: "db_compaction_pause_duration_milliseconds",
Help: "Bucketed histogram of db compaction pause duration.",
// lowest bucket start of upper bound 1 ms with factor 2
// highest bucket start of 1 ms * 2^12 == 4.096 sec
Buckets: prometheus.ExponentialBuckets(1, 2, 13),
})
dbCompactionTotalMs = prometheus.NewHistogram(
prometheus.HistogramOpts{
Namespace: "etcd_debugging",
Subsystem: "mvcc",
Name: "db_compaction_total_duration_milliseconds",
Help: "Bucketed histogram of db compaction total duration.",
// lowest bucket start of upper bound 100 ms with factor 2
// highest bucket start of 100 ms * 2^13 == 8.192 sec
Buckets: prometheus.ExponentialBuckets(100, 2, 14),
})
dbCompactionLast = prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: "etcd_debugging",
Subsystem: "mvcc",
Name: "db_compaction_last",
Help: "The unix time of the last db compaction. Resets to 0 on start.",
})
dbCompactionKeysCounter = prometheus.NewCounter(
prometheus.CounterOpts{
Namespace: "etcd_debugging",
Subsystem: "mvcc",
Name: "db_compaction_keys_total",
Help: "Total number of db keys compacted.",
})
dbTotalSize = prometheus.NewGaugeFunc(prometheus.GaugeOpts{
Namespace: "etcd",
Subsystem: "mvcc",
Name: "db_total_size_in_bytes",
Help: "Total size of the underlying database physically allocated in bytes.",
},
func() float64 {
reportDbTotalSizeInBytesMu.RLock()
defer reportDbTotalSizeInBytesMu.RUnlock()
return reportDbTotalSizeInBytes()
},
)
// overridden by mvcc initialization
reportDbTotalSizeInBytesMu sync.RWMutex
reportDbTotalSizeInBytes = func() float64 { return 0 }
// TODO: remove this in v3.5
dbTotalSizeDebug = prometheus.NewGaugeFunc(prometheus.GaugeOpts{
Namespace: "etcd_debugging",
Subsystem: "mvcc",
Name: "db_total_size_in_bytes",
Help: "Total size of the underlying database physically allocated in bytes.",
},
func() float64 {
reportDbTotalSizeInBytesDebugMu.RLock()
defer reportDbTotalSizeInBytesDebugMu.RUnlock()
return reportDbTotalSizeInBytesDebug()
},
)
// overridden by mvcc initialization
reportDbTotalSizeInBytesDebugMu sync.RWMutex
reportDbTotalSizeInBytesDebug = func() float64 { return 0 }
dbTotalSizeInUse = prometheus.NewGaugeFunc(prometheus.GaugeOpts{
Namespace: "etcd",
Subsystem: "mvcc",
Name: "db_total_size_in_use_in_bytes",
Help: "Total size of the underlying database logically in use in bytes.",
},
func() float64 {
reportDbTotalSizeInUseInBytesMu.RLock()
defer reportDbTotalSizeInUseInBytesMu.RUnlock()
return reportDbTotalSizeInUseInBytes()
},
)
// overridden by mvcc initialization
reportDbTotalSizeInUseInBytesMu sync.RWMutex
reportDbTotalSizeInUseInBytes = func() float64 { return 0 }
dbOpenReadTxN = prometheus.NewGaugeFunc(prometheus.GaugeOpts{
Namespace: "etcd",
Subsystem: "mvcc",
Name: "db_open_read_transactions",
Help: "The number of currently open read transactions",
},
func() float64 {
reportDbOpenReadTxNMu.RLock()
defer reportDbOpenReadTxNMu.RUnlock()
return reportDbOpenReadTxN()
},
)
// overridden by mvcc initialization
reportDbOpenReadTxNMu sync.RWMutex
reportDbOpenReadTxN = func() float64 { return 0 }
hashSec = prometheus.NewHistogram(prometheus.HistogramOpts{
Namespace: "etcd",
Subsystem: "mvcc",
Name: "hash_duration_seconds",
Help: "The latency distribution of storage hash operation.",
// 100 MB usually takes 100 ms, so start with 10 MB of 10 ms
// lowest bucket start of upper bound 0.01 sec (10 ms) with factor 2
// highest bucket start of 0.01 sec * 2^14 == 163.84 sec
Buckets: prometheus.ExponentialBuckets(.01, 2, 15),
})
hashRevSec = prometheus.NewHistogram(prometheus.HistogramOpts{
Namespace: "etcd",
Subsystem: "mvcc",
Name: "hash_rev_duration_seconds",
Help: "The latency distribution of storage hash by revision operation.",
// 100 MB usually takes 100 ms, so start with 10 MB of 10 ms
// lowest bucket start of upper bound 0.01 sec (10 ms) with factor 2
// highest bucket start of 0.01 sec * 2^14 == 163.84 sec
Buckets: prometheus.ExponentialBuckets(.01, 2, 15),
})
currentRev = prometheus.NewGaugeFunc(prometheus.GaugeOpts{
Namespace: "etcd_debugging",
Subsystem: "mvcc",
Name: "current_revision",
Help: "The current revision of store.",
},
func() float64 {
reportCurrentRevMu.RLock()
defer reportCurrentRevMu.RUnlock()
return reportCurrentRev()
},
)
// overridden by mvcc initialization
reportCurrentRevMu sync.RWMutex
reportCurrentRev = func() float64 { return 0 }
compactRev = prometheus.NewGaugeFunc(prometheus.GaugeOpts{
Namespace: "etcd_debugging",
Subsystem: "mvcc",
Name: "compact_revision",
Help: "The revision of the last compaction in store.",
},
func() float64 {
reportCompactRevMu.RLock()
defer reportCompactRevMu.RUnlock()
return reportCompactRev()
},
)
// overridden by mvcc initialization
reportCompactRevMu sync.RWMutex
reportCompactRev = func() float64 { return 0 }
totalPutSizeGauge = prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: "etcd_debugging",
Subsystem: "mvcc",
Name: "total_put_size_in_bytes",
Help: "The total size of put kv pairs seen by this member.",
})
)
func init() {
prometheus.MustRegister(rangeCounter)
prometheus.MustRegister(rangeCounterDebug)
prometheus.MustRegister(putCounter)
prometheus.MustRegister(putCounterDebug)
prometheus.MustRegister(deleteCounter)
prometheus.MustRegister(deleteCounterDebug)
prometheus.MustRegister(txnCounter)
prometheus.MustRegister(txnCounterDebug)
prometheus.MustRegister(keysGauge)
prometheus.MustRegister(watchStreamGauge)
prometheus.MustRegister(watcherGauge)
prometheus.MustRegister(slowWatcherGauge)
prometheus.MustRegister(totalEventsCounter)
prometheus.MustRegister(pendingEventsGauge)
prometheus.MustRegister(indexCompactionPauseMs)
prometheus.MustRegister(dbCompactionPauseMs)
prometheus.MustRegister(dbCompactionTotalMs)
prometheus.MustRegister(dbCompactionLast)
prometheus.MustRegister(dbCompactionKeysCounter)
prometheus.MustRegister(dbTotalSize)
prometheus.MustRegister(dbTotalSizeDebug)
prometheus.MustRegister(dbTotalSizeInUse)
prometheus.MustRegister(dbOpenReadTxN)
prometheus.MustRegister(hashSec)
prometheus.MustRegister(hashRevSec)
prometheus.MustRegister(currentRev)
prometheus.MustRegister(compactRev)
prometheus.MustRegister(totalPutSizeGauge)
}
// ReportEventReceived reports that an event is received.
// This function should be called when the external systems received an
// event from mvcc.Watcher.
func ReportEventReceived(n int) {
pendingEventsGauge.Sub(float64(n))
totalEventsCounter.Add(float64(n))
}

View File

@@ -0,0 +1,71 @@
// Copyright 2017 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package mvcc
import "go.etcd.io/etcd/v3/lease"
type metricsTxnWrite struct {
TxnWrite
ranges uint
puts uint
deletes uint
putSize int64
}
func newMetricsTxnRead(tr TxnRead) TxnRead {
return &metricsTxnWrite{&txnReadWrite{tr}, 0, 0, 0, 0}
}
func newMetricsTxnWrite(tw TxnWrite) TxnWrite {
return &metricsTxnWrite{tw, 0, 0, 0, 0}
}
func (tw *metricsTxnWrite) Range(key, end []byte, ro RangeOptions) (*RangeResult, error) {
tw.ranges++
return tw.TxnWrite.Range(key, end, ro)
}
func (tw *metricsTxnWrite) DeleteRange(key, end []byte) (n, rev int64) {
tw.deletes++
return tw.TxnWrite.DeleteRange(key, end)
}
func (tw *metricsTxnWrite) Put(key, value []byte, lease lease.LeaseID) (rev int64) {
tw.puts++
size := int64(len(key) + len(value))
tw.putSize += size
return tw.TxnWrite.Put(key, value, lease)
}
func (tw *metricsTxnWrite) End() {
defer tw.TxnWrite.End()
if sum := tw.ranges + tw.puts + tw.deletes; sum > 1 {
txnCounter.Inc()
txnCounterDebug.Inc() // TODO: remove in 3.5 release
}
ranges := float64(tw.ranges)
rangeCounter.Add(ranges)
rangeCounterDebug.Add(ranges) // TODO: remove in 3.5 release
puts := float64(tw.puts)
putCounter.Add(puts)
putCounterDebug.Add(puts) // TODO: remove in 3.5 release
totalPutSizeGauge.Add(float64(tw.putSize))
deletes := float64(tw.deletes)
deleteCounter.Add(deletes)
deleteCounterDebug.Add(deletes) // TODO: remove in 3.5 release
}

67
server/mvcc/revision.go Normal file
View File

@@ -0,0 +1,67 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package mvcc
import "encoding/binary"
// revBytesLen is the byte length of a normal revision.
// First 8 bytes is the revision.main in big-endian format. The 9th byte
// is a '_'. The last 8 bytes is the revision.sub in big-endian format.
const revBytesLen = 8 + 1 + 8
// A revision indicates modification of the key-value space.
// The set of changes that share same main revision changes the key-value space atomically.
type revision struct {
// main is the main revision of a set of changes that happen atomically.
main int64
// sub is the sub revision of a change in a set of changes that happen
// atomically. Each change has different increasing sub revision in that
// set.
sub int64
}
func (a revision) GreaterThan(b revision) bool {
if a.main > b.main {
return true
}
if a.main < b.main {
return false
}
return a.sub > b.sub
}
func newRevBytes() []byte {
return make([]byte, revBytesLen, markedRevBytesLen)
}
func revToBytes(rev revision, bytes []byte) {
binary.BigEndian.PutUint64(bytes, uint64(rev.main))
bytes[8] = '_'
binary.BigEndian.PutUint64(bytes[9:], uint64(rev.sub))
}
func bytesToRev(bytes []byte) revision {
return revision{
main: int64(binary.BigEndian.Uint64(bytes[0:8])),
sub: int64(binary.BigEndian.Uint64(bytes[9:])),
}
}
type revisions []revision
func (a revisions) Len() int { return len(a) }
func (a revisions) Less(i, j int) bool { return a[j].GreaterThan(a[i]) }
func (a revisions) Swap(i, j int) { a[i], a[j] = a[j], a[i] }

View File

@@ -0,0 +1,53 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package mvcc
import (
"bytes"
"math"
"reflect"
"testing"
)
// TestRevision tests that revision could be encoded to and decoded from
// bytes slice. Moreover, the lexicographical order of its byte slice representation
// follows the order of (main, sub).
func TestRevision(t *testing.T) {
tests := []revision{
// order in (main, sub)
{},
{main: 1, sub: 0},
{main: 1, sub: 1},
{main: 2, sub: 0},
{main: math.MaxInt64, sub: math.MaxInt64},
}
bs := make([][]byte, len(tests))
for i, tt := range tests {
b := newRevBytes()
revToBytes(tt, b)
bs[i] = b
if grev := bytesToRev(b); !reflect.DeepEqual(grev, tt) {
t.Errorf("#%d: revision = %+v, want %+v", i, grev, tt)
}
}
for i := 0; i < len(tests)-1; i++ {
if bytes.Compare(bs[i], bs[i+1]) >= 0 {
t.Errorf("#%d: %v (%+v) should be smaller than %v (%+v)", i, bs[i], tests[i], bs[i+1], tests[i+1])
}
}
}

57
server/mvcc/util.go Normal file
View File

@@ -0,0 +1,57 @@
// Copyright 2016 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package mvcc
import (
"encoding/binary"
"fmt"
"go.etcd.io/etcd/api/v3/mvccpb"
"go.etcd.io/etcd/v3/mvcc/backend"
)
func UpdateConsistentIndex(be backend.Backend, index uint64) {
tx := be.BatchTx()
tx.Lock()
defer tx.Unlock()
var oldi uint64
_, vs := tx.UnsafeRange(metaBucketName, consistentIndexKeyName, nil, 0)
if len(vs) != 0 {
oldi = binary.BigEndian.Uint64(vs[0])
}
if index <= oldi {
return
}
bs := make([]byte, 8)
binary.BigEndian.PutUint64(bs, index)
tx.UnsafePut(metaBucketName, consistentIndexKeyName, bs)
}
func WriteKV(be backend.Backend, kv mvccpb.KeyValue) {
ibytes := newRevBytes()
revToBytes(revision{main: kv.ModRevision}, ibytes)
d, err := kv.Marshal()
if err != nil {
panic(fmt.Errorf("cannot marshal event: %v", err))
}
be.BatchTx().Lock()
be.BatchTx().UnsafePut(keyBucketName, ibytes, d)
be.BatchTx().Unlock()
}

View File

@@ -0,0 +1,548 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package mvcc
import (
"sync"
"time"
"go.etcd.io/etcd/api/v3/mvccpb"
"go.etcd.io/etcd/pkg/v3/traceutil"
"go.etcd.io/etcd/v3/etcdserver/cindex"
"go.etcd.io/etcd/v3/lease"
"go.etcd.io/etcd/v3/mvcc/backend"
"go.uber.org/zap"
)
// non-const so modifiable by tests
var (
// chanBufLen is the length of the buffered chan
// for sending out watched events.
// See https://github.com/etcd-io/etcd/issues/11906 for more detail.
chanBufLen = 128
// maxWatchersPerSync is the number of watchers to sync in a single batch
maxWatchersPerSync = 512
)
type watchable interface {
watch(key, end []byte, startRev int64, id WatchID, ch chan<- WatchResponse, fcs ...FilterFunc) (*watcher, cancelFunc)
progress(w *watcher)
rev() int64
}
type watchableStore struct {
*store
// mu protects watcher groups and batches. It should never be locked
// before locking store.mu to avoid deadlock.
mu sync.RWMutex
// victims are watcher batches that were blocked on the watch channel
victims []watcherBatch
victimc chan struct{}
// contains all unsynced watchers that needs to sync with events that have happened
unsynced watcherGroup
// contains all synced watchers that are in sync with the progress of the store.
// The key of the map is the key that the watcher watches on.
synced watcherGroup
stopc chan struct{}
wg sync.WaitGroup
}
// cancelFunc updates unsynced and synced maps when running
// cancel operations.
type cancelFunc func()
func New(lg *zap.Logger, b backend.Backend, le lease.Lessor, ci cindex.ConsistentIndexer, cfg StoreConfig) ConsistentWatchableKV {
return newWatchableStore(lg, b, le, ci, cfg)
}
func newWatchableStore(lg *zap.Logger, b backend.Backend, le lease.Lessor, ci cindex.ConsistentIndexer, cfg StoreConfig) *watchableStore {
if lg == nil {
lg = zap.NewNop()
}
s := &watchableStore{
store: NewStore(lg, b, le, ci, cfg),
victimc: make(chan struct{}, 1),
unsynced: newWatcherGroup(),
synced: newWatcherGroup(),
stopc: make(chan struct{}),
}
s.store.ReadView = &readView{s}
s.store.WriteView = &writeView{s}
if s.le != nil {
// use this store as the deleter so revokes trigger watch events
s.le.SetRangeDeleter(func() lease.TxnDelete { return s.Write(traceutil.TODO()) })
}
s.wg.Add(2)
go s.syncWatchersLoop()
go s.syncVictimsLoop()
return s
}
func (s *watchableStore) Close() error {
close(s.stopc)
s.wg.Wait()
return s.store.Close()
}
func (s *watchableStore) NewWatchStream() WatchStream {
watchStreamGauge.Inc()
return &watchStream{
watchable: s,
ch: make(chan WatchResponse, chanBufLen),
cancels: make(map[WatchID]cancelFunc),
watchers: make(map[WatchID]*watcher),
}
}
func (s *watchableStore) watch(key, end []byte, startRev int64, id WatchID, ch chan<- WatchResponse, fcs ...FilterFunc) (*watcher, cancelFunc) {
wa := &watcher{
key: key,
end: end,
minRev: startRev,
id: id,
ch: ch,
fcs: fcs,
}
s.mu.Lock()
s.revMu.RLock()
synced := startRev > s.store.currentRev || startRev == 0
if synced {
wa.minRev = s.store.currentRev + 1
if startRev > wa.minRev {
wa.minRev = startRev
}
}
if synced {
s.synced.add(wa)
} else {
slowWatcherGauge.Inc()
s.unsynced.add(wa)
}
s.revMu.RUnlock()
s.mu.Unlock()
watcherGauge.Inc()
return wa, func() { s.cancelWatcher(wa) }
}
// cancelWatcher removes references of the watcher from the watchableStore
func (s *watchableStore) cancelWatcher(wa *watcher) {
for {
s.mu.Lock()
if s.unsynced.delete(wa) {
slowWatcherGauge.Dec()
watcherGauge.Dec()
break
} else if s.synced.delete(wa) {
watcherGauge.Dec()
break
} else if wa.compacted {
watcherGauge.Dec()
break
} else if wa.ch == nil {
// already canceled (e.g., cancel/close race)
break
}
if !wa.victim {
s.mu.Unlock()
panic("watcher not victim but not in watch groups")
}
var victimBatch watcherBatch
for _, wb := range s.victims {
if wb[wa] != nil {
victimBatch = wb
break
}
}
if victimBatch != nil {
slowWatcherGauge.Dec()
watcherGauge.Dec()
delete(victimBatch, wa)
break
}
// victim being processed so not accessible; retry
s.mu.Unlock()
time.Sleep(time.Millisecond)
}
wa.ch = nil
s.mu.Unlock()
}
func (s *watchableStore) Restore(b backend.Backend) error {
s.mu.Lock()
defer s.mu.Unlock()
err := s.store.Restore(b)
if err != nil {
return err
}
for wa := range s.synced.watchers {
wa.restore = true
s.unsynced.add(wa)
}
s.synced = newWatcherGroup()
return nil
}
// syncWatchersLoop syncs the watcher in the unsynced map every 100ms.
func (s *watchableStore) syncWatchersLoop() {
defer s.wg.Done()
for {
s.mu.RLock()
st := time.Now()
lastUnsyncedWatchers := s.unsynced.size()
s.mu.RUnlock()
unsyncedWatchers := 0
if lastUnsyncedWatchers > 0 {
unsyncedWatchers = s.syncWatchers()
}
syncDuration := time.Since(st)
waitDuration := 100 * time.Millisecond
// more work pending?
if unsyncedWatchers != 0 && lastUnsyncedWatchers > unsyncedWatchers {
// be fair to other store operations by yielding time taken
waitDuration = syncDuration
}
select {
case <-time.After(waitDuration):
case <-s.stopc:
return
}
}
}
// syncVictimsLoop tries to write precomputed watcher responses to
// watchers that had a blocked watcher channel
func (s *watchableStore) syncVictimsLoop() {
defer s.wg.Done()
for {
for s.moveVictims() != 0 {
// try to update all victim watchers
}
s.mu.RLock()
isEmpty := len(s.victims) == 0
s.mu.RUnlock()
var tickc <-chan time.Time
if !isEmpty {
tickc = time.After(10 * time.Millisecond)
}
select {
case <-tickc:
case <-s.victimc:
case <-s.stopc:
return
}
}
}
// moveVictims tries to update watches with already pending event data
func (s *watchableStore) moveVictims() (moved int) {
s.mu.Lock()
victims := s.victims
s.victims = nil
s.mu.Unlock()
var newVictim watcherBatch
for _, wb := range victims {
// try to send responses again
for w, eb := range wb {
// watcher has observed the store up to, but not including, w.minRev
rev := w.minRev - 1
if w.send(WatchResponse{WatchID: w.id, Events: eb.evs, Revision: rev}) {
pendingEventsGauge.Add(float64(len(eb.evs)))
} else {
if newVictim == nil {
newVictim = make(watcherBatch)
}
newVictim[w] = eb
continue
}
moved++
}
// assign completed victim watchers to unsync/sync
s.mu.Lock()
s.store.revMu.RLock()
curRev := s.store.currentRev
for w, eb := range wb {
if newVictim != nil && newVictim[w] != nil {
// couldn't send watch response; stays victim
continue
}
w.victim = false
if eb.moreRev != 0 {
w.minRev = eb.moreRev
}
if w.minRev <= curRev {
s.unsynced.add(w)
} else {
slowWatcherGauge.Dec()
s.synced.add(w)
}
}
s.store.revMu.RUnlock()
s.mu.Unlock()
}
if len(newVictim) > 0 {
s.mu.Lock()
s.victims = append(s.victims, newVictim)
s.mu.Unlock()
}
return moved
}
// syncWatchers syncs unsynced watchers by:
// 1. choose a set of watchers from the unsynced watcher group
// 2. iterate over the set to get the minimum revision and remove compacted watchers
// 3. use minimum revision to get all key-value pairs and send those events to watchers
// 4. remove synced watchers in set from unsynced group and move to synced group
func (s *watchableStore) syncWatchers() int {
s.mu.Lock()
defer s.mu.Unlock()
if s.unsynced.size() == 0 {
return 0
}
s.store.revMu.RLock()
defer s.store.revMu.RUnlock()
// in order to find key-value pairs from unsynced watchers, we need to
// find min revision index, and these revisions can be used to
// query the backend store of key-value pairs
curRev := s.store.currentRev
compactionRev := s.store.compactMainRev
wg, minRev := s.unsynced.choose(maxWatchersPerSync, curRev, compactionRev)
minBytes, maxBytes := newRevBytes(), newRevBytes()
revToBytes(revision{main: minRev}, minBytes)
revToBytes(revision{main: curRev + 1}, maxBytes)
// UnsafeRange returns keys and values. And in boltdb, keys are revisions.
// values are actual key-value pairs in backend.
tx := s.store.b.ReadTx()
tx.RLock()
revs, vs := tx.UnsafeRange(keyBucketName, minBytes, maxBytes, 0)
tx.RUnlock()
var evs []mvccpb.Event
evs = kvsToEvents(s.store.lg, wg, revs, vs)
var victims watcherBatch
wb := newWatcherBatch(wg, evs)
for w := range wg.watchers {
w.minRev = curRev + 1
eb, ok := wb[w]
if !ok {
// bring un-notified watcher to synced
s.synced.add(w)
s.unsynced.delete(w)
continue
}
if eb.moreRev != 0 {
w.minRev = eb.moreRev
}
if w.send(WatchResponse{WatchID: w.id, Events: eb.evs, Revision: curRev}) {
pendingEventsGauge.Add(float64(len(eb.evs)))
} else {
if victims == nil {
victims = make(watcherBatch)
}
w.victim = true
}
if w.victim {
victims[w] = eb
} else {
if eb.moreRev != 0 {
// stay unsynced; more to read
continue
}
s.synced.add(w)
}
s.unsynced.delete(w)
}
s.addVictim(victims)
vsz := 0
for _, v := range s.victims {
vsz += len(v)
}
slowWatcherGauge.Set(float64(s.unsynced.size() + vsz))
return s.unsynced.size()
}
// kvsToEvents gets all events for the watchers from all key-value pairs
func kvsToEvents(lg *zap.Logger, wg *watcherGroup, revs, vals [][]byte) (evs []mvccpb.Event) {
for i, v := range vals {
var kv mvccpb.KeyValue
if err := kv.Unmarshal(v); err != nil {
lg.Panic("failed to unmarshal mvccpb.KeyValue", zap.Error(err))
}
if !wg.contains(string(kv.Key)) {
continue
}
ty := mvccpb.PUT
if isTombstone(revs[i]) {
ty = mvccpb.DELETE
// patch in mod revision so watchers won't skip
kv.ModRevision = bytesToRev(revs[i]).main
}
evs = append(evs, mvccpb.Event{Kv: &kv, Type: ty})
}
return evs
}
// notify notifies the fact that given event at the given rev just happened to
// watchers that watch on the key of the event.
func (s *watchableStore) notify(rev int64, evs []mvccpb.Event) {
var victim watcherBatch
for w, eb := range newWatcherBatch(&s.synced, evs) {
if eb.revs != 1 {
s.store.lg.Panic(
"unexpected multiple revisions in watch notification",
zap.Int("number-of-revisions", eb.revs),
)
}
if w.send(WatchResponse{WatchID: w.id, Events: eb.evs, Revision: rev}) {
pendingEventsGauge.Add(float64(len(eb.evs)))
} else {
// move slow watcher to victims
w.minRev = rev + 1
if victim == nil {
victim = make(watcherBatch)
}
w.victim = true
victim[w] = eb
s.synced.delete(w)
slowWatcherGauge.Inc()
}
}
s.addVictim(victim)
}
func (s *watchableStore) addVictim(victim watcherBatch) {
if victim == nil {
return
}
s.victims = append(s.victims, victim)
select {
case s.victimc <- struct{}{}:
default:
}
}
func (s *watchableStore) rev() int64 { return s.store.Rev() }
func (s *watchableStore) progress(w *watcher) {
s.mu.RLock()
defer s.mu.RUnlock()
if _, ok := s.synced.watchers[w]; ok {
w.send(WatchResponse{WatchID: w.id, Revision: s.rev()})
// If the ch is full, this watcher is receiving events.
// We do not need to send progress at all.
}
}
type watcher struct {
// the watcher key
key []byte
// end indicates the end of the range to watch.
// If end is set, the watcher is on a range.
end []byte
// victim is set when ch is blocked and undergoing victim processing
victim bool
// compacted is set when the watcher is removed because of compaction
compacted bool
// restore is true when the watcher is being restored from leader snapshot
// which means that this watcher has just been moved from "synced" to "unsynced"
// watcher group, possibly with a future revision when it was first added
// to the synced watcher
// "unsynced" watcher revision must always be <= current revision,
// except when the watcher were to be moved from "synced" watcher group
restore bool
// minRev is the minimum revision update the watcher will accept
minRev int64
id WatchID
fcs []FilterFunc
// a chan to send out the watch response.
// The chan might be shared with other watchers.
ch chan<- WatchResponse
}
func (w *watcher) send(wr WatchResponse) bool {
progressEvent := len(wr.Events) == 0
if len(w.fcs) != 0 {
ne := make([]mvccpb.Event, 0, len(wr.Events))
for i := range wr.Events {
filtered := false
for _, filter := range w.fcs {
if filter(wr.Events[i]) {
filtered = true
break
}
}
if !filtered {
ne = append(ne, wr.Events[i])
}
}
wr.Events = ne
}
// if all events are filtered out, we should send nothing.
if !progressEvent && len(wr.Events) == 0 {
return true
}
select {
case w.ch <- wr:
return true
default:
return false
}
}

View File

@@ -0,0 +1,218 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package mvcc
import (
"math/rand"
"os"
"testing"
"go.etcd.io/etcd/pkg/v3/traceutil"
"go.etcd.io/etcd/v3/etcdserver/cindex"
"go.etcd.io/etcd/v3/lease"
"go.etcd.io/etcd/v3/mvcc/backend"
"go.uber.org/zap"
)
func BenchmarkWatchableStorePut(b *testing.B) {
be, tmpPath := backend.NewDefaultTmpBackend()
s := New(zap.NewExample(), be, &lease.FakeLessor{}, nil, StoreConfig{})
defer cleanup(s, be, tmpPath)
// arbitrary number of bytes
bytesN := 64
keys := createBytesSlice(bytesN, b.N)
vals := createBytesSlice(bytesN, b.N)
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
s.Put(keys[i], vals[i], lease.NoLease)
}
}
// BenchmarkWatchableStoreTxnPut benchmarks the Put operation
// with transaction begin and end, where transaction involves
// some synchronization operations, such as mutex locking.
func BenchmarkWatchableStoreTxnPut(b *testing.B) {
be, tmpPath := backend.NewDefaultTmpBackend()
s := New(zap.NewExample(), be, &lease.FakeLessor{}, cindex.NewConsistentIndex(be.BatchTx()), StoreConfig{})
defer cleanup(s, be, tmpPath)
// arbitrary number of bytes
bytesN := 64
keys := createBytesSlice(bytesN, b.N)
vals := createBytesSlice(bytesN, b.N)
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
txn := s.Write(traceutil.TODO())
txn.Put(keys[i], vals[i], lease.NoLease)
txn.End()
}
}
// BenchmarkWatchableStoreWatchPutSync benchmarks the case of
// many synced watchers receiving a Put notification.
func BenchmarkWatchableStoreWatchPutSync(b *testing.B) {
benchmarkWatchableStoreWatchPut(b, true)
}
// BenchmarkWatchableStoreWatchPutUnsync benchmarks the case of
// many unsynced watchers receiving a Put notification.
func BenchmarkWatchableStoreWatchPutUnsync(b *testing.B) {
benchmarkWatchableStoreWatchPut(b, false)
}
func benchmarkWatchableStoreWatchPut(b *testing.B, synced bool) {
be, tmpPath := backend.NewDefaultTmpBackend()
s := newWatchableStore(zap.NewExample(), be, &lease.FakeLessor{}, nil, StoreConfig{})
defer cleanup(s, be, tmpPath)
k := []byte("testkey")
v := []byte("testval")
rev := int64(0)
if !synced {
// non-0 value to keep watchers in unsynced
rev = 1
}
w := s.NewWatchStream()
defer w.Close()
watchIDs := make([]WatchID, b.N)
for i := range watchIDs {
watchIDs[i], _ = w.Watch(0, k, nil, rev)
}
b.ResetTimer()
b.ReportAllocs()
// trigger watchers
s.Put(k, v, lease.NoLease)
for range watchIDs {
<-w.Chan()
}
select {
case wc := <-w.Chan():
b.Fatalf("unexpected data %v", wc)
default:
}
}
// Benchmarks on cancel function performance for unsynced watchers
// in a WatchableStore. It creates k*N watchers to populate unsynced
// with a reasonably large number of watchers. And measures the time it
// takes to cancel N watchers out of k*N watchers. The performance is
// expected to differ depending on the unsynced member implementation.
// TODO: k is an arbitrary constant. We need to figure out what factor
// we should put to simulate the real-world use cases.
func BenchmarkWatchableStoreUnsyncedCancel(b *testing.B) {
be, tmpPath := backend.NewDefaultTmpBackend()
s := NewStore(zap.NewExample(), be, &lease.FakeLessor{}, nil, StoreConfig{})
// manually create watchableStore instead of newWatchableStore
// because newWatchableStore periodically calls syncWatchersLoop
// method to sync watchers in unsynced map. We want to keep watchers
// in unsynced for this benchmark.
ws := &watchableStore{
store: s,
unsynced: newWatcherGroup(),
// to make the test not crash from assigning to nil map.
// 'synced' doesn't get populated in this test.
synced: newWatcherGroup(),
}
defer func() {
ws.store.Close()
os.Remove(tmpPath)
}()
// Put a key so that we can spawn watchers on that key
// (testKey in this test). This increases the rev to 1,
// and later we can we set the watcher's startRev to 1,
// and force watchers to be in unsynced.
testKey := []byte("foo")
testValue := []byte("bar")
s.Put(testKey, testValue, lease.NoLease)
w := ws.NewWatchStream()
const k int = 2
benchSampleN := b.N
watcherN := k * benchSampleN
watchIDs := make([]WatchID, watcherN)
for i := 0; i < watcherN; i++ {
// non-0 value to keep watchers in unsynced
watchIDs[i], _ = w.Watch(0, testKey, nil, 1)
}
// random-cancel N watchers to make it not biased towards
// data structures with an order, such as slice.
ix := rand.Perm(watcherN)
b.ResetTimer()
b.ReportAllocs()
// cancel N watchers
for _, idx := range ix[:benchSampleN] {
if err := w.Cancel(watchIDs[idx]); err != nil {
b.Error(err)
}
}
}
func BenchmarkWatchableStoreSyncedCancel(b *testing.B) {
be, tmpPath := backend.NewDefaultTmpBackend()
s := newWatchableStore(zap.NewExample(), be, &lease.FakeLessor{}, nil, StoreConfig{})
defer func() {
s.store.Close()
os.Remove(tmpPath)
}()
// Put a key so that we can spawn watchers on that key
testKey := []byte("foo")
testValue := []byte("bar")
s.Put(testKey, testValue, lease.NoLease)
w := s.NewWatchStream()
// put 1 million watchers on the same key
const watcherN = 1000000
watchIDs := make([]WatchID, watcherN)
for i := 0; i < watcherN; i++ {
// 0 for startRev to keep watchers in synced
watchIDs[i], _ = w.Watch(0, testKey, nil, 0)
}
// randomly cancel watchers to make it not biased towards
// data structures with an order, such as slice.
ix := rand.Perm(watcherN)
b.ResetTimer()
b.ReportAllocs()
for _, idx := range ix {
if err := w.Cancel(watchIDs[idx]); err != nil {
b.Error(err)
}
}
}

View File

@@ -0,0 +1,655 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package mvcc
import (
"bytes"
"fmt"
"os"
"reflect"
"sync"
"testing"
"time"
"go.etcd.io/etcd/api/v3/mvccpb"
"go.etcd.io/etcd/pkg/v3/traceutil"
"go.etcd.io/etcd/v3/etcdserver/cindex"
"go.etcd.io/etcd/v3/lease"
"go.etcd.io/etcd/v3/mvcc/backend"
"go.uber.org/zap"
)
func TestWatch(t *testing.T) {
b, tmpPath := backend.NewDefaultTmpBackend()
s := newWatchableStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
defer func() {
s.store.Close()
os.Remove(tmpPath)
}()
testKey := []byte("foo")
testValue := []byte("bar")
s.Put(testKey, testValue, lease.NoLease)
w := s.NewWatchStream()
w.Watch(0, testKey, nil, 0)
if !s.synced.contains(string(testKey)) {
// the key must have had an entry in synced
t.Errorf("existence = false, want true")
}
}
func TestNewWatcherCancel(t *testing.T) {
b, tmpPath := backend.NewDefaultTmpBackend()
s := newWatchableStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
defer func() {
s.store.Close()
os.Remove(tmpPath)
}()
testKey := []byte("foo")
testValue := []byte("bar")
s.Put(testKey, testValue, lease.NoLease)
w := s.NewWatchStream()
wt, _ := w.Watch(0, testKey, nil, 0)
if err := w.Cancel(wt); err != nil {
t.Error(err)
}
if s.synced.contains(string(testKey)) {
// the key shoud have been deleted
t.Errorf("existence = true, want false")
}
}
// TestCancelUnsynced tests if running CancelFunc removes watchers from unsynced.
func TestCancelUnsynced(t *testing.T) {
b, tmpPath := backend.NewDefaultTmpBackend()
// manually create watchableStore instead of newWatchableStore
// because newWatchableStore automatically calls syncWatchers
// method to sync watchers in unsynced map. We want to keep watchers
// in unsynced to test if syncWatchers works as expected.
s := &watchableStore{
store: NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{}),
unsynced: newWatcherGroup(),
// to make the test not crash from assigning to nil map.
// 'synced' doesn't get populated in this test.
synced: newWatcherGroup(),
}
defer func() {
s.store.Close()
os.Remove(tmpPath)
}()
// Put a key so that we can spawn watchers on that key.
// (testKey in this test). This increases the rev to 1,
// and later we can we set the watcher's startRev to 1,
// and force watchers to be in unsynced.
testKey := []byte("foo")
testValue := []byte("bar")
s.Put(testKey, testValue, lease.NoLease)
w := s.NewWatchStream()
// arbitrary number for watchers
watcherN := 100
// create watcherN of watch ids to cancel
watchIDs := make([]WatchID, watcherN)
for i := 0; i < watcherN; i++ {
// use 1 to keep watchers in unsynced
watchIDs[i], _ = w.Watch(0, testKey, nil, 1)
}
for _, idx := range watchIDs {
if err := w.Cancel(idx); err != nil {
t.Error(err)
}
}
// After running CancelFunc
//
// unsynced should be empty
// because cancel removes watcher from unsynced
if size := s.unsynced.size(); size != 0 {
t.Errorf("unsynced size = %d, want 0", size)
}
}
// TestSyncWatchers populates unsynced watcher map and tests syncWatchers
// method to see if it correctly sends events to channel of unsynced watchers
// and moves these watchers to synced.
func TestSyncWatchers(t *testing.T) {
b, tmpPath := backend.NewDefaultTmpBackend()
s := &watchableStore{
store: NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{}),
unsynced: newWatcherGroup(),
synced: newWatcherGroup(),
}
defer func() {
s.store.Close()
os.Remove(tmpPath)
}()
testKey := []byte("foo")
testValue := []byte("bar")
s.Put(testKey, testValue, lease.NoLease)
w := s.NewWatchStream()
// arbitrary number for watchers
watcherN := 100
for i := 0; i < watcherN; i++ {
// specify rev as 1 to keep watchers in unsynced
w.Watch(0, testKey, nil, 1)
}
// Before running s.syncWatchers() synced should be empty because we manually
// populate unsynced only
sws := s.synced.watcherSetByKey(string(testKey))
uws := s.unsynced.watcherSetByKey(string(testKey))
if len(sws) != 0 {
t.Fatalf("synced[string(testKey)] size = %d, want 0", len(sws))
}
// unsynced should not be empty because we manually populated unsynced only
if len(uws) != watcherN {
t.Errorf("unsynced size = %d, want %d", len(uws), watcherN)
}
// this should move all unsynced watchers to synced ones
s.syncWatchers()
sws = s.synced.watcherSetByKey(string(testKey))
uws = s.unsynced.watcherSetByKey(string(testKey))
// After running s.syncWatchers(), synced should not be empty because syncwatchers
// populates synced in this test case
if len(sws) != watcherN {
t.Errorf("synced[string(testKey)] size = %d, want %d", len(sws), watcherN)
}
// unsynced should be empty because syncwatchers is expected to move all watchers
// from unsynced to synced in this test case
if len(uws) != 0 {
t.Errorf("unsynced size = %d, want 0", len(uws))
}
for w := range sws {
if w.minRev != s.Rev()+1 {
t.Errorf("w.minRev = %d, want %d", w.minRev, s.Rev()+1)
}
}
if len(w.(*watchStream).ch) != watcherN {
t.Errorf("watched event size = %d, want %d", len(w.(*watchStream).ch), watcherN)
}
evs := (<-w.(*watchStream).ch).Events
if len(evs) != 1 {
t.Errorf("len(evs) got = %d, want = 1", len(evs))
}
if evs[0].Type != mvccpb.PUT {
t.Errorf("got = %v, want = %v", evs[0].Type, mvccpb.PUT)
}
if !bytes.Equal(evs[0].Kv.Key, testKey) {
t.Errorf("got = %s, want = %s", evs[0].Kv.Key, testKey)
}
if !bytes.Equal(evs[0].Kv.Value, testValue) {
t.Errorf("got = %s, want = %s", evs[0].Kv.Value, testValue)
}
}
// TestWatchCompacted tests a watcher that watches on a compacted revision.
func TestWatchCompacted(t *testing.T) {
b, tmpPath := backend.NewDefaultTmpBackend()
s := newWatchableStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
defer func() {
s.store.Close()
os.Remove(tmpPath)
}()
testKey := []byte("foo")
testValue := []byte("bar")
maxRev := 10
compactRev := int64(5)
for i := 0; i < maxRev; i++ {
s.Put(testKey, testValue, lease.NoLease)
}
_, err := s.Compact(traceutil.TODO(), compactRev)
if err != nil {
t.Fatalf("failed to compact kv (%v)", err)
}
w := s.NewWatchStream()
wt, _ := w.Watch(0, testKey, nil, compactRev-1)
select {
case resp := <-w.Chan():
if resp.WatchID != wt {
t.Errorf("resp.WatchID = %x, want %x", resp.WatchID, wt)
}
if resp.CompactRevision == 0 {
t.Errorf("resp.Compacted = %v, want %v", resp.CompactRevision, compactRev)
}
case <-time.After(1 * time.Second):
t.Fatalf("failed to receive response (timeout)")
}
}
func TestWatchFutureRev(t *testing.T) {
b, tmpPath := backend.NewDefaultTmpBackend()
s := newWatchableStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
defer func() {
s.store.Close()
os.Remove(tmpPath)
}()
testKey := []byte("foo")
testValue := []byte("bar")
w := s.NewWatchStream()
wrev := int64(10)
w.Watch(0, testKey, nil, wrev)
for i := 0; i < 10; i++ {
rev := s.Put(testKey, testValue, lease.NoLease)
if rev >= wrev {
break
}
}
select {
case resp := <-w.Chan():
if resp.Revision != wrev {
t.Fatalf("rev = %d, want %d", resp.Revision, wrev)
}
if len(resp.Events) != 1 {
t.Fatalf("failed to get events from the response")
}
if resp.Events[0].Kv.ModRevision != wrev {
t.Fatalf("kv.rev = %d, want %d", resp.Events[0].Kv.ModRevision, wrev)
}
case <-time.After(time.Second):
t.Fatal("failed to receive event in 1 second.")
}
}
func TestWatchRestore(t *testing.T) {
test := func(delay time.Duration) func(t *testing.T) {
return func(t *testing.T) {
b, tmpPath := backend.NewDefaultTmpBackend()
s := newWatchableStore(zap.NewExample(), b, &lease.FakeLessor{}, cindex.NewConsistentIndex(b.BatchTx()), StoreConfig{})
defer cleanup(s, b, tmpPath)
testKey := []byte("foo")
testValue := []byte("bar")
rev := s.Put(testKey, testValue, lease.NoLease)
newBackend, newPath := backend.NewDefaultTmpBackend()
newStore := newWatchableStore(zap.NewExample(), newBackend, &lease.FakeLessor{}, cindex.NewConsistentIndex(newBackend.BatchTx()), StoreConfig{})
defer cleanup(newStore, newBackend, newPath)
w := newStore.NewWatchStream()
w.Watch(0, testKey, nil, rev-1)
time.Sleep(delay)
newStore.Restore(b)
select {
case resp := <-w.Chan():
if resp.Revision != rev {
t.Fatalf("rev = %d, want %d", resp.Revision, rev)
}
if len(resp.Events) != 1 {
t.Fatalf("failed to get events from the response")
}
if resp.Events[0].Kv.ModRevision != rev {
t.Fatalf("kv.rev = %d, want %d", resp.Events[0].Kv.ModRevision, rev)
}
case <-time.After(time.Second):
t.Fatal("failed to receive event in 1 second.")
}
}
}
t.Run("Normal", test(0))
t.Run("RunSyncWatchLoopBeforeRestore", test(time.Millisecond*120)) // longer than default waitDuration
}
// TestWatchRestoreSyncedWatcher tests such a case that:
// 1. watcher is created with a future revision "math.MaxInt64 - 2"
// 2. watcher with a future revision is added to "synced" watcher group
// 3. restore/overwrite storage with snapshot of a higher lasat revision
// 4. restore operation moves "synced" to "unsynced" watcher group
// 5. choose the watcher from step 1, without panic
func TestWatchRestoreSyncedWatcher(t *testing.T) {
b1, b1Path := backend.NewDefaultTmpBackend()
s1 := newWatchableStore(zap.NewExample(), b1, &lease.FakeLessor{}, cindex.NewConsistentIndex(b1.BatchTx()), StoreConfig{})
defer cleanup(s1, b1, b1Path)
b2, b2Path := backend.NewDefaultTmpBackend()
s2 := newWatchableStore(zap.NewExample(), b2, &lease.FakeLessor{}, cindex.NewConsistentIndex(b2.BatchTx()), StoreConfig{})
defer cleanup(s2, b2, b2Path)
testKey, testValue := []byte("foo"), []byte("bar")
rev := s1.Put(testKey, testValue, lease.NoLease)
startRev := rev + 2
// create a watcher with a future revision
// add to "synced" watcher group (startRev > s.store.currentRev)
w1 := s1.NewWatchStream()
w1.Watch(0, testKey, nil, startRev)
// make "s2" ends up with a higher last revision
s2.Put(testKey, testValue, lease.NoLease)
s2.Put(testKey, testValue, lease.NoLease)
// overwrite storage with higher revisions
if err := s1.Restore(b2); err != nil {
t.Fatal(err)
}
// wait for next "syncWatchersLoop" iteration
// and the unsynced watcher should be chosen
time.Sleep(2 * time.Second)
// trigger events for "startRev"
s1.Put(testKey, testValue, lease.NoLease)
select {
case resp := <-w1.Chan():
if resp.Revision != startRev {
t.Fatalf("resp.Revision expect %d, got %d", startRev, resp.Revision)
}
if len(resp.Events) != 1 {
t.Fatalf("len(resp.Events) expect 1, got %d", len(resp.Events))
}
if resp.Events[0].Kv.ModRevision != startRev {
t.Fatalf("resp.Events[0].Kv.ModRevision expect %d, got %d", startRev, resp.Events[0].Kv.ModRevision)
}
case <-time.After(time.Second):
t.Fatal("failed to receive event in 1 second")
}
}
// TestWatchBatchUnsynced tests batching on unsynced watchers
func TestWatchBatchUnsynced(t *testing.T) {
b, tmpPath := backend.NewDefaultTmpBackend()
s := newWatchableStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
oldMaxRevs := watchBatchMaxRevs
defer func() {
watchBatchMaxRevs = oldMaxRevs
s.store.Close()
os.Remove(tmpPath)
}()
batches := 3
watchBatchMaxRevs = 4
v := []byte("foo")
for i := 0; i < watchBatchMaxRevs*batches; i++ {
s.Put(v, v, lease.NoLease)
}
w := s.NewWatchStream()
w.Watch(0, v, nil, 1)
for i := 0; i < batches; i++ {
if resp := <-w.Chan(); len(resp.Events) != watchBatchMaxRevs {
t.Fatalf("len(events) = %d, want %d", len(resp.Events), watchBatchMaxRevs)
}
}
s.store.revMu.Lock()
defer s.store.revMu.Unlock()
if size := s.synced.size(); size != 1 {
t.Errorf("synced size = %d, want 1", size)
}
}
func TestNewMapwatcherToEventMap(t *testing.T) {
k0, k1, k2 := []byte("foo0"), []byte("foo1"), []byte("foo2")
v0, v1, v2 := []byte("bar0"), []byte("bar1"), []byte("bar2")
ws := []*watcher{{key: k0}, {key: k1}, {key: k2}}
evs := []mvccpb.Event{
{
Type: mvccpb.PUT,
Kv: &mvccpb.KeyValue{Key: k0, Value: v0},
},
{
Type: mvccpb.PUT,
Kv: &mvccpb.KeyValue{Key: k1, Value: v1},
},
{
Type: mvccpb.PUT,
Kv: &mvccpb.KeyValue{Key: k2, Value: v2},
},
}
tests := []struct {
sync []*watcher
evs []mvccpb.Event
wwe map[*watcher][]mvccpb.Event
}{
// no watcher in sync, some events should return empty wwe
{
nil,
evs,
map[*watcher][]mvccpb.Event{},
},
// one watcher in sync, one event that does not match the key of that
// watcher should return empty wwe
{
[]*watcher{ws[2]},
evs[:1],
map[*watcher][]mvccpb.Event{},
},
// one watcher in sync, one event that matches the key of that
// watcher should return wwe with that matching watcher
{
[]*watcher{ws[1]},
evs[1:2],
map[*watcher][]mvccpb.Event{
ws[1]: evs[1:2],
},
},
// two watchers in sync that watches two different keys, one event
// that matches the key of only one of the watcher should return wwe
// with the matching watcher
{
[]*watcher{ws[0], ws[2]},
evs[2:],
map[*watcher][]mvccpb.Event{
ws[2]: evs[2:],
},
},
// two watchers in sync that watches the same key, two events that
// match the keys should return wwe with those two watchers
{
[]*watcher{ws[0], ws[1]},
evs[:2],
map[*watcher][]mvccpb.Event{
ws[0]: evs[:1],
ws[1]: evs[1:2],
},
},
}
for i, tt := range tests {
wg := newWatcherGroup()
for _, w := range tt.sync {
wg.add(w)
}
gwe := newWatcherBatch(&wg, tt.evs)
if len(gwe) != len(tt.wwe) {
t.Errorf("#%d: len(gwe) got = %d, want = %d", i, len(gwe), len(tt.wwe))
}
// compare gwe and tt.wwe
for w, eb := range gwe {
if len(eb.evs) != len(tt.wwe[w]) {
t.Errorf("#%d: len(eb.evs) got = %d, want = %d", i, len(eb.evs), len(tt.wwe[w]))
}
if !reflect.DeepEqual(eb.evs, tt.wwe[w]) {
t.Errorf("#%d: reflect.DeepEqual events got = %v, want = true", i, false)
}
}
}
}
// TestWatchVictims tests that watchable store delivers watch events
// when the watch channel is temporarily clogged with too many events.
func TestWatchVictims(t *testing.T) {
oldChanBufLen, oldMaxWatchersPerSync := chanBufLen, maxWatchersPerSync
b, tmpPath := backend.NewDefaultTmpBackend()
s := newWatchableStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
defer func() {
s.store.Close()
os.Remove(tmpPath)
chanBufLen, maxWatchersPerSync = oldChanBufLen, oldMaxWatchersPerSync
}()
chanBufLen, maxWatchersPerSync = 1, 2
numPuts := chanBufLen * 64
testKey, testValue := []byte("foo"), []byte("bar")
var wg sync.WaitGroup
numWatches := maxWatchersPerSync * 128
errc := make(chan error, numWatches)
wg.Add(numWatches)
for i := 0; i < numWatches; i++ {
go func() {
w := s.NewWatchStream()
w.Watch(0, testKey, nil, 1)
defer func() {
w.Close()
wg.Done()
}()
tc := time.After(10 * time.Second)
evs, nextRev := 0, int64(2)
for evs < numPuts {
select {
case <-tc:
errc <- fmt.Errorf("time out")
return
case wr := <-w.Chan():
evs += len(wr.Events)
for _, ev := range wr.Events {
if ev.Kv.ModRevision != nextRev {
errc <- fmt.Errorf("expected rev=%d, got %d", nextRev, ev.Kv.ModRevision)
return
}
nextRev++
}
time.Sleep(time.Millisecond)
}
}
if evs != numPuts {
errc <- fmt.Errorf("expected %d events, got %d", numPuts, evs)
return
}
select {
case <-w.Chan():
errc <- fmt.Errorf("unexpected response")
default:
}
}()
time.Sleep(time.Millisecond)
}
var wgPut sync.WaitGroup
wgPut.Add(numPuts)
for i := 0; i < numPuts; i++ {
go func() {
defer wgPut.Done()
s.Put(testKey, testValue, lease.NoLease)
}()
}
wgPut.Wait()
wg.Wait()
select {
case err := <-errc:
t.Fatal(err)
default:
}
}
// TestStressWatchCancelClose tests closing a watch stream while
// canceling its watches.
func TestStressWatchCancelClose(t *testing.T) {
b, tmpPath := backend.NewDefaultTmpBackend()
s := newWatchableStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
defer func() {
s.store.Close()
os.Remove(tmpPath)
}()
testKey, testValue := []byte("foo"), []byte("bar")
var wg sync.WaitGroup
readyc := make(chan struct{})
wg.Add(100)
for i := 0; i < 100; i++ {
go func() {
defer wg.Done()
w := s.NewWatchStream()
ids := make([]WatchID, 10)
for i := range ids {
ids[i], _ = w.Watch(0, testKey, nil, 0)
}
<-readyc
wg.Add(1 + len(ids)/2)
for i := range ids[:len(ids)/2] {
go func(n int) {
defer wg.Done()
w.Cancel(ids[n])
}(i)
}
go func() {
defer wg.Done()
w.Close()
}()
}()
}
close(readyc)
for i := 0; i < 100; i++ {
s.Put(testKey, testValue, lease.NoLease)
}
wg.Wait()
}

View File

@@ -0,0 +1,56 @@
// Copyright 2017 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package mvcc
import (
"go.etcd.io/etcd/api/v3/mvccpb"
"go.etcd.io/etcd/pkg/v3/traceutil"
)
func (tw *watchableStoreTxnWrite) End() {
changes := tw.Changes()
if len(changes) == 0 {
tw.TxnWrite.End()
return
}
rev := tw.Rev() + 1
evs := make([]mvccpb.Event, len(changes))
for i, change := range changes {
evs[i].Kv = &changes[i]
if change.CreateRevision == 0 {
evs[i].Type = mvccpb.DELETE
evs[i].Kv.ModRevision = rev
} else {
evs[i].Type = mvccpb.PUT
}
}
// end write txn under watchable store lock so the updates are visible
// when asynchronous event posting checks the current store revision
tw.s.mu.Lock()
tw.s.notify(rev, evs)
tw.TxnWrite.End()
tw.s.mu.Unlock()
}
type watchableStoreTxnWrite struct {
TxnWrite
s *watchableStore
}
func (s *watchableStore) Write(trace *traceutil.Trace) TxnWrite {
return &watchableStoreTxnWrite{s.store.Write(trace), s}
}

193
server/mvcc/watcher.go Normal file
View File

@@ -0,0 +1,193 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package mvcc
import (
"bytes"
"errors"
"sync"
"go.etcd.io/etcd/api/v3/mvccpb"
)
// AutoWatchID is the watcher ID passed in WatchStream.Watch when no
// user-provided ID is available. If pass, an ID will automatically be assigned.
const AutoWatchID WatchID = 0
var (
ErrWatcherNotExist = errors.New("mvcc: watcher does not exist")
ErrEmptyWatcherRange = errors.New("mvcc: watcher range is empty")
ErrWatcherDuplicateID = errors.New("mvcc: duplicate watch ID provided on the WatchStream")
)
type WatchID int64
// FilterFunc returns true if the given event should be filtered out.
type FilterFunc func(e mvccpb.Event) bool
type WatchStream interface {
// Watch creates a watcher. The watcher watches the events happening or
// happened on the given key or range [key, end) from the given startRev.
//
// The whole event history can be watched unless compacted.
// If "startRev" <=0, watch observes events after currentRev.
//
// The returned "id" is the ID of this watcher. It appears as WatchID
// in events that are sent to the created watcher through stream channel.
// The watch ID is used when it's not equal to AutoWatchID. Otherwise,
// an auto-generated watch ID is returned.
Watch(id WatchID, key, end []byte, startRev int64, fcs ...FilterFunc) (WatchID, error)
// Chan returns a chan. All watch response will be sent to the returned chan.
Chan() <-chan WatchResponse
// RequestProgress requests the progress of the watcher with given ID. The response
// will only be sent if the watcher is currently synced.
// The responses will be sent through the WatchRespone Chan attached
// with this stream to ensure correct ordering.
// The responses contains no events. The revision in the response is the progress
// of the watchers since the watcher is currently synced.
RequestProgress(id WatchID)
// Cancel cancels a watcher by giving its ID. If watcher does not exist, an error will be
// returned.
Cancel(id WatchID) error
// Close closes Chan and release all related resources.
Close()
// Rev returns the current revision of the KV the stream watches on.
Rev() int64
}
type WatchResponse struct {
// WatchID is the WatchID of the watcher this response sent to.
WatchID WatchID
// Events contains all the events that needs to send.
Events []mvccpb.Event
// Revision is the revision of the KV when the watchResponse is created.
// For a normal response, the revision should be the same as the last
// modified revision inside Events. For a delayed response to a unsynced
// watcher, the revision is greater than the last modified revision
// inside Events.
Revision int64
// CompactRevision is set when the watcher is cancelled due to compaction.
CompactRevision int64
}
// watchStream contains a collection of watchers that share
// one streaming chan to send out watched events and other control events.
type watchStream struct {
watchable watchable
ch chan WatchResponse
mu sync.Mutex // guards fields below it
// nextID is the ID pre-allocated for next new watcher in this stream
nextID WatchID
closed bool
cancels map[WatchID]cancelFunc
watchers map[WatchID]*watcher
}
// Watch creates a new watcher in the stream and returns its WatchID.
func (ws *watchStream) Watch(id WatchID, key, end []byte, startRev int64, fcs ...FilterFunc) (WatchID, error) {
// prevent wrong range where key >= end lexicographically
// watch request with 'WithFromKey' has empty-byte range end
if len(end) != 0 && bytes.Compare(key, end) != -1 {
return -1, ErrEmptyWatcherRange
}
ws.mu.Lock()
defer ws.mu.Unlock()
if ws.closed {
return -1, ErrEmptyWatcherRange
}
if id == AutoWatchID {
for ws.watchers[ws.nextID] != nil {
ws.nextID++
}
id = ws.nextID
ws.nextID++
} else if _, ok := ws.watchers[id]; ok {
return -1, ErrWatcherDuplicateID
}
w, c := ws.watchable.watch(key, end, startRev, id, ws.ch, fcs...)
ws.cancels[id] = c
ws.watchers[id] = w
return id, nil
}
func (ws *watchStream) Chan() <-chan WatchResponse {
return ws.ch
}
func (ws *watchStream) Cancel(id WatchID) error {
ws.mu.Lock()
cancel, ok := ws.cancels[id]
w := ws.watchers[id]
ok = ok && !ws.closed
ws.mu.Unlock()
if !ok {
return ErrWatcherNotExist
}
cancel()
ws.mu.Lock()
// The watch isn't removed until cancel so that if Close() is called,
// it will wait for the cancel. Otherwise, Close() could close the
// watch channel while the store is still posting events.
if ww := ws.watchers[id]; ww == w {
delete(ws.cancels, id)
delete(ws.watchers, id)
}
ws.mu.Unlock()
return nil
}
func (ws *watchStream) Close() {
ws.mu.Lock()
defer ws.mu.Unlock()
for _, cancel := range ws.cancels {
cancel()
}
ws.closed = true
close(ws.ch)
watchStreamGauge.Dec()
}
func (ws *watchStream) Rev() int64 {
ws.mu.Lock()
defer ws.mu.Unlock()
return ws.watchable.rev()
}
func (ws *watchStream) RequestProgress(id WatchID) {
ws.mu.Lock()
w, ok := ws.watchers[id]
ws.mu.Unlock()
if !ok {
return
}
ws.watchable.progress(w)
}

View File

@@ -0,0 +1,40 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package mvcc
import (
"fmt"
"testing"
"go.etcd.io/etcd/v3/lease"
"go.etcd.io/etcd/v3/mvcc/backend"
"go.uber.org/zap"
)
func BenchmarkKVWatcherMemoryUsage(b *testing.B) {
be, tmpPath := backend.NewDefaultTmpBackend()
watchable := newWatchableStore(zap.NewExample(), be, &lease.FakeLessor{}, nil, StoreConfig{})
defer cleanup(watchable, be, tmpPath)
w := watchable.NewWatchStream()
b.ReportAllocs()
b.StartTimer()
for i := 0; i < b.N; i++ {
w.Watch(0, []byte(fmt.Sprint("foo", i)), nil, 0)
}
}

View File

@@ -0,0 +1,293 @@
// Copyright 2016 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package mvcc
import (
"fmt"
"math"
"go.etcd.io/etcd/api/v3/mvccpb"
"go.etcd.io/etcd/pkg/v3/adt"
)
var (
// watchBatchMaxRevs is the maximum distinct revisions that
// may be sent to an unsynced watcher at a time. Declared as
// var instead of const for testing purposes.
watchBatchMaxRevs = 1000
)
type eventBatch struct {
// evs is a batch of revision-ordered events
evs []mvccpb.Event
// revs is the minimum unique revisions observed for this batch
revs int
// moreRev is first revision with more events following this batch
moreRev int64
}
func (eb *eventBatch) add(ev mvccpb.Event) {
if eb.revs > watchBatchMaxRevs {
// maxed out batch size
return
}
if len(eb.evs) == 0 {
// base case
eb.revs = 1
eb.evs = append(eb.evs, ev)
return
}
// revision accounting
ebRev := eb.evs[len(eb.evs)-1].Kv.ModRevision
evRev := ev.Kv.ModRevision
if evRev > ebRev {
eb.revs++
if eb.revs > watchBatchMaxRevs {
eb.moreRev = evRev
return
}
}
eb.evs = append(eb.evs, ev)
}
type watcherBatch map[*watcher]*eventBatch
func (wb watcherBatch) add(w *watcher, ev mvccpb.Event) {
eb := wb[w]
if eb == nil {
eb = &eventBatch{}
wb[w] = eb
}
eb.add(ev)
}
// newWatcherBatch maps watchers to their matched events. It enables quick
// events look up by watcher.
func newWatcherBatch(wg *watcherGroup, evs []mvccpb.Event) watcherBatch {
if len(wg.watchers) == 0 {
return nil
}
wb := make(watcherBatch)
for _, ev := range evs {
for w := range wg.watcherSetByKey(string(ev.Kv.Key)) {
if ev.Kv.ModRevision >= w.minRev {
// don't double notify
wb.add(w, ev)
}
}
}
return wb
}
type watcherSet map[*watcher]struct{}
func (w watcherSet) add(wa *watcher) {
if _, ok := w[wa]; ok {
panic("add watcher twice!")
}
w[wa] = struct{}{}
}
func (w watcherSet) union(ws watcherSet) {
for wa := range ws {
w.add(wa)
}
}
func (w watcherSet) delete(wa *watcher) {
if _, ok := w[wa]; !ok {
panic("removing missing watcher!")
}
delete(w, wa)
}
type watcherSetByKey map[string]watcherSet
func (w watcherSetByKey) add(wa *watcher) {
set := w[string(wa.key)]
if set == nil {
set = make(watcherSet)
w[string(wa.key)] = set
}
set.add(wa)
}
func (w watcherSetByKey) delete(wa *watcher) bool {
k := string(wa.key)
if v, ok := w[k]; ok {
if _, ok := v[wa]; ok {
delete(v, wa)
if len(v) == 0 {
// remove the set; nothing left
delete(w, k)
}
return true
}
}
return false
}
// watcherGroup is a collection of watchers organized by their ranges
type watcherGroup struct {
// keyWatchers has the watchers that watch on a single key
keyWatchers watcherSetByKey
// ranges has the watchers that watch a range; it is sorted by interval
ranges adt.IntervalTree
// watchers is the set of all watchers
watchers watcherSet
}
func newWatcherGroup() watcherGroup {
return watcherGroup{
keyWatchers: make(watcherSetByKey),
ranges: adt.NewIntervalTree(),
watchers: make(watcherSet),
}
}
// add puts a watcher in the group.
func (wg *watcherGroup) add(wa *watcher) {
wg.watchers.add(wa)
if wa.end == nil {
wg.keyWatchers.add(wa)
return
}
// interval already registered?
ivl := adt.NewStringAffineInterval(string(wa.key), string(wa.end))
if iv := wg.ranges.Find(ivl); iv != nil {
iv.Val.(watcherSet).add(wa)
return
}
// not registered, put in interval tree
ws := make(watcherSet)
ws.add(wa)
wg.ranges.Insert(ivl, ws)
}
// contains is whether the given key has a watcher in the group.
func (wg *watcherGroup) contains(key string) bool {
_, ok := wg.keyWatchers[key]
return ok || wg.ranges.Intersects(adt.NewStringAffinePoint(key))
}
// size gives the number of unique watchers in the group.
func (wg *watcherGroup) size() int { return len(wg.watchers) }
// delete removes a watcher from the group.
func (wg *watcherGroup) delete(wa *watcher) bool {
if _, ok := wg.watchers[wa]; !ok {
return false
}
wg.watchers.delete(wa)
if wa.end == nil {
wg.keyWatchers.delete(wa)
return true
}
ivl := adt.NewStringAffineInterval(string(wa.key), string(wa.end))
iv := wg.ranges.Find(ivl)
if iv == nil {
return false
}
ws := iv.Val.(watcherSet)
delete(ws, wa)
if len(ws) == 0 {
// remove interval missing watchers
if ok := wg.ranges.Delete(ivl); !ok {
panic("could not remove watcher from interval tree")
}
}
return true
}
// choose selects watchers from the watcher group to update
func (wg *watcherGroup) choose(maxWatchers int, curRev, compactRev int64) (*watcherGroup, int64) {
if len(wg.watchers) < maxWatchers {
return wg, wg.chooseAll(curRev, compactRev)
}
ret := newWatcherGroup()
for w := range wg.watchers {
if maxWatchers <= 0 {
break
}
maxWatchers--
ret.add(w)
}
return &ret, ret.chooseAll(curRev, compactRev)
}
func (wg *watcherGroup) chooseAll(curRev, compactRev int64) int64 {
minRev := int64(math.MaxInt64)
for w := range wg.watchers {
if w.minRev > curRev {
// after network partition, possibly choosing future revision watcher from restore operation
// with watch key "proxy-namespace__lostleader" and revision "math.MaxInt64 - 2"
// do not panic when such watcher had been moved from "synced" watcher during restore operation
if !w.restore {
panic(fmt.Errorf("watcher minimum revision %d should not exceed current revision %d", w.minRev, curRev))
}
// mark 'restore' done, since it's chosen
w.restore = false
}
if w.minRev < compactRev {
select {
case w.ch <- WatchResponse{WatchID: w.id, CompactRevision: compactRev}:
w.compacted = true
wg.delete(w)
default:
// retry next time
}
continue
}
if minRev > w.minRev {
minRev = w.minRev
}
}
return minRev
}
// watcherSetByKey gets the set of watchers that receive events on the given key.
func (wg *watcherGroup) watcherSetByKey(key string) watcherSet {
wkeys := wg.keyWatchers[key]
wranges := wg.ranges.Stab(adt.NewStringAffinePoint(key))
// zero-copy cases
switch {
case len(wranges) == 0:
// no need to merge ranges or copy; reuse single-key set
return wkeys
case len(wranges) == 0 && len(wkeys) == 0:
return nil
case len(wranges) == 1 && len(wkeys) == 0:
return wranges[0].Val.(watcherSet)
}
// copy case
ret := make(watcherSet)
ret.union(wg.keyWatchers[key])
for _, item := range wranges {
ret.union(item.Val.(watcherSet))
}
return ret
}

380
server/mvcc/watcher_test.go Normal file
View File

@@ -0,0 +1,380 @@
// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package mvcc
import (
"bytes"
"fmt"
"os"
"reflect"
"testing"
"time"
"go.etcd.io/etcd/api/v3/mvccpb"
"go.etcd.io/etcd/v3/lease"
"go.etcd.io/etcd/v3/mvcc/backend"
"go.uber.org/zap"
)
// TestWatcherWatchID tests that each watcher provides unique watchID,
// and the watched event attaches the correct watchID.
func TestWatcherWatchID(t *testing.T) {
b, tmpPath := backend.NewDefaultTmpBackend()
s := WatchableKV(newWatchableStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{}))
defer cleanup(s, b, tmpPath)
w := s.NewWatchStream()
defer w.Close()
idm := make(map[WatchID]struct{})
for i := 0; i < 10; i++ {
id, _ := w.Watch(0, []byte("foo"), nil, 0)
if _, ok := idm[id]; ok {
t.Errorf("#%d: id %d exists", i, id)
}
idm[id] = struct{}{}
s.Put([]byte("foo"), []byte("bar"), lease.NoLease)
resp := <-w.Chan()
if resp.WatchID != id {
t.Errorf("#%d: watch id in event = %d, want %d", i, resp.WatchID, id)
}
if err := w.Cancel(id); err != nil {
t.Error(err)
}
}
s.Put([]byte("foo2"), []byte("bar"), lease.NoLease)
// unsynced watchers
for i := 10; i < 20; i++ {
id, _ := w.Watch(0, []byte("foo2"), nil, 1)
if _, ok := idm[id]; ok {
t.Errorf("#%d: id %d exists", i, id)
}
idm[id] = struct{}{}
resp := <-w.Chan()
if resp.WatchID != id {
t.Errorf("#%d: watch id in event = %d, want %d", i, resp.WatchID, id)
}
if err := w.Cancel(id); err != nil {
t.Error(err)
}
}
}
func TestWatcherRequestsCustomID(t *testing.T) {
b, tmpPath := backend.NewDefaultTmpBackend()
s := WatchableKV(newWatchableStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{}))
defer cleanup(s, b, tmpPath)
w := s.NewWatchStream()
defer w.Close()
// - Request specifically ID #1
// - Try to duplicate it, get an error
// - Make sure the auto-assignment skips over things we manually assigned
tt := []struct {
givenID WatchID
expectedID WatchID
expectedErr error
}{
{1, 1, nil},
{1, 0, ErrWatcherDuplicateID},
{0, 0, nil},
{0, 2, nil},
}
for i, tcase := range tt {
id, err := w.Watch(tcase.givenID, []byte("foo"), nil, 0)
if tcase.expectedErr != nil || err != nil {
if err != tcase.expectedErr {
t.Errorf("expected get error %q in test case %q, got %q", tcase.expectedErr, i, err)
}
} else if tcase.expectedID != id {
t.Errorf("expected to create ID %d, got %d in test case %d", tcase.expectedID, id, i)
}
}
}
// TestWatcherWatchPrefix tests if Watch operation correctly watches
// and returns events with matching prefixes.
func TestWatcherWatchPrefix(t *testing.T) {
b, tmpPath := backend.NewDefaultTmpBackend()
s := WatchableKV(newWatchableStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{}))
defer cleanup(s, b, tmpPath)
w := s.NewWatchStream()
defer w.Close()
idm := make(map[WatchID]struct{})
val := []byte("bar")
keyWatch, keyEnd, keyPut := []byte("foo"), []byte("fop"), []byte("foobar")
for i := 0; i < 10; i++ {
id, _ := w.Watch(0, keyWatch, keyEnd, 0)
if _, ok := idm[id]; ok {
t.Errorf("#%d: unexpected duplicated id %x", i, id)
}
idm[id] = struct{}{}
s.Put(keyPut, val, lease.NoLease)
resp := <-w.Chan()
if resp.WatchID != id {
t.Errorf("#%d: watch id in event = %d, want %d", i, resp.WatchID, id)
}
if err := w.Cancel(id); err != nil {
t.Errorf("#%d: unexpected cancel error %v", i, err)
}
if len(resp.Events) != 1 {
t.Errorf("#%d: len(resp.Events) got = %d, want = 1", i, len(resp.Events))
}
if len(resp.Events) == 1 {
if !bytes.Equal(resp.Events[0].Kv.Key, keyPut) {
t.Errorf("#%d: resp.Events got = %s, want = %s", i, resp.Events[0].Kv.Key, keyPut)
}
}
}
keyWatch1, keyEnd1, keyPut1 := []byte("foo1"), []byte("foo2"), []byte("foo1bar")
s.Put(keyPut1, val, lease.NoLease)
// unsynced watchers
for i := 10; i < 15; i++ {
id, _ := w.Watch(0, keyWatch1, keyEnd1, 1)
if _, ok := idm[id]; ok {
t.Errorf("#%d: id %d exists", i, id)
}
idm[id] = struct{}{}
resp := <-w.Chan()
if resp.WatchID != id {
t.Errorf("#%d: watch id in event = %d, want %d", i, resp.WatchID, id)
}
if err := w.Cancel(id); err != nil {
t.Error(err)
}
if len(resp.Events) != 1 {
t.Errorf("#%d: len(resp.Events) got = %d, want = 1", i, len(resp.Events))
}
if len(resp.Events) == 1 {
if !bytes.Equal(resp.Events[0].Kv.Key, keyPut1) {
t.Errorf("#%d: resp.Events got = %s, want = %s", i, resp.Events[0].Kv.Key, keyPut1)
}
}
}
}
// TestWatcherWatchWrongRange ensures that watcher with wrong 'end' range
// does not create watcher, which panics when canceling in range tree.
func TestWatcherWatchWrongRange(t *testing.T) {
b, tmpPath := backend.NewDefaultTmpBackend()
s := WatchableKV(newWatchableStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{}))
defer cleanup(s, b, tmpPath)
w := s.NewWatchStream()
defer w.Close()
if _, err := w.Watch(0, []byte("foa"), []byte("foa"), 1); err != ErrEmptyWatcherRange {
t.Fatalf("key == end range given; expected ErrEmptyWatcherRange, got %+v", err)
}
if _, err := w.Watch(0, []byte("fob"), []byte("foa"), 1); err != ErrEmptyWatcherRange {
t.Fatalf("key > end range given; expected ErrEmptyWatcherRange, got %+v", err)
}
// watch request with 'WithFromKey' has empty-byte range end
if id, _ := w.Watch(0, []byte("foo"), []byte{}, 1); id != 0 {
t.Fatalf("\x00 is range given; id expected 0, got %d", id)
}
}
func TestWatchDeleteRange(t *testing.T) {
b, tmpPath := backend.NewDefaultTmpBackend()
s := newWatchableStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
defer func() {
s.store.Close()
os.Remove(tmpPath)
}()
testKeyPrefix := []byte("foo")
for i := 0; i < 3; i++ {
s.Put([]byte(fmt.Sprintf("%s_%d", testKeyPrefix, i)), []byte("bar"), lease.NoLease)
}
w := s.NewWatchStream()
from, to := testKeyPrefix, []byte(fmt.Sprintf("%s_%d", testKeyPrefix, 99))
w.Watch(0, from, to, 0)
s.DeleteRange(from, to)
we := []mvccpb.Event{
{Type: mvccpb.DELETE, Kv: &mvccpb.KeyValue{Key: []byte("foo_0"), ModRevision: 5}},
{Type: mvccpb.DELETE, Kv: &mvccpb.KeyValue{Key: []byte("foo_1"), ModRevision: 5}},
{Type: mvccpb.DELETE, Kv: &mvccpb.KeyValue{Key: []byte("foo_2"), ModRevision: 5}},
}
select {
case r := <-w.Chan():
if !reflect.DeepEqual(r.Events, we) {
t.Errorf("event = %v, want %v", r.Events, we)
}
case <-time.After(10 * time.Second):
t.Fatal("failed to receive event after 10 seconds!")
}
}
// TestWatchStreamCancelWatcherByID ensures cancel calls the cancel func of the watcher
// with given id inside watchStream.
func TestWatchStreamCancelWatcherByID(t *testing.T) {
b, tmpPath := backend.NewDefaultTmpBackend()
s := WatchableKV(newWatchableStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{}))
defer cleanup(s, b, tmpPath)
w := s.NewWatchStream()
defer w.Close()
id, _ := w.Watch(0, []byte("foo"), nil, 0)
tests := []struct {
cancelID WatchID
werr error
}{
// no error should be returned when cancel the created watcher.
{id, nil},
// not exist error should be returned when cancel again.
{id, ErrWatcherNotExist},
// not exist error should be returned when cancel a bad id.
{id + 1, ErrWatcherNotExist},
}
for i, tt := range tests {
gerr := w.Cancel(tt.cancelID)
if gerr != tt.werr {
t.Errorf("#%d: err = %v, want %v", i, gerr, tt.werr)
}
}
if l := len(w.(*watchStream).cancels); l != 0 {
t.Errorf("cancels = %d, want 0", l)
}
}
// TestWatcherRequestProgress ensures synced watcher can correctly
// report its correct progress.
func TestWatcherRequestProgress(t *testing.T) {
b, tmpPath := backend.NewDefaultTmpBackend()
// manually create watchableStore instead of newWatchableStore
// because newWatchableStore automatically calls syncWatchers
// method to sync watchers in unsynced map. We want to keep watchers
// in unsynced to test if syncWatchers works as expected.
s := &watchableStore{
store: NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{}),
unsynced: newWatcherGroup(),
synced: newWatcherGroup(),
}
defer func() {
s.store.Close()
os.Remove(tmpPath)
}()
testKey := []byte("foo")
notTestKey := []byte("bad")
testValue := []byte("bar")
s.Put(testKey, testValue, lease.NoLease)
w := s.NewWatchStream()
badID := WatchID(1000)
w.RequestProgress(badID)
select {
case resp := <-w.Chan():
t.Fatalf("unexpected %+v", resp)
default:
}
id, _ := w.Watch(0, notTestKey, nil, 1)
w.RequestProgress(id)
select {
case resp := <-w.Chan():
t.Fatalf("unexpected %+v", resp)
default:
}
s.syncWatchers()
w.RequestProgress(id)
wrs := WatchResponse{WatchID: id, Revision: 2}
select {
case resp := <-w.Chan():
if !reflect.DeepEqual(resp, wrs) {
t.Fatalf("got %+v, expect %+v", resp, wrs)
}
case <-time.After(time.Second):
t.Fatal("failed to receive progress")
}
}
func TestWatcherWatchWithFilter(t *testing.T) {
b, tmpPath := backend.NewDefaultTmpBackend()
s := WatchableKV(newWatchableStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{}))
defer cleanup(s, b, tmpPath)
w := s.NewWatchStream()
defer w.Close()
filterPut := func(e mvccpb.Event) bool {
return e.Type == mvccpb.PUT
}
w.Watch(0, []byte("foo"), nil, 0, filterPut)
done := make(chan struct{}, 1)
go func() {
<-w.Chan()
done <- struct{}{}
}()
s.Put([]byte("foo"), []byte("bar"), 0)
select {
case <-done:
t.Fatal("failed to filter put request")
case <-time.After(100 * time.Millisecond):
}
s.DeleteRange([]byte("foo"), nil)
select {
case <-done:
case <-time.After(100 * time.Millisecond):
t.Fatal("failed to receive delete request")
}
}