server: Move server files to 'server' directory.

26 git mv mvcc wal auth etcdserver etcdmain proxy embed/ lease/ server 36 git mv go.mod go.sum server
2024-09-27 06:25:44 +00:00 · 2020-10-20 23:05:17 +02:00
parent eee8dec0c3
commit 4a5e9d1261
316 changed files with 0 additions and 0 deletions
--- a/server/mvcc/backend/backend.go
+++ b/server/mvcc/backend/backend.go
@@ -0,0 +1,572 @@
+// Copyright 2015 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package backend
+
+import (
+	"fmt"
+	"hash/crc32"
+	"io"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	humanize "github.com/dustin/go-humanize"
+	bolt "go.etcd.io/bbolt"
+	"go.uber.org/zap"
+)
+
+var (
+	defaultBatchLimit    = 10000
+	defaultBatchInterval = 100 * time.Millisecond
+
+	defragLimit = 10000
+
+	// initialMmapSize is the initial size of the mmapped region. Setting this larger than
+	// the potential max db size can prevent writer from blocking reader.
+	// This only works for linux.
+	initialMmapSize = uint64(10 * 1024 * 1024 * 1024)
+
+	// minSnapshotWarningTimeout is the minimum threshold to trigger a long running snapshot warning.
+	minSnapshotWarningTimeout = 30 * time.Second
+)
+
+type Backend interface {
+	// ReadTx returns a read transaction. It is replaced by ConcurrentReadTx in the main data path, see #10523.
+	ReadTx() ReadTx
+	BatchTx() BatchTx
+	// ConcurrentReadTx returns a non-blocking read transaction.
+	ConcurrentReadTx() ReadTx
+
+	Snapshot() Snapshot
+	Hash(ignores map[IgnoreKey]struct{}) (uint32, error)
+	// Size returns the current size of the backend physically allocated.
+	// The backend can hold DB space that is not utilized at the moment,
+	// since it can conduct pre-allocation or spare unused space for recycling.
+	// Use SizeInUse() instead for the actual DB size.
+	Size() int64
+	// SizeInUse returns the current size of the backend logically in use.
+	// Since the backend can manage free space in a non-byte unit such as
+	// number of pages, the returned value can be not exactly accurate in bytes.
+	SizeInUse() int64
+	// OpenReadTxN returns the number of currently open read transactions in the backend.
+	OpenReadTxN() int64
+	Defrag() error
+	ForceCommit()
+	Close() error
+}
+
+type Snapshot interface {
+	// Size gets the size of the snapshot.
+	Size() int64
+	// WriteTo writes the snapshot into the given writer.
+	WriteTo(w io.Writer) (n int64, err error)
+	// Close closes the snapshot.
+	Close() error
+}
+
+type backend struct {
+	// size and commits are used with atomic operations so they must be
+	// 64-bit aligned, otherwise 32-bit tests will crash
+
+	// size is the number of bytes allocated in the backend
+	size int64
+	// sizeInUse is the number of bytes actually used in the backend
+	sizeInUse int64
+	// commits counts number of commits since start
+	commits int64
+	// openReadTxN is the number of currently open read transactions in the backend
+	openReadTxN int64
+
+	mu sync.RWMutex
+	db *bolt.DB
+
+	batchInterval time.Duration
+	batchLimit    int
+	batchTx       *batchTxBuffered
+
+	readTx *readTx
+
+	stopc chan struct{}
+	donec chan struct{}
+
+	lg *zap.Logger
+}
+
+type BackendConfig struct {
+	// Path is the file path to the backend file.
+	Path string
+	// BatchInterval is the maximum time before flushing the BatchTx.
+	BatchInterval time.Duration
+	// BatchLimit is the maximum puts before flushing the BatchTx.
+	BatchLimit int
+	// BackendFreelistType is the backend boltdb's freelist type.
+	BackendFreelistType bolt.FreelistType
+	// MmapSize is the number of bytes to mmap for the backend.
+	MmapSize uint64
+	// Logger logs backend-side operations.
+	Logger *zap.Logger
+	// UnsafeNoFsync disables all uses of fsync.
+	UnsafeNoFsync bool `json:"unsafe-no-fsync"`
+}
+
+func DefaultBackendConfig() BackendConfig {
+	return BackendConfig{
+		BatchInterval: defaultBatchInterval,
+		BatchLimit:    defaultBatchLimit,
+		MmapSize:      initialMmapSize,
+	}
+}
+
+func New(bcfg BackendConfig) Backend {
+	return newBackend(bcfg)
+}
+
+func NewDefaultBackend(path string) Backend {
+	bcfg := DefaultBackendConfig()
+	bcfg.Path = path
+	return newBackend(bcfg)
+}
+
+func newBackend(bcfg BackendConfig) *backend {
+	if bcfg.Logger == nil {
+		bcfg.Logger = zap.NewNop()
+	}
+
+	bopts := &bolt.Options{}
+	if boltOpenOptions != nil {
+		*bopts = *boltOpenOptions
+	}
+	bopts.InitialMmapSize = bcfg.mmapSize()
+	bopts.FreelistType = bcfg.BackendFreelistType
+	bopts.NoSync = bcfg.UnsafeNoFsync
+	bopts.NoGrowSync = bcfg.UnsafeNoFsync
+
+	db, err := bolt.Open(bcfg.Path, 0600, bopts)
+	if err != nil {
+		bcfg.Logger.Panic("failed to open database", zap.String("path", bcfg.Path), zap.Error(err))
+	}
+
+	// In future, may want to make buffering optional for low-concurrency systems
+	// or dynamically swap between buffered/non-buffered depending on workload.
+	b := &backend{
+		db: db,
+
+		batchInterval: bcfg.BatchInterval,
+		batchLimit:    bcfg.BatchLimit,
+
+		readTx: &readTx{
+			baseReadTx: baseReadTx{
+				buf: txReadBuffer{
+					txBuffer: txBuffer{make(map[string]*bucketBuffer)},
+				},
+				buckets: make(map[string]*bolt.Bucket),
+				txWg:    new(sync.WaitGroup),
+				txMu:    new(sync.RWMutex),
+			},
+		},
+
+		stopc: make(chan struct{}),
+		donec: make(chan struct{}),
+
+		lg: bcfg.Logger,
+	}
+	b.batchTx = newBatchTxBuffered(b)
+	go b.run()
+	return b
+}
+
+// BatchTx returns the current batch tx in coalescer. The tx can be used for read and
+// write operations. The write result can be retrieved within the same tx immediately.
+// The write result is isolated with other txs until the current one get committed.
+func (b *backend) BatchTx() BatchTx {
+	return b.batchTx
+}
+
+func (b *backend) ReadTx() ReadTx { return b.readTx }
+
+// ConcurrentReadTx creates and returns a new ReadTx, which:
+// A) creates and keeps a copy of backend.readTx.txReadBuffer,
+// B) references the boltdb read Tx (and its bucket cache) of current batch interval.
+func (b *backend) ConcurrentReadTx() ReadTx {
+	b.readTx.RLock()
+	defer b.readTx.RUnlock()
+	// prevent boltdb read Tx from been rolled back until store read Tx is done. Needs to be called when holding readTx.RLock().
+	b.readTx.txWg.Add(1)
+	// TODO: might want to copy the read buffer lazily - create copy when A) end of a write transaction B) end of a batch interval.
+	return &concurrentReadTx{
+		baseReadTx: baseReadTx{
+			buf:     b.readTx.buf.unsafeCopy(),
+			txMu:    b.readTx.txMu,
+			tx:      b.readTx.tx,
+			buckets: b.readTx.buckets,
+			txWg:    b.readTx.txWg,
+		},
+	}
+}
+
+// ForceCommit forces the current batching tx to commit.
+func (b *backend) ForceCommit() {
+	b.batchTx.Commit()
+}
+
+func (b *backend) Snapshot() Snapshot {
+	b.batchTx.Commit()
+
+	b.mu.RLock()
+	defer b.mu.RUnlock()
+	tx, err := b.db.Begin(false)
+	if err != nil {
+		b.lg.Fatal("failed to begin tx", zap.Error(err))
+	}
+
+	stopc, donec := make(chan struct{}), make(chan struct{})
+	dbBytes := tx.Size()
+	go func() {
+		defer close(donec)
+		// sendRateBytes is based on transferring snapshot data over a 1 gigabit/s connection
+		// assuming a min tcp throughput of 100MB/s.
+		var sendRateBytes int64 = 100 * 1024 * 1024
+		warningTimeout := time.Duration(int64((float64(dbBytes) / float64(sendRateBytes)) * float64(time.Second)))
+		if warningTimeout < minSnapshotWarningTimeout {
+			warningTimeout = minSnapshotWarningTimeout
+		}
+		start := time.Now()
+		ticker := time.NewTicker(warningTimeout)
+		defer ticker.Stop()
+		for {
+			select {
+			case <-ticker.C:
+				b.lg.Warn(
+					"snapshotting taking too long to transfer",
+					zap.Duration("taking", time.Since(start)),
+					zap.Int64("bytes", dbBytes),
+					zap.String("size", humanize.Bytes(uint64(dbBytes))),
+				)
+
+			case <-stopc:
+				snapshotTransferSec.Observe(time.Since(start).Seconds())
+				return
+			}
+		}
+	}()
+
+	return &snapshot{tx, stopc, donec}
+}
+
+type IgnoreKey struct {
+	Bucket string
+	Key    string
+}
+
+func (b *backend) Hash(ignores map[IgnoreKey]struct{}) (uint32, error) {
+	h := crc32.New(crc32.MakeTable(crc32.Castagnoli))
+
+	b.mu.RLock()
+	defer b.mu.RUnlock()
+	err := b.db.View(func(tx *bolt.Tx) error {
+		c := tx.Cursor()
+		for next, _ := c.First(); next != nil; next, _ = c.Next() {
+			b := tx.Bucket(next)
+			if b == nil {
+				return fmt.Errorf("cannot get hash of bucket %s", string(next))
+			}
+			h.Write(next)
+			b.ForEach(func(k, v []byte) error {
+				bk := IgnoreKey{Bucket: string(next), Key: string(k)}
+				if _, ok := ignores[bk]; !ok {
+					h.Write(k)
+					h.Write(v)
+				}
+				return nil
+			})
+		}
+		return nil
+	})
+
+	if err != nil {
+		return 0, err
+	}
+
+	return h.Sum32(), nil
+}
+
+func (b *backend) Size() int64 {
+	return atomic.LoadInt64(&b.size)
+}
+
+func (b *backend) SizeInUse() int64 {
+	return atomic.LoadInt64(&b.sizeInUse)
+}
+
+func (b *backend) run() {
+	defer close(b.donec)
+	t := time.NewTimer(b.batchInterval)
+	defer t.Stop()
+	for {
+		select {
+		case <-t.C:
+		case <-b.stopc:
+			b.batchTx.CommitAndStop()
+			return
+		}
+		if b.batchTx.safePending() != 0 {
+			b.batchTx.Commit()
+		}
+		t.Reset(b.batchInterval)
+	}
+}
+
+func (b *backend) Close() error {
+	close(b.stopc)
+	<-b.donec
+	return b.db.Close()
+}
+
+// Commits returns total number of commits since start
+func (b *backend) Commits() int64 {
+	return atomic.LoadInt64(&b.commits)
+}
+
+func (b *backend) Defrag() error {
+	return b.defrag()
+}
+
+func (b *backend) defrag() error {
+	now := time.Now()
+
+	// TODO: make this non-blocking?
+	// lock batchTx to ensure nobody is using previous tx, and then
+	// close previous ongoing tx.
+	b.batchTx.Lock()
+	defer b.batchTx.Unlock()
+
+	// lock database after lock tx to avoid deadlock.
+	b.mu.Lock()
+	defer b.mu.Unlock()
+
+	// block concurrent read requests while resetting tx
+	b.readTx.Lock()
+	defer b.readTx.Unlock()
+
+	b.batchTx.unsafeCommit(true)
+
+	b.batchTx.tx = nil
+
+	// Create a temporary file to ensure we start with a clean slate.
+	// Snapshotter.cleanupSnapdir cleans up any of these that are found during startup.
+	dir := filepath.Dir(b.db.Path())
+	temp, err := ioutil.TempFile(dir, "db.tmp.*")
+	if err != nil {
+		return err
+	}
+	options := bolt.Options{}
+	if boltOpenOptions != nil {
+		options = *boltOpenOptions
+	}
+	options.OpenFile = func(path string, i int, mode os.FileMode) (file *os.File, err error) {
+		return temp, nil
+	}
+	tdbp := temp.Name()
+	tmpdb, err := bolt.Open(tdbp, 0600, &options)
+	if err != nil {
+		return err
+	}
+
+	dbp := b.db.Path()
+	size1, sizeInUse1 := b.Size(), b.SizeInUse()
+	if b.lg != nil {
+		b.lg.Info(
+			"defragmenting",
+			zap.String("path", dbp),
+			zap.Int64("current-db-size-bytes", size1),
+			zap.String("current-db-size", humanize.Bytes(uint64(size1))),
+			zap.Int64("current-db-size-in-use-bytes", sizeInUse1),
+			zap.String("current-db-size-in-use", humanize.Bytes(uint64(sizeInUse1))),
+		)
+	}
+	// gofail: var defragBeforeCopy struct{}
+	err = defragdb(b.db, tmpdb, defragLimit)
+	if err != nil {
+		tmpdb.Close()
+		if rmErr := os.RemoveAll(tmpdb.Path()); rmErr != nil {
+			b.lg.Error("failed to remove db.tmp after defragmentation completed", zap.Error(rmErr))
+		}
+		return err
+	}
+
+	err = b.db.Close()
+	if err != nil {
+		b.lg.Fatal("failed to close database", zap.Error(err))
+	}
+	err = tmpdb.Close()
+	if err != nil {
+		b.lg.Fatal("failed to close tmp database", zap.Error(err))
+	}
+	// gofail: var defragBeforeRename struct{}
+	err = os.Rename(tdbp, dbp)
+	if err != nil {
+		b.lg.Fatal("failed to rename tmp database", zap.Error(err))
+	}
+
+	b.db, err = bolt.Open(dbp, 0600, boltOpenOptions)
+	if err != nil {
+		b.lg.Fatal("failed to open database", zap.String("path", dbp), zap.Error(err))
+	}
+	b.batchTx.tx = b.unsafeBegin(true)
+
+	b.readTx.reset()
+	b.readTx.tx = b.unsafeBegin(false)
+
+	size := b.readTx.tx.Size()
+	db := b.readTx.tx.DB()
+	atomic.StoreInt64(&b.size, size)
+	atomic.StoreInt64(&b.sizeInUse, size-(int64(db.Stats().FreePageN)*int64(db.Info().PageSize)))
+
+	took := time.Since(now)
+	defragSec.Observe(took.Seconds())
+
+	size2, sizeInUse2 := b.Size(), b.SizeInUse()
+	if b.lg != nil {
+		b.lg.Info(
+			"defragmented",
+			zap.String("path", dbp),
+			zap.Int64("current-db-size-bytes-diff", size2-size1),
+			zap.Int64("current-db-size-bytes", size2),
+			zap.String("current-db-size", humanize.Bytes(uint64(size2))),
+			zap.Int64("current-db-size-in-use-bytes-diff", sizeInUse2-sizeInUse1),
+			zap.Int64("current-db-size-in-use-bytes", sizeInUse2),
+			zap.String("current-db-size-in-use", humanize.Bytes(uint64(sizeInUse2))),
+			zap.Duration("took", took),
+		)
+	}
+	return nil
+}
+
+func defragdb(odb, tmpdb *bolt.DB, limit int) error {
+	// open a tx on tmpdb for writes
+	tmptx, err := tmpdb.Begin(true)
+	if err != nil {
+		return err
+	}
+	defer func() {
+		if err != nil {
+			tmptx.Rollback()
+		}
+	}()
+
+	// open a tx on old db for read
+	tx, err := odb.Begin(false)
+	if err != nil {
+		return err
+	}
+	defer tx.Rollback()
+
+	c := tx.Cursor()
+
+	count := 0
+	for next, _ := c.First(); next != nil; next, _ = c.Next() {
+		b := tx.Bucket(next)
+		if b == nil {
+			return fmt.Errorf("backend: cannot defrag bucket %s", string(next))
+		}
+
+		tmpb, berr := tmptx.CreateBucketIfNotExists(next)
+		if berr != nil {
+			return berr
+		}
+		tmpb.FillPercent = 0.9 // for seq write in for each
+
+		if err = b.ForEach(func(k, v []byte) error {
+			count++
+			if count > limit {
+				err = tmptx.Commit()
+				if err != nil {
+					return err
+				}
+				tmptx, err = tmpdb.Begin(true)
+				if err != nil {
+					return err
+				}
+				tmpb = tmptx.Bucket(next)
+				tmpb.FillPercent = 0.9 // for seq write in for each
+
+				count = 0
+			}
+			return tmpb.Put(k, v)
+		}); err != nil {
+			return err
+		}
+	}
+
+	return tmptx.Commit()
+}
+
+func (b *backend) begin(write bool) *bolt.Tx {
+	b.mu.RLock()
+	tx := b.unsafeBegin(write)
+	b.mu.RUnlock()
+
+	size := tx.Size()
+	db := tx.DB()
+	stats := db.Stats()
+	atomic.StoreInt64(&b.size, size)
+	atomic.StoreInt64(&b.sizeInUse, size-(int64(stats.FreePageN)*int64(db.Info().PageSize)))
+	atomic.StoreInt64(&b.openReadTxN, int64(stats.OpenTxN))
+
+	return tx
+}
+
+func (b *backend) unsafeBegin(write bool) *bolt.Tx {
+	tx, err := b.db.Begin(write)
+	if err != nil {
+		b.lg.Fatal("failed to begin tx", zap.Error(err))
+	}
+	return tx
+}
+
+func (b *backend) OpenReadTxN() int64 {
+	return atomic.LoadInt64(&b.openReadTxN)
+}
+
+// NewTmpBackend creates a backend implementation for testing.
+func NewTmpBackend(batchInterval time.Duration, batchLimit int) (*backend, string) {
+	dir, err := ioutil.TempDir(os.TempDir(), "etcd_backend_test")
+	if err != nil {
+		panic(err)
+	}
+	tmpPath := filepath.Join(dir, "database")
+	bcfg := DefaultBackendConfig()
+	bcfg.Path, bcfg.BatchInterval, bcfg.BatchLimit = tmpPath, batchInterval, batchLimit
+	return newBackend(bcfg), tmpPath
+}
+
+func NewDefaultTmpBackend() (*backend, string) {
+	return NewTmpBackend(defaultBatchInterval, defaultBatchLimit)
+}
+
+type snapshot struct {
+	*bolt.Tx
+	stopc chan struct{}
+	donec chan struct{}
+}
+
+func (s *snapshot) Close() error {
+	close(s.stopc)
+	<-s.donec
+	return s.Tx.Rollback()
+}
--- a/server/mvcc/backend/backend_bench_test.go
+++ b/server/mvcc/backend/backend_bench_test.go
@@ -0,0 +1,50 @@
+// Copyright 2015 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package backend
+
+import (
+	"crypto/rand"
+	"os"
+	"testing"
+	"time"
+)
+
+func BenchmarkBackendPut(b *testing.B) {
+	backend, tmppath := NewTmpBackend(100*time.Millisecond, 10000)
+	defer backend.Close()
+	defer os.Remove(tmppath)
+
+	// prepare keys
+	keys := make([][]byte, b.N)
+	for i := 0; i < b.N; i++ {
+		keys[i] = make([]byte, 64)
+		rand.Read(keys[i])
+	}
+	value := make([]byte, 128)
+	rand.Read(value)
+
+	batchTx := backend.BatchTx()
+
+	batchTx.Lock()
+	batchTx.UnsafeCreateBucket([]byte("test"))
+	batchTx.Unlock()
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		batchTx.Lock()
+		batchTx.UnsafePut([]byte("test"), keys[i], value)
+		batchTx.Unlock()
+	}
+}
--- a/server/mvcc/backend/backend_test.go
+++ b/server/mvcc/backend/backend_test.go
@@ -0,0 +1,335 @@
+// Copyright 2015 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package backend
+
+import (
+	"fmt"
+	"io/ioutil"
+	"os"
+	"reflect"
+	"testing"
+	"time"
+
+	bolt "go.etcd.io/bbolt"
+)
+
+func TestBackendClose(t *testing.T) {
+	b, tmpPath := NewTmpBackend(time.Hour, 10000)
+	defer os.Remove(tmpPath)
+
+	// check close could work
+	done := make(chan struct{})
+	go func() {
+		err := b.Close()
+		if err != nil {
+			t.Errorf("close error = %v, want nil", err)
+		}
+		done <- struct{}{}
+	}()
+	select {
+	case <-done:
+	case <-time.After(10 * time.Second):
+		t.Errorf("failed to close database in 10s")
+	}
+}
+
+func TestBackendSnapshot(t *testing.T) {
+	b, tmpPath := NewTmpBackend(time.Hour, 10000)
+	defer cleanup(b, tmpPath)
+
+	tx := b.BatchTx()
+	tx.Lock()
+	tx.UnsafeCreateBucket([]byte("test"))
+	tx.UnsafePut([]byte("test"), []byte("foo"), []byte("bar"))
+	tx.Unlock()
+	b.ForceCommit()
+
+	// write snapshot to a new file
+	f, err := ioutil.TempFile(os.TempDir(), "etcd_backend_test")
+	if err != nil {
+		t.Fatal(err)
+	}
+	snap := b.Snapshot()
+	defer snap.Close()
+	if _, err := snap.WriteTo(f); err != nil {
+		t.Fatal(err)
+	}
+	f.Close()
+
+	// bootstrap new backend from the snapshot
+	bcfg := DefaultBackendConfig()
+	bcfg.Path, bcfg.BatchInterval, bcfg.BatchLimit = f.Name(), time.Hour, 10000
+	nb := New(bcfg)
+	defer cleanup(nb, f.Name())
+
+	newTx := nb.BatchTx()
+	newTx.Lock()
+	ks, _ := newTx.UnsafeRange([]byte("test"), []byte("foo"), []byte("goo"), 0)
+	if len(ks) != 1 {
+		t.Errorf("len(kvs) = %d, want 1", len(ks))
+	}
+	newTx.Unlock()
+}
+
+func TestBackendBatchIntervalCommit(t *testing.T) {
+	// start backend with super short batch interval so
+	// we do not need to wait long before commit to happen.
+	b, tmpPath := NewTmpBackend(time.Nanosecond, 10000)
+	defer cleanup(b, tmpPath)
+
+	pc := b.Commits()
+
+	tx := b.BatchTx()
+	tx.Lock()
+	tx.UnsafeCreateBucket([]byte("test"))
+	tx.UnsafePut([]byte("test"), []byte("foo"), []byte("bar"))
+	tx.Unlock()
+
+	for i := 0; i < 10; i++ {
+		if b.Commits() >= pc+1 {
+			break
+		}
+		time.Sleep(time.Duration(i*100) * time.Millisecond)
+	}
+
+	// check whether put happens via db view
+	b.db.View(func(tx *bolt.Tx) error {
+		bucket := tx.Bucket([]byte("test"))
+		if bucket == nil {
+			t.Errorf("bucket test does not exit")
+			return nil
+		}
+		v := bucket.Get([]byte("foo"))
+		if v == nil {
+			t.Errorf("foo key failed to written in backend")
+		}
+		return nil
+	})
+}
+
+func TestBackendDefrag(t *testing.T) {
+	b, tmpPath := NewDefaultTmpBackend()
+	defer cleanup(b, tmpPath)
+
+	tx := b.BatchTx()
+	tx.Lock()
+	tx.UnsafeCreateBucket([]byte("test"))
+	for i := 0; i < defragLimit+100; i++ {
+		tx.UnsafePut([]byte("test"), []byte(fmt.Sprintf("foo_%d", i)), []byte("bar"))
+	}
+	tx.Unlock()
+	b.ForceCommit()
+
+	// remove some keys to ensure the disk space will be reclaimed after defrag
+	tx = b.BatchTx()
+	tx.Lock()
+	for i := 0; i < 50; i++ {
+		tx.UnsafeDelete([]byte("test"), []byte(fmt.Sprintf("foo_%d", i)))
+	}
+	tx.Unlock()
+	b.ForceCommit()
+
+	size := b.Size()
+
+	// shrink and check hash
+	oh, err := b.Hash(nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	err = b.Defrag()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	nh, err := b.Hash(nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if oh != nh {
+		t.Errorf("hash = %v, want %v", nh, oh)
+	}
+
+	nsize := b.Size()
+	if nsize >= size {
+		t.Errorf("new size = %v, want < %d", nsize, size)
+	}
+
+	// try put more keys after shrink.
+	tx = b.BatchTx()
+	tx.Lock()
+	tx.UnsafeCreateBucket([]byte("test"))
+	tx.UnsafePut([]byte("test"), []byte("more"), []byte("bar"))
+	tx.Unlock()
+	b.ForceCommit()
+}
+
+// TestBackendWriteback ensures writes are stored to the read txn on write txn unlock.
+func TestBackendWriteback(t *testing.T) {
+	b, tmpPath := NewDefaultTmpBackend()
+	defer cleanup(b, tmpPath)
+
+	tx := b.BatchTx()
+	tx.Lock()
+	tx.UnsafeCreateBucket([]byte("key"))
+	tx.UnsafePut([]byte("key"), []byte("abc"), []byte("bar"))
+	tx.UnsafePut([]byte("key"), []byte("def"), []byte("baz"))
+	tx.UnsafePut([]byte("key"), []byte("overwrite"), []byte("1"))
+	tx.Unlock()
+
+	// overwrites should be propagated too
+	tx.Lock()
+	tx.UnsafePut([]byte("key"), []byte("overwrite"), []byte("2"))
+	tx.Unlock()
+
+	keys := []struct {
+		key   []byte
+		end   []byte
+		limit int64
+
+		wkey [][]byte
+		wval [][]byte
+	}{
+		{
+			key: []byte("abc"),
+			end: nil,
+
+			wkey: [][]byte{[]byte("abc")},
+			wval: [][]byte{[]byte("bar")},
+		},
+		{
+			key: []byte("abc"),
+			end: []byte("def"),
+
+			wkey: [][]byte{[]byte("abc")},
+			wval: [][]byte{[]byte("bar")},
+		},
+		{
+			key: []byte("abc"),
+			end: []byte("deg"),
+
+			wkey: [][]byte{[]byte("abc"), []byte("def")},
+			wval: [][]byte{[]byte("bar"), []byte("baz")},
+		},
+		{
+			key:   []byte("abc"),
+			end:   []byte("\xff"),
+			limit: 1,
+
+			wkey: [][]byte{[]byte("abc")},
+			wval: [][]byte{[]byte("bar")},
+		},
+		{
+			key: []byte("abc"),
+			end: []byte("\xff"),
+
+			wkey: [][]byte{[]byte("abc"), []byte("def"), []byte("overwrite")},
+			wval: [][]byte{[]byte("bar"), []byte("baz"), []byte("2")},
+		},
+	}
+	rtx := b.ReadTx()
+	for i, tt := range keys {
+		rtx.RLock()
+		k, v := rtx.UnsafeRange([]byte("key"), tt.key, tt.end, tt.limit)
+		rtx.RUnlock()
+		if !reflect.DeepEqual(tt.wkey, k) || !reflect.DeepEqual(tt.wval, v) {
+			t.Errorf("#%d: want k=%+v, v=%+v; got k=%+v, v=%+v", i, tt.wkey, tt.wval, k, v)
+		}
+	}
+}
+
+// TestConcurrentReadTx ensures that current read transaction can see all prior writes stored in read buffer
+func TestConcurrentReadTx(t *testing.T) {
+	b, tmpPath := NewTmpBackend(time.Hour, 10000)
+	defer cleanup(b, tmpPath)
+
+	wtx1 := b.BatchTx()
+	wtx1.Lock()
+	wtx1.UnsafeCreateBucket([]byte("key"))
+	wtx1.UnsafePut([]byte("key"), []byte("abc"), []byte("ABC"))
+	wtx1.UnsafePut([]byte("key"), []byte("overwrite"), []byte("1"))
+	wtx1.Unlock()
+
+	wtx2 := b.BatchTx()
+	wtx2.Lock()
+	wtx2.UnsafePut([]byte("key"), []byte("def"), []byte("DEF"))
+	wtx2.UnsafePut([]byte("key"), []byte("overwrite"), []byte("2"))
+	wtx2.Unlock()
+
+	rtx := b.ConcurrentReadTx()
+	rtx.RLock() // no-op
+	k, v := rtx.UnsafeRange([]byte("key"), []byte("abc"), []byte("\xff"), 0)
+	rtx.RUnlock()
+	wKey := [][]byte{[]byte("abc"), []byte("def"), []byte("overwrite")}
+	wVal := [][]byte{[]byte("ABC"), []byte("DEF"), []byte("2")}
+	if !reflect.DeepEqual(wKey, k) || !reflect.DeepEqual(wVal, v) {
+		t.Errorf("want k=%+v, v=%+v; got k=%+v, v=%+v", wKey, wVal, k, v)
+	}
+}
+
+// TestBackendWritebackForEach checks that partially written / buffered
+// data is visited in the same order as fully committed data.
+func TestBackendWritebackForEach(t *testing.T) {
+	b, tmpPath := NewTmpBackend(time.Hour, 10000)
+	defer cleanup(b, tmpPath)
+
+	tx := b.BatchTx()
+	tx.Lock()
+	tx.UnsafeCreateBucket([]byte("key"))
+	for i := 0; i < 5; i++ {
+		k := []byte(fmt.Sprintf("%04d", i))
+		tx.UnsafePut([]byte("key"), k, []byte("bar"))
+	}
+	tx.Unlock()
+
+	// writeback
+	b.ForceCommit()
+
+	tx.Lock()
+	tx.UnsafeCreateBucket([]byte("key"))
+	for i := 5; i < 20; i++ {
+		k := []byte(fmt.Sprintf("%04d", i))
+		tx.UnsafePut([]byte("key"), k, []byte("bar"))
+	}
+	tx.Unlock()
+
+	seq := ""
+	getSeq := func(k, v []byte) error {
+		seq += string(k)
+		return nil
+	}
+	rtx := b.ReadTx()
+	rtx.RLock()
+	rtx.UnsafeForEach([]byte("key"), getSeq)
+	rtx.RUnlock()
+
+	partialSeq := seq
+
+	seq = ""
+	b.ForceCommit()
+
+	tx.Lock()
+	tx.UnsafeForEach([]byte("key"), getSeq)
+	tx.Unlock()
+
+	if seq != partialSeq {
+		t.Fatalf("expected %q, got %q", seq, partialSeq)
+	}
+}
+
+func cleanup(b Backend, path string) {
+	b.Close()
+	os.Remove(path)
+}
--- a/server/mvcc/backend/batch_tx.go
+++ b/server/mvcc/backend/batch_tx.go
@@ -0,0 +1,307 @@
+// Copyright 2015 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package backend
+
+import (
+	"bytes"
+	"math"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	bolt "go.etcd.io/bbolt"
+	"go.uber.org/zap"
+)
+
+type BatchTx interface {
+	ReadTx
+	UnsafeCreateBucket(name []byte)
+	UnsafePut(bucketName []byte, key []byte, value []byte)
+	UnsafeSeqPut(bucketName []byte, key []byte, value []byte)
+	UnsafeDelete(bucketName []byte, key []byte)
+	// Commit commits a previous tx and begins a new writable one.
+	Commit()
+	// CommitAndStop commits the previous tx and does not create a new one.
+	CommitAndStop()
+}
+
+type batchTx struct {
+	sync.Mutex
+	tx      *bolt.Tx
+	backend *backend
+
+	pending int
+}
+
+func (t *batchTx) Lock() {
+	t.Mutex.Lock()
+}
+
+func (t *batchTx) Unlock() {
+	if t.pending >= t.backend.batchLimit {
+		t.commit(false)
+	}
+	t.Mutex.Unlock()
+}
+
+// BatchTx interface embeds ReadTx interface. But RLock() and RUnlock() do not
+// have appropriate semantics in BatchTx interface. Therefore should not be called.
+// TODO: might want to decouple ReadTx and BatchTx
+
+func (t *batchTx) RLock() {
+	panic("unexpected RLock")
+}
+
+func (t *batchTx) RUnlock() {
+	panic("unexpected RUnlock")
+}
+
+func (t *batchTx) UnsafeCreateBucket(name []byte) {
+	_, err := t.tx.CreateBucket(name)
+	if err != nil && err != bolt.ErrBucketExists {
+		t.backend.lg.Fatal(
+			"failed to create a bucket",
+			zap.String("bucket-name", string(name)),
+			zap.Error(err),
+		)
+	}
+	t.pending++
+}
+
+// UnsafePut must be called holding the lock on the tx.
+func (t *batchTx) UnsafePut(bucketName []byte, key []byte, value []byte) {
+	t.unsafePut(bucketName, key, value, false)
+}
+
+// UnsafeSeqPut must be called holding the lock on the tx.
+func (t *batchTx) UnsafeSeqPut(bucketName []byte, key []byte, value []byte) {
+	t.unsafePut(bucketName, key, value, true)
+}
+
+func (t *batchTx) unsafePut(bucketName []byte, key []byte, value []byte, seq bool) {
+	bucket := t.tx.Bucket(bucketName)
+	if bucket == nil {
+		t.backend.lg.Fatal(
+			"failed to find a bucket",
+			zap.String("bucket-name", string(bucketName)),
+		)
+	}
+	if seq {
+		// it is useful to increase fill percent when the workloads are mostly append-only.
+		// this can delay the page split and reduce space usage.
+		bucket.FillPercent = 0.9
+	}
+	if err := bucket.Put(key, value); err != nil {
+		t.backend.lg.Fatal(
+			"failed to write to a bucket",
+			zap.String("bucket-name", string(bucketName)),
+			zap.Error(err),
+		)
+	}
+	t.pending++
+}
+
+// UnsafeRange must be called holding the lock on the tx.
+func (t *batchTx) UnsafeRange(bucketName, key, endKey []byte, limit int64) ([][]byte, [][]byte) {
+	bucket := t.tx.Bucket(bucketName)
+	if bucket == nil {
+		t.backend.lg.Fatal(
+			"failed to find a bucket",
+			zap.String("bucket-name", string(bucketName)),
+		)
+	}
+	return unsafeRange(bucket.Cursor(), key, endKey, limit)
+}
+
+func unsafeRange(c *bolt.Cursor, key, endKey []byte, limit int64) (keys [][]byte, vs [][]byte) {
+	if limit <= 0 {
+		limit = math.MaxInt64
+	}
+	var isMatch func(b []byte) bool
+	if len(endKey) > 0 {
+		isMatch = func(b []byte) bool { return bytes.Compare(b, endKey) < 0 }
+	} else {
+		isMatch = func(b []byte) bool { return bytes.Equal(b, key) }
+		limit = 1
+	}
+
+	for ck, cv := c.Seek(key); ck != nil && isMatch(ck); ck, cv = c.Next() {
+		vs = append(vs, cv)
+		keys = append(keys, ck)
+		if limit == int64(len(keys)) {
+			break
+		}
+	}
+	return keys, vs
+}
+
+// UnsafeDelete must be called holding the lock on the tx.
+func (t *batchTx) UnsafeDelete(bucketName []byte, key []byte) {
+	bucket := t.tx.Bucket(bucketName)
+	if bucket == nil {
+		t.backend.lg.Fatal(
+			"failed to find a bucket",
+			zap.String("bucket-name", string(bucketName)),
+		)
+	}
+	err := bucket.Delete(key)
+	if err != nil {
+		t.backend.lg.Fatal(
+			"failed to delete a key",
+			zap.String("bucket-name", string(bucketName)),
+			zap.Error(err),
+		)
+	}
+	t.pending++
+}
+
+// UnsafeForEach must be called holding the lock on the tx.
+func (t *batchTx) UnsafeForEach(bucketName []byte, visitor func(k, v []byte) error) error {
+	return unsafeForEach(t.tx, bucketName, visitor)
+}
+
+func unsafeForEach(tx *bolt.Tx, bucket []byte, visitor func(k, v []byte) error) error {
+	if b := tx.Bucket(bucket); b != nil {
+		return b.ForEach(visitor)
+	}
+	return nil
+}
+
+// Commit commits a previous tx and begins a new writable one.
+func (t *batchTx) Commit() {
+	t.Lock()
+	t.commit(false)
+	t.Unlock()
+}
+
+// CommitAndStop commits the previous tx and does not create a new one.
+func (t *batchTx) CommitAndStop() {
+	t.Lock()
+	t.commit(true)
+	t.Unlock()
+}
+
+func (t *batchTx) safePending() int {
+	t.Mutex.Lock()
+	defer t.Mutex.Unlock()
+	return t.pending
+}
+
+func (t *batchTx) commit(stop bool) {
+	// commit the last tx
+	if t.tx != nil {
+		if t.pending == 0 && !stop {
+			return
+		}
+
+		start := time.Now()
+
+		// gofail: var beforeCommit struct{}
+		err := t.tx.Commit()
+		// gofail: var afterCommit struct{}
+
+		rebalanceSec.Observe(t.tx.Stats().RebalanceTime.Seconds())
+		spillSec.Observe(t.tx.Stats().SpillTime.Seconds())
+		writeSec.Observe(t.tx.Stats().WriteTime.Seconds())
+		commitSec.Observe(time.Since(start).Seconds())
+		atomic.AddInt64(&t.backend.commits, 1)
+
+		t.pending = 0
+		if err != nil {
+			t.backend.lg.Fatal("failed to commit tx", zap.Error(err))
+		}
+	}
+	if !stop {
+		t.tx = t.backend.begin(true)
+	}
+}
+
+type batchTxBuffered struct {
+	batchTx
+	buf txWriteBuffer
+}
+
+func newBatchTxBuffered(backend *backend) *batchTxBuffered {
+	tx := &batchTxBuffered{
+		batchTx: batchTx{backend: backend},
+		buf: txWriteBuffer{
+			txBuffer: txBuffer{make(map[string]*bucketBuffer)},
+			seq:      true,
+		},
+	}
+	tx.Commit()
+	return tx
+}
+
+func (t *batchTxBuffered) Unlock() {
+	if t.pending != 0 {
+		t.backend.readTx.Lock() // blocks txReadBuffer for writing.
+		t.buf.writeback(&t.backend.readTx.buf)
+		t.backend.readTx.Unlock()
+		if t.pending >= t.backend.batchLimit {
+			t.commit(false)
+		}
+	}
+	t.batchTx.Unlock()
+}
+
+func (t *batchTxBuffered) Commit() {
+	t.Lock()
+	t.commit(false)
+	t.Unlock()
+}
+
+func (t *batchTxBuffered) CommitAndStop() {
+	t.Lock()
+	t.commit(true)
+	t.Unlock()
+}
+
+func (t *batchTxBuffered) commit(stop bool) {
+	// all read txs must be closed to acquire boltdb commit rwlock
+	t.backend.readTx.Lock()
+	t.unsafeCommit(stop)
+	t.backend.readTx.Unlock()
+}
+
+func (t *batchTxBuffered) unsafeCommit(stop bool) {
+	if t.backend.readTx.tx != nil {
+		// wait all store read transactions using the current boltdb tx to finish,
+		// then close the boltdb tx
+		go func(tx *bolt.Tx, wg *sync.WaitGroup) {
+			wg.Wait()
+			if err := tx.Rollback(); err != nil {
+				t.backend.lg.Fatal("failed to rollback tx", zap.Error(err))
+			}
+		}(t.backend.readTx.tx, t.backend.readTx.txWg)
+		t.backend.readTx.reset()
+	}
+
+	t.batchTx.commit(stop)
+
+	if !stop {
+		t.backend.readTx.tx = t.backend.begin(false)
+	}
+}
+
+func (t *batchTxBuffered) UnsafePut(bucketName []byte, key []byte, value []byte) {
+	t.batchTx.UnsafePut(bucketName, key, value)
+	t.buf.put(bucketName, key, value)
+}
+
+func (t *batchTxBuffered) UnsafeSeqPut(bucketName []byte, key []byte, value []byte) {
+	t.batchTx.UnsafeSeqPut(bucketName, key, value)
+	t.buf.putSeq(bucketName, key, value)
+}
--- a/server/mvcc/backend/batch_tx_test.go
+++ b/server/mvcc/backend/batch_tx_test.go
@@ -0,0 +1,197 @@
+// Copyright 2015 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package backend
+
+import (
+	"reflect"
+	"testing"
+	"time"
+
+	bolt "go.etcd.io/bbolt"
+)
+
+func TestBatchTxPut(t *testing.T) {
+	b, tmpPath := NewTmpBackend(time.Hour, 10000)
+	defer cleanup(b, tmpPath)
+
+	tx := b.batchTx
+	tx.Lock()
+	defer tx.Unlock()
+
+	// create bucket
+	tx.UnsafeCreateBucket([]byte("test"))
+
+	// put
+	v := []byte("bar")
+	tx.UnsafePut([]byte("test"), []byte("foo"), v)
+
+	// check put result before and after tx is committed
+	for k := 0; k < 2; k++ {
+		_, gv := tx.UnsafeRange([]byte("test"), []byte("foo"), nil, 0)
+		if !reflect.DeepEqual(gv[0], v) {
+			t.Errorf("v = %s, want %s", string(gv[0]), string(v))
+		}
+		tx.commit(false)
+	}
+}
+
+func TestBatchTxRange(t *testing.T) {
+	b, tmpPath := NewTmpBackend(time.Hour, 10000)
+	defer cleanup(b, tmpPath)
+
+	tx := b.batchTx
+	tx.Lock()
+	defer tx.Unlock()
+
+	tx.UnsafeCreateBucket([]byte("test"))
+	// put keys
+	allKeys := [][]byte{[]byte("foo"), []byte("foo1"), []byte("foo2")}
+	allVals := [][]byte{[]byte("bar"), []byte("bar1"), []byte("bar2")}
+	for i := range allKeys {
+		tx.UnsafePut([]byte("test"), allKeys[i], allVals[i])
+	}
+
+	tests := []struct {
+		key    []byte
+		endKey []byte
+		limit  int64
+
+		wkeys [][]byte
+		wvals [][]byte
+	}{
+		// single key
+		{
+			[]byte("foo"), nil, 0,
+			allKeys[:1], allVals[:1],
+		},
+		// single key, bad
+		{
+			[]byte("doo"), nil, 0,
+			nil, nil,
+		},
+		// key range
+		{
+			[]byte("foo"), []byte("foo1"), 0,
+			allKeys[:1], allVals[:1],
+		},
+		// key range, get all keys
+		{
+			[]byte("foo"), []byte("foo3"), 0,
+			allKeys, allVals,
+		},
+		// key range, bad
+		{
+			[]byte("goo"), []byte("goo3"), 0,
+			nil, nil,
+		},
+		// key range with effective limit
+		{
+			[]byte("foo"), []byte("foo3"), 1,
+			allKeys[:1], allVals[:1],
+		},
+		// key range with limit
+		{
+			[]byte("foo"), []byte("foo3"), 4,
+			allKeys, allVals,
+		},
+	}
+	for i, tt := range tests {
+		keys, vals := tx.UnsafeRange([]byte("test"), tt.key, tt.endKey, tt.limit)
+		if !reflect.DeepEqual(keys, tt.wkeys) {
+			t.Errorf("#%d: keys = %+v, want %+v", i, keys, tt.wkeys)
+		}
+		if !reflect.DeepEqual(vals, tt.wvals) {
+			t.Errorf("#%d: vals = %+v, want %+v", i, vals, tt.wvals)
+		}
+	}
+}
+
+func TestBatchTxDelete(t *testing.T) {
+	b, tmpPath := NewTmpBackend(time.Hour, 10000)
+	defer cleanup(b, tmpPath)
+
+	tx := b.batchTx
+	tx.Lock()
+	defer tx.Unlock()
+
+	tx.UnsafeCreateBucket([]byte("test"))
+	tx.UnsafePut([]byte("test"), []byte("foo"), []byte("bar"))
+
+	tx.UnsafeDelete([]byte("test"), []byte("foo"))
+
+	// check put result before and after tx is committed
+	for k := 0; k < 2; k++ {
+		ks, _ := tx.UnsafeRange([]byte("test"), []byte("foo"), nil, 0)
+		if len(ks) != 0 {
+			t.Errorf("keys on foo = %v, want nil", ks)
+		}
+		tx.commit(false)
+	}
+}
+
+func TestBatchTxCommit(t *testing.T) {
+	b, tmpPath := NewTmpBackend(time.Hour, 10000)
+	defer cleanup(b, tmpPath)
+
+	tx := b.batchTx
+	tx.Lock()
+	tx.UnsafeCreateBucket([]byte("test"))
+	tx.UnsafePut([]byte("test"), []byte("foo"), []byte("bar"))
+	tx.Unlock()
+
+	tx.Commit()
+
+	// check whether put happens via db view
+	b.db.View(func(tx *bolt.Tx) error {
+		bucket := tx.Bucket([]byte("test"))
+		if bucket == nil {
+			t.Errorf("bucket test does not exit")
+			return nil
+		}
+		v := bucket.Get([]byte("foo"))
+		if v == nil {
+			t.Errorf("foo key failed to written in backend")
+		}
+		return nil
+	})
+}
+
+func TestBatchTxBatchLimitCommit(t *testing.T) {
+	// start backend with batch limit 1 so one write can
+	// trigger a commit
+	b, tmpPath := NewTmpBackend(time.Hour, 1)
+	defer cleanup(b, tmpPath)
+
+	tx := b.batchTx
+	tx.Lock()
+	tx.UnsafeCreateBucket([]byte("test"))
+	tx.UnsafePut([]byte("test"), []byte("foo"), []byte("bar"))
+	tx.Unlock()
+
+	// batch limit commit should have been triggered
+	// check whether put happens via db view
+	b.db.View(func(tx *bolt.Tx) error {
+		bucket := tx.Bucket([]byte("test"))
+		if bucket == nil {
+			t.Errorf("bucket test does not exit")
+			return nil
+		}
+		v := bucket.Get([]byte("foo"))
+		if v == nil {
+			t.Errorf("foo key failed to written in backend")
+		}
+		return nil
+	})
+}
--- a/server/mvcc/backend/config_default.go
+++ b/server/mvcc/backend/config_default.go
@@ -0,0 +1,23 @@
+// Copyright 2016 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build !linux,!windows
+
+package backend
+
+import bolt "go.etcd.io/bbolt"
+
+var boltOpenOptions *bolt.Options
+
+func (bcfg *BackendConfig) mmapSize() int { return int(bcfg.MmapSize) }
--- a/server/mvcc/backend/config_linux.go
+++ b/server/mvcc/backend/config_linux.go
@@ -0,0 +1,34 @@
+// Copyright 2015 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package backend
+
+import (
+	"syscall"
+
+	bolt "go.etcd.io/bbolt"
+)
+
+// syscall.MAP_POPULATE on linux 2.6.23+ does sequential read-ahead
+// which can speed up entire-database read with boltdb. We want to
+// enable MAP_POPULATE for faster key-value store recovery in storage
+// package. If your kernel version is lower than 2.6.23
+// (https://github.com/torvalds/linux/releases/tag/v2.6.23), mmap might
+// silently ignore this flag. Please update your kernel to prevent this.
+var boltOpenOptions = &bolt.Options{
+	MmapFlags:      syscall.MAP_POPULATE,
+	NoFreelistSync: true,
+}
+
+func (bcfg *BackendConfig) mmapSize() int { return int(bcfg.MmapSize) }
--- a/server/mvcc/backend/config_windows.go
+++ b/server/mvcc/backend/config_windows.go
@@ -0,0 +1,26 @@
+// Copyright 2017 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build windows
+
+package backend
+
+import bolt "go.etcd.io/bbolt"
+
+var boltOpenOptions *bolt.Options = nil
+
+// setting mmap size != 0 on windows will allocate the entire
+// mmap size for the file, instead of growing it. So, force 0.
+
+func (bcfg *BackendConfig) mmapSize() int { return 0 }
--- a/server/mvcc/backend/doc.go
+++ b/server/mvcc/backend/doc.go
@@ -0,0 +1,16 @@
+// Copyright 2015 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package backend defines a standard interface for etcd's backend MVCC storage.
+package backend
--- a/server/mvcc/backend/metrics.go
+++ b/server/mvcc/backend/metrics.go
@@ -0,0 +1,95 @@
+// Copyright 2016 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package backend
+
+import "github.com/prometheus/client_golang/prometheus"
+
+var (
+	commitSec = prometheus.NewHistogram(prometheus.HistogramOpts{
+		Namespace: "etcd",
+		Subsystem: "disk",
+		Name:      "backend_commit_duration_seconds",
+		Help:      "The latency distributions of commit called by backend.",
+
+		// lowest bucket start of upper bound 0.001 sec (1 ms) with factor 2
+		// highest bucket start of 0.001 sec * 2^13 == 8.192 sec
+		Buckets: prometheus.ExponentialBuckets(0.001, 2, 14),
+	})
+
+	rebalanceSec = prometheus.NewHistogram(prometheus.HistogramOpts{
+		Namespace: "etcd_debugging",
+		Subsystem: "disk",
+		Name:      "backend_commit_rebalance_duration_seconds",
+		Help:      "The latency distributions of commit.rebalance called by bboltdb backend.",
+
+		// lowest bucket start of upper bound 0.001 sec (1 ms) with factor 2
+		// highest bucket start of 0.001 sec * 2^13 == 8.192 sec
+		Buckets: prometheus.ExponentialBuckets(0.001, 2, 14),
+	})
+
+	spillSec = prometheus.NewHistogram(prometheus.HistogramOpts{
+		Namespace: "etcd_debugging",
+		Subsystem: "disk",
+		Name:      "backend_commit_spill_duration_seconds",
+		Help:      "The latency distributions of commit.spill called by bboltdb backend.",
+
+		// lowest bucket start of upper bound 0.001 sec (1 ms) with factor 2
+		// highest bucket start of 0.001 sec * 2^13 == 8.192 sec
+		Buckets: prometheus.ExponentialBuckets(0.001, 2, 14),
+	})
+
+	writeSec = prometheus.NewHistogram(prometheus.HistogramOpts{
+		Namespace: "etcd_debugging",
+		Subsystem: "disk",
+		Name:      "backend_commit_write_duration_seconds",
+		Help:      "The latency distributions of commit.write called by bboltdb backend.",
+
+		// lowest bucket start of upper bound 0.001 sec (1 ms) with factor 2
+		// highest bucket start of 0.001 sec * 2^13 == 8.192 sec
+		Buckets: prometheus.ExponentialBuckets(0.001, 2, 14),
+	})
+
+	defragSec = prometheus.NewHistogram(prometheus.HistogramOpts{
+		Namespace: "etcd",
+		Subsystem: "disk",
+		Name:      "backend_defrag_duration_seconds",
+		Help:      "The latency distribution of backend defragmentation.",
+
+		// 100 MB usually takes 1 sec, so start with 10 MB of 100 ms
+		// lowest bucket start of upper bound 0.1 sec (100 ms) with factor 2
+		// highest bucket start of 0.1 sec * 2^12 == 409.6 sec
+		Buckets: prometheus.ExponentialBuckets(.1, 2, 13),
+	})
+
+	snapshotTransferSec = prometheus.NewHistogram(prometheus.HistogramOpts{
+		Namespace: "etcd",
+		Subsystem: "disk",
+		Name:      "backend_snapshot_duration_seconds",
+		Help:      "The latency distribution of backend snapshots.",
+
+		// lowest bucket start of upper bound 0.01 sec (10 ms) with factor 2
+		// highest bucket start of 0.01 sec * 2^16 == 655.36 sec
+		Buckets: prometheus.ExponentialBuckets(.01, 2, 17),
+	})
+)
+
+func init() {
+	prometheus.MustRegister(commitSec)
+	prometheus.MustRegister(rebalanceSec)
+	prometheus.MustRegister(spillSec)
+	prometheus.MustRegister(writeSec)
+	prometheus.MustRegister(defragSec)
+	prometheus.MustRegister(snapshotTransferSec)
+}
--- a/server/mvcc/backend/read_tx.go
+++ b/server/mvcc/backend/read_tx.go
@@ -0,0 +1,153 @@
+// Copyright 2017 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package backend
+
+import (
+	"bytes"
+	"math"
+	"sync"
+
+	bolt "go.etcd.io/bbolt"
+)
+
+// safeRangeBucket is a hack to avoid inadvertently reading duplicate keys;
+// overwrites on a bucket should only fetch with limit=1, but safeRangeBucket
+// is known to never overwrite any key so range is safe.
+var safeRangeBucket = []byte("key")
+
+type ReadTx interface {
+	Lock()
+	Unlock()
+	RLock()
+	RUnlock()
+
+	UnsafeRange(bucketName []byte, key, endKey []byte, limit int64) (keys [][]byte, vals [][]byte)
+	UnsafeForEach(bucketName []byte, visitor func(k, v []byte) error) error
+}
+
+// Base type for readTx and concurrentReadTx to eliminate duplicate functions between these
+type baseReadTx struct {
+	// mu protects accesses to the txReadBuffer
+	mu  sync.RWMutex
+	buf txReadBuffer
+
+	// TODO: group and encapsulate {txMu, tx, buckets, txWg}, as they share the same lifecycle.
+	// txMu protects accesses to buckets and tx on Range requests.
+	txMu    *sync.RWMutex
+	tx      *bolt.Tx
+	buckets map[string]*bolt.Bucket
+	// txWg protects tx from being rolled back at the end of a batch interval until all reads using this tx are done.
+	txWg *sync.WaitGroup
+}
+
+func (baseReadTx *baseReadTx) UnsafeForEach(bucketName []byte, visitor func(k, v []byte) error) error {
+	dups := make(map[string]struct{})
+	getDups := func(k, v []byte) error {
+		dups[string(k)] = struct{}{}
+		return nil
+	}
+	visitNoDup := func(k, v []byte) error {
+		if _, ok := dups[string(k)]; ok {
+			return nil
+		}
+		return visitor(k, v)
+	}
+	if err := baseReadTx.buf.ForEach(bucketName, getDups); err != nil {
+		return err
+	}
+	baseReadTx.txMu.Lock()
+	err := unsafeForEach(baseReadTx.tx, bucketName, visitNoDup)
+	baseReadTx.txMu.Unlock()
+	if err != nil {
+		return err
+	}
+	return baseReadTx.buf.ForEach(bucketName, visitor)
+}
+
+func (baseReadTx *baseReadTx) UnsafeRange(bucketName, key, endKey []byte, limit int64) ([][]byte, [][]byte) {
+	if endKey == nil {
+		// forbid duplicates for single keys
+		limit = 1
+	}
+	if limit <= 0 {
+		limit = math.MaxInt64
+	}
+	if limit > 1 && !bytes.Equal(bucketName, safeRangeBucket) {
+		panic("do not use unsafeRange on non-keys bucket")
+	}
+	keys, vals := baseReadTx.buf.Range(bucketName, key, endKey, limit)
+	if int64(len(keys)) == limit {
+		return keys, vals
+	}
+
+	// find/cache bucket
+	bn := string(bucketName)
+	baseReadTx.txMu.RLock()
+	bucket, ok := baseReadTx.buckets[bn]
+	baseReadTx.txMu.RUnlock()
+	lockHeld := false
+	if !ok {
+		baseReadTx.txMu.Lock()
+		lockHeld = true
+		bucket = baseReadTx.tx.Bucket(bucketName)
+		baseReadTx.buckets[bn] = bucket
+	}
+
+	// ignore missing bucket since may have been created in this batch
+	if bucket == nil {
+		if lockHeld {
+			baseReadTx.txMu.Unlock()
+		}
+		return keys, vals
+	}
+	if !lockHeld {
+		baseReadTx.txMu.Lock()
+		lockHeld = true
+	}
+	c := bucket.Cursor()
+	baseReadTx.txMu.Unlock()
+
+	k2, v2 := unsafeRange(c, key, endKey, limit-int64(len(keys)))
+	return append(k2, keys...), append(v2, vals...)
+}
+
+type readTx struct {
+	baseReadTx
+}
+
+func (rt *readTx) Lock()    { rt.mu.Lock() }
+func (rt *readTx) Unlock()  { rt.mu.Unlock() }
+func (rt *readTx) RLock()   { rt.mu.RLock() }
+func (rt *readTx) RUnlock() { rt.mu.RUnlock() }
+
+func (rt *readTx) reset() {
+	rt.buf.reset()
+	rt.buckets = make(map[string]*bolt.Bucket)
+	rt.tx = nil
+	rt.txWg = new(sync.WaitGroup)
+}
+
+type concurrentReadTx struct {
+	baseReadTx
+}
+
+func (rt *concurrentReadTx) Lock()   {}
+func (rt *concurrentReadTx) Unlock() {}
+
+// RLock is no-op. concurrentReadTx does not need to be locked after it is created.
+func (rt *concurrentReadTx) RLock() {}
+
+// RUnlock signals the end of concurrentReadTx.
+func (rt *concurrentReadTx) RUnlock() { rt.txWg.Done() }
--- a/server/mvcc/backend/tx_buffer.go
+++ b/server/mvcc/backend/tx_buffer.go
@@ -0,0 +1,203 @@
+// Copyright 2017 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package backend
+
+import (
+	"bytes"
+	"sort"
+)
+
+// txBuffer handles functionality shared between txWriteBuffer and txReadBuffer.
+type txBuffer struct {
+	buckets map[string]*bucketBuffer
+}
+
+func (txb *txBuffer) reset() {
+	for k, v := range txb.buckets {
+		if v.used == 0 {
+			// demote
+			delete(txb.buckets, k)
+		}
+		v.used = 0
+	}
+}
+
+// txWriteBuffer buffers writes of pending updates that have not yet committed.
+type txWriteBuffer struct {
+	txBuffer
+	seq bool
+}
+
+func (txw *txWriteBuffer) put(bucket, k, v []byte) {
+	txw.seq = false
+	txw.putSeq(bucket, k, v)
+}
+
+func (txw *txWriteBuffer) putSeq(bucket, k, v []byte) {
+	b, ok := txw.buckets[string(bucket)]
+	if !ok {
+		b = newBucketBuffer()
+		txw.buckets[string(bucket)] = b
+	}
+	b.add(k, v)
+}
+
+func (txw *txWriteBuffer) writeback(txr *txReadBuffer) {
+	for k, wb := range txw.buckets {
+		rb, ok := txr.buckets[k]
+		if !ok {
+			delete(txw.buckets, k)
+			txr.buckets[k] = wb
+			continue
+		}
+		if !txw.seq && wb.used > 1 {
+			// assume no duplicate keys
+			sort.Sort(wb)
+		}
+		rb.merge(wb)
+	}
+	txw.reset()
+}
+
+// txReadBuffer accesses buffered updates.
+type txReadBuffer struct{ txBuffer }
+
+func (txr *txReadBuffer) Range(bucketName, key, endKey []byte, limit int64) ([][]byte, [][]byte) {
+	if b := txr.buckets[string(bucketName)]; b != nil {
+		return b.Range(key, endKey, limit)
+	}
+	return nil, nil
+}
+
+func (txr *txReadBuffer) ForEach(bucketName []byte, visitor func(k, v []byte) error) error {
+	if b := txr.buckets[string(bucketName)]; b != nil {
+		return b.ForEach(visitor)
+	}
+	return nil
+}
+
+// unsafeCopy returns a copy of txReadBuffer, caller should acquire backend.readTx.RLock()
+func (txr *txReadBuffer) unsafeCopy() txReadBuffer {
+	txrCopy := txReadBuffer{
+		txBuffer: txBuffer{
+			buckets: make(map[string]*bucketBuffer, len(txr.txBuffer.buckets)),
+		},
+	}
+	for bucketName, bucket := range txr.txBuffer.buckets {
+		txrCopy.txBuffer.buckets[bucketName] = bucket.Copy()
+	}
+	return txrCopy
+}
+
+type kv struct {
+	key []byte
+	val []byte
+}
+
+// bucketBuffer buffers key-value pairs that are pending commit.
+type bucketBuffer struct {
+	buf []kv
+	// used tracks number of elements in use so buf can be reused without reallocation.
+	used int
+}
+
+func newBucketBuffer() *bucketBuffer {
+	return &bucketBuffer{buf: make([]kv, 512), used: 0}
+}
+
+func (bb *bucketBuffer) Range(key, endKey []byte, limit int64) (keys [][]byte, vals [][]byte) {
+	f := func(i int) bool { return bytes.Compare(bb.buf[i].key, key) >= 0 }
+	idx := sort.Search(bb.used, f)
+	if idx < 0 {
+		return nil, nil
+	}
+	if len(endKey) == 0 {
+		if bytes.Equal(key, bb.buf[idx].key) {
+			keys = append(keys, bb.buf[idx].key)
+			vals = append(vals, bb.buf[idx].val)
+		}
+		return keys, vals
+	}
+	if bytes.Compare(endKey, bb.buf[idx].key) <= 0 {
+		return nil, nil
+	}
+	for i := idx; i < bb.used && int64(len(keys)) < limit; i++ {
+		if bytes.Compare(endKey, bb.buf[i].key) <= 0 {
+			break
+		}
+		keys = append(keys, bb.buf[i].key)
+		vals = append(vals, bb.buf[i].val)
+	}
+	return keys, vals
+}
+
+func (bb *bucketBuffer) ForEach(visitor func(k, v []byte) error) error {
+	for i := 0; i < bb.used; i++ {
+		if err := visitor(bb.buf[i].key, bb.buf[i].val); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (bb *bucketBuffer) add(k, v []byte) {
+	bb.buf[bb.used].key, bb.buf[bb.used].val = k, v
+	bb.used++
+	if bb.used == len(bb.buf) {
+		buf := make([]kv, (3*len(bb.buf))/2)
+		copy(buf, bb.buf)
+		bb.buf = buf
+	}
+}
+
+// merge merges data from bbsrc into bb.
+func (bb *bucketBuffer) merge(bbsrc *bucketBuffer) {
+	for i := 0; i < bbsrc.used; i++ {
+		bb.add(bbsrc.buf[i].key, bbsrc.buf[i].val)
+	}
+	if bb.used == bbsrc.used {
+		return
+	}
+	if bytes.Compare(bb.buf[(bb.used-bbsrc.used)-1].key, bbsrc.buf[0].key) < 0 {
+		return
+	}
+
+	sort.Stable(bb)
+
+	// remove duplicates, using only newest update
+	widx := 0
+	for ridx := 1; ridx < bb.used; ridx++ {
+		if !bytes.Equal(bb.buf[ridx].key, bb.buf[widx].key) {
+			widx++
+		}
+		bb.buf[widx] = bb.buf[ridx]
+	}
+	bb.used = widx + 1
+}
+
+func (bb *bucketBuffer) Len() int { return bb.used }
+func (bb *bucketBuffer) Less(i, j int) bool {
+	return bytes.Compare(bb.buf[i].key, bb.buf[j].key) < 0
+}
+func (bb *bucketBuffer) Swap(i, j int) { bb.buf[i], bb.buf[j] = bb.buf[j], bb.buf[i] }
+
+func (bb *bucketBuffer) Copy() *bucketBuffer {
+	bbCopy := bucketBuffer{
+		buf:  make([]kv, len(bb.buf)),
+		used: bb.used,
+	}
+	copy(bbCopy.buf, bb.buf)
+	return &bbCopy
+}
--- a/server/mvcc/doc.go
+++ b/server/mvcc/doc.go
@@ -0,0 +1,16 @@
+// Copyright 2015 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package mvcc defines etcd's stable MVCC storage.
+package mvcc
--- a/server/mvcc/index.go
+++ b/server/mvcc/index.go
@@ -0,0 +1,279 @@
+// Copyright 2015 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mvcc
+
+import (
+	"sort"
+	"sync"
+
+	"github.com/google/btree"
+	"go.uber.org/zap"
+)
+
+type index interface {
+	Get(key []byte, atRev int64) (rev, created revision, ver int64, err error)
+	Range(key, end []byte, atRev int64) ([][]byte, []revision)
+	Revisions(key, end []byte, atRev int64, limit int) []revision
+	CountRevisions(key, end []byte, atRev int64, limit int) int
+	Put(key []byte, rev revision)
+	Tombstone(key []byte, rev revision) error
+	RangeSince(key, end []byte, rev int64) []revision
+	Compact(rev int64) map[revision]struct{}
+	Keep(rev int64) map[revision]struct{}
+	Equal(b index) bool
+
+	Insert(ki *keyIndex)
+	KeyIndex(ki *keyIndex) *keyIndex
+}
+
+type treeIndex struct {
+	sync.RWMutex
+	tree *btree.BTree
+	lg   *zap.Logger
+}
+
+func newTreeIndex(lg *zap.Logger) index {
+	return &treeIndex{
+		tree: btree.New(32),
+		lg:   lg,
+	}
+}
+
+func (ti *treeIndex) Put(key []byte, rev revision) {
+	keyi := &keyIndex{key: key}
+
+	ti.Lock()
+	defer ti.Unlock()
+	item := ti.tree.Get(keyi)
+	if item == nil {
+		keyi.put(ti.lg, rev.main, rev.sub)
+		ti.tree.ReplaceOrInsert(keyi)
+		return
+	}
+	okeyi := item.(*keyIndex)
+	okeyi.put(ti.lg, rev.main, rev.sub)
+}
+
+func (ti *treeIndex) Get(key []byte, atRev int64) (modified, created revision, ver int64, err error) {
+	keyi := &keyIndex{key: key}
+	ti.RLock()
+	defer ti.RUnlock()
+	if keyi = ti.keyIndex(keyi); keyi == nil {
+		return revision{}, revision{}, 0, ErrRevisionNotFound
+	}
+	return keyi.get(ti.lg, atRev)
+}
+
+func (ti *treeIndex) KeyIndex(keyi *keyIndex) *keyIndex {
+	ti.RLock()
+	defer ti.RUnlock()
+	return ti.keyIndex(keyi)
+}
+
+func (ti *treeIndex) keyIndex(keyi *keyIndex) *keyIndex {
+	if item := ti.tree.Get(keyi); item != nil {
+		return item.(*keyIndex)
+	}
+	return nil
+}
+
+func (ti *treeIndex) visit(key, end []byte, f func(ki *keyIndex) bool) {
+	keyi, endi := &keyIndex{key: key}, &keyIndex{key: end}
+
+	ti.RLock()
+	defer ti.RUnlock()
+
+	ti.tree.AscendGreaterOrEqual(keyi, func(item btree.Item) bool {
+		if len(endi.key) > 0 && !item.Less(endi) {
+			return false
+		}
+		if !f(item.(*keyIndex)) {
+			return false
+		}
+		return true
+	})
+}
+
+func (ti *treeIndex) Revisions(key, end []byte, atRev int64, limit int) (revs []revision) {
+	if end == nil {
+		rev, _, _, err := ti.Get(key, atRev)
+		if err != nil {
+			return nil
+		}
+		return []revision{rev}
+	}
+	ti.visit(key, end, func(ki *keyIndex) bool {
+		if rev, _, _, err := ki.get(ti.lg, atRev); err == nil {
+			revs = append(revs, rev)
+			if len(revs) == limit {
+				return false
+			}
+		}
+		return true
+	})
+	return revs
+}
+
+func (ti *treeIndex) CountRevisions(key, end []byte, atRev int64, limit int) int {
+	if end == nil {
+		_, _, _, err := ti.Get(key, atRev)
+		if err != nil {
+			return 0
+		}
+		return 1
+	}
+	total := 0
+	ti.visit(key, end, func(ki *keyIndex) bool {
+		if _, _, _, err := ki.get(ti.lg, atRev); err == nil {
+			total++
+			if total == limit {
+				return false
+			}
+		}
+		return true
+	})
+	return total
+}
+
+func (ti *treeIndex) Range(key, end []byte, atRev int64) (keys [][]byte, revs []revision) {
+	if end == nil {
+		rev, _, _, err := ti.Get(key, atRev)
+		if err != nil {
+			return nil, nil
+		}
+		return [][]byte{key}, []revision{rev}
+	}
+	ti.visit(key, end, func(ki *keyIndex) bool {
+		if rev, _, _, err := ki.get(ti.lg, atRev); err == nil {
+			revs = append(revs, rev)
+			keys = append(keys, ki.key)
+		}
+		return true
+	})
+	return keys, revs
+}
+
+func (ti *treeIndex) Tombstone(key []byte, rev revision) error {
+	keyi := &keyIndex{key: key}
+
+	ti.Lock()
+	defer ti.Unlock()
+	item := ti.tree.Get(keyi)
+	if item == nil {
+		return ErrRevisionNotFound
+	}
+
+	ki := item.(*keyIndex)
+	return ki.tombstone(ti.lg, rev.main, rev.sub)
+}
+
+// RangeSince returns all revisions from key(including) to end(excluding)
+// at or after the given rev. The returned slice is sorted in the order
+// of revision.
+func (ti *treeIndex) RangeSince(key, end []byte, rev int64) []revision {
+	keyi := &keyIndex{key: key}
+
+	ti.RLock()
+	defer ti.RUnlock()
+
+	if end == nil {
+		item := ti.tree.Get(keyi)
+		if item == nil {
+			return nil
+		}
+		keyi = item.(*keyIndex)
+		return keyi.since(ti.lg, rev)
+	}
+
+	endi := &keyIndex{key: end}
+	var revs []revision
+	ti.tree.AscendGreaterOrEqual(keyi, func(item btree.Item) bool {
+		if len(endi.key) > 0 && !item.Less(endi) {
+			return false
+		}
+		curKeyi := item.(*keyIndex)
+		revs = append(revs, curKeyi.since(ti.lg, rev)...)
+		return true
+	})
+	sort.Sort(revisions(revs))
+
+	return revs
+}
+
+func (ti *treeIndex) Compact(rev int64) map[revision]struct{} {
+	available := make(map[revision]struct{})
+	ti.lg.Info("compact tree index", zap.Int64("revision", rev))
+	ti.Lock()
+	clone := ti.tree.Clone()
+	ti.Unlock()
+
+	clone.Ascend(func(item btree.Item) bool {
+		keyi := item.(*keyIndex)
+		//Lock is needed here to prevent modification to the keyIndex while
+		//compaction is going on or revision added to empty before deletion
+		ti.Lock()
+		keyi.compact(ti.lg, rev, available)
+		if keyi.isEmpty() {
+			item := ti.tree.Delete(keyi)
+			if item == nil {
+				ti.lg.Panic("failed to delete during compaction")
+			}
+		}
+		ti.Unlock()
+		return true
+	})
+	return available
+}
+
+// Keep finds all revisions to be kept for a Compaction at the given rev.
+func (ti *treeIndex) Keep(rev int64) map[revision]struct{} {
+	available := make(map[revision]struct{})
+	ti.RLock()
+	defer ti.RUnlock()
+	ti.tree.Ascend(func(i btree.Item) bool {
+		keyi := i.(*keyIndex)
+		keyi.keep(rev, available)
+		return true
+	})
+	return available
+}
+
+func (ti *treeIndex) Equal(bi index) bool {
+	b := bi.(*treeIndex)
+
+	if ti.tree.Len() != b.tree.Len() {
+		return false
+	}
+
+	equal := true
+
+	ti.tree.Ascend(func(item btree.Item) bool {
+		aki := item.(*keyIndex)
+		bki := b.tree.Get(item).(*keyIndex)
+		if !aki.equal(bki) {
+			equal = false
+			return false
+		}
+		return true
+	})
+
+	return equal
+}
+
+func (ti *treeIndex) Insert(ki *keyIndex) {
+	ti.Lock()
+	defer ti.Unlock()
+	ti.tree.ReplaceOrInsert(ki)
+}
--- a/server/mvcc/index_bench_test.go
+++ b/server/mvcc/index_bench_test.go
@@ -0,0 +1,42 @@
+// Copyright 2018 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mvcc
+
+import (
+	"testing"
+
+	"go.uber.org/zap"
+)
+
+func BenchmarkIndexCompact1(b *testing.B)       { benchmarkIndexCompact(b, 1) }
+func BenchmarkIndexCompact100(b *testing.B)     { benchmarkIndexCompact(b, 100) }
+func BenchmarkIndexCompact10000(b *testing.B)   { benchmarkIndexCompact(b, 10000) }
+func BenchmarkIndexCompact100000(b *testing.B)  { benchmarkIndexCompact(b, 100000) }
+func BenchmarkIndexCompact1000000(b *testing.B) { benchmarkIndexCompact(b, 1000000) }
+
+func benchmarkIndexCompact(b *testing.B, size int) {
+	log := zap.NewNop()
+	kvindex := newTreeIndex(log)
+
+	bytesN := 64
+	keys := createBytesSlice(bytesN, size)
+	for i := 1; i < size; i++ {
+		kvindex.Put(keys[i], revision{main: int64(i), sub: int64(i)})
+	}
+	b.ResetTimer()
+	for i := 1; i < b.N; i++ {
+		kvindex.Compact(int64(i))
+	}
+}
--- a/server/mvcc/index_test.go
+++ b/server/mvcc/index_test.go
@@ -0,0 +1,293 @@
+// Copyright 2015 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mvcc
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/google/btree"
+	"go.uber.org/zap"
+)
+
+func TestIndexGet(t *testing.T) {
+	ti := newTreeIndex(zap.NewExample())
+	ti.Put([]byte("foo"), revision{main: 2})
+	ti.Put([]byte("foo"), revision{main: 4})
+	ti.Tombstone([]byte("foo"), revision{main: 6})
+
+	tests := []struct {
+		rev int64
+
+		wrev     revision
+		wcreated revision
+		wver     int64
+		werr     error
+	}{
+		{0, revision{}, revision{}, 0, ErrRevisionNotFound},
+		{1, revision{}, revision{}, 0, ErrRevisionNotFound},
+		{2, revision{main: 2}, revision{main: 2}, 1, nil},
+		{3, revision{main: 2}, revision{main: 2}, 1, nil},
+		{4, revision{main: 4}, revision{main: 2}, 2, nil},
+		{5, revision{main: 4}, revision{main: 2}, 2, nil},
+		{6, revision{}, revision{}, 0, ErrRevisionNotFound},
+	}
+	for i, tt := range tests {
+		rev, created, ver, err := ti.Get([]byte("foo"), tt.rev)
+		if err != tt.werr {
+			t.Errorf("#%d: err = %v, want %v", i, err, tt.werr)
+		}
+		if rev != tt.wrev {
+			t.Errorf("#%d: rev = %+v, want %+v", i, rev, tt.wrev)
+		}
+		if created != tt.wcreated {
+			t.Errorf("#%d: created = %+v, want %+v", i, created, tt.wcreated)
+		}
+		if ver != tt.wver {
+			t.Errorf("#%d: ver = %d, want %d", i, ver, tt.wver)
+		}
+	}
+}
+
+func TestIndexRange(t *testing.T) {
+	allKeys := [][]byte{[]byte("foo"), []byte("foo1"), []byte("foo2")}
+	allRevs := []revision{{main: 1}, {main: 2}, {main: 3}}
+
+	ti := newTreeIndex(zap.NewExample())
+	for i := range allKeys {
+		ti.Put(allKeys[i], allRevs[i])
+	}
+
+	atRev := int64(3)
+	tests := []struct {
+		key, end []byte
+		wkeys    [][]byte
+		wrevs    []revision
+	}{
+		// single key that not found
+		{
+			[]byte("bar"), nil, nil, nil,
+		},
+		// single key that found
+		{
+			[]byte("foo"), nil, allKeys[:1], allRevs[:1],
+		},
+		// range keys, return first member
+		{
+			[]byte("foo"), []byte("foo1"), allKeys[:1], allRevs[:1],
+		},
+		// range keys, return first two members
+		{
+			[]byte("foo"), []byte("foo2"), allKeys[:2], allRevs[:2],
+		},
+		// range keys, return all members
+		{
+			[]byte("foo"), []byte("fop"), allKeys, allRevs,
+		},
+		// range keys, return last two members
+		{
+			[]byte("foo1"), []byte("fop"), allKeys[1:], allRevs[1:],
+		},
+		// range keys, return last member
+		{
+			[]byte("foo2"), []byte("fop"), allKeys[2:], allRevs[2:],
+		},
+		// range keys, return nothing
+		{
+			[]byte("foo3"), []byte("fop"), nil, nil,
+		},
+	}
+	for i, tt := range tests {
+		keys, revs := ti.Range(tt.key, tt.end, atRev)
+		if !reflect.DeepEqual(keys, tt.wkeys) {
+			t.Errorf("#%d: keys = %+v, want %+v", i, keys, tt.wkeys)
+		}
+		if !reflect.DeepEqual(revs, tt.wrevs) {
+			t.Errorf("#%d: revs = %+v, want %+v", i, revs, tt.wrevs)
+		}
+	}
+}
+
+func TestIndexTombstone(t *testing.T) {
+	ti := newTreeIndex(zap.NewExample())
+	ti.Put([]byte("foo"), revision{main: 1})
+
+	err := ti.Tombstone([]byte("foo"), revision{main: 2})
+	if err != nil {
+		t.Errorf("tombstone error = %v, want nil", err)
+	}
+
+	_, _, _, err = ti.Get([]byte("foo"), 2)
+	if err != ErrRevisionNotFound {
+		t.Errorf("get error = %v, want ErrRevisionNotFound", err)
+	}
+	err = ti.Tombstone([]byte("foo"), revision{main: 3})
+	if err != ErrRevisionNotFound {
+		t.Errorf("tombstone error = %v, want %v", err, ErrRevisionNotFound)
+	}
+}
+
+func TestIndexRangeSince(t *testing.T) {
+	allKeys := [][]byte{[]byte("foo"), []byte("foo1"), []byte("foo2"), []byte("foo2"), []byte("foo1"), []byte("foo")}
+	allRevs := []revision{{main: 1}, {main: 2}, {main: 3}, {main: 4}, {main: 5}, {main: 6}}
+
+	ti := newTreeIndex(zap.NewExample())
+	for i := range allKeys {
+		ti.Put(allKeys[i], allRevs[i])
+	}
+
+	atRev := int64(1)
+	tests := []struct {
+		key, end []byte
+		wrevs    []revision
+	}{
+		// single key that not found
+		{
+			[]byte("bar"), nil, nil,
+		},
+		// single key that found
+		{
+			[]byte("foo"), nil, []revision{{main: 1}, {main: 6}},
+		},
+		// range keys, return first member
+		{
+			[]byte("foo"), []byte("foo1"), []revision{{main: 1}, {main: 6}},
+		},
+		// range keys, return first two members
+		{
+			[]byte("foo"), []byte("foo2"), []revision{{main: 1}, {main: 2}, {main: 5}, {main: 6}},
+		},
+		// range keys, return all members
+		{
+			[]byte("foo"), []byte("fop"), allRevs,
+		},
+		// range keys, return last two members
+		{
+			[]byte("foo1"), []byte("fop"), []revision{{main: 2}, {main: 3}, {main: 4}, {main: 5}},
+		},
+		// range keys, return last member
+		{
+			[]byte("foo2"), []byte("fop"), []revision{{main: 3}, {main: 4}},
+		},
+		// range keys, return nothing
+		{
+			[]byte("foo3"), []byte("fop"), nil,
+		},
+	}
+	for i, tt := range tests {
+		revs := ti.RangeSince(tt.key, tt.end, atRev)
+		if !reflect.DeepEqual(revs, tt.wrevs) {
+			t.Errorf("#%d: revs = %+v, want %+v", i, revs, tt.wrevs)
+		}
+	}
+}
+
+func TestIndexCompactAndKeep(t *testing.T) {
+	maxRev := int64(20)
+	tests := []struct {
+		key     []byte
+		remove  bool
+		rev     revision
+		created revision
+		ver     int64
+	}{
+		{[]byte("foo"), false, revision{main: 1}, revision{main: 1}, 1},
+		{[]byte("foo1"), false, revision{main: 2}, revision{main: 2}, 1},
+		{[]byte("foo2"), false, revision{main: 3}, revision{main: 3}, 1},
+		{[]byte("foo2"), false, revision{main: 4}, revision{main: 3}, 2},
+		{[]byte("foo"), false, revision{main: 5}, revision{main: 1}, 2},
+		{[]byte("foo1"), false, revision{main: 6}, revision{main: 2}, 2},
+		{[]byte("foo1"), true, revision{main: 7}, revision{}, 0},
+		{[]byte("foo2"), true, revision{main: 8}, revision{}, 0},
+		{[]byte("foo"), true, revision{main: 9}, revision{}, 0},
+		{[]byte("foo"), false, revision{10, 0}, revision{10, 0}, 1},
+		{[]byte("foo1"), false, revision{10, 1}, revision{10, 1}, 1},
+	}
+
+	// Continuous Compact and Keep
+	ti := newTreeIndex(zap.NewExample())
+	for _, tt := range tests {
+		if tt.remove {
+			ti.Tombstone(tt.key, tt.rev)
+		} else {
+			ti.Put(tt.key, tt.rev)
+		}
+	}
+	for i := int64(1); i < maxRev; i++ {
+		am := ti.Compact(i)
+		keep := ti.Keep(i)
+		if !(reflect.DeepEqual(am, keep)) {
+			t.Errorf("#%d: compact keep %v != Keep keep %v", i, am, keep)
+		}
+		wti := &treeIndex{tree: btree.New(32)}
+		for _, tt := range tests {
+			if _, ok := am[tt.rev]; ok || tt.rev.GreaterThan(revision{main: i}) {
+				if tt.remove {
+					wti.Tombstone(tt.key, tt.rev)
+				} else {
+					restore(wti, tt.key, tt.created, tt.rev, tt.ver)
+				}
+			}
+		}
+		if !ti.Equal(wti) {
+			t.Errorf("#%d: not equal ti", i)
+		}
+	}
+
+	// Once Compact and Keep
+	for i := int64(1); i < maxRev; i++ {
+		ti := newTreeIndex(zap.NewExample())
+		for _, tt := range tests {
+			if tt.remove {
+				ti.Tombstone(tt.key, tt.rev)
+			} else {
+				ti.Put(tt.key, tt.rev)
+			}
+		}
+		am := ti.Compact(i)
+		keep := ti.Keep(i)
+		if !(reflect.DeepEqual(am, keep)) {
+			t.Errorf("#%d: compact keep %v != Keep keep %v", i, am, keep)
+		}
+		wti := &treeIndex{tree: btree.New(32)}
+		for _, tt := range tests {
+			if _, ok := am[tt.rev]; ok || tt.rev.GreaterThan(revision{main: i}) {
+				if tt.remove {
+					wti.Tombstone(tt.key, tt.rev)
+				} else {
+					restore(wti, tt.key, tt.created, tt.rev, tt.ver)
+				}
+			}
+		}
+		if !ti.Equal(wti) {
+			t.Errorf("#%d: not equal ti", i)
+		}
+	}
+}
+
+func restore(ti *treeIndex, key []byte, created, modified revision, ver int64) {
+	keyi := &keyIndex{key: key}
+
+	ti.Lock()
+	defer ti.Unlock()
+	item := ti.tree.Get(keyi)
+	if item == nil {
+		keyi.restore(ti.lg, created, modified, ver)
+		ti.tree.ReplaceOrInsert(keyi)
+		return
+	}
+	okeyi := item.(*keyIndex)
+	okeyi.put(ti.lg, modified.main, modified.sub)
+}
--- a/server/mvcc/key_index.go
+++ b/server/mvcc/key_index.go
@@ -0,0 +1,378 @@
+// Copyright 2015 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mvcc
+
+import (
+	"bytes"
+	"errors"
+	"fmt"
+
+	"github.com/google/btree"
+	"go.uber.org/zap"
+)
+
+var (
+	ErrRevisionNotFound = errors.New("mvcc: revision not found")
+)
+
+// keyIndex stores the revisions of a key in the backend.
+// Each keyIndex has at least one key generation.
+// Each generation might have several key versions.
+// Tombstone on a key appends an tombstone version at the end
+// of the current generation and creates a new empty generation.
+// Each version of a key has an index pointing to the backend.
+//
+// For example: put(1.0);put(2.0);tombstone(3.0);put(4.0);tombstone(5.0) on key "foo"
+// generate a keyIndex:
+// key:     "foo"
+// rev: 5
+// generations:
+//    {empty}
+//    {4.0, 5.0(t)}
+//    {1.0, 2.0, 3.0(t)}
+//
+// Compact a keyIndex removes the versions with smaller or equal to
+// rev except the largest one. If the generation becomes empty
+// during compaction, it will be removed. if all the generations get
+// removed, the keyIndex should be removed.
+//
+// For example:
+// compact(2) on the previous example
+// generations:
+//    {empty}
+//    {4.0, 5.0(t)}
+//    {2.0, 3.0(t)}
+//
+// compact(4)
+// generations:
+//    {empty}
+//    {4.0, 5.0(t)}
+//
+// compact(5):
+// generations:
+//    {empty} -> key SHOULD be removed.
+//
+// compact(6):
+// generations:
+//    {empty} -> key SHOULD be removed.
+type keyIndex struct {
+	key         []byte
+	modified    revision // the main rev of the last modification
+	generations []generation
+}
+
+// put puts a revision to the keyIndex.
+func (ki *keyIndex) put(lg *zap.Logger, main int64, sub int64) {
+	rev := revision{main: main, sub: sub}
+
+	if !rev.GreaterThan(ki.modified) {
+		lg.Panic(
+			"'put' with an unexpected smaller revision",
+			zap.Int64("given-revision-main", rev.main),
+			zap.Int64("given-revision-sub", rev.sub),
+			zap.Int64("modified-revision-main", ki.modified.main),
+			zap.Int64("modified-revision-sub", ki.modified.sub),
+		)
+	}
+	if len(ki.generations) == 0 {
+		ki.generations = append(ki.generations, generation{})
+	}
+	g := &ki.generations[len(ki.generations)-1]
+	if len(g.revs) == 0 { // create a new key
+		keysGauge.Inc()
+		g.created = rev
+	}
+	g.revs = append(g.revs, rev)
+	g.ver++
+	ki.modified = rev
+}
+
+func (ki *keyIndex) restore(lg *zap.Logger, created, modified revision, ver int64) {
+	if len(ki.generations) != 0 {
+		lg.Panic(
+			"'restore' got an unexpected non-empty generations",
+			zap.Int("generations-size", len(ki.generations)),
+		)
+	}
+
+	ki.modified = modified
+	g := generation{created: created, ver: ver, revs: []revision{modified}}
+	ki.generations = append(ki.generations, g)
+	keysGauge.Inc()
+}
+
+// tombstone puts a revision, pointing to a tombstone, to the keyIndex.
+// It also creates a new empty generation in the keyIndex.
+// It returns ErrRevisionNotFound when tombstone on an empty generation.
+func (ki *keyIndex) tombstone(lg *zap.Logger, main int64, sub int64) error {
+	if ki.isEmpty() {
+		lg.Panic(
+			"'tombstone' got an unexpected empty keyIndex",
+			zap.String("key", string(ki.key)),
+		)
+	}
+	if ki.generations[len(ki.generations)-1].isEmpty() {
+		return ErrRevisionNotFound
+	}
+	ki.put(lg, main, sub)
+	ki.generations = append(ki.generations, generation{})
+	keysGauge.Dec()
+	return nil
+}
+
+// get gets the modified, created revision and version of the key that satisfies the given atRev.
+// Rev must be higher than or equal to the given atRev.
+func (ki *keyIndex) get(lg *zap.Logger, atRev int64) (modified, created revision, ver int64, err error) {
+	if ki.isEmpty() {
+		lg.Panic(
+			"'get' got an unexpected empty keyIndex",
+			zap.String("key", string(ki.key)),
+		)
+	}
+	g := ki.findGeneration(atRev)
+	if g.isEmpty() {
+		return revision{}, revision{}, 0, ErrRevisionNotFound
+	}
+
+	n := g.walk(func(rev revision) bool { return rev.main > atRev })
+	if n != -1 {
+		return g.revs[n], g.created, g.ver - int64(len(g.revs)-n-1), nil
+	}
+
+	return revision{}, revision{}, 0, ErrRevisionNotFound
+}
+
+// since returns revisions since the given rev. Only the revision with the
+// largest sub revision will be returned if multiple revisions have the same
+// main revision.
+func (ki *keyIndex) since(lg *zap.Logger, rev int64) []revision {
+	if ki.isEmpty() {
+		lg.Panic(
+			"'since' got an unexpected empty keyIndex",
+			zap.String("key", string(ki.key)),
+		)
+	}
+	since := revision{rev, 0}
+	var gi int
+	// find the generations to start checking
+	for gi = len(ki.generations) - 1; gi > 0; gi-- {
+		g := ki.generations[gi]
+		if g.isEmpty() {
+			continue
+		}
+		if since.GreaterThan(g.created) {
+			break
+		}
+	}
+
+	var revs []revision
+	var last int64
+	for ; gi < len(ki.generations); gi++ {
+		for _, r := range ki.generations[gi].revs {
+			if since.GreaterThan(r) {
+				continue
+			}
+			if r.main == last {
+				// replace the revision with a new one that has higher sub value,
+				// because the original one should not be seen by external
+				revs[len(revs)-1] = r
+				continue
+			}
+			revs = append(revs, r)
+			last = r.main
+		}
+	}
+	return revs
+}
+
+// compact compacts a keyIndex by removing the versions with smaller or equal
+// revision than the given atRev except the largest one (If the largest one is
+// a tombstone, it will not be kept).
+// If a generation becomes empty during compaction, it will be removed.
+func (ki *keyIndex) compact(lg *zap.Logger, atRev int64, available map[revision]struct{}) {
+	if ki.isEmpty() {
+		lg.Panic(
+			"'compact' got an unexpected empty keyIndex",
+			zap.String("key", string(ki.key)),
+		)
+	}
+
+	genIdx, revIndex := ki.doCompact(atRev, available)
+
+	g := &ki.generations[genIdx]
+	if !g.isEmpty() {
+		// remove the previous contents.
+		if revIndex != -1 {
+			g.revs = g.revs[revIndex:]
+		}
+		// remove any tombstone
+		if len(g.revs) == 1 && genIdx != len(ki.generations)-1 {
+			delete(available, g.revs[0])
+			genIdx++
+		}
+	}
+
+	// remove the previous generations.
+	ki.generations = ki.generations[genIdx:]
+}
+
+// keep finds the revision to be kept if compact is called at given atRev.
+func (ki *keyIndex) keep(atRev int64, available map[revision]struct{}) {
+	if ki.isEmpty() {
+		return
+	}
+
+	genIdx, revIndex := ki.doCompact(atRev, available)
+	g := &ki.generations[genIdx]
+	if !g.isEmpty() {
+		// remove any tombstone
+		if revIndex == len(g.revs)-1 && genIdx != len(ki.generations)-1 {
+			delete(available, g.revs[revIndex])
+		}
+	}
+}
+
+func (ki *keyIndex) doCompact(atRev int64, available map[revision]struct{}) (genIdx int, revIndex int) {
+	// walk until reaching the first revision smaller or equal to "atRev",
+	// and add the revision to the available map
+	f := func(rev revision) bool {
+		if rev.main <= atRev {
+			available[rev] = struct{}{}
+			return false
+		}
+		return true
+	}
+
+	genIdx, g := 0, &ki.generations[0]
+	// find first generation includes atRev or created after atRev
+	for genIdx < len(ki.generations)-1 {
+		if tomb := g.revs[len(g.revs)-1].main; tomb > atRev {
+			break
+		}
+		genIdx++
+		g = &ki.generations[genIdx]
+	}
+
+	revIndex = g.walk(f)
+
+	return genIdx, revIndex
+}
+
+func (ki *keyIndex) isEmpty() bool {
+	return len(ki.generations) == 1 && ki.generations[0].isEmpty()
+}
+
+// findGeneration finds out the generation of the keyIndex that the
+// given rev belongs to. If the given rev is at the gap of two generations,
+// which means that the key does not exist at the given rev, it returns nil.
+func (ki *keyIndex) findGeneration(rev int64) *generation {
+	lastg := len(ki.generations) - 1
+	cg := lastg
+
+	for cg >= 0 {
+		if len(ki.generations[cg].revs) == 0 {
+			cg--
+			continue
+		}
+		g := ki.generations[cg]
+		if cg != lastg {
+			if tomb := g.revs[len(g.revs)-1].main; tomb <= rev {
+				return nil
+			}
+		}
+		if g.revs[0].main <= rev {
+			return &ki.generations[cg]
+		}
+		cg--
+	}
+	return nil
+}
+
+func (ki *keyIndex) Less(b btree.Item) bool {
+	return bytes.Compare(ki.key, b.(*keyIndex).key) == -1
+}
+
+func (ki *keyIndex) equal(b *keyIndex) bool {
+	if !bytes.Equal(ki.key, b.key) {
+		return false
+	}
+	if ki.modified != b.modified {
+		return false
+	}
+	if len(ki.generations) != len(b.generations) {
+		return false
+	}
+	for i := range ki.generations {
+		ag, bg := ki.generations[i], b.generations[i]
+		if !ag.equal(bg) {
+			return false
+		}
+	}
+	return true
+}
+
+func (ki *keyIndex) String() string {
+	var s string
+	for _, g := range ki.generations {
+		s += g.String()
+	}
+	return s
+}
+
+// generation contains multiple revisions of a key.
+type generation struct {
+	ver     int64
+	created revision // when the generation is created (put in first revision).
+	revs    []revision
+}
+
+func (g *generation) isEmpty() bool { return g == nil || len(g.revs) == 0 }
+
+// walk walks through the revisions in the generation in descending order.
+// It passes the revision to the given function.
+// walk returns until: 1. it finishes walking all pairs 2. the function returns false.
+// walk returns the position at where it stopped. If it stopped after
+// finishing walking, -1 will be returned.
+func (g *generation) walk(f func(rev revision) bool) int {
+	l := len(g.revs)
+	for i := range g.revs {
+		ok := f(g.revs[l-i-1])
+		if !ok {
+			return l - i - 1
+		}
+	}
+	return -1
+}
+
+func (g *generation) String() string {
+	return fmt.Sprintf("g: created[%d] ver[%d], revs %#v\n", g.created, g.ver, g.revs)
+}
+
+func (g generation) equal(b generation) bool {
+	if g.ver != b.ver {
+		return false
+	}
+	if len(g.revs) != len(b.revs) {
+		return false
+	}
+
+	for i := range g.revs {
+		ar, br := g.revs[i], b.revs[i]
+		if ar != br {
+			return false
+		}
+	}
+	return true
+}
--- a/server/mvcc/key_index_test.go
+++ b/server/mvcc/key_index_test.go
@@ -0,0 +1,700 @@
+// Copyright 2015 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mvcc
+
+import (
+	"reflect"
+	"testing"
+
+	"go.uber.org/zap"
+)
+
+func TestKeyIndexGet(t *testing.T) {
+	// key: "foo"
+	// rev: 16
+	// generations:
+	//    {empty}
+	//    {{14, 0}[1], {14, 1}[2], {16, 0}(t)[3]}
+	//    {{8, 0}[1], {10, 0}[2], {12, 0}(t)[3]}
+	//    {{2, 0}[1], {4, 0}[2], {6, 0}(t)[3]}
+	ki := newTestKeyIndex()
+	ki.compact(zap.NewExample(), 4, make(map[revision]struct{}))
+
+	tests := []struct {
+		rev int64
+
+		wmod   revision
+		wcreat revision
+		wver   int64
+		werr   error
+	}{
+		{17, revision{}, revision{}, 0, ErrRevisionNotFound},
+		{16, revision{}, revision{}, 0, ErrRevisionNotFound},
+
+		// get on generation 3
+		{15, revision{14, 1}, revision{14, 0}, 2, nil},
+		{14, revision{14, 1}, revision{14, 0}, 2, nil},
+
+		{13, revision{}, revision{}, 0, ErrRevisionNotFound},
+		{12, revision{}, revision{}, 0, ErrRevisionNotFound},
+
+		// get on generation 2
+		{11, revision{10, 0}, revision{8, 0}, 2, nil},
+		{10, revision{10, 0}, revision{8, 0}, 2, nil},
+		{9, revision{8, 0}, revision{8, 0}, 1, nil},
+		{8, revision{8, 0}, revision{8, 0}, 1, nil},
+
+		{7, revision{}, revision{}, 0, ErrRevisionNotFound},
+		{6, revision{}, revision{}, 0, ErrRevisionNotFound},
+
+		// get on generation 1
+		{5, revision{4, 0}, revision{2, 0}, 2, nil},
+		{4, revision{4, 0}, revision{2, 0}, 2, nil},
+
+		{3, revision{}, revision{}, 0, ErrRevisionNotFound},
+		{2, revision{}, revision{}, 0, ErrRevisionNotFound},
+		{1, revision{}, revision{}, 0, ErrRevisionNotFound},
+		{0, revision{}, revision{}, 0, ErrRevisionNotFound},
+	}
+
+	for i, tt := range tests {
+		mod, creat, ver, err := ki.get(zap.NewExample(), tt.rev)
+		if err != tt.werr {
+			t.Errorf("#%d: err = %v, want %v", i, err, tt.werr)
+		}
+		if mod != tt.wmod {
+			t.Errorf("#%d: modified = %+v, want %+v", i, mod, tt.wmod)
+		}
+		if creat != tt.wcreat {
+			t.Errorf("#%d: created = %+v, want %+v", i, creat, tt.wcreat)
+		}
+		if ver != tt.wver {
+			t.Errorf("#%d: version = %d, want %d", i, ver, tt.wver)
+		}
+	}
+}
+
+func TestKeyIndexSince(t *testing.T) {
+	ki := newTestKeyIndex()
+	ki.compact(zap.NewExample(), 4, make(map[revision]struct{}))
+
+	allRevs := []revision{{4, 0}, {6, 0}, {8, 0}, {10, 0}, {12, 0}, {14, 1}, {16, 0}}
+	tests := []struct {
+		rev int64
+
+		wrevs []revision
+	}{
+		{17, nil},
+		{16, allRevs[6:]},
+		{15, allRevs[6:]},
+		{14, allRevs[5:]},
+		{13, allRevs[5:]},
+		{12, allRevs[4:]},
+		{11, allRevs[4:]},
+		{10, allRevs[3:]},
+		{9, allRevs[3:]},
+		{8, allRevs[2:]},
+		{7, allRevs[2:]},
+		{6, allRevs[1:]},
+		{5, allRevs[1:]},
+		{4, allRevs},
+		{3, allRevs},
+		{2, allRevs},
+		{1, allRevs},
+		{0, allRevs},
+	}
+
+	for i, tt := range tests {
+		revs := ki.since(zap.NewExample(), tt.rev)
+		if !reflect.DeepEqual(revs, tt.wrevs) {
+			t.Errorf("#%d: revs = %+v, want %+v", i, revs, tt.wrevs)
+		}
+	}
+}
+
+func TestKeyIndexPut(t *testing.T) {
+	ki := &keyIndex{key: []byte("foo")}
+	ki.put(zap.NewExample(), 5, 0)
+
+	wki := &keyIndex{
+		key:         []byte("foo"),
+		modified:    revision{5, 0},
+		generations: []generation{{created: revision{5, 0}, ver: 1, revs: []revision{{main: 5}}}},
+	}
+	if !reflect.DeepEqual(ki, wki) {
+		t.Errorf("ki = %+v, want %+v", ki, wki)
+	}
+
+	ki.put(zap.NewExample(), 7, 0)
+
+	wki = &keyIndex{
+		key:         []byte("foo"),
+		modified:    revision{7, 0},
+		generations: []generation{{created: revision{5, 0}, ver: 2, revs: []revision{{main: 5}, {main: 7}}}},
+	}
+	if !reflect.DeepEqual(ki, wki) {
+		t.Errorf("ki = %+v, want %+v", ki, wki)
+	}
+}
+
+func TestKeyIndexRestore(t *testing.T) {
+	ki := &keyIndex{key: []byte("foo")}
+	ki.restore(zap.NewExample(), revision{5, 0}, revision{7, 0}, 2)
+
+	wki := &keyIndex{
+		key:         []byte("foo"),
+		modified:    revision{7, 0},
+		generations: []generation{{created: revision{5, 0}, ver: 2, revs: []revision{{main: 7}}}},
+	}
+	if !reflect.DeepEqual(ki, wki) {
+		t.Errorf("ki = %+v, want %+v", ki, wki)
+	}
+}
+
+func TestKeyIndexTombstone(t *testing.T) {
+	ki := &keyIndex{key: []byte("foo")}
+	ki.put(zap.NewExample(), 5, 0)
+
+	err := ki.tombstone(zap.NewExample(), 7, 0)
+	if err != nil {
+		t.Errorf("unexpected tombstone error: %v", err)
+	}
+
+	wki := &keyIndex{
+		key:         []byte("foo"),
+		modified:    revision{7, 0},
+		generations: []generation{{created: revision{5, 0}, ver: 2, revs: []revision{{main: 5}, {main: 7}}}, {}},
+	}
+	if !reflect.DeepEqual(ki, wki) {
+		t.Errorf("ki = %+v, want %+v", ki, wki)
+	}
+
+	ki.put(zap.NewExample(), 8, 0)
+	ki.put(zap.NewExample(), 9, 0)
+	err = ki.tombstone(zap.NewExample(), 15, 0)
+	if err != nil {
+		t.Errorf("unexpected tombstone error: %v", err)
+	}
+
+	wki = &keyIndex{
+		key:      []byte("foo"),
+		modified: revision{15, 0},
+		generations: []generation{
+			{created: revision{5, 0}, ver: 2, revs: []revision{{main: 5}, {main: 7}}},
+			{created: revision{8, 0}, ver: 3, revs: []revision{{main: 8}, {main: 9}, {main: 15}}},
+			{},
+		},
+	}
+	if !reflect.DeepEqual(ki, wki) {
+		t.Errorf("ki = %+v, want %+v", ki, wki)
+	}
+
+	err = ki.tombstone(zap.NewExample(), 16, 0)
+	if err != ErrRevisionNotFound {
+		t.Errorf("tombstone error = %v, want %v", err, ErrRevisionNotFound)
+	}
+}
+
+func TestKeyIndexCompactAndKeep(t *testing.T) {
+	tests := []struct {
+		compact int64
+
+		wki *keyIndex
+		wam map[revision]struct{}
+	}{
+		{
+			1,
+			&keyIndex{
+				key:      []byte("foo"),
+				modified: revision{16, 0},
+				generations: []generation{
+					{created: revision{2, 0}, ver: 3, revs: []revision{{main: 2}, {main: 4}, {main: 6}}},
+					{created: revision{8, 0}, ver: 3, revs: []revision{{main: 8}, {main: 10}, {main: 12}}},
+					{created: revision{14, 0}, ver: 3, revs: []revision{{main: 14}, {main: 14, sub: 1}, {main: 16}}},
+					{},
+				},
+			},
+			map[revision]struct{}{},
+		},
+		{
+			2,
+			&keyIndex{
+				key:      []byte("foo"),
+				modified: revision{16, 0},
+				generations: []generation{
+					{created: revision{2, 0}, ver: 3, revs: []revision{{main: 2}, {main: 4}, {main: 6}}},
+					{created: revision{8, 0}, ver: 3, revs: []revision{{main: 8}, {main: 10}, {main: 12}}},
+					{created: revision{14, 0}, ver: 3, revs: []revision{{main: 14}, {main: 14, sub: 1}, {main: 16}}},
+					{},
+				},
+			},
+			map[revision]struct{}{
+				{main: 2}: {},
+			},
+		},
+		{
+			3,
+			&keyIndex{
+				key:      []byte("foo"),
+				modified: revision{16, 0},
+				generations: []generation{
+					{created: revision{2, 0}, ver: 3, revs: []revision{{main: 2}, {main: 4}, {main: 6}}},
+					{created: revision{8, 0}, ver: 3, revs: []revision{{main: 8}, {main: 10}, {main: 12}}},
+					{created: revision{14, 0}, ver: 3, revs: []revision{{main: 14}, {main: 14, sub: 1}, {main: 16}}},
+					{},
+				},
+			},
+			map[revision]struct{}{
+				{main: 2}: {},
+			},
+		},
+		{
+			4,
+			&keyIndex{
+				key:      []byte("foo"),
+				modified: revision{16, 0},
+				generations: []generation{
+					{created: revision{2, 0}, ver: 3, revs: []revision{{main: 4}, {main: 6}}},
+					{created: revision{8, 0}, ver: 3, revs: []revision{{main: 8}, {main: 10}, {main: 12}}},
+					{created: revision{14, 0}, ver: 3, revs: []revision{{main: 14}, {main: 14, sub: 1}, {main: 16}}},
+					{},
+				},
+			},
+			map[revision]struct{}{
+				{main: 4}: {},
+			},
+		},
+		{
+			5,
+			&keyIndex{
+				key:      []byte("foo"),
+				modified: revision{16, 0},
+				generations: []generation{
+					{created: revision{2, 0}, ver: 3, revs: []revision{{main: 4}, {main: 6}}},
+					{created: revision{8, 0}, ver: 3, revs: []revision{{main: 8}, {main: 10}, {main: 12}}},
+					{created: revision{14, 0}, ver: 3, revs: []revision{{main: 14}, {main: 14, sub: 1}, {main: 16}}},
+					{},
+				},
+			},
+			map[revision]struct{}{
+				{main: 4}: {},
+			},
+		},
+		{
+			6,
+			&keyIndex{
+				key:      []byte("foo"),
+				modified: revision{16, 0},
+				generations: []generation{
+					{created: revision{8, 0}, ver: 3, revs: []revision{{main: 8}, {main: 10}, {main: 12}}},
+					{created: revision{14, 0}, ver: 3, revs: []revision{{main: 14}, {main: 14, sub: 1}, {main: 16}}},
+					{},
+				},
+			},
+			map[revision]struct{}{},
+		},
+		{
+			7,
+			&keyIndex{
+				key:      []byte("foo"),
+				modified: revision{16, 0},
+				generations: []generation{
+					{created: revision{8, 0}, ver: 3, revs: []revision{{main: 8}, {main: 10}, {main: 12}}},
+					{created: revision{14, 0}, ver: 3, revs: []revision{{main: 14}, {main: 14, sub: 1}, {main: 16}}},
+					{},
+				},
+			},
+			map[revision]struct{}{},
+		},
+		{
+			8,
+			&keyIndex{
+				key:      []byte("foo"),
+				modified: revision{16, 0},
+				generations: []generation{
+					{created: revision{8, 0}, ver: 3, revs: []revision{{main: 8}, {main: 10}, {main: 12}}},
+					{created: revision{14, 0}, ver: 3, revs: []revision{{main: 14}, {main: 14, sub: 1}, {main: 16}}},
+					{},
+				},
+			},
+			map[revision]struct{}{
+				{main: 8}: {},
+			},
+		},
+		{
+			9,
+			&keyIndex{
+				key:      []byte("foo"),
+				modified: revision{16, 0},
+				generations: []generation{
+					{created: revision{8, 0}, ver: 3, revs: []revision{{main: 8}, {main: 10}, {main: 12}}},
+					{created: revision{14, 0}, ver: 3, revs: []revision{{main: 14}, {main: 14, sub: 1}, {main: 16}}},
+					{},
+				},
+			},
+			map[revision]struct{}{
+				{main: 8}: {},
+			},
+		},
+		{
+			10,
+			&keyIndex{
+				key:      []byte("foo"),
+				modified: revision{16, 0},
+				generations: []generation{
+					{created: revision{8, 0}, ver: 3, revs: []revision{{main: 10}, {main: 12}}},
+					{created: revision{14, 0}, ver: 3, revs: []revision{{main: 14}, {main: 14, sub: 1}, {main: 16}}},
+					{},
+				},
+			},
+			map[revision]struct{}{
+				{main: 10}: {},
+			},
+		},
+		{
+			11,
+			&keyIndex{
+				key:      []byte("foo"),
+				modified: revision{16, 0},
+				generations: []generation{
+					{created: revision{8, 0}, ver: 3, revs: []revision{{main: 10}, {main: 12}}},
+					{created: revision{14, 0}, ver: 3, revs: []revision{{main: 14}, {main: 14, sub: 1}, {main: 16}}},
+					{},
+				},
+			},
+			map[revision]struct{}{
+				{main: 10}: {},
+			},
+		},
+		{
+			12,
+			&keyIndex{
+				key:      []byte("foo"),
+				modified: revision{16, 0},
+				generations: []generation{
+					{created: revision{14, 0}, ver: 3, revs: []revision{{main: 14}, {main: 14, sub: 1}, {main: 16}}},
+					{},
+				},
+			},
+			map[revision]struct{}{},
+		},
+		{
+			13,
+			&keyIndex{
+				key:      []byte("foo"),
+				modified: revision{16, 0},
+				generations: []generation{
+					{created: revision{14, 0}, ver: 3, revs: []revision{{main: 14}, {main: 14, sub: 1}, {main: 16}}},
+					{},
+				},
+			},
+			map[revision]struct{}{},
+		},
+		{
+			14,
+			&keyIndex{
+				key:      []byte("foo"),
+				modified: revision{16, 0},
+				generations: []generation{
+					{created: revision{14, 0}, ver: 3, revs: []revision{{main: 14, sub: 1}, {main: 16}}},
+					{},
+				},
+			},
+			map[revision]struct{}{
+				{main: 14, sub: 1}: {},
+			},
+		},
+		{
+			15,
+			&keyIndex{
+				key:      []byte("foo"),
+				modified: revision{16, 0},
+				generations: []generation{
+					{created: revision{14, 0}, ver: 3, revs: []revision{{main: 14, sub: 1}, {main: 16}}},
+					{},
+				},
+			},
+			map[revision]struct{}{
+				{main: 14, sub: 1}: {},
+			},
+		},
+		{
+			16,
+			&keyIndex{
+				key:      []byte("foo"),
+				modified: revision{16, 0},
+				generations: []generation{
+					{},
+				},
+			},
+			map[revision]struct{}{},
+		},
+	}
+
+	// Continuous Compaction and finding Keep
+	ki := newTestKeyIndex()
+	for i, tt := range tests {
+		am := make(map[revision]struct{})
+		kiclone := cloneKeyIndex(ki)
+		ki.keep(tt.compact, am)
+		if !reflect.DeepEqual(ki, kiclone) {
+			t.Errorf("#%d: ki = %+v, want %+v", i, ki, kiclone)
+		}
+		if !reflect.DeepEqual(am, tt.wam) {
+			t.Errorf("#%d: am = %+v, want %+v", i, am, tt.wam)
+		}
+		am = make(map[revision]struct{})
+		ki.compact(zap.NewExample(), tt.compact, am)
+		if !reflect.DeepEqual(ki, tt.wki) {
+			t.Errorf("#%d: ki = %+v, want %+v", i, ki, tt.wki)
+		}
+		if !reflect.DeepEqual(am, tt.wam) {
+			t.Errorf("#%d: am = %+v, want %+v", i, am, tt.wam)
+		}
+	}
+
+	// Jump Compaction and finding Keep
+	ki = newTestKeyIndex()
+	for i, tt := range tests {
+		if (i%2 == 0 && i < 6) || (i%2 == 1 && i > 6) {
+			am := make(map[revision]struct{})
+			kiclone := cloneKeyIndex(ki)
+			ki.keep(tt.compact, am)
+			if !reflect.DeepEqual(ki, kiclone) {
+				t.Errorf("#%d: ki = %+v, want %+v", i, ki, kiclone)
+			}
+			if !reflect.DeepEqual(am, tt.wam) {
+				t.Errorf("#%d: am = %+v, want %+v", i, am, tt.wam)
+			}
+			am = make(map[revision]struct{})
+			ki.compact(zap.NewExample(), tt.compact, am)
+			if !reflect.DeepEqual(ki, tt.wki) {
+				t.Errorf("#%d: ki = %+v, want %+v", i, ki, tt.wki)
+			}
+			if !reflect.DeepEqual(am, tt.wam) {
+				t.Errorf("#%d: am = %+v, want %+v", i, am, tt.wam)
+			}
+		}
+	}
+
+	kiClone := newTestKeyIndex()
+	// Once Compaction and finding Keep
+	for i, tt := range tests {
+		ki := newTestKeyIndex()
+		am := make(map[revision]struct{})
+		ki.keep(tt.compact, am)
+		if !reflect.DeepEqual(ki, kiClone) {
+			t.Errorf("#%d: ki = %+v, want %+v", i, ki, kiClone)
+		}
+		if !reflect.DeepEqual(am, tt.wam) {
+			t.Errorf("#%d: am = %+v, want %+v", i, am, tt.wam)
+		}
+		am = make(map[revision]struct{})
+		ki.compact(zap.NewExample(), tt.compact, am)
+		if !reflect.DeepEqual(ki, tt.wki) {
+			t.Errorf("#%d: ki = %+v, want %+v", i, ki, tt.wki)
+		}
+		if !reflect.DeepEqual(am, tt.wam) {
+			t.Errorf("#%d: am = %+v, want %+v", i, am, tt.wam)
+		}
+	}
+}
+
+func cloneKeyIndex(ki *keyIndex) *keyIndex {
+	generations := make([]generation, len(ki.generations))
+	for i, gen := range ki.generations {
+		generations[i] = *cloneGeneration(&gen)
+	}
+	return &keyIndex{ki.key, ki.modified, generations}
+}
+
+func cloneGeneration(g *generation) *generation {
+	if g.revs == nil {
+		return &generation{g.ver, g.created, nil}
+	}
+	tmp := make([]revision, len(g.revs))
+	copy(tmp, g.revs)
+	return &generation{g.ver, g.created, tmp}
+}
+
+// test that compact on version that higher than last modified version works well
+func TestKeyIndexCompactOnFurtherRev(t *testing.T) {
+	ki := &keyIndex{key: []byte("foo")}
+	ki.put(zap.NewExample(), 1, 0)
+	ki.put(zap.NewExample(), 2, 0)
+	am := make(map[revision]struct{})
+	ki.compact(zap.NewExample(), 3, am)
+
+	wki := &keyIndex{
+		key:      []byte("foo"),
+		modified: revision{2, 0},
+		generations: []generation{
+			{created: revision{1, 0}, ver: 2, revs: []revision{{main: 2}}},
+		},
+	}
+	wam := map[revision]struct{}{
+		{main: 2}: {},
+	}
+	if !reflect.DeepEqual(ki, wki) {
+		t.Errorf("ki = %+v, want %+v", ki, wki)
+	}
+	if !reflect.DeepEqual(am, wam) {
+		t.Errorf("am = %+v, want %+v", am, wam)
+	}
+}
+
+func TestKeyIndexIsEmpty(t *testing.T) {
+	tests := []struct {
+		ki *keyIndex
+		w  bool
+	}{
+		{
+			&keyIndex{
+				key:         []byte("foo"),
+				generations: []generation{{}},
+			},
+			true,
+		},
+		{
+			&keyIndex{
+				key:      []byte("foo"),
+				modified: revision{2, 0},
+				generations: []generation{
+					{created: revision{1, 0}, ver: 2, revs: []revision{{main: 2}}},
+				},
+			},
+			false,
+		},
+	}
+	for i, tt := range tests {
+		g := tt.ki.isEmpty()
+		if g != tt.w {
+			t.Errorf("#%d: isEmpty = %v, want %v", i, g, tt.w)
+		}
+	}
+}
+
+func TestKeyIndexFindGeneration(t *testing.T) {
+	ki := newTestKeyIndex()
+
+	tests := []struct {
+		rev int64
+		wg  *generation
+	}{
+		{0, nil},
+		{1, nil},
+		{2, &ki.generations[0]},
+		{3, &ki.generations[0]},
+		{4, &ki.generations[0]},
+		{5, &ki.generations[0]},
+		{6, nil},
+		{7, nil},
+		{8, &ki.generations[1]},
+		{9, &ki.generations[1]},
+		{10, &ki.generations[1]},
+		{11, &ki.generations[1]},
+		{12, nil},
+		{13, nil},
+	}
+	for i, tt := range tests {
+		g := ki.findGeneration(tt.rev)
+		if g != tt.wg {
+			t.Errorf("#%d: generation = %+v, want %+v", i, g, tt.wg)
+		}
+	}
+}
+
+func TestKeyIndexLess(t *testing.T) {
+	ki := &keyIndex{key: []byte("foo")}
+
+	tests := []struct {
+		ki *keyIndex
+		w  bool
+	}{
+		{&keyIndex{key: []byte("doo")}, false},
+		{&keyIndex{key: []byte("foo")}, false},
+		{&keyIndex{key: []byte("goo")}, true},
+	}
+	for i, tt := range tests {
+		g := ki.Less(tt.ki)
+		if g != tt.w {
+			t.Errorf("#%d: Less = %v, want %v", i, g, tt.w)
+		}
+	}
+}
+
+func TestGenerationIsEmpty(t *testing.T) {
+	tests := []struct {
+		g *generation
+		w bool
+	}{
+		{nil, true},
+		{&generation{}, true},
+		{&generation{revs: []revision{{main: 1}}}, false},
+	}
+	for i, tt := range tests {
+		g := tt.g.isEmpty()
+		if g != tt.w {
+			t.Errorf("#%d: isEmpty = %v, want %v", i, g, tt.w)
+		}
+	}
+}
+
+func TestGenerationWalk(t *testing.T) {
+	g := &generation{
+		ver:     3,
+		created: revision{2, 0},
+		revs:    []revision{{main: 2}, {main: 4}, {main: 6}},
+	}
+	tests := []struct {
+		f  func(rev revision) bool
+		wi int
+	}{
+		{func(rev revision) bool { return rev.main >= 7 }, 2},
+		{func(rev revision) bool { return rev.main >= 6 }, 1},
+		{func(rev revision) bool { return rev.main >= 5 }, 1},
+		{func(rev revision) bool { return rev.main >= 4 }, 0},
+		{func(rev revision) bool { return rev.main >= 3 }, 0},
+		{func(rev revision) bool { return rev.main >= 2 }, -1},
+	}
+	for i, tt := range tests {
+		idx := g.walk(tt.f)
+		if idx != tt.wi {
+			t.Errorf("#%d: index = %d, want %d", i, idx, tt.wi)
+		}
+	}
+}
+
+func newTestKeyIndex() *keyIndex {
+	// key: "foo"
+	// rev: 16
+	// generations:
+	//    {empty}
+	//    {{14, 0}[1], {14, 1}[2], {16, 0}(t)[3]}
+	//    {{8, 0}[1], {10, 0}[2], {12, 0}(t)[3]}
+	//    {{2, 0}[1], {4, 0}[2], {6, 0}(t)[3]}
+
+	ki := &keyIndex{key: []byte("foo")}
+	ki.put(zap.NewExample(), 2, 0)
+	ki.put(zap.NewExample(), 4, 0)
+	ki.tombstone(zap.NewExample(), 6, 0)
+	ki.put(zap.NewExample(), 8, 0)
+	ki.put(zap.NewExample(), 10, 0)
+	ki.tombstone(zap.NewExample(), 12, 0)
+	ki.put(zap.NewExample(), 14, 0)
+	ki.put(zap.NewExample(), 14, 1)
+	ki.tombstone(zap.NewExample(), 16, 0)
+	return ki
+}
--- a/server/mvcc/kv.go
+++ b/server/mvcc/kv.go
@@ -0,0 +1,150 @@
+// Copyright 2015 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mvcc
+
+import (
+	"go.etcd.io/etcd/api/v3/mvccpb"
+	"go.etcd.io/etcd/pkg/v3/traceutil"
+	"go.etcd.io/etcd/v3/lease"
+	"go.etcd.io/etcd/v3/mvcc/backend"
+)
+
+type RangeOptions struct {
+	Limit int64
+	Rev   int64
+	Count bool
+}
+
+type RangeResult struct {
+	KVs   []mvccpb.KeyValue
+	Rev   int64
+	Count int
+}
+
+type ReadView interface {
+	// FirstRev returns the first KV revision at the time of opening the txn.
+	// After a compaction, the first revision increases to the compaction
+	// revision.
+	FirstRev() int64
+
+	// Rev returns the revision of the KV at the time of opening the txn.
+	Rev() int64
+
+	// Range gets the keys in the range at rangeRev.
+	// The returned rev is the current revision of the KV when the operation is executed.
+	// If rangeRev <=0, range gets the keys at currentRev.
+	// If `end` is nil, the request returns the key.
+	// If `end` is not nil and not empty, it gets the keys in range [key, range_end).
+	// If `end` is not nil and empty, it gets the keys greater than or equal to key.
+	// Limit limits the number of keys returned.
+	// If the required rev is compacted, ErrCompacted will be returned.
+	Range(key, end []byte, ro RangeOptions) (r *RangeResult, err error)
+}
+
+// TxnRead represents a read-only transaction with operations that will not
+// block other read transactions.
+type TxnRead interface {
+	ReadView
+	// End marks the transaction is complete and ready to commit.
+	End()
+}
+
+type WriteView interface {
+	// DeleteRange deletes the given range from the store.
+	// A deleteRange increases the rev of the store if any key in the range exists.
+	// The number of key deleted will be returned.
+	// The returned rev is the current revision of the KV when the operation is executed.
+	// It also generates one event for each key delete in the event history.
+	// if the `end` is nil, deleteRange deletes the key.
+	// if the `end` is not nil, deleteRange deletes the keys in range [key, range_end).
+	DeleteRange(key, end []byte) (n, rev int64)
+
+	// Put puts the given key, value into the store. Put also takes additional argument lease to
+	// attach a lease to a key-value pair as meta-data. KV implementation does not validate the lease
+	// id.
+	// A put also increases the rev of the store, and generates one event in the event history.
+	// The returned rev is the current revision of the KV when the operation is executed.
+	Put(key, value []byte, lease lease.LeaseID) (rev int64)
+}
+
+// TxnWrite represents a transaction that can modify the store.
+type TxnWrite interface {
+	TxnRead
+	WriteView
+	// Changes gets the changes made since opening the write txn.
+	Changes() []mvccpb.KeyValue
+}
+
+// txnReadWrite coerces a read txn to a write, panicking on any write operation.
+type txnReadWrite struct{ TxnRead }
+
+func (trw *txnReadWrite) DeleteRange(key, end []byte) (n, rev int64) { panic("unexpected DeleteRange") }
+func (trw *txnReadWrite) Put(key, value []byte, lease lease.LeaseID) (rev int64) {
+	panic("unexpected Put")
+}
+func (trw *txnReadWrite) Changes() []mvccpb.KeyValue { return nil }
+
+func NewReadOnlyTxnWrite(txn TxnRead) TxnWrite { return &txnReadWrite{txn} }
+
+type KV interface {
+	ReadView
+	WriteView
+
+	// Read creates a read transaction.
+	Read(trace *traceutil.Trace) TxnRead
+
+	// Write creates a write transaction.
+	Write(trace *traceutil.Trace) TxnWrite
+
+	// Hash computes the hash of the KV's backend.
+	Hash() (hash uint32, revision int64, err error)
+
+	// HashByRev computes the hash of all MVCC revisions up to a given revision.
+	HashByRev(rev int64) (hash uint32, revision int64, compactRev int64, err error)
+
+	// Compact frees all superseded keys with revisions less than rev.
+	Compact(trace *traceutil.Trace, rev int64) (<-chan struct{}, error)
+
+	// Commit commits outstanding txns into the underlying backend.
+	Commit()
+
+	// Restore restores the KV store from a backend.
+	Restore(b backend.Backend) error
+	Close() error
+}
+
+// WatchableKV is a KV that can be watched.
+type WatchableKV interface {
+	KV
+	Watchable
+}
+
+// Watchable is the interface that wraps the NewWatchStream function.
+type Watchable interface {
+	// NewWatchStream returns a WatchStream that can be used to
+	// watch events happened or happening on the KV.
+	NewWatchStream() WatchStream
+}
+
+// ConsistentWatchableKV is a WatchableKV that understands the consistency
+// algorithm and consistent index.
+// If the consistent index of executing entry is not larger than the
+// consistent index of ConsistentWatchableKV, all operations in
+// this entry are skipped and return empty response.
+type ConsistentWatchableKV interface {
+	WatchableKV
+	// ConsistentIndex returns the current consistent index of the KV.
+	ConsistentIndex() uint64
+}
--- a/server/mvcc/kv_test.go
+++ b/server/mvcc/kv_test.go
@@ -0,0 +1,838 @@
+// Copyright 2015 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mvcc
+
+import (
+	"fmt"
+	"os"
+	"reflect"
+	"testing"
+	"time"
+
+	"go.etcd.io/etcd/api/v3/mvccpb"
+	"go.etcd.io/etcd/pkg/v3/testutil"
+	"go.etcd.io/etcd/pkg/v3/traceutil"
+	"go.etcd.io/etcd/v3/lease"
+	"go.etcd.io/etcd/v3/mvcc/backend"
+
+	"github.com/prometheus/client_golang/prometheus"
+	dto "github.com/prometheus/client_model/go"
+	"go.uber.org/zap"
+)
+
+// Functional tests for features implemented in v3 store. It treats v3 store
+// as a black box, and tests it by feeding the input and validating the output.
+
+// TODO: add similar tests on operations in one txn/rev
+
+type (
+	rangeFunc       func(kv KV, key, end []byte, ro RangeOptions) (*RangeResult, error)
+	putFunc         func(kv KV, key, value []byte, lease lease.LeaseID) int64
+	deleteRangeFunc func(kv KV, key, end []byte) (n, rev int64)
+)
+
+var (
+	normalRangeFunc = func(kv KV, key, end []byte, ro RangeOptions) (*RangeResult, error) {
+		return kv.Range(key, end, ro)
+	}
+	txnRangeFunc = func(kv KV, key, end []byte, ro RangeOptions) (*RangeResult, error) {
+		txn := kv.Read(traceutil.TODO())
+		defer txn.End()
+		return txn.Range(key, end, ro)
+	}
+
+	normalPutFunc = func(kv KV, key, value []byte, lease lease.LeaseID) int64 {
+		return kv.Put(key, value, lease)
+	}
+	txnPutFunc = func(kv KV, key, value []byte, lease lease.LeaseID) int64 {
+		txn := kv.Write(traceutil.TODO())
+		defer txn.End()
+		return txn.Put(key, value, lease)
+	}
+
+	normalDeleteRangeFunc = func(kv KV, key, end []byte) (n, rev int64) {
+		return kv.DeleteRange(key, end)
+	}
+	txnDeleteRangeFunc = func(kv KV, key, end []byte) (n, rev int64) {
+		txn := kv.Write(traceutil.TODO())
+		defer txn.End()
+		return txn.DeleteRange(key, end)
+	}
+)
+
+func TestKVRange(t *testing.T)    { testKVRange(t, normalRangeFunc) }
+func TestKVTxnRange(t *testing.T) { testKVRange(t, txnRangeFunc) }
+
+func testKVRange(t *testing.T, f rangeFunc) {
+	b, tmpPath := backend.NewDefaultTmpBackend()
+	s := NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
+	defer cleanup(s, b, tmpPath)
+
+	kvs := put3TestKVs(s)
+
+	wrev := int64(4)
+	tests := []struct {
+		key, end []byte
+		wkvs     []mvccpb.KeyValue
+	}{
+		// get no keys
+		{
+			[]byte("doo"), []byte("foo"),
+			nil,
+		},
+		// get no keys when key == end
+		{
+			[]byte("foo"), []byte("foo"),
+			nil,
+		},
+		// get no keys when ranging single key
+		{
+			[]byte("doo"), nil,
+			nil,
+		},
+		// get all keys
+		{
+			[]byte("foo"), []byte("foo3"),
+			kvs,
+		},
+		// get partial keys
+		{
+			[]byte("foo"), []byte("foo1"),
+			kvs[:1],
+		},
+		// get single key
+		{
+			[]byte("foo"), nil,
+			kvs[:1],
+		},
+		// get entire keyspace
+		{
+			[]byte(""), []byte(""),
+			kvs,
+		},
+	}
+
+	for i, tt := range tests {
+		r, err := f(s, tt.key, tt.end, RangeOptions{})
+		if err != nil {
+			t.Fatal(err)
+		}
+		if r.Rev != wrev {
+			t.Errorf("#%d: rev = %d, want %d", i, r.Rev, wrev)
+		}
+		if !reflect.DeepEqual(r.KVs, tt.wkvs) {
+			t.Errorf("#%d: kvs = %+v, want %+v", i, r.KVs, tt.wkvs)
+		}
+	}
+}
+
+func TestKVRangeRev(t *testing.T)    { testKVRangeRev(t, normalRangeFunc) }
+func TestKVTxnRangeRev(t *testing.T) { testKVRangeRev(t, txnRangeFunc) }
+
+func testKVRangeRev(t *testing.T, f rangeFunc) {
+	b, tmpPath := backend.NewDefaultTmpBackend()
+	s := NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
+	defer cleanup(s, b, tmpPath)
+
+	kvs := put3TestKVs(s)
+
+	tests := []struct {
+		rev  int64
+		wrev int64
+		wkvs []mvccpb.KeyValue
+	}{
+		{-1, 4, kvs},
+		{0, 4, kvs},
+		{2, 4, kvs[:1]},
+		{3, 4, kvs[:2]},
+		{4, 4, kvs},
+	}
+
+	for i, tt := range tests {
+		r, err := f(s, []byte("foo"), []byte("foo3"), RangeOptions{Rev: tt.rev})
+		if err != nil {
+			t.Fatal(err)
+		}
+		if r.Rev != tt.wrev {
+			t.Errorf("#%d: rev = %d, want %d", i, r.Rev, tt.wrev)
+		}
+		if !reflect.DeepEqual(r.KVs, tt.wkvs) {
+			t.Errorf("#%d: kvs = %+v, want %+v", i, r.KVs, tt.wkvs)
+		}
+	}
+}
+
+func TestKVRangeBadRev(t *testing.T)    { testKVRangeBadRev(t, normalRangeFunc) }
+func TestKVTxnRangeBadRev(t *testing.T) { testKVRangeBadRev(t, txnRangeFunc) }
+
+func testKVRangeBadRev(t *testing.T, f rangeFunc) {
+	b, tmpPath := backend.NewDefaultTmpBackend()
+	s := NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
+	defer cleanup(s, b, tmpPath)
+
+	put3TestKVs(s)
+	if _, err := s.Compact(traceutil.TODO(), 4); err != nil {
+		t.Fatalf("compact error (%v)", err)
+	}
+
+	tests := []struct {
+		rev  int64
+		werr error
+	}{
+		{-1, nil}, // <= 0 is most recent store
+		{0, nil},
+		{1, ErrCompacted},
+		{2, ErrCompacted},
+		{4, nil},
+		{5, ErrFutureRev},
+		{100, ErrFutureRev},
+	}
+	for i, tt := range tests {
+		_, err := f(s, []byte("foo"), []byte("foo3"), RangeOptions{Rev: tt.rev})
+		if err != tt.werr {
+			t.Errorf("#%d: error = %v, want %v", i, err, tt.werr)
+		}
+	}
+}
+
+func TestKVRangeLimit(t *testing.T)    { testKVRangeLimit(t, normalRangeFunc) }
+func TestKVTxnRangeLimit(t *testing.T) { testKVRangeLimit(t, txnRangeFunc) }
+
+func testKVRangeLimit(t *testing.T, f rangeFunc) {
+	b, tmpPath := backend.NewDefaultTmpBackend()
+	s := NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
+	defer cleanup(s, b, tmpPath)
+
+	kvs := put3TestKVs(s)
+
+	wrev := int64(4)
+	tests := []struct {
+		limit int64
+		wkvs  []mvccpb.KeyValue
+	}{
+		// no limit
+		{-1, kvs},
+		// no limit
+		{0, kvs},
+		{1, kvs[:1]},
+		{2, kvs[:2]},
+		{3, kvs},
+		{100, kvs},
+	}
+	for i, tt := range tests {
+		r, err := f(s, []byte("foo"), []byte("foo3"), RangeOptions{Limit: tt.limit})
+		if err != nil {
+			t.Fatalf("#%d: range error (%v)", i, err)
+		}
+		if !reflect.DeepEqual(r.KVs, tt.wkvs) {
+			t.Errorf("#%d: kvs = %+v, want %+v", i, r.KVs, tt.wkvs)
+		}
+		if r.Rev != wrev {
+			t.Errorf("#%d: rev = %d, want %d", i, r.Rev, wrev)
+		}
+		if tt.limit <= 0 || int(tt.limit) > len(kvs) {
+			if r.Count != len(kvs) {
+				t.Errorf("#%d: count = %d, want %d", i, r.Count, len(kvs))
+			}
+		} else if r.Count != int(tt.limit) {
+			t.Errorf("#%d: count = %d, want %d", i, r.Count, tt.limit)
+		}
+	}
+}
+
+func TestKVPutMultipleTimes(t *testing.T)    { testKVPutMultipleTimes(t, normalPutFunc) }
+func TestKVTxnPutMultipleTimes(t *testing.T) { testKVPutMultipleTimes(t, txnPutFunc) }
+
+func testKVPutMultipleTimes(t *testing.T, f putFunc) {
+	b, tmpPath := backend.NewDefaultTmpBackend()
+	s := NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
+	defer cleanup(s, b, tmpPath)
+
+	for i := 0; i < 10; i++ {
+		base := int64(i + 1)
+
+		rev := f(s, []byte("foo"), []byte("bar"), lease.LeaseID(base))
+		if rev != base+1 {
+			t.Errorf("#%d: rev = %d, want %d", i, rev, base+1)
+		}
+
+		r, err := s.Range([]byte("foo"), nil, RangeOptions{})
+		if err != nil {
+			t.Fatal(err)
+		}
+		wkvs := []mvccpb.KeyValue{
+			{Key: []byte("foo"), Value: []byte("bar"), CreateRevision: 2, ModRevision: base + 1, Version: base, Lease: base},
+		}
+		if !reflect.DeepEqual(r.KVs, wkvs) {
+			t.Errorf("#%d: kvs = %+v, want %+v", i, r.KVs, wkvs)
+		}
+	}
+}
+
+func TestKVDeleteRange(t *testing.T)    { testKVDeleteRange(t, normalDeleteRangeFunc) }
+func TestKVTxnDeleteRange(t *testing.T) { testKVDeleteRange(t, txnDeleteRangeFunc) }
+
+func testKVDeleteRange(t *testing.T, f deleteRangeFunc) {
+	tests := []struct {
+		key, end []byte
+
+		wrev int64
+		wN   int64
+	}{
+		{
+			[]byte("foo"), nil,
+			5, 1,
+		},
+		{
+			[]byte("foo"), []byte("foo1"),
+			5, 1,
+		},
+		{
+			[]byte("foo"), []byte("foo2"),
+			5, 2,
+		},
+		{
+			[]byte("foo"), []byte("foo3"),
+			5, 3,
+		},
+		{
+			[]byte("foo3"), []byte("foo8"),
+			4, 0,
+		},
+		{
+			[]byte("foo3"), nil,
+			4, 0,
+		},
+	}
+
+	for i, tt := range tests {
+		b, tmpPath := backend.NewDefaultTmpBackend()
+		s := NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
+
+		s.Put([]byte("foo"), []byte("bar"), lease.NoLease)
+		s.Put([]byte("foo1"), []byte("bar1"), lease.NoLease)
+		s.Put([]byte("foo2"), []byte("bar2"), lease.NoLease)
+
+		n, rev := f(s, tt.key, tt.end)
+		if n != tt.wN || rev != tt.wrev {
+			t.Errorf("#%d: n = %d, rev = %d, want (%d, %d)", i, n, rev, tt.wN, tt.wrev)
+		}
+
+		cleanup(s, b, tmpPath)
+	}
+}
+
+func TestKVDeleteMultipleTimes(t *testing.T)    { testKVDeleteMultipleTimes(t, normalDeleteRangeFunc) }
+func TestKVTxnDeleteMultipleTimes(t *testing.T) { testKVDeleteMultipleTimes(t, txnDeleteRangeFunc) }
+
+func testKVDeleteMultipleTimes(t *testing.T, f deleteRangeFunc) {
+	b, tmpPath := backend.NewDefaultTmpBackend()
+	s := NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
+	defer cleanup(s, b, tmpPath)
+
+	s.Put([]byte("foo"), []byte("bar"), lease.NoLease)
+
+	n, rev := f(s, []byte("foo"), nil)
+	if n != 1 || rev != 3 {
+		t.Fatalf("n = %d, rev = %d, want (%d, %d)", n, rev, 1, 3)
+	}
+
+	for i := 0; i < 10; i++ {
+		n, rev := f(s, []byte("foo"), nil)
+		if n != 0 || rev != 3 {
+			t.Fatalf("#%d: n = %d, rev = %d, want (%d, %d)", i, n, rev, 0, 3)
+		}
+	}
+}
+
+// test that range, put, delete on single key in sequence repeatedly works correctly.
+func TestKVOperationInSequence(t *testing.T) {
+	b, tmpPath := backend.NewDefaultTmpBackend()
+	s := NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
+	defer cleanup(s, b, tmpPath)
+
+	for i := 0; i < 10; i++ {
+		base := int64(i*2 + 1)
+
+		// put foo
+		rev := s.Put([]byte("foo"), []byte("bar"), lease.NoLease)
+		if rev != base+1 {
+			t.Errorf("#%d: put rev = %d, want %d", i, rev, base+1)
+		}
+
+		r, err := s.Range([]byte("foo"), nil, RangeOptions{Rev: base + 1})
+		if err != nil {
+			t.Fatal(err)
+		}
+		wkvs := []mvccpb.KeyValue{
+			{Key: []byte("foo"), Value: []byte("bar"), CreateRevision: base + 1, ModRevision: base + 1, Version: 1, Lease: int64(lease.NoLease)},
+		}
+		if !reflect.DeepEqual(r.KVs, wkvs) {
+			t.Errorf("#%d: kvs = %+v, want %+v", i, r.KVs, wkvs)
+		}
+		if r.Rev != base+1 {
+			t.Errorf("#%d: range rev = %d, want %d", i, rev, base+1)
+		}
+
+		// delete foo
+		n, rev := s.DeleteRange([]byte("foo"), nil)
+		if n != 1 || rev != base+2 {
+			t.Errorf("#%d: n = %d, rev = %d, want (%d, %d)", i, n, rev, 1, base+2)
+		}
+
+		r, err = s.Range([]byte("foo"), nil, RangeOptions{Rev: base + 2})
+		if err != nil {
+			t.Fatal(err)
+		}
+		if r.KVs != nil {
+			t.Errorf("#%d: kvs = %+v, want %+v", i, r.KVs, nil)
+		}
+		if r.Rev != base+2 {
+			t.Errorf("#%d: range rev = %d, want %d", i, r.Rev, base+2)
+		}
+	}
+}
+
+func TestKVTxnBlockWriteOperations(t *testing.T) {
+	b, tmpPath := backend.NewDefaultTmpBackend()
+	s := NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
+
+	tests := []func(){
+		func() { s.Put([]byte("foo"), nil, lease.NoLease) },
+		func() { s.DeleteRange([]byte("foo"), nil) },
+	}
+	for i, tt := range tests {
+		tf := tt
+		txn := s.Write(traceutil.TODO())
+		done := make(chan struct{}, 1)
+		go func() {
+			tf()
+			done <- struct{}{}
+		}()
+		select {
+		case <-done:
+			t.Fatalf("#%d: operation failed to be blocked", i)
+		case <-time.After(10 * time.Millisecond):
+		}
+
+		txn.End()
+		select {
+		case <-done:
+		case <-time.After(10 * time.Second):
+			testutil.FatalStack(t, fmt.Sprintf("#%d: operation failed to be unblocked", i))
+		}
+	}
+
+	// only close backend when we know all the tx are finished
+	cleanup(s, b, tmpPath)
+}
+
+func TestKVTxnNonBlockRange(t *testing.T) {
+	b, tmpPath := backend.NewDefaultTmpBackend()
+	s := NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
+	defer cleanup(s, b, tmpPath)
+
+	txn := s.Write(traceutil.TODO())
+	defer txn.End()
+
+	donec := make(chan struct{})
+	go func() {
+		defer close(donec)
+		s.Range([]byte("foo"), nil, RangeOptions{})
+	}()
+	select {
+	case <-donec:
+	case <-time.After(100 * time.Millisecond):
+		t.Fatalf("range operation blocked on write txn")
+	}
+}
+
+// test that txn range, put, delete on single key in sequence repeatedly works correctly.
+func TestKVTxnOperationInSequence(t *testing.T) {
+	b, tmpPath := backend.NewDefaultTmpBackend()
+	s := NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
+	defer cleanup(s, b, tmpPath)
+
+	for i := 0; i < 10; i++ {
+		txn := s.Write(traceutil.TODO())
+		base := int64(i + 1)
+
+		// put foo
+		rev := txn.Put([]byte("foo"), []byte("bar"), lease.NoLease)
+		if rev != base+1 {
+			t.Errorf("#%d: put rev = %d, want %d", i, rev, base+1)
+		}
+
+		r, err := txn.Range([]byte("foo"), nil, RangeOptions{Rev: base + 1})
+		if err != nil {
+			t.Fatal(err)
+		}
+		wkvs := []mvccpb.KeyValue{
+			{Key: []byte("foo"), Value: []byte("bar"), CreateRevision: base + 1, ModRevision: base + 1, Version: 1, Lease: int64(lease.NoLease)},
+		}
+		if !reflect.DeepEqual(r.KVs, wkvs) {
+			t.Errorf("#%d: kvs = %+v, want %+v", i, r.KVs, wkvs)
+		}
+		if r.Rev != base+1 {
+			t.Errorf("#%d: range rev = %d, want %d", i, r.Rev, base+1)
+		}
+
+		// delete foo
+		n, rev := txn.DeleteRange([]byte("foo"), nil)
+		if n != 1 || rev != base+1 {
+			t.Errorf("#%d: n = %d, rev = %d, want (%d, %d)", i, n, rev, 1, base+1)
+		}
+
+		r, err = txn.Range([]byte("foo"), nil, RangeOptions{Rev: base + 1})
+		if err != nil {
+			t.Errorf("#%d: range error (%v)", i, err)
+		}
+		if r.KVs != nil {
+			t.Errorf("#%d: kvs = %+v, want %+v", i, r.KVs, nil)
+		}
+		if r.Rev != base+1 {
+			t.Errorf("#%d: range rev = %d, want %d", i, r.Rev, base+1)
+		}
+
+		txn.End()
+	}
+}
+
+func TestKVCompactReserveLastValue(t *testing.T) {
+	b, tmpPath := backend.NewDefaultTmpBackend()
+	s := NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
+	defer cleanup(s, b, tmpPath)
+
+	s.Put([]byte("foo"), []byte("bar0"), 1)
+	s.Put([]byte("foo"), []byte("bar1"), 2)
+	s.DeleteRange([]byte("foo"), nil)
+	s.Put([]byte("foo"), []byte("bar2"), 3)
+
+	// rev in tests will be called in Compact() one by one on the same store
+	tests := []struct {
+		rev int64
+		// wanted kvs right after the compacted rev
+		wkvs []mvccpb.KeyValue
+	}{
+		{
+			1,
+			[]mvccpb.KeyValue{
+				{Key: []byte("foo"), Value: []byte("bar0"), CreateRevision: 2, ModRevision: 2, Version: 1, Lease: 1},
+			},
+		},
+		{
+			2,
+			[]mvccpb.KeyValue{
+				{Key: []byte("foo"), Value: []byte("bar1"), CreateRevision: 2, ModRevision: 3, Version: 2, Lease: 2},
+			},
+		},
+		{
+			3,
+			nil,
+		},
+		{
+			4,
+			[]mvccpb.KeyValue{
+				{Key: []byte("foo"), Value: []byte("bar2"), CreateRevision: 5, ModRevision: 5, Version: 1, Lease: 3},
+			},
+		},
+	}
+	for i, tt := range tests {
+		_, err := s.Compact(traceutil.TODO(), tt.rev)
+		if err != nil {
+			t.Errorf("#%d: unexpect compact error %v", i, err)
+		}
+		r, err := s.Range([]byte("foo"), nil, RangeOptions{Rev: tt.rev + 1})
+		if err != nil {
+			t.Errorf("#%d: unexpect range error %v", i, err)
+		}
+		if !reflect.DeepEqual(r.KVs, tt.wkvs) {
+			t.Errorf("#%d: kvs = %+v, want %+v", i, r.KVs, tt.wkvs)
+		}
+	}
+}
+
+func TestKVCompactBad(t *testing.T) {
+	b, tmpPath := backend.NewDefaultTmpBackend()
+	s := NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
+	defer cleanup(s, b, tmpPath)
+
+	s.Put([]byte("foo"), []byte("bar0"), lease.NoLease)
+	s.Put([]byte("foo"), []byte("bar1"), lease.NoLease)
+	s.Put([]byte("foo"), []byte("bar2"), lease.NoLease)
+
+	// rev in tests will be called in Compact() one by one on the same store
+	tests := []struct {
+		rev  int64
+		werr error
+	}{
+		{0, nil},
+		{1, nil},
+		{1, ErrCompacted},
+		{4, nil},
+		{5, ErrFutureRev},
+		{100, ErrFutureRev},
+	}
+	for i, tt := range tests {
+		_, err := s.Compact(traceutil.TODO(), tt.rev)
+		if err != tt.werr {
+			t.Errorf("#%d: compact error = %v, want %v", i, err, tt.werr)
+		}
+	}
+}
+
+func TestKVHash(t *testing.T) {
+	hashes := make([]uint32, 3)
+
+	for i := 0; i < len(hashes); i++ {
+		var err error
+		b, tmpPath := backend.NewDefaultTmpBackend()
+		kv := NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
+		kv.Put([]byte("foo0"), []byte("bar0"), lease.NoLease)
+		kv.Put([]byte("foo1"), []byte("bar0"), lease.NoLease)
+		hashes[i], _, err = kv.Hash()
+		if err != nil {
+			t.Fatalf("failed to get hash: %v", err)
+		}
+		cleanup(kv, b, tmpPath)
+	}
+
+	for i := 1; i < len(hashes); i++ {
+		if hashes[i-1] != hashes[i] {
+			t.Errorf("hash[%d](%d) != hash[%d](%d)", i-1, hashes[i-1], i, hashes[i])
+		}
+	}
+}
+
+func TestKVRestore(t *testing.T) {
+	tests := []func(kv KV){
+		func(kv KV) {
+			kv.Put([]byte("foo"), []byte("bar0"), 1)
+			kv.Put([]byte("foo"), []byte("bar1"), 2)
+			kv.Put([]byte("foo"), []byte("bar2"), 3)
+			kv.Put([]byte("foo2"), []byte("bar0"), 1)
+		},
+		func(kv KV) {
+			kv.Put([]byte("foo"), []byte("bar0"), 1)
+			kv.DeleteRange([]byte("foo"), nil)
+			kv.Put([]byte("foo"), []byte("bar1"), 2)
+		},
+		func(kv KV) {
+			kv.Put([]byte("foo"), []byte("bar0"), 1)
+			kv.Put([]byte("foo"), []byte("bar1"), 2)
+			kv.Compact(traceutil.TODO(), 1)
+		},
+	}
+	for i, tt := range tests {
+		b, tmpPath := backend.NewDefaultTmpBackend()
+		s := NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
+		tt(s)
+		var kvss [][]mvccpb.KeyValue
+		for k := int64(0); k < 10; k++ {
+			r, _ := s.Range([]byte("a"), []byte("z"), RangeOptions{Rev: k})
+			kvss = append(kvss, r.KVs)
+		}
+
+		keysBefore := readGaugeInt(keysGauge)
+		s.Close()
+
+		// ns should recover the the previous state from backend.
+		ns := NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
+
+		if keysRestore := readGaugeInt(keysGauge); keysBefore != keysRestore {
+			t.Errorf("#%d: got %d key count, expected %d", i, keysRestore, keysBefore)
+		}
+
+		// wait for possible compaction to finish
+		testutil.WaitSchedule()
+		var nkvss [][]mvccpb.KeyValue
+		for k := int64(0); k < 10; k++ {
+			r, _ := ns.Range([]byte("a"), []byte("z"), RangeOptions{Rev: k})
+			nkvss = append(nkvss, r.KVs)
+		}
+		cleanup(ns, b, tmpPath)
+
+		if !reflect.DeepEqual(nkvss, kvss) {
+			t.Errorf("#%d: kvs history = %+v, want %+v", i, nkvss, kvss)
+		}
+	}
+}
+
+func readGaugeInt(g prometheus.Gauge) int {
+	ch := make(chan prometheus.Metric, 1)
+	g.Collect(ch)
+	m := <-ch
+	mm := &dto.Metric{}
+	m.Write(mm)
+	return int(mm.GetGauge().GetValue())
+}
+
+func TestKVSnapshot(t *testing.T) {
+	b, tmpPath := backend.NewDefaultTmpBackend()
+	s := NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
+	defer cleanup(s, b, tmpPath)
+
+	wkvs := put3TestKVs(s)
+
+	newPath := "new_test"
+	f, err := os.Create(newPath)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer os.Remove(newPath)
+
+	snap := s.b.Snapshot()
+	defer snap.Close()
+	_, err = snap.WriteTo(f)
+	if err != nil {
+		t.Fatal(err)
+	}
+	f.Close()
+
+	ns := NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
+	defer ns.Close()
+	r, err := ns.Range([]byte("a"), []byte("z"), RangeOptions{})
+	if err != nil {
+		t.Errorf("unexpect range error (%v)", err)
+	}
+	if !reflect.DeepEqual(r.KVs, wkvs) {
+		t.Errorf("kvs = %+v, want %+v", r.KVs, wkvs)
+	}
+	if r.Rev != 4 {
+		t.Errorf("rev = %d, want %d", r.Rev, 4)
+	}
+}
+
+func TestWatchableKVWatch(t *testing.T) {
+	b, tmpPath := backend.NewDefaultTmpBackend()
+	s := WatchableKV(newWatchableStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{}))
+	defer cleanup(s, b, tmpPath)
+
+	w := s.NewWatchStream()
+	defer w.Close()
+
+	wid, _ := w.Watch(0, []byte("foo"), []byte("fop"), 0)
+
+	wev := []mvccpb.Event{
+		{Type: mvccpb.PUT,
+			Kv: &mvccpb.KeyValue{
+				Key:            []byte("foo"),
+				Value:          []byte("bar"),
+				CreateRevision: 2,
+				ModRevision:    2,
+				Version:        1,
+				Lease:          1,
+			},
+		},
+		{
+			Type: mvccpb.PUT,
+			Kv: &mvccpb.KeyValue{
+				Key:            []byte("foo1"),
+				Value:          []byte("bar1"),
+				CreateRevision: 3,
+				ModRevision:    3,
+				Version:        1,
+				Lease:          2,
+			},
+		},
+		{
+			Type: mvccpb.PUT,
+			Kv: &mvccpb.KeyValue{
+				Key:            []byte("foo1"),
+				Value:          []byte("bar11"),
+				CreateRevision: 3,
+				ModRevision:    4,
+				Version:        2,
+				Lease:          3,
+			},
+		},
+	}
+
+	s.Put([]byte("foo"), []byte("bar"), 1)
+	select {
+	case resp := <-w.Chan():
+		if resp.WatchID != wid {
+			t.Errorf("resp.WatchID got = %d, want = %d", resp.WatchID, wid)
+		}
+		ev := resp.Events[0]
+		if !reflect.DeepEqual(ev, wev[0]) {
+			t.Errorf("watched event = %+v, want %+v", ev, wev[0])
+		}
+	case <-time.After(5 * time.Second):
+		// CPU might be too slow, and the routine is not able to switch around
+		testutil.FatalStack(t, "failed to watch the event")
+	}
+
+	s.Put([]byte("foo1"), []byte("bar1"), 2)
+	select {
+	case resp := <-w.Chan():
+		if resp.WatchID != wid {
+			t.Errorf("resp.WatchID got = %d, want = %d", resp.WatchID, wid)
+		}
+		ev := resp.Events[0]
+		if !reflect.DeepEqual(ev, wev[1]) {
+			t.Errorf("watched event = %+v, want %+v", ev, wev[1])
+		}
+	case <-time.After(5 * time.Second):
+		testutil.FatalStack(t, "failed to watch the event")
+	}
+
+	w = s.NewWatchStream()
+	wid, _ = w.Watch(0, []byte("foo1"), []byte("foo2"), 3)
+
+	select {
+	case resp := <-w.Chan():
+		if resp.WatchID != wid {
+			t.Errorf("resp.WatchID got = %d, want = %d", resp.WatchID, wid)
+		}
+		ev := resp.Events[0]
+		if !reflect.DeepEqual(ev, wev[1]) {
+			t.Errorf("watched event = %+v, want %+v", ev, wev[1])
+		}
+	case <-time.After(5 * time.Second):
+		testutil.FatalStack(t, "failed to watch the event")
+	}
+
+	s.Put([]byte("foo1"), []byte("bar11"), 3)
+	select {
+	case resp := <-w.Chan():
+		if resp.WatchID != wid {
+			t.Errorf("resp.WatchID got = %d, want = %d", resp.WatchID, wid)
+		}
+		ev := resp.Events[0]
+		if !reflect.DeepEqual(ev, wev[2]) {
+			t.Errorf("watched event = %+v, want %+v", ev, wev[2])
+		}
+	case <-time.After(5 * time.Second):
+		testutil.FatalStack(t, "failed to watch the event")
+	}
+}
+
+func cleanup(s KV, b backend.Backend, path string) {
+	s.Close()
+	b.Close()
+	os.Remove(path)
+}
+
+func put3TestKVs(s KV) []mvccpb.KeyValue {
+	s.Put([]byte("foo"), []byte("bar"), 1)
+	s.Put([]byte("foo1"), []byte("bar1"), 2)
+	s.Put([]byte("foo2"), []byte("bar2"), 3)
+	return []mvccpb.KeyValue{
+		{Key: []byte("foo"), Value: []byte("bar"), CreateRevision: 2, ModRevision: 2, Version: 1, Lease: 1},
+		{Key: []byte("foo1"), Value: []byte("bar1"), CreateRevision: 3, ModRevision: 3, Version: 1, Lease: 2},
+		{Key: []byte("foo2"), Value: []byte("bar2"), CreateRevision: 4, ModRevision: 4, Version: 1, Lease: 3},
+	}
+}
--- a/server/mvcc/kv_view.go
+++ b/server/mvcc/kv_view.go
@@ -0,0 +1,54 @@
+// Copyright 2017 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mvcc
+
+import (
+	"go.etcd.io/etcd/pkg/v3/traceutil"
+	"go.etcd.io/etcd/v3/lease"
+)
+
+type readView struct{ kv KV }
+
+func (rv *readView) FirstRev() int64 {
+	tr := rv.kv.Read(traceutil.TODO())
+	defer tr.End()
+	return tr.FirstRev()
+}
+
+func (rv *readView) Rev() int64 {
+	tr := rv.kv.Read(traceutil.TODO())
+	defer tr.End()
+	return tr.Rev()
+}
+
+func (rv *readView) Range(key, end []byte, ro RangeOptions) (r *RangeResult, err error) {
+	tr := rv.kv.Read(traceutil.TODO())
+	defer tr.End()
+	return tr.Range(key, end, ro)
+}
+
+type writeView struct{ kv KV }
+
+func (wv *writeView) DeleteRange(key, end []byte) (n, rev int64) {
+	tw := wv.kv.Write(traceutil.TODO())
+	defer tw.End()
+	return tw.DeleteRange(key, end)
+}
+
+func (wv *writeView) Put(key, value []byte, lease lease.LeaseID) (rev int64) {
+	tw := wv.kv.Write(traceutil.TODO())
+	defer tw.End()
+	return tw.Put(key, value, lease)
+}
--- a/server/mvcc/kvstore.go
+++ b/server/mvcc/kvstore.go
@@ -0,0 +1,592 @@
+// Copyright 2015 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mvcc
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"hash/crc32"
+	"math"
+	"sync"
+	"time"
+
+	"go.etcd.io/etcd/api/v3/mvccpb"
+	"go.etcd.io/etcd/pkg/v3/schedule"
+	"go.etcd.io/etcd/pkg/v3/traceutil"
+	"go.etcd.io/etcd/v3/etcdserver/cindex"
+	"go.etcd.io/etcd/v3/lease"
+	"go.etcd.io/etcd/v3/mvcc/backend"
+
+	"go.uber.org/zap"
+)
+
+var (
+	keyBucketName  = []byte("key")
+	metaBucketName = []byte("meta")
+
+	consistentIndexKeyName  = []byte("consistent_index")
+	scheduledCompactKeyName = []byte("scheduledCompactRev")
+	finishedCompactKeyName  = []byte("finishedCompactRev")
+
+	ErrCompacted = errors.New("mvcc: required revision has been compacted")
+	ErrFutureRev = errors.New("mvcc: required revision is a future revision")
+	ErrCanceled  = errors.New("mvcc: watcher is canceled")
+)
+
+const (
+	// markedRevBytesLen is the byte length of marked revision.
+	// The first `revBytesLen` bytes represents a normal revision. The last
+	// one byte is the mark.
+	markedRevBytesLen      = revBytesLen + 1
+	markBytePosition       = markedRevBytesLen - 1
+	markTombstone     byte = 't'
+)
+
+var restoreChunkKeys = 10000 // non-const for testing
+var defaultCompactBatchLimit = 1000
+
+type StoreConfig struct {
+	CompactionBatchLimit int
+}
+
+type store struct {
+	ReadView
+	WriteView
+
+	cfg StoreConfig
+
+	// mu read locks for txns and write locks for non-txn store changes.
+	mu sync.RWMutex
+
+	ci cindex.ConsistentIndexer
+
+	b       backend.Backend
+	kvindex index
+
+	le lease.Lessor
+
+	// revMuLock protects currentRev and compactMainRev.
+	// Locked at end of write txn and released after write txn unlock lock.
+	// Locked before locking read txn and released after locking.
+	revMu sync.RWMutex
+	// currentRev is the revision of the last completed transaction.
+	currentRev int64
+	// compactMainRev is the main revision of the last compaction.
+	compactMainRev int64
+
+	fifoSched schedule.Scheduler
+
+	stopc chan struct{}
+
+	lg *zap.Logger
+}
+
+// NewStore returns a new store. It is useful to create a store inside
+// mvcc pkg. It should only be used for testing externally.
+func NewStore(lg *zap.Logger, b backend.Backend, le lease.Lessor, ci cindex.ConsistentIndexer, cfg StoreConfig) *store {
+	if lg == nil {
+		lg = zap.NewNop()
+	}
+	if cfg.CompactionBatchLimit == 0 {
+		cfg.CompactionBatchLimit = defaultCompactBatchLimit
+	}
+	s := &store{
+		cfg:     cfg,
+		b:       b,
+		ci:      ci,
+		kvindex: newTreeIndex(lg),
+
+		le: le,
+
+		currentRev:     1,
+		compactMainRev: -1,
+
+		fifoSched: schedule.NewFIFOScheduler(),
+
+		stopc: make(chan struct{}),
+
+		lg: lg,
+	}
+	s.ReadView = &readView{s}
+	s.WriteView = &writeView{s}
+	if s.le != nil {
+		s.le.SetRangeDeleter(func() lease.TxnDelete { return s.Write(traceutil.TODO()) })
+	}
+
+	tx := s.b.BatchTx()
+	tx.Lock()
+	tx.UnsafeCreateBucket(keyBucketName)
+	tx.UnsafeCreateBucket(metaBucketName)
+	tx.Unlock()
+	s.b.ForceCommit()
+
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	if err := s.restore(); err != nil {
+		// TODO: return the error instead of panic here?
+		panic("failed to recover store from backend")
+	}
+
+	return s
+}
+
+func (s *store) compactBarrier(ctx context.Context, ch chan struct{}) {
+	if ctx == nil || ctx.Err() != nil {
+		select {
+		case <-s.stopc:
+		default:
+			// fix deadlock in mvcc,for more information, please refer to pr 11817.
+			// s.stopc is only updated in restore operation, which is called by apply
+			// snapshot call, compaction and apply snapshot requests are serialized by
+			// raft, and do not happen at the same time.
+			s.mu.Lock()
+			f := func(ctx context.Context) { s.compactBarrier(ctx, ch) }
+			s.fifoSched.Schedule(f)
+			s.mu.Unlock()
+		}
+		return
+	}
+	close(ch)
+}
+
+func (s *store) Hash() (hash uint32, revision int64, err error) {
+	// TODO: hash and revision could be inconsistent, one possible fix is to add s.revMu.RLock() at the beginning of function, which is costly
+	start := time.Now()
+
+	s.b.ForceCommit()
+	h, err := s.b.Hash(DefaultIgnores)
+
+	hashSec.Observe(time.Since(start).Seconds())
+	return h, s.currentRev, err
+}
+
+func (s *store) HashByRev(rev int64) (hash uint32, currentRev int64, compactRev int64, err error) {
+	start := time.Now()
+
+	s.mu.RLock()
+	s.revMu.RLock()
+	compactRev, currentRev = s.compactMainRev, s.currentRev
+	s.revMu.RUnlock()
+
+	if rev > 0 && rev <= compactRev {
+		s.mu.RUnlock()
+		return 0, 0, compactRev, ErrCompacted
+	} else if rev > 0 && rev > currentRev {
+		s.mu.RUnlock()
+		return 0, currentRev, 0, ErrFutureRev
+	}
+
+	if rev == 0 {
+		rev = currentRev
+	}
+	keep := s.kvindex.Keep(rev)
+
+	tx := s.b.ReadTx()
+	tx.RLock()
+	defer tx.RUnlock()
+	s.mu.RUnlock()
+
+	upper := revision{main: rev + 1}
+	lower := revision{main: compactRev + 1}
+	h := crc32.New(crc32.MakeTable(crc32.Castagnoli))
+
+	h.Write(keyBucketName)
+	err = tx.UnsafeForEach(keyBucketName, func(k, v []byte) error {
+		kr := bytesToRev(k)
+		if !upper.GreaterThan(kr) {
+			return nil
+		}
+		// skip revisions that are scheduled for deletion
+		// due to compacting; don't skip if there isn't one.
+		if lower.GreaterThan(kr) && len(keep) > 0 {
+			if _, ok := keep[kr]; !ok {
+				return nil
+			}
+		}
+		h.Write(k)
+		h.Write(v)
+		return nil
+	})
+	hash = h.Sum32()
+
+	hashRevSec.Observe(time.Since(start).Seconds())
+	return hash, currentRev, compactRev, err
+}
+
+func (s *store) updateCompactRev(rev int64) (<-chan struct{}, error) {
+	s.revMu.Lock()
+	if rev <= s.compactMainRev {
+		ch := make(chan struct{})
+		f := func(ctx context.Context) { s.compactBarrier(ctx, ch) }
+		s.fifoSched.Schedule(f)
+		s.revMu.Unlock()
+		return ch, ErrCompacted
+	}
+	if rev > s.currentRev {
+		s.revMu.Unlock()
+		return nil, ErrFutureRev
+	}
+
+	s.compactMainRev = rev
+
+	rbytes := newRevBytes()
+	revToBytes(revision{main: rev}, rbytes)
+
+	tx := s.b.BatchTx()
+	tx.Lock()
+	tx.UnsafePut(metaBucketName, scheduledCompactKeyName, rbytes)
+	tx.Unlock()
+	// ensure that desired compaction is persisted
+	s.b.ForceCommit()
+
+	s.revMu.Unlock()
+
+	return nil, nil
+}
+
+func (s *store) compact(trace *traceutil.Trace, rev int64) (<-chan struct{}, error) {
+	ch := make(chan struct{})
+	var j = func(ctx context.Context) {
+		if ctx.Err() != nil {
+			s.compactBarrier(ctx, ch)
+			return
+		}
+		start := time.Now()
+		keep := s.kvindex.Compact(rev)
+		indexCompactionPauseMs.Observe(float64(time.Since(start) / time.Millisecond))
+		if !s.scheduleCompaction(rev, keep) {
+			s.compactBarrier(nil, ch)
+			return
+		}
+		close(ch)
+	}
+
+	s.fifoSched.Schedule(j)
+	trace.Step("schedule compaction")
+	return ch, nil
+}
+
+func (s *store) compactLockfree(rev int64) (<-chan struct{}, error) {
+	ch, err := s.updateCompactRev(rev)
+	if err != nil {
+		return ch, err
+	}
+
+	return s.compact(traceutil.TODO(), rev)
+}
+
+func (s *store) Compact(trace *traceutil.Trace, rev int64) (<-chan struct{}, error) {
+	s.mu.Lock()
+
+	ch, err := s.updateCompactRev(rev)
+	trace.Step("check and update compact revision")
+	if err != nil {
+		s.mu.Unlock()
+		return ch, err
+	}
+	s.mu.Unlock()
+
+	return s.compact(trace, rev)
+}
+
+// DefaultIgnores is a map of keys to ignore in hash checking.
+var DefaultIgnores map[backend.IgnoreKey]struct{}
+
+func init() {
+	DefaultIgnores = map[backend.IgnoreKey]struct{}{
+		// consistent index might be changed due to v2 internal sync, which
+		// is not controllable by the user.
+		{Bucket: string(metaBucketName), Key: string(consistentIndexKeyName)}: {},
+	}
+}
+
+func (s *store) Commit() {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	tx := s.b.BatchTx()
+	tx.Lock()
+	s.saveIndex(tx)
+	tx.Unlock()
+	s.b.ForceCommit()
+}
+
+func (s *store) Restore(b backend.Backend) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	close(s.stopc)
+	s.fifoSched.Stop()
+
+	s.b = b
+	s.kvindex = newTreeIndex(s.lg)
+
+	{
+		// During restore the metrics might report 'special' values
+		s.revMu.Lock()
+		s.currentRev = 1
+		s.compactMainRev = -1
+		s.revMu.Unlock()
+	}
+
+	s.fifoSched = schedule.NewFIFOScheduler()
+	s.stopc = make(chan struct{})
+	s.ci.SetBatchTx(b.BatchTx())
+	s.ci.SetConsistentIndex(0)
+
+	return s.restore()
+}
+
+func (s *store) restore() error {
+	s.setupMetricsReporter()
+
+	min, max := newRevBytes(), newRevBytes()
+	revToBytes(revision{main: 1}, min)
+	revToBytes(revision{main: math.MaxInt64, sub: math.MaxInt64}, max)
+
+	keyToLease := make(map[string]lease.LeaseID)
+
+	// restore index
+	tx := s.b.BatchTx()
+	tx.Lock()
+
+	_, finishedCompactBytes := tx.UnsafeRange(metaBucketName, finishedCompactKeyName, nil, 0)
+	if len(finishedCompactBytes) != 0 {
+		s.revMu.Lock()
+		s.compactMainRev = bytesToRev(finishedCompactBytes[0]).main
+
+		s.lg.Info(
+			"restored last compact revision",
+			zap.String("meta-bucket-name", string(metaBucketName)),
+			zap.String("meta-bucket-name-key", string(finishedCompactKeyName)),
+			zap.Int64("restored-compact-revision", s.compactMainRev),
+		)
+		s.revMu.Unlock()
+	}
+	_, scheduledCompactBytes := tx.UnsafeRange(metaBucketName, scheduledCompactKeyName, nil, 0)
+	scheduledCompact := int64(0)
+	if len(scheduledCompactBytes) != 0 {
+		scheduledCompact = bytesToRev(scheduledCompactBytes[0]).main
+	}
+
+	// index keys concurrently as they're loaded in from tx
+	keysGauge.Set(0)
+	rkvc, revc := restoreIntoIndex(s.lg, s.kvindex)
+	for {
+		keys, vals := tx.UnsafeRange(keyBucketName, min, max, int64(restoreChunkKeys))
+		if len(keys) == 0 {
+			break
+		}
+		// rkvc blocks if the total pending keys exceeds the restore
+		// chunk size to keep keys from consuming too much memory.
+		restoreChunk(s.lg, rkvc, keys, vals, keyToLease)
+		if len(keys) < restoreChunkKeys {
+			// partial set implies final set
+			break
+		}
+		// next set begins after where this one ended
+		newMin := bytesToRev(keys[len(keys)-1][:revBytesLen])
+		newMin.sub++
+		revToBytes(newMin, min)
+	}
+	close(rkvc)
+
+	{
+		s.revMu.Lock()
+		s.currentRev = <-revc
+
+		// keys in the range [compacted revision -N, compaction] might all be deleted due to compaction.
+		// the correct revision should be set to compaction revision in the case, not the largest revision
+		// we have seen.
+		if s.currentRev < s.compactMainRev {
+			s.currentRev = s.compactMainRev
+		}
+		s.revMu.Unlock()
+	}
+
+	if scheduledCompact <= s.compactMainRev {
+		scheduledCompact = 0
+	}
+
+	for key, lid := range keyToLease {
+		if s.le == nil {
+			tx.Unlock()
+			panic("no lessor to attach lease")
+		}
+		err := s.le.Attach(lid, []lease.LeaseItem{{Key: key}})
+		if err != nil {
+			s.lg.Error(
+				"failed to attach a lease",
+				zap.String("lease-id", fmt.Sprintf("%016x", lid)),
+				zap.Error(err),
+			)
+		}
+	}
+
+	tx.Unlock()
+
+	if scheduledCompact != 0 {
+		if _, err := s.compactLockfree(scheduledCompact); err != nil {
+			s.lg.Warn("compaction encountered error", zap.Error(err))
+		}
+
+		s.lg.Info(
+			"resume scheduled compaction",
+			zap.String("meta-bucket-name", string(metaBucketName)),
+			zap.String("meta-bucket-name-key", string(scheduledCompactKeyName)),
+			zap.Int64("scheduled-compact-revision", scheduledCompact),
+		)
+	}
+
+	return nil
+}
+
+type revKeyValue struct {
+	key  []byte
+	kv   mvccpb.KeyValue
+	kstr string
+}
+
+func restoreIntoIndex(lg *zap.Logger, idx index) (chan<- revKeyValue, <-chan int64) {
+	rkvc, revc := make(chan revKeyValue, restoreChunkKeys), make(chan int64, 1)
+	go func() {
+		currentRev := int64(1)
+		defer func() { revc <- currentRev }()
+		// restore the tree index from streaming the unordered index.
+		kiCache := make(map[string]*keyIndex, restoreChunkKeys)
+		for rkv := range rkvc {
+			ki, ok := kiCache[rkv.kstr]
+			// purge kiCache if many keys but still missing in the cache
+			if !ok && len(kiCache) >= restoreChunkKeys {
+				i := 10
+				for k := range kiCache {
+					delete(kiCache, k)
+					if i--; i == 0 {
+						break
+					}
+				}
+			}
+			// cache miss, fetch from tree index if there
+			if !ok {
+				ki = &keyIndex{key: rkv.kv.Key}
+				if idxKey := idx.KeyIndex(ki); idxKey != nil {
+					kiCache[rkv.kstr], ki = idxKey, idxKey
+					ok = true
+				}
+			}
+			rev := bytesToRev(rkv.key)
+			currentRev = rev.main
+			if ok {
+				if isTombstone(rkv.key) {
+					if err := ki.tombstone(lg, rev.main, rev.sub); err != nil {
+						lg.Warn("tombstone encountered error", zap.Error(err))
+					}
+					continue
+				}
+				ki.put(lg, rev.main, rev.sub)
+			} else if !isTombstone(rkv.key) {
+				ki.restore(lg, revision{rkv.kv.CreateRevision, 0}, rev, rkv.kv.Version)
+				idx.Insert(ki)
+				kiCache[rkv.kstr] = ki
+			}
+		}
+	}()
+	return rkvc, revc
+}
+
+func restoreChunk(lg *zap.Logger, kvc chan<- revKeyValue, keys, vals [][]byte, keyToLease map[string]lease.LeaseID) {
+	for i, key := range keys {
+		rkv := revKeyValue{key: key}
+		if err := rkv.kv.Unmarshal(vals[i]); err != nil {
+			lg.Fatal("failed to unmarshal mvccpb.KeyValue", zap.Error(err))
+		}
+		rkv.kstr = string(rkv.kv.Key)
+		if isTombstone(key) {
+			delete(keyToLease, rkv.kstr)
+		} else if lid := lease.LeaseID(rkv.kv.Lease); lid != lease.NoLease {
+			keyToLease[rkv.kstr] = lid
+		} else {
+			delete(keyToLease, rkv.kstr)
+		}
+		kvc <- rkv
+	}
+}
+
+func (s *store) Close() error {
+	close(s.stopc)
+	s.fifoSched.Stop()
+	return nil
+}
+
+func (s *store) saveIndex(tx backend.BatchTx) {
+	if s.ci != nil {
+		s.ci.UnsafeSave(tx)
+	}
+}
+
+func (s *store) ConsistentIndex() uint64 {
+	if s.ci != nil {
+		return s.ci.ConsistentIndex()
+	}
+	return 0
+}
+
+func (s *store) setupMetricsReporter() {
+	b := s.b
+	reportDbTotalSizeInBytesMu.Lock()
+	reportDbTotalSizeInBytes = func() float64 { return float64(b.Size()) }
+	reportDbTotalSizeInBytesMu.Unlock()
+	reportDbTotalSizeInBytesDebugMu.Lock()
+	reportDbTotalSizeInBytesDebug = func() float64 { return float64(b.Size()) }
+	reportDbTotalSizeInBytesDebugMu.Unlock()
+	reportDbTotalSizeInUseInBytesMu.Lock()
+	reportDbTotalSizeInUseInBytes = func() float64 { return float64(b.SizeInUse()) }
+	reportDbTotalSizeInUseInBytesMu.Unlock()
+	reportDbOpenReadTxNMu.Lock()
+	reportDbOpenReadTxN = func() float64 { return float64(b.OpenReadTxN()) }
+	reportDbOpenReadTxNMu.Unlock()
+	reportCurrentRevMu.Lock()
+	reportCurrentRev = func() float64 {
+		s.revMu.RLock()
+		defer s.revMu.RUnlock()
+		return float64(s.currentRev)
+	}
+	reportCurrentRevMu.Unlock()
+	reportCompactRevMu.Lock()
+	reportCompactRev = func() float64 {
+		s.revMu.RLock()
+		defer s.revMu.RUnlock()
+		return float64(s.compactMainRev)
+	}
+	reportCompactRevMu.Unlock()
+}
+
+// appendMarkTombstone appends tombstone mark to normal revision bytes.
+func appendMarkTombstone(lg *zap.Logger, b []byte) []byte {
+	if len(b) != revBytesLen {
+		lg.Panic(
+			"cannot append tombstone mark to non-normal revision bytes",
+			zap.Int("expected-revision-bytes-size", revBytesLen),
+			zap.Int("given-revision-bytes-size", len(b)),
+		)
+	}
+	return append(b, markTombstone)
+}
+
+// isTombstone checks whether the revision bytes is a tombstone.
+func isTombstone(b []byte) bool {
+	return len(b) == markedRevBytesLen && b[markBytePosition] == markTombstone
+}
--- a/server/mvcc/kvstore_bench_test.go
+++ b/server/mvcc/kvstore_bench_test.go
@@ -0,0 +1,165 @@
+// Copyright 2015 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mvcc
+
+import (
+	"testing"
+
+	"go.etcd.io/etcd/pkg/v3/traceutil"
+	"go.etcd.io/etcd/v3/etcdserver/cindex"
+	"go.etcd.io/etcd/v3/lease"
+	"go.etcd.io/etcd/v3/mvcc/backend"
+
+	"go.uber.org/zap"
+)
+
+func BenchmarkStorePut(b *testing.B) {
+	be, tmpPath := backend.NewDefaultTmpBackend()
+	s := NewStore(zap.NewExample(), be, &lease.FakeLessor{}, cindex.NewConsistentIndex(be.BatchTx()), StoreConfig{})
+	defer cleanup(s, be, tmpPath)
+
+	// arbitrary number of bytes
+	bytesN := 64
+	keys := createBytesSlice(bytesN, b.N)
+	vals := createBytesSlice(bytesN, b.N)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		s.Put(keys[i], vals[i], lease.NoLease)
+	}
+}
+
+func BenchmarkStoreRangeKey1(b *testing.B)   { benchmarkStoreRange(b, 1) }
+func BenchmarkStoreRangeKey100(b *testing.B) { benchmarkStoreRange(b, 100) }
+
+func benchmarkStoreRange(b *testing.B, n int) {
+	be, tmpPath := backend.NewDefaultTmpBackend()
+	s := NewStore(zap.NewExample(), be, &lease.FakeLessor{}, cindex.NewConsistentIndex(be.BatchTx()), StoreConfig{})
+	defer cleanup(s, be, tmpPath)
+
+	// 64 byte key/val
+	keys, val := createBytesSlice(64, n), createBytesSlice(64, 1)
+	for i := range keys {
+		s.Put(keys[i], val[0], lease.NoLease)
+	}
+	// Force into boltdb tx instead of backend read tx.
+	s.Commit()
+
+	var begin, end []byte
+	if n == 1 {
+		begin, end = keys[0], nil
+	} else {
+		begin, end = []byte{}, []byte{}
+	}
+
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		s.Range(begin, end, RangeOptions{})
+	}
+}
+
+func BenchmarkConsistentIndex(b *testing.B) {
+	be, tmpPath := backend.NewDefaultTmpBackend()
+	s := NewStore(zap.NewExample(), be, &lease.FakeLessor{}, cindex.NewConsistentIndex(be.BatchTx()), StoreConfig{})
+	defer cleanup(s, be, tmpPath)
+
+	tx := s.b.BatchTx()
+	tx.Lock()
+	s.saveIndex(tx)
+	tx.Unlock()
+
+	b.ReportAllocs()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		s.ConsistentIndex()
+	}
+}
+
+// BenchmarkStoreTxnPutUpdate is same as above, but instead updates single key
+func BenchmarkStorePutUpdate(b *testing.B) {
+	be, tmpPath := backend.NewDefaultTmpBackend()
+	s := NewStore(zap.NewExample(), be, &lease.FakeLessor{}, cindex.NewConsistentIndex(be.BatchTx()), StoreConfig{})
+	defer cleanup(s, be, tmpPath)
+
+	// arbitrary number of bytes
+	keys := createBytesSlice(64, 1)
+	vals := createBytesSlice(1024, 1)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		s.Put(keys[0], vals[0], lease.NoLease)
+	}
+}
+
+// BenchmarkStoreTxnPut benchmarks the Put operation
+// with transaction begin and end, where transaction involves
+// some synchronization operations, such as mutex locking.
+func BenchmarkStoreTxnPut(b *testing.B) {
+	be, tmpPath := backend.NewDefaultTmpBackend()
+	s := NewStore(zap.NewExample(), be, &lease.FakeLessor{}, cindex.NewConsistentIndex(be.BatchTx()), StoreConfig{})
+	defer cleanup(s, be, tmpPath)
+
+	// arbitrary number of bytes
+	bytesN := 64
+	keys := createBytesSlice(bytesN, b.N)
+	vals := createBytesSlice(bytesN, b.N)
+
+	b.ResetTimer()
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		txn := s.Write(traceutil.TODO())
+		txn.Put(keys[i], vals[i], lease.NoLease)
+		txn.End()
+	}
+}
+
+// benchmarkStoreRestore benchmarks the restore operation
+func benchmarkStoreRestore(revsPerKey int, b *testing.B) {
+	be, tmpPath := backend.NewDefaultTmpBackend()
+	s := NewStore(zap.NewExample(), be, &lease.FakeLessor{}, cindex.NewConsistentIndex(be.BatchTx()), StoreConfig{})
+	// use closure to capture 's' to pick up the reassignment
+	defer func() { cleanup(s, be, tmpPath) }()
+
+	// arbitrary number of bytes
+	bytesN := 64
+	keys := createBytesSlice(bytesN, b.N)
+	vals := createBytesSlice(bytesN, b.N)
+
+	for i := 0; i < b.N; i++ {
+		for j := 0; j < revsPerKey; j++ {
+			txn := s.Write(traceutil.TODO())
+			txn.Put(keys[i], vals[i], lease.NoLease)
+			txn.End()
+		}
+	}
+	s.Close()
+
+	b.ReportAllocs()
+	b.ResetTimer()
+	s = NewStore(zap.NewExample(), be, &lease.FakeLessor{}, cindex.NewConsistentIndex(be.BatchTx()), StoreConfig{})
+}
+
+func BenchmarkStoreRestoreRevs1(b *testing.B) {
+	benchmarkStoreRestore(1, b)
+}
+
+func BenchmarkStoreRestoreRevs10(b *testing.B) {
+	benchmarkStoreRestore(10, b)
+}
+
+func BenchmarkStoreRestoreRevs20(b *testing.B) {
+	benchmarkStoreRestore(20, b)
+}
--- a/server/mvcc/kvstore_compaction.go
+++ b/server/mvcc/kvstore_compaction.go
@@ -0,0 +1,77 @@
+// Copyright 2015 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mvcc
+
+import (
+	"encoding/binary"
+	"time"
+
+	"go.uber.org/zap"
+)
+
+func (s *store) scheduleCompaction(compactMainRev int64, keep map[revision]struct{}) bool {
+	totalStart := time.Now()
+	defer func() { dbCompactionTotalMs.Observe(float64(time.Since(totalStart) / time.Millisecond)) }()
+	keyCompactions := 0
+	defer func() { dbCompactionKeysCounter.Add(float64(keyCompactions)) }()
+	defer func() { dbCompactionLast.Set(float64(time.Now().Unix())) }()
+
+	end := make([]byte, 8)
+	binary.BigEndian.PutUint64(end, uint64(compactMainRev+1))
+
+	last := make([]byte, 8+1+8)
+	for {
+		var rev revision
+
+		start := time.Now()
+
+		tx := s.b.BatchTx()
+		tx.Lock()
+		keys, _ := tx.UnsafeRange(keyBucketName, last, end, int64(s.cfg.CompactionBatchLimit))
+		for _, key := range keys {
+			rev = bytesToRev(key)
+			if _, ok := keep[rev]; !ok {
+				tx.UnsafeDelete(keyBucketName, key)
+				keyCompactions++
+			}
+		}
+
+		if len(keys) < s.cfg.CompactionBatchLimit {
+			rbytes := make([]byte, 8+1+8)
+			revToBytes(revision{main: compactMainRev}, rbytes)
+			tx.UnsafePut(metaBucketName, finishedCompactKeyName, rbytes)
+			tx.Unlock()
+			s.lg.Info(
+				"finished scheduled compaction",
+				zap.Int64("compact-revision", compactMainRev),
+				zap.Duration("took", time.Since(totalStart)),
+			)
+			return true
+		}
+
+		// update last
+		revToBytes(revision{main: rev.main, sub: rev.sub + 1}, last)
+		tx.Unlock()
+		// Immediately commit the compaction deletes instead of letting them accumulate in the write buffer
+		s.b.ForceCommit()
+		dbCompactionPauseMs.Observe(float64(time.Since(start) / time.Millisecond))
+
+		select {
+		case <-time.After(10 * time.Millisecond):
+		case <-s.stopc:
+			return false
+		}
+	}
+}
--- a/server/mvcc/kvstore_compaction_test.go
+++ b/server/mvcc/kvstore_compaction_test.go
@@ -0,0 +1,137 @@
+// Copyright 2015 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mvcc
+
+import (
+	"os"
+	"reflect"
+	"testing"
+	"time"
+
+	"go.etcd.io/etcd/pkg/v3/traceutil"
+	"go.etcd.io/etcd/v3/lease"
+	"go.etcd.io/etcd/v3/mvcc/backend"
+	"go.uber.org/zap"
+)
+
+func TestScheduleCompaction(t *testing.T) {
+	revs := []revision{{1, 0}, {2, 0}, {3, 0}}
+
+	tests := []struct {
+		rev   int64
+		keep  map[revision]struct{}
+		wrevs []revision
+	}{
+		// compact at 1 and discard all history
+		{
+			1,
+			nil,
+			revs[1:],
+		},
+		// compact at 3 and discard all history
+		{
+			3,
+			nil,
+			nil,
+		},
+		// compact at 1 and keeps history one step earlier
+		{
+			1,
+			map[revision]struct{}{
+				{main: 1}: {},
+			},
+			revs,
+		},
+		// compact at 1 and keeps history two steps earlier
+		{
+			3,
+			map[revision]struct{}{
+				{main: 2}: {},
+				{main: 3}: {},
+			},
+			revs[1:],
+		},
+	}
+	for i, tt := range tests {
+		b, tmpPath := backend.NewDefaultTmpBackend()
+		s := NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
+		tx := s.b.BatchTx()
+
+		tx.Lock()
+		ibytes := newRevBytes()
+		for _, rev := range revs {
+			revToBytes(rev, ibytes)
+			tx.UnsafePut(keyBucketName, ibytes, []byte("bar"))
+		}
+		tx.Unlock()
+
+		s.scheduleCompaction(tt.rev, tt.keep)
+
+		tx.Lock()
+		for _, rev := range tt.wrevs {
+			revToBytes(rev, ibytes)
+			keys, _ := tx.UnsafeRange(keyBucketName, ibytes, nil, 0)
+			if len(keys) != 1 {
+				t.Errorf("#%d: range on %v = %d, want 1", i, rev, len(keys))
+			}
+		}
+		_, vals := tx.UnsafeRange(metaBucketName, finishedCompactKeyName, nil, 0)
+		revToBytes(revision{main: tt.rev}, ibytes)
+		if w := [][]byte{ibytes}; !reflect.DeepEqual(vals, w) {
+			t.Errorf("#%d: vals on %v = %+v, want %+v", i, finishedCompactKeyName, vals, w)
+		}
+		tx.Unlock()
+
+		cleanup(s, b, tmpPath)
+	}
+}
+
+func TestCompactAllAndRestore(t *testing.T) {
+	b, tmpPath := backend.NewDefaultTmpBackend()
+	s0 := NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
+	defer os.Remove(tmpPath)
+
+	s0.Put([]byte("foo"), []byte("bar"), lease.NoLease)
+	s0.Put([]byte("foo"), []byte("bar1"), lease.NoLease)
+	s0.Put([]byte("foo"), []byte("bar2"), lease.NoLease)
+	s0.DeleteRange([]byte("foo"), nil)
+
+	rev := s0.Rev()
+	// compact all keys
+	done, err := s0.Compact(traceutil.TODO(), rev)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	select {
+	case <-done:
+	case <-time.After(10 * time.Second):
+		t.Fatal("timeout waiting for compaction to finish")
+	}
+
+	err = s0.Close()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	s1 := NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
+	if s1.Rev() != rev {
+		t.Errorf("rev = %v, want %v", s1.Rev(), rev)
+	}
+	_, err = s1.Range([]byte("foo"), nil, RangeOptions{})
+	if err != nil {
+		t.Errorf("unexpect range error %v", err)
+	}
+}
--- a/server/mvcc/kvstore_test.go
+++ b/server/mvcc/kvstore_test.go
--- a/server/mvcc/kvstore_txn.go
+++ b/server/mvcc/kvstore_txn.go
@@ -0,0 +1,289 @@
+// Copyright 2017 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mvcc
+
+import (
+	"go.etcd.io/etcd/api/v3/mvccpb"
+	"go.etcd.io/etcd/pkg/v3/traceutil"
+	"go.etcd.io/etcd/v3/lease"
+	"go.etcd.io/etcd/v3/mvcc/backend"
+	"go.uber.org/zap"
+)
+
+type storeTxnRead struct {
+	s  *store
+	tx backend.ReadTx
+
+	firstRev int64
+	rev      int64
+
+	trace *traceutil.Trace
+}
+
+func (s *store) Read(trace *traceutil.Trace) TxnRead {
+	s.mu.RLock()
+	s.revMu.RLock()
+	// backend holds b.readTx.RLock() only when creating the concurrentReadTx. After
+	// ConcurrentReadTx is created, it will not block write transaction.
+	tx := s.b.ConcurrentReadTx()
+	tx.RLock() // RLock is no-op. concurrentReadTx does not need to be locked after it is created.
+	firstRev, rev := s.compactMainRev, s.currentRev
+	s.revMu.RUnlock()
+	return newMetricsTxnRead(&storeTxnRead{s, tx, firstRev, rev, trace})
+}
+
+func (tr *storeTxnRead) FirstRev() int64 { return tr.firstRev }
+func (tr *storeTxnRead) Rev() int64      { return tr.rev }
+
+func (tr *storeTxnRead) Range(key, end []byte, ro RangeOptions) (r *RangeResult, err error) {
+	return tr.rangeKeys(key, end, tr.Rev(), ro)
+}
+
+func (tr *storeTxnRead) End() {
+	tr.tx.RUnlock() // RUnlock signals the end of concurrentReadTx.
+	tr.s.mu.RUnlock()
+}
+
+type storeTxnWrite struct {
+	storeTxnRead
+	tx backend.BatchTx
+	// beginRev is the revision where the txn begins; it will write to the next revision.
+	beginRev int64
+	changes  []mvccpb.KeyValue
+}
+
+func (s *store) Write(trace *traceutil.Trace) TxnWrite {
+	s.mu.RLock()
+	tx := s.b.BatchTx()
+	tx.Lock()
+	tw := &storeTxnWrite{
+		storeTxnRead: storeTxnRead{s, tx, 0, 0, trace},
+		tx:           tx,
+		beginRev:     s.currentRev,
+		changes:      make([]mvccpb.KeyValue, 0, 4),
+	}
+	return newMetricsTxnWrite(tw)
+}
+
+func (tw *storeTxnWrite) Rev() int64 { return tw.beginRev }
+
+func (tw *storeTxnWrite) Range(key, end []byte, ro RangeOptions) (r *RangeResult, err error) {
+	rev := tw.beginRev
+	if len(tw.changes) > 0 {
+		rev++
+	}
+	return tw.rangeKeys(key, end, rev, ro)
+}
+
+func (tw *storeTxnWrite) DeleteRange(key, end []byte) (int64, int64) {
+	if n := tw.deleteRange(key, end); n != 0 || len(tw.changes) > 0 {
+		return n, tw.beginRev + 1
+	}
+	return 0, tw.beginRev
+}
+
+func (tw *storeTxnWrite) Put(key, value []byte, lease lease.LeaseID) int64 {
+	tw.put(key, value, lease)
+	return tw.beginRev + 1
+}
+
+func (tw *storeTxnWrite) End() {
+	// only update index if the txn modifies the mvcc state.
+	if len(tw.changes) != 0 {
+		tw.s.saveIndex(tw.tx)
+		// hold revMu lock to prevent new read txns from opening until writeback.
+		tw.s.revMu.Lock()
+		tw.s.currentRev++
+	}
+	tw.tx.Unlock()
+	if len(tw.changes) != 0 {
+		tw.s.revMu.Unlock()
+	}
+	tw.s.mu.RUnlock()
+}
+
+func (tr *storeTxnRead) rangeKeys(key, end []byte, curRev int64, ro RangeOptions) (*RangeResult, error) {
+	rev := ro.Rev
+	if rev > curRev {
+		return &RangeResult{KVs: nil, Count: -1, Rev: curRev}, ErrFutureRev
+	}
+	if rev <= 0 {
+		rev = curRev
+	}
+	if rev < tr.s.compactMainRev {
+		return &RangeResult{KVs: nil, Count: -1, Rev: 0}, ErrCompacted
+	}
+	if ro.Count {
+		total := tr.s.kvindex.CountRevisions(key, end, rev, int(ro.Limit))
+		tr.trace.Step("count revisions from in-memory index tree")
+		return &RangeResult{KVs: nil, Count: total, Rev: curRev}, nil
+	}
+	revpairs := tr.s.kvindex.Revisions(key, end, rev, int(ro.Limit))
+	tr.trace.Step("range keys from in-memory index tree")
+	if len(revpairs) == 0 {
+		return &RangeResult{KVs: nil, Count: 0, Rev: curRev}, nil
+	}
+
+	limit := int(ro.Limit)
+	if limit <= 0 || limit > len(revpairs) {
+		limit = len(revpairs)
+	}
+
+	kvs := make([]mvccpb.KeyValue, limit)
+	revBytes := newRevBytes()
+	for i, revpair := range revpairs[:len(kvs)] {
+		revToBytes(revpair, revBytes)
+		_, vs := tr.tx.UnsafeRange(keyBucketName, revBytes, nil, 0)
+		if len(vs) != 1 {
+			tr.s.lg.Fatal(
+				"range failed to find revision pair",
+				zap.Int64("revision-main", revpair.main),
+				zap.Int64("revision-sub", revpair.sub),
+			)
+		}
+		if err := kvs[i].Unmarshal(vs[0]); err != nil {
+			tr.s.lg.Fatal(
+				"failed to unmarshal mvccpb.KeyValue",
+				zap.Error(err),
+			)
+		}
+	}
+	tr.trace.Step("range keys from bolt db")
+	return &RangeResult{KVs: kvs, Count: len(revpairs), Rev: curRev}, nil
+}
+
+func (tw *storeTxnWrite) put(key, value []byte, leaseID lease.LeaseID) {
+	rev := tw.beginRev + 1
+	c := rev
+	oldLease := lease.NoLease
+
+	// if the key exists before, use its previous created and
+	// get its previous leaseID
+	_, created, ver, err := tw.s.kvindex.Get(key, rev)
+	if err == nil {
+		c = created.main
+		oldLease = tw.s.le.GetLease(lease.LeaseItem{Key: string(key)})
+	}
+	tw.trace.Step("get key's previous created_revision and leaseID")
+	ibytes := newRevBytes()
+	idxRev := revision{main: rev, sub: int64(len(tw.changes))}
+	revToBytes(idxRev, ibytes)
+
+	ver = ver + 1
+	kv := mvccpb.KeyValue{
+		Key:            key,
+		Value:          value,
+		CreateRevision: c,
+		ModRevision:    rev,
+		Version:        ver,
+		Lease:          int64(leaseID),
+	}
+
+	d, err := kv.Marshal()
+	if err != nil {
+		tw.storeTxnRead.s.lg.Fatal(
+			"failed to marshal mvccpb.KeyValue",
+			zap.Error(err),
+		)
+	}
+
+	tw.trace.Step("marshal mvccpb.KeyValue")
+	tw.tx.UnsafeSeqPut(keyBucketName, ibytes, d)
+	tw.s.kvindex.Put(key, idxRev)
+	tw.changes = append(tw.changes, kv)
+	tw.trace.Step("store kv pair into bolt db")
+
+	if oldLease != lease.NoLease {
+		if tw.s.le == nil {
+			panic("no lessor to detach lease")
+		}
+		err = tw.s.le.Detach(oldLease, []lease.LeaseItem{{Key: string(key)}})
+		if err != nil {
+			tw.storeTxnRead.s.lg.Error(
+				"failed to detach old lease from a key",
+				zap.Error(err),
+			)
+		}
+	}
+	if leaseID != lease.NoLease {
+		if tw.s.le == nil {
+			panic("no lessor to attach lease")
+		}
+		err = tw.s.le.Attach(leaseID, []lease.LeaseItem{{Key: string(key)}})
+		if err != nil {
+			panic("unexpected error from lease Attach")
+		}
+	}
+	tw.trace.Step("attach lease to kv pair")
+}
+
+func (tw *storeTxnWrite) deleteRange(key, end []byte) int64 {
+	rrev := tw.beginRev
+	if len(tw.changes) > 0 {
+		rrev++
+	}
+	keys, _ := tw.s.kvindex.Range(key, end, rrev)
+	if len(keys) == 0 {
+		return 0
+	}
+	for _, key := range keys {
+		tw.delete(key)
+	}
+	return int64(len(keys))
+}
+
+func (tw *storeTxnWrite) delete(key []byte) {
+	ibytes := newRevBytes()
+	idxRev := revision{main: tw.beginRev + 1, sub: int64(len(tw.changes))}
+	revToBytes(idxRev, ibytes)
+
+	ibytes = appendMarkTombstone(tw.storeTxnRead.s.lg, ibytes)
+
+	kv := mvccpb.KeyValue{Key: key}
+
+	d, err := kv.Marshal()
+	if err != nil {
+		tw.storeTxnRead.s.lg.Fatal(
+			"failed to marshal mvccpb.KeyValue",
+			zap.Error(err),
+		)
+	}
+
+	tw.tx.UnsafeSeqPut(keyBucketName, ibytes, d)
+	err = tw.s.kvindex.Tombstone(key, idxRev)
+	if err != nil {
+		tw.storeTxnRead.s.lg.Fatal(
+			"failed to tombstone an existing key",
+			zap.String("key", string(key)),
+			zap.Error(err),
+		)
+	}
+	tw.changes = append(tw.changes, kv)
+
+	item := lease.LeaseItem{Key: string(key)}
+	leaseID := tw.s.le.GetLease(item)
+
+	if leaseID != lease.NoLease {
+		err = tw.s.le.Detach(leaseID, []lease.LeaseItem{item})
+		if err != nil {
+			tw.storeTxnRead.s.lg.Error(
+				"failed to detach old lease from a key",
+				zap.Error(err),
+			)
+		}
+	}
+}
+
+func (tw *storeTxnWrite) Changes() []mvccpb.KeyValue { return tw.changes }
--- a/server/mvcc/metrics.go
+++ b/server/mvcc/metrics.go
@@ -0,0 +1,354 @@
+// Copyright 2015 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mvcc
+
+import (
+	"sync"
+
+	"github.com/prometheus/client_golang/prometheus"
+)
+
+var (
+	rangeCounter = prometheus.NewCounter(
+		prometheus.CounterOpts{
+			Namespace: "etcd",
+			Subsystem: "mvcc",
+			Name:      "range_total",
+			Help:      "Total number of ranges seen by this member.",
+		})
+	rangeCounterDebug = prometheus.NewCounter(
+		prometheus.CounterOpts{
+			Namespace: "etcd_debugging",
+			Subsystem: "mvcc",
+			Name:      "range_total",
+			Help:      "Total number of ranges seen by this member.",
+		})
+
+	putCounter = prometheus.NewCounter(
+		prometheus.CounterOpts{
+			Namespace: "etcd",
+			Subsystem: "mvcc",
+			Name:      "put_total",
+			Help:      "Total number of puts seen by this member.",
+		})
+	// TODO: remove in 3.5 release
+	putCounterDebug = prometheus.NewCounter(
+		prometheus.CounterOpts{
+			Namespace: "etcd_debugging",
+			Subsystem: "mvcc",
+			Name:      "put_total",
+			Help:      "Total number of puts seen by this member.",
+		})
+
+	deleteCounter = prometheus.NewCounter(
+		prometheus.CounterOpts{
+			Namespace: "etcd",
+			Subsystem: "mvcc",
+			Name:      "delete_total",
+			Help:      "Total number of deletes seen by this member.",
+		})
+	// TODO: remove in 3.5 release
+	deleteCounterDebug = prometheus.NewCounter(
+		prometheus.CounterOpts{
+			Namespace: "etcd_debugging",
+			Subsystem: "mvcc",
+			Name:      "delete_total",
+			Help:      "Total number of deletes seen by this member.",
+		})
+
+	txnCounter = prometheus.NewCounter(
+		prometheus.CounterOpts{
+			Namespace: "etcd",
+			Subsystem: "mvcc",
+			Name:      "txn_total",
+			Help:      "Total number of txns seen by this member.",
+		})
+	txnCounterDebug = prometheus.NewCounter(
+		prometheus.CounterOpts{
+			Namespace: "etcd_debugging",
+			Subsystem: "mvcc",
+			Name:      "txn_total",
+			Help:      "Total number of txns seen by this member.",
+		})
+
+	keysGauge = prometheus.NewGauge(
+		prometheus.GaugeOpts{
+			Namespace: "etcd_debugging",
+			Subsystem: "mvcc",
+			Name:      "keys_total",
+			Help:      "Total number of keys.",
+		})
+
+	watchStreamGauge = prometheus.NewGauge(
+		prometheus.GaugeOpts{
+			Namespace: "etcd_debugging",
+			Subsystem: "mvcc",
+			Name:      "watch_stream_total",
+			Help:      "Total number of watch streams.",
+		})
+
+	watcherGauge = prometheus.NewGauge(
+		prometheus.GaugeOpts{
+			Namespace: "etcd_debugging",
+			Subsystem: "mvcc",
+			Name:      "watcher_total",
+			Help:      "Total number of watchers.",
+		})
+
+	slowWatcherGauge = prometheus.NewGauge(
+		prometheus.GaugeOpts{
+			Namespace: "etcd_debugging",
+			Subsystem: "mvcc",
+			Name:      "slow_watcher_total",
+			Help:      "Total number of unsynced slow watchers.",
+		})
+
+	totalEventsCounter = prometheus.NewCounter(
+		prometheus.CounterOpts{
+			Namespace: "etcd_debugging",
+			Subsystem: "mvcc",
+			Name:      "events_total",
+			Help:      "Total number of events sent by this member.",
+		})
+
+	pendingEventsGauge = prometheus.NewGauge(
+		prometheus.GaugeOpts{
+			Namespace: "etcd_debugging",
+			Subsystem: "mvcc",
+			Name:      "pending_events_total",
+			Help:      "Total number of pending events to be sent.",
+		})
+
+	indexCompactionPauseMs = prometheus.NewHistogram(
+		prometheus.HistogramOpts{
+			Namespace: "etcd_debugging",
+			Subsystem: "mvcc",
+			Name:      "index_compaction_pause_duration_milliseconds",
+			Help:      "Bucketed histogram of index compaction pause duration.",
+
+			// lowest bucket start of upper bound 0.5 ms with factor 2
+			// highest bucket start of 0.5 ms * 2^13 == 4.096 sec
+			Buckets: prometheus.ExponentialBuckets(0.5, 2, 14),
+		})
+
+	dbCompactionPauseMs = prometheus.NewHistogram(
+		prometheus.HistogramOpts{
+			Namespace: "etcd_debugging",
+			Subsystem: "mvcc",
+			Name:      "db_compaction_pause_duration_milliseconds",
+			Help:      "Bucketed histogram of db compaction pause duration.",
+
+			// lowest bucket start of upper bound 1 ms with factor 2
+			// highest bucket start of 1 ms * 2^12 == 4.096 sec
+			Buckets: prometheus.ExponentialBuckets(1, 2, 13),
+		})
+
+	dbCompactionTotalMs = prometheus.NewHistogram(
+		prometheus.HistogramOpts{
+			Namespace: "etcd_debugging",
+			Subsystem: "mvcc",
+			Name:      "db_compaction_total_duration_milliseconds",
+			Help:      "Bucketed histogram of db compaction total duration.",
+
+			// lowest bucket start of upper bound 100 ms with factor 2
+			// highest bucket start of 100 ms * 2^13 == 8.192 sec
+			Buckets: prometheus.ExponentialBuckets(100, 2, 14),
+		})
+
+	dbCompactionLast = prometheus.NewGauge(
+		prometheus.GaugeOpts{
+			Namespace: "etcd_debugging",
+			Subsystem: "mvcc",
+			Name:      "db_compaction_last",
+			Help:      "The unix time of the last db compaction. Resets to 0 on start.",
+		})
+
+	dbCompactionKeysCounter = prometheus.NewCounter(
+		prometheus.CounterOpts{
+			Namespace: "etcd_debugging",
+			Subsystem: "mvcc",
+			Name:      "db_compaction_keys_total",
+			Help:      "Total number of db keys compacted.",
+		})
+
+	dbTotalSize = prometheus.NewGaugeFunc(prometheus.GaugeOpts{
+		Namespace: "etcd",
+		Subsystem: "mvcc",
+		Name:      "db_total_size_in_bytes",
+		Help:      "Total size of the underlying database physically allocated in bytes.",
+	},
+		func() float64 {
+			reportDbTotalSizeInBytesMu.RLock()
+			defer reportDbTotalSizeInBytesMu.RUnlock()
+			return reportDbTotalSizeInBytes()
+		},
+	)
+	// overridden by mvcc initialization
+	reportDbTotalSizeInBytesMu sync.RWMutex
+	reportDbTotalSizeInBytes   = func() float64 { return 0 }
+
+	// TODO: remove this in v3.5
+	dbTotalSizeDebug = prometheus.NewGaugeFunc(prometheus.GaugeOpts{
+		Namespace: "etcd_debugging",
+		Subsystem: "mvcc",
+		Name:      "db_total_size_in_bytes",
+		Help:      "Total size of the underlying database physically allocated in bytes.",
+	},
+		func() float64 {
+			reportDbTotalSizeInBytesDebugMu.RLock()
+			defer reportDbTotalSizeInBytesDebugMu.RUnlock()
+			return reportDbTotalSizeInBytesDebug()
+		},
+	)
+	// overridden by mvcc initialization
+	reportDbTotalSizeInBytesDebugMu sync.RWMutex
+	reportDbTotalSizeInBytesDebug   = func() float64 { return 0 }
+
+	dbTotalSizeInUse = prometheus.NewGaugeFunc(prometheus.GaugeOpts{
+		Namespace: "etcd",
+		Subsystem: "mvcc",
+		Name:      "db_total_size_in_use_in_bytes",
+		Help:      "Total size of the underlying database logically in use in bytes.",
+	},
+		func() float64 {
+			reportDbTotalSizeInUseInBytesMu.RLock()
+			defer reportDbTotalSizeInUseInBytesMu.RUnlock()
+			return reportDbTotalSizeInUseInBytes()
+		},
+	)
+	// overridden by mvcc initialization
+	reportDbTotalSizeInUseInBytesMu sync.RWMutex
+	reportDbTotalSizeInUseInBytes   = func() float64 { return 0 }
+
+	dbOpenReadTxN = prometheus.NewGaugeFunc(prometheus.GaugeOpts{
+		Namespace: "etcd",
+		Subsystem: "mvcc",
+		Name:      "db_open_read_transactions",
+		Help:      "The number of currently open read transactions",
+	},
+
+		func() float64 {
+			reportDbOpenReadTxNMu.RLock()
+			defer reportDbOpenReadTxNMu.RUnlock()
+			return reportDbOpenReadTxN()
+		},
+	)
+	// overridden by mvcc initialization
+	reportDbOpenReadTxNMu sync.RWMutex
+	reportDbOpenReadTxN   = func() float64 { return 0 }
+
+	hashSec = prometheus.NewHistogram(prometheus.HistogramOpts{
+		Namespace: "etcd",
+		Subsystem: "mvcc",
+		Name:      "hash_duration_seconds",
+		Help:      "The latency distribution of storage hash operation.",
+
+		// 100 MB usually takes 100 ms, so start with 10 MB of 10 ms
+		// lowest bucket start of upper bound 0.01 sec (10 ms) with factor 2
+		// highest bucket start of 0.01 sec * 2^14 == 163.84 sec
+		Buckets: prometheus.ExponentialBuckets(.01, 2, 15),
+	})
+
+	hashRevSec = prometheus.NewHistogram(prometheus.HistogramOpts{
+		Namespace: "etcd",
+		Subsystem: "mvcc",
+		Name:      "hash_rev_duration_seconds",
+		Help:      "The latency distribution of storage hash by revision operation.",
+
+		// 100 MB usually takes 100 ms, so start with 10 MB of 10 ms
+		// lowest bucket start of upper bound 0.01 sec (10 ms) with factor 2
+		// highest bucket start of 0.01 sec * 2^14 == 163.84 sec
+		Buckets: prometheus.ExponentialBuckets(.01, 2, 15),
+	})
+
+	currentRev = prometheus.NewGaugeFunc(prometheus.GaugeOpts{
+		Namespace: "etcd_debugging",
+		Subsystem: "mvcc",
+		Name:      "current_revision",
+		Help:      "The current revision of store.",
+	},
+		func() float64 {
+			reportCurrentRevMu.RLock()
+			defer reportCurrentRevMu.RUnlock()
+			return reportCurrentRev()
+		},
+	)
+	// overridden by mvcc initialization
+	reportCurrentRevMu sync.RWMutex
+	reportCurrentRev   = func() float64 { return 0 }
+
+	compactRev = prometheus.NewGaugeFunc(prometheus.GaugeOpts{
+		Namespace: "etcd_debugging",
+		Subsystem: "mvcc",
+		Name:      "compact_revision",
+		Help:      "The revision of the last compaction in store.",
+	},
+		func() float64 {
+			reportCompactRevMu.RLock()
+			defer reportCompactRevMu.RUnlock()
+			return reportCompactRev()
+		},
+	)
+	// overridden by mvcc initialization
+	reportCompactRevMu sync.RWMutex
+	reportCompactRev   = func() float64 { return 0 }
+
+	totalPutSizeGauge = prometheus.NewGauge(
+		prometheus.GaugeOpts{
+			Namespace: "etcd_debugging",
+			Subsystem: "mvcc",
+			Name:      "total_put_size_in_bytes",
+			Help:      "The total size of put kv pairs seen by this member.",
+		})
+)
+
+func init() {
+	prometheus.MustRegister(rangeCounter)
+	prometheus.MustRegister(rangeCounterDebug)
+	prometheus.MustRegister(putCounter)
+	prometheus.MustRegister(putCounterDebug)
+	prometheus.MustRegister(deleteCounter)
+	prometheus.MustRegister(deleteCounterDebug)
+	prometheus.MustRegister(txnCounter)
+	prometheus.MustRegister(txnCounterDebug)
+	prometheus.MustRegister(keysGauge)
+	prometheus.MustRegister(watchStreamGauge)
+	prometheus.MustRegister(watcherGauge)
+	prometheus.MustRegister(slowWatcherGauge)
+	prometheus.MustRegister(totalEventsCounter)
+	prometheus.MustRegister(pendingEventsGauge)
+	prometheus.MustRegister(indexCompactionPauseMs)
+	prometheus.MustRegister(dbCompactionPauseMs)
+	prometheus.MustRegister(dbCompactionTotalMs)
+	prometheus.MustRegister(dbCompactionLast)
+	prometheus.MustRegister(dbCompactionKeysCounter)
+	prometheus.MustRegister(dbTotalSize)
+	prometheus.MustRegister(dbTotalSizeDebug)
+	prometheus.MustRegister(dbTotalSizeInUse)
+	prometheus.MustRegister(dbOpenReadTxN)
+	prometheus.MustRegister(hashSec)
+	prometheus.MustRegister(hashRevSec)
+	prometheus.MustRegister(currentRev)
+	prometheus.MustRegister(compactRev)
+	prometheus.MustRegister(totalPutSizeGauge)
+}
+
+// ReportEventReceived reports that an event is received.
+// This function should be called when the external systems received an
+// event from mvcc.Watcher.
+func ReportEventReceived(n int) {
+	pendingEventsGauge.Sub(float64(n))
+	totalEventsCounter.Add(float64(n))
+}
--- a/server/mvcc/metrics_txn.go
+++ b/server/mvcc/metrics_txn.go
@@ -0,0 +1,71 @@
+// Copyright 2017 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mvcc
+
+import "go.etcd.io/etcd/v3/lease"
+
+type metricsTxnWrite struct {
+	TxnWrite
+	ranges  uint
+	puts    uint
+	deletes uint
+	putSize int64
+}
+
+func newMetricsTxnRead(tr TxnRead) TxnRead {
+	return &metricsTxnWrite{&txnReadWrite{tr}, 0, 0, 0, 0}
+}
+
+func newMetricsTxnWrite(tw TxnWrite) TxnWrite {
+	return &metricsTxnWrite{tw, 0, 0, 0, 0}
+}
+
+func (tw *metricsTxnWrite) Range(key, end []byte, ro RangeOptions) (*RangeResult, error) {
+	tw.ranges++
+	return tw.TxnWrite.Range(key, end, ro)
+}
+
+func (tw *metricsTxnWrite) DeleteRange(key, end []byte) (n, rev int64) {
+	tw.deletes++
+	return tw.TxnWrite.DeleteRange(key, end)
+}
+
+func (tw *metricsTxnWrite) Put(key, value []byte, lease lease.LeaseID) (rev int64) {
+	tw.puts++
+	size := int64(len(key) + len(value))
+	tw.putSize += size
+	return tw.TxnWrite.Put(key, value, lease)
+}
+
+func (tw *metricsTxnWrite) End() {
+	defer tw.TxnWrite.End()
+	if sum := tw.ranges + tw.puts + tw.deletes; sum > 1 {
+		txnCounter.Inc()
+		txnCounterDebug.Inc() // TODO: remove in 3.5 release
+	}
+
+	ranges := float64(tw.ranges)
+	rangeCounter.Add(ranges)
+	rangeCounterDebug.Add(ranges) // TODO: remove in 3.5 release
+
+	puts := float64(tw.puts)
+	putCounter.Add(puts)
+	putCounterDebug.Add(puts) // TODO: remove in 3.5 release
+	totalPutSizeGauge.Add(float64(tw.putSize))
+
+	deletes := float64(tw.deletes)
+	deleteCounter.Add(deletes)
+	deleteCounterDebug.Add(deletes) // TODO: remove in 3.5 release
+}
--- a/server/mvcc/revision.go
+++ b/server/mvcc/revision.go
@@ -0,0 +1,67 @@
+// Copyright 2015 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mvcc
+
+import "encoding/binary"
+
+// revBytesLen is the byte length of a normal revision.
+// First 8 bytes is the revision.main in big-endian format. The 9th byte
+// is a '_'. The last 8 bytes is the revision.sub in big-endian format.
+const revBytesLen = 8 + 1 + 8
+
+// A revision indicates modification of the key-value space.
+// The set of changes that share same main revision changes the key-value space atomically.
+type revision struct {
+	// main is the main revision of a set of changes that happen atomically.
+	main int64
+
+	// sub is the sub revision of a change in a set of changes that happen
+	// atomically. Each change has different increasing sub revision in that
+	// set.
+	sub int64
+}
+
+func (a revision) GreaterThan(b revision) bool {
+	if a.main > b.main {
+		return true
+	}
+	if a.main < b.main {
+		return false
+	}
+	return a.sub > b.sub
+}
+
+func newRevBytes() []byte {
+	return make([]byte, revBytesLen, markedRevBytesLen)
+}
+
+func revToBytes(rev revision, bytes []byte) {
+	binary.BigEndian.PutUint64(bytes, uint64(rev.main))
+	bytes[8] = '_'
+	binary.BigEndian.PutUint64(bytes[9:], uint64(rev.sub))
+}
+
+func bytesToRev(bytes []byte) revision {
+	return revision{
+		main: int64(binary.BigEndian.Uint64(bytes[0:8])),
+		sub:  int64(binary.BigEndian.Uint64(bytes[9:])),
+	}
+}
+
+type revisions []revision
+
+func (a revisions) Len() int           { return len(a) }
+func (a revisions) Less(i, j int) bool { return a[j].GreaterThan(a[i]) }
+func (a revisions) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
--- a/server/mvcc/revision_test.go
+++ b/server/mvcc/revision_test.go
@@ -0,0 +1,53 @@
+// Copyright 2015 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mvcc
+
+import (
+	"bytes"
+	"math"
+	"reflect"
+	"testing"
+)
+
+// TestRevision tests that revision could be encoded to and decoded from
+// bytes slice. Moreover, the lexicographical order of its byte slice representation
+// follows the order of (main, sub).
+func TestRevision(t *testing.T) {
+	tests := []revision{
+		// order in (main, sub)
+		{},
+		{main: 1, sub: 0},
+		{main: 1, sub: 1},
+		{main: 2, sub: 0},
+		{main: math.MaxInt64, sub: math.MaxInt64},
+	}
+
+	bs := make([][]byte, len(tests))
+	for i, tt := range tests {
+		b := newRevBytes()
+		revToBytes(tt, b)
+		bs[i] = b
+
+		if grev := bytesToRev(b); !reflect.DeepEqual(grev, tt) {
+			t.Errorf("#%d: revision = %+v, want %+v", i, grev, tt)
+		}
+	}
+
+	for i := 0; i < len(tests)-1; i++ {
+		if bytes.Compare(bs[i], bs[i+1]) >= 0 {
+			t.Errorf("#%d: %v (%+v) should be smaller than %v (%+v)", i, bs[i], tests[i], bs[i+1], tests[i+1])
+		}
+	}
+}
--- a/server/mvcc/util.go
+++ b/server/mvcc/util.go
@@ -0,0 +1,57 @@
+// Copyright 2016 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mvcc
+
+import (
+	"encoding/binary"
+	"fmt"
+
+	"go.etcd.io/etcd/api/v3/mvccpb"
+	"go.etcd.io/etcd/v3/mvcc/backend"
+)
+
+func UpdateConsistentIndex(be backend.Backend, index uint64) {
+	tx := be.BatchTx()
+	tx.Lock()
+	defer tx.Unlock()
+
+	var oldi uint64
+	_, vs := tx.UnsafeRange(metaBucketName, consistentIndexKeyName, nil, 0)
+	if len(vs) != 0 {
+		oldi = binary.BigEndian.Uint64(vs[0])
+	}
+
+	if index <= oldi {
+		return
+	}
+
+	bs := make([]byte, 8)
+	binary.BigEndian.PutUint64(bs, index)
+	tx.UnsafePut(metaBucketName, consistentIndexKeyName, bs)
+}
+
+func WriteKV(be backend.Backend, kv mvccpb.KeyValue) {
+	ibytes := newRevBytes()
+	revToBytes(revision{main: kv.ModRevision}, ibytes)
+
+	d, err := kv.Marshal()
+	if err != nil {
+		panic(fmt.Errorf("cannot marshal event: %v", err))
+	}
+
+	be.BatchTx().Lock()
+	be.BatchTx().UnsafePut(keyBucketName, ibytes, d)
+	be.BatchTx().Unlock()
+}
--- a/server/mvcc/watchable_store.go
+++ b/server/mvcc/watchable_store.go
@@ -0,0 +1,548 @@
+// Copyright 2015 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mvcc
+
+import (
+	"sync"
+	"time"
+
+	"go.etcd.io/etcd/api/v3/mvccpb"
+	"go.etcd.io/etcd/pkg/v3/traceutil"
+	"go.etcd.io/etcd/v3/etcdserver/cindex"
+	"go.etcd.io/etcd/v3/lease"
+	"go.etcd.io/etcd/v3/mvcc/backend"
+
+	"go.uber.org/zap"
+)
+
+// non-const so modifiable by tests
+var (
+	// chanBufLen is the length of the buffered chan
+	// for sending out watched events.
+	// See https://github.com/etcd-io/etcd/issues/11906 for more detail.
+	chanBufLen = 128
+
+	// maxWatchersPerSync is the number of watchers to sync in a single batch
+	maxWatchersPerSync = 512
+)
+
+type watchable interface {
+	watch(key, end []byte, startRev int64, id WatchID, ch chan<- WatchResponse, fcs ...FilterFunc) (*watcher, cancelFunc)
+	progress(w *watcher)
+	rev() int64
+}
+
+type watchableStore struct {
+	*store
+
+	// mu protects watcher groups and batches. It should never be locked
+	// before locking store.mu to avoid deadlock.
+	mu sync.RWMutex
+
+	// victims are watcher batches that were blocked on the watch channel
+	victims []watcherBatch
+	victimc chan struct{}
+
+	// contains all unsynced watchers that needs to sync with events that have happened
+	unsynced watcherGroup
+
+	// contains all synced watchers that are in sync with the progress of the store.
+	// The key of the map is the key that the watcher watches on.
+	synced watcherGroup
+
+	stopc chan struct{}
+	wg    sync.WaitGroup
+}
+
+// cancelFunc updates unsynced and synced maps when running
+// cancel operations.
+type cancelFunc func()
+
+func New(lg *zap.Logger, b backend.Backend, le lease.Lessor, ci cindex.ConsistentIndexer, cfg StoreConfig) ConsistentWatchableKV {
+	return newWatchableStore(lg, b, le, ci, cfg)
+}
+
+func newWatchableStore(lg *zap.Logger, b backend.Backend, le lease.Lessor, ci cindex.ConsistentIndexer, cfg StoreConfig) *watchableStore {
+	if lg == nil {
+		lg = zap.NewNop()
+	}
+	s := &watchableStore{
+		store:    NewStore(lg, b, le, ci, cfg),
+		victimc:  make(chan struct{}, 1),
+		unsynced: newWatcherGroup(),
+		synced:   newWatcherGroup(),
+		stopc:    make(chan struct{}),
+	}
+	s.store.ReadView = &readView{s}
+	s.store.WriteView = &writeView{s}
+	if s.le != nil {
+		// use this store as the deleter so revokes trigger watch events
+		s.le.SetRangeDeleter(func() lease.TxnDelete { return s.Write(traceutil.TODO()) })
+	}
+	s.wg.Add(2)
+	go s.syncWatchersLoop()
+	go s.syncVictimsLoop()
+	return s
+}
+
+func (s *watchableStore) Close() error {
+	close(s.stopc)
+	s.wg.Wait()
+	return s.store.Close()
+}
+
+func (s *watchableStore) NewWatchStream() WatchStream {
+	watchStreamGauge.Inc()
+	return &watchStream{
+		watchable: s,
+		ch:        make(chan WatchResponse, chanBufLen),
+		cancels:   make(map[WatchID]cancelFunc),
+		watchers:  make(map[WatchID]*watcher),
+	}
+}
+
+func (s *watchableStore) watch(key, end []byte, startRev int64, id WatchID, ch chan<- WatchResponse, fcs ...FilterFunc) (*watcher, cancelFunc) {
+	wa := &watcher{
+		key:    key,
+		end:    end,
+		minRev: startRev,
+		id:     id,
+		ch:     ch,
+		fcs:    fcs,
+	}
+
+	s.mu.Lock()
+	s.revMu.RLock()
+	synced := startRev > s.store.currentRev || startRev == 0
+	if synced {
+		wa.minRev = s.store.currentRev + 1
+		if startRev > wa.minRev {
+			wa.minRev = startRev
+		}
+	}
+	if synced {
+		s.synced.add(wa)
+	} else {
+		slowWatcherGauge.Inc()
+		s.unsynced.add(wa)
+	}
+	s.revMu.RUnlock()
+	s.mu.Unlock()
+
+	watcherGauge.Inc()
+
+	return wa, func() { s.cancelWatcher(wa) }
+}
+
+// cancelWatcher removes references of the watcher from the watchableStore
+func (s *watchableStore) cancelWatcher(wa *watcher) {
+	for {
+		s.mu.Lock()
+		if s.unsynced.delete(wa) {
+			slowWatcherGauge.Dec()
+			watcherGauge.Dec()
+			break
+		} else if s.synced.delete(wa) {
+			watcherGauge.Dec()
+			break
+		} else if wa.compacted {
+			watcherGauge.Dec()
+			break
+		} else if wa.ch == nil {
+			// already canceled (e.g., cancel/close race)
+			break
+		}
+
+		if !wa.victim {
+			s.mu.Unlock()
+			panic("watcher not victim but not in watch groups")
+		}
+
+		var victimBatch watcherBatch
+		for _, wb := range s.victims {
+			if wb[wa] != nil {
+				victimBatch = wb
+				break
+			}
+		}
+		if victimBatch != nil {
+			slowWatcherGauge.Dec()
+			watcherGauge.Dec()
+			delete(victimBatch, wa)
+			break
+		}
+
+		// victim being processed so not accessible; retry
+		s.mu.Unlock()
+		time.Sleep(time.Millisecond)
+	}
+
+	wa.ch = nil
+	s.mu.Unlock()
+}
+
+func (s *watchableStore) Restore(b backend.Backend) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	err := s.store.Restore(b)
+	if err != nil {
+		return err
+	}
+
+	for wa := range s.synced.watchers {
+		wa.restore = true
+		s.unsynced.add(wa)
+	}
+	s.synced = newWatcherGroup()
+	return nil
+}
+
+// syncWatchersLoop syncs the watcher in the unsynced map every 100ms.
+func (s *watchableStore) syncWatchersLoop() {
+	defer s.wg.Done()
+
+	for {
+		s.mu.RLock()
+		st := time.Now()
+		lastUnsyncedWatchers := s.unsynced.size()
+		s.mu.RUnlock()
+
+		unsyncedWatchers := 0
+		if lastUnsyncedWatchers > 0 {
+			unsyncedWatchers = s.syncWatchers()
+		}
+		syncDuration := time.Since(st)
+
+		waitDuration := 100 * time.Millisecond
+		// more work pending?
+		if unsyncedWatchers != 0 && lastUnsyncedWatchers > unsyncedWatchers {
+			// be fair to other store operations by yielding time taken
+			waitDuration = syncDuration
+		}
+
+		select {
+		case <-time.After(waitDuration):
+		case <-s.stopc:
+			return
+		}
+	}
+}
+
+// syncVictimsLoop tries to write precomputed watcher responses to
+// watchers that had a blocked watcher channel
+func (s *watchableStore) syncVictimsLoop() {
+	defer s.wg.Done()
+
+	for {
+		for s.moveVictims() != 0 {
+			// try to update all victim watchers
+		}
+		s.mu.RLock()
+		isEmpty := len(s.victims) == 0
+		s.mu.RUnlock()
+
+		var tickc <-chan time.Time
+		if !isEmpty {
+			tickc = time.After(10 * time.Millisecond)
+		}
+
+		select {
+		case <-tickc:
+		case <-s.victimc:
+		case <-s.stopc:
+			return
+		}
+	}
+}
+
+// moveVictims tries to update watches with already pending event data
+func (s *watchableStore) moveVictims() (moved int) {
+	s.mu.Lock()
+	victims := s.victims
+	s.victims = nil
+	s.mu.Unlock()
+
+	var newVictim watcherBatch
+	for _, wb := range victims {
+		// try to send responses again
+		for w, eb := range wb {
+			// watcher has observed the store up to, but not including, w.minRev
+			rev := w.minRev - 1
+			if w.send(WatchResponse{WatchID: w.id, Events: eb.evs, Revision: rev}) {
+				pendingEventsGauge.Add(float64(len(eb.evs)))
+			} else {
+				if newVictim == nil {
+					newVictim = make(watcherBatch)
+				}
+				newVictim[w] = eb
+				continue
+			}
+			moved++
+		}
+
+		// assign completed victim watchers to unsync/sync
+		s.mu.Lock()
+		s.store.revMu.RLock()
+		curRev := s.store.currentRev
+		for w, eb := range wb {
+			if newVictim != nil && newVictim[w] != nil {
+				// couldn't send watch response; stays victim
+				continue
+			}
+			w.victim = false
+			if eb.moreRev != 0 {
+				w.minRev = eb.moreRev
+			}
+			if w.minRev <= curRev {
+				s.unsynced.add(w)
+			} else {
+				slowWatcherGauge.Dec()
+				s.synced.add(w)
+			}
+		}
+		s.store.revMu.RUnlock()
+		s.mu.Unlock()
+	}
+
+	if len(newVictim) > 0 {
+		s.mu.Lock()
+		s.victims = append(s.victims, newVictim)
+		s.mu.Unlock()
+	}
+
+	return moved
+}
+
+// syncWatchers syncs unsynced watchers by:
+//	1. choose a set of watchers from the unsynced watcher group
+//	2. iterate over the set to get the minimum revision and remove compacted watchers
+//	3. use minimum revision to get all key-value pairs and send those events to watchers
+//	4. remove synced watchers in set from unsynced group and move to synced group
+func (s *watchableStore) syncWatchers() int {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	if s.unsynced.size() == 0 {
+		return 0
+	}
+
+	s.store.revMu.RLock()
+	defer s.store.revMu.RUnlock()
+
+	// in order to find key-value pairs from unsynced watchers, we need to
+	// find min revision index, and these revisions can be used to
+	// query the backend store of key-value pairs
+	curRev := s.store.currentRev
+	compactionRev := s.store.compactMainRev
+
+	wg, minRev := s.unsynced.choose(maxWatchersPerSync, curRev, compactionRev)
+	minBytes, maxBytes := newRevBytes(), newRevBytes()
+	revToBytes(revision{main: minRev}, minBytes)
+	revToBytes(revision{main: curRev + 1}, maxBytes)
+
+	// UnsafeRange returns keys and values. And in boltdb, keys are revisions.
+	// values are actual key-value pairs in backend.
+	tx := s.store.b.ReadTx()
+	tx.RLock()
+	revs, vs := tx.UnsafeRange(keyBucketName, minBytes, maxBytes, 0)
+	tx.RUnlock()
+	var evs []mvccpb.Event
+	evs = kvsToEvents(s.store.lg, wg, revs, vs)
+
+	var victims watcherBatch
+	wb := newWatcherBatch(wg, evs)
+	for w := range wg.watchers {
+		w.minRev = curRev + 1
+
+		eb, ok := wb[w]
+		if !ok {
+			// bring un-notified watcher to synced
+			s.synced.add(w)
+			s.unsynced.delete(w)
+			continue
+		}
+
+		if eb.moreRev != 0 {
+			w.minRev = eb.moreRev
+		}
+
+		if w.send(WatchResponse{WatchID: w.id, Events: eb.evs, Revision: curRev}) {
+			pendingEventsGauge.Add(float64(len(eb.evs)))
+		} else {
+			if victims == nil {
+				victims = make(watcherBatch)
+			}
+			w.victim = true
+		}
+
+		if w.victim {
+			victims[w] = eb
+		} else {
+			if eb.moreRev != 0 {
+				// stay unsynced; more to read
+				continue
+			}
+			s.synced.add(w)
+		}
+		s.unsynced.delete(w)
+	}
+	s.addVictim(victims)
+
+	vsz := 0
+	for _, v := range s.victims {
+		vsz += len(v)
+	}
+	slowWatcherGauge.Set(float64(s.unsynced.size() + vsz))
+
+	return s.unsynced.size()
+}
+
+// kvsToEvents gets all events for the watchers from all key-value pairs
+func kvsToEvents(lg *zap.Logger, wg *watcherGroup, revs, vals [][]byte) (evs []mvccpb.Event) {
+	for i, v := range vals {
+		var kv mvccpb.KeyValue
+		if err := kv.Unmarshal(v); err != nil {
+			lg.Panic("failed to unmarshal mvccpb.KeyValue", zap.Error(err))
+		}
+
+		if !wg.contains(string(kv.Key)) {
+			continue
+		}
+
+		ty := mvccpb.PUT
+		if isTombstone(revs[i]) {
+			ty = mvccpb.DELETE
+			// patch in mod revision so watchers won't skip
+			kv.ModRevision = bytesToRev(revs[i]).main
+		}
+		evs = append(evs, mvccpb.Event{Kv: &kv, Type: ty})
+	}
+	return evs
+}
+
+// notify notifies the fact that given event at the given rev just happened to
+// watchers that watch on the key of the event.
+func (s *watchableStore) notify(rev int64, evs []mvccpb.Event) {
+	var victim watcherBatch
+	for w, eb := range newWatcherBatch(&s.synced, evs) {
+		if eb.revs != 1 {
+			s.store.lg.Panic(
+				"unexpected multiple revisions in watch notification",
+				zap.Int("number-of-revisions", eb.revs),
+			)
+		}
+		if w.send(WatchResponse{WatchID: w.id, Events: eb.evs, Revision: rev}) {
+			pendingEventsGauge.Add(float64(len(eb.evs)))
+		} else {
+			// move slow watcher to victims
+			w.minRev = rev + 1
+			if victim == nil {
+				victim = make(watcherBatch)
+			}
+			w.victim = true
+			victim[w] = eb
+			s.synced.delete(w)
+			slowWatcherGauge.Inc()
+		}
+	}
+	s.addVictim(victim)
+}
+
+func (s *watchableStore) addVictim(victim watcherBatch) {
+	if victim == nil {
+		return
+	}
+	s.victims = append(s.victims, victim)
+	select {
+	case s.victimc <- struct{}{}:
+	default:
+	}
+}
+
+func (s *watchableStore) rev() int64 { return s.store.Rev() }
+
+func (s *watchableStore) progress(w *watcher) {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	if _, ok := s.synced.watchers[w]; ok {
+		w.send(WatchResponse{WatchID: w.id, Revision: s.rev()})
+		// If the ch is full, this watcher is receiving events.
+		// We do not need to send progress at all.
+	}
+}
+
+type watcher struct {
+	// the watcher key
+	key []byte
+	// end indicates the end of the range to watch.
+	// If end is set, the watcher is on a range.
+	end []byte
+
+	// victim is set when ch is blocked and undergoing victim processing
+	victim bool
+
+	// compacted is set when the watcher is removed because of compaction
+	compacted bool
+
+	// restore is true when the watcher is being restored from leader snapshot
+	// which means that this watcher has just been moved from "synced" to "unsynced"
+	// watcher group, possibly with a future revision when it was first added
+	// to the synced watcher
+	// "unsynced" watcher revision must always be <= current revision,
+	// except when the watcher were to be moved from "synced" watcher group
+	restore bool
+
+	// minRev is the minimum revision update the watcher will accept
+	minRev int64
+	id     WatchID
+
+	fcs []FilterFunc
+	// a chan to send out the watch response.
+	// The chan might be shared with other watchers.
+	ch chan<- WatchResponse
+}
+
+func (w *watcher) send(wr WatchResponse) bool {
+	progressEvent := len(wr.Events) == 0
+
+	if len(w.fcs) != 0 {
+		ne := make([]mvccpb.Event, 0, len(wr.Events))
+		for i := range wr.Events {
+			filtered := false
+			for _, filter := range w.fcs {
+				if filter(wr.Events[i]) {
+					filtered = true
+					break
+				}
+			}
+			if !filtered {
+				ne = append(ne, wr.Events[i])
+			}
+		}
+		wr.Events = ne
+	}
+
+	// if all events are filtered out, we should send nothing.
+	if !progressEvent && len(wr.Events) == 0 {
+		return true
+	}
+	select {
+	case w.ch <- wr:
+		return true
+	default:
+		return false
+	}
+}
--- a/server/mvcc/watchable_store_bench_test.go
+++ b/server/mvcc/watchable_store_bench_test.go
@@ -0,0 +1,218 @@
+// Copyright 2015 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mvcc
+
+import (
+	"math/rand"
+	"os"
+	"testing"
+
+	"go.etcd.io/etcd/pkg/v3/traceutil"
+	"go.etcd.io/etcd/v3/etcdserver/cindex"
+	"go.etcd.io/etcd/v3/lease"
+	"go.etcd.io/etcd/v3/mvcc/backend"
+
+	"go.uber.org/zap"
+)
+
+func BenchmarkWatchableStorePut(b *testing.B) {
+	be, tmpPath := backend.NewDefaultTmpBackend()
+	s := New(zap.NewExample(), be, &lease.FakeLessor{}, nil, StoreConfig{})
+	defer cleanup(s, be, tmpPath)
+
+	// arbitrary number of bytes
+	bytesN := 64
+	keys := createBytesSlice(bytesN, b.N)
+	vals := createBytesSlice(bytesN, b.N)
+
+	b.ResetTimer()
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		s.Put(keys[i], vals[i], lease.NoLease)
+	}
+}
+
+// BenchmarkWatchableStoreTxnPut benchmarks the Put operation
+// with transaction begin and end, where transaction involves
+// some synchronization operations, such as mutex locking.
+func BenchmarkWatchableStoreTxnPut(b *testing.B) {
+	be, tmpPath := backend.NewDefaultTmpBackend()
+	s := New(zap.NewExample(), be, &lease.FakeLessor{}, cindex.NewConsistentIndex(be.BatchTx()), StoreConfig{})
+	defer cleanup(s, be, tmpPath)
+
+	// arbitrary number of bytes
+	bytesN := 64
+	keys := createBytesSlice(bytesN, b.N)
+	vals := createBytesSlice(bytesN, b.N)
+
+	b.ResetTimer()
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		txn := s.Write(traceutil.TODO())
+		txn.Put(keys[i], vals[i], lease.NoLease)
+		txn.End()
+	}
+}
+
+// BenchmarkWatchableStoreWatchPutSync benchmarks the case of
+// many synced watchers receiving a Put notification.
+func BenchmarkWatchableStoreWatchPutSync(b *testing.B) {
+	benchmarkWatchableStoreWatchPut(b, true)
+}
+
+// BenchmarkWatchableStoreWatchPutUnsync benchmarks the case of
+// many unsynced watchers receiving a Put notification.
+func BenchmarkWatchableStoreWatchPutUnsync(b *testing.B) {
+	benchmarkWatchableStoreWatchPut(b, false)
+}
+
+func benchmarkWatchableStoreWatchPut(b *testing.B, synced bool) {
+	be, tmpPath := backend.NewDefaultTmpBackend()
+	s := newWatchableStore(zap.NewExample(), be, &lease.FakeLessor{}, nil, StoreConfig{})
+	defer cleanup(s, be, tmpPath)
+
+	k := []byte("testkey")
+	v := []byte("testval")
+
+	rev := int64(0)
+	if !synced {
+		// non-0 value to keep watchers in unsynced
+		rev = 1
+	}
+
+	w := s.NewWatchStream()
+	defer w.Close()
+	watchIDs := make([]WatchID, b.N)
+	for i := range watchIDs {
+		watchIDs[i], _ = w.Watch(0, k, nil, rev)
+	}
+
+	b.ResetTimer()
+	b.ReportAllocs()
+
+	// trigger watchers
+	s.Put(k, v, lease.NoLease)
+	for range watchIDs {
+		<-w.Chan()
+	}
+	select {
+	case wc := <-w.Chan():
+		b.Fatalf("unexpected data %v", wc)
+	default:
+	}
+}
+
+// Benchmarks on cancel function performance for unsynced watchers
+// in a WatchableStore. It creates k*N watchers to populate unsynced
+// with a reasonably large number of watchers. And measures the time it
+// takes to cancel N watchers out of k*N watchers. The performance is
+// expected to differ depending on the unsynced member implementation.
+// TODO: k is an arbitrary constant. We need to figure out what factor
+// we should put to simulate the real-world use cases.
+func BenchmarkWatchableStoreUnsyncedCancel(b *testing.B) {
+	be, tmpPath := backend.NewDefaultTmpBackend()
+	s := NewStore(zap.NewExample(), be, &lease.FakeLessor{}, nil, StoreConfig{})
+
+	// manually create watchableStore instead of newWatchableStore
+	// because newWatchableStore periodically calls syncWatchersLoop
+	// method to sync watchers in unsynced map. We want to keep watchers
+	// in unsynced for this benchmark.
+	ws := &watchableStore{
+		store:    s,
+		unsynced: newWatcherGroup(),
+
+		// to make the test not crash from assigning to nil map.
+		// 'synced' doesn't get populated in this test.
+		synced: newWatcherGroup(),
+	}
+
+	defer func() {
+		ws.store.Close()
+		os.Remove(tmpPath)
+	}()
+
+	// Put a key so that we can spawn watchers on that key
+	// (testKey in this test). This increases the rev to 1,
+	// and later we can we set the watcher's startRev to 1,
+	// and force watchers to be in unsynced.
+	testKey := []byte("foo")
+	testValue := []byte("bar")
+	s.Put(testKey, testValue, lease.NoLease)
+
+	w := ws.NewWatchStream()
+
+	const k int = 2
+	benchSampleN := b.N
+	watcherN := k * benchSampleN
+
+	watchIDs := make([]WatchID, watcherN)
+	for i := 0; i < watcherN; i++ {
+		// non-0 value to keep watchers in unsynced
+		watchIDs[i], _ = w.Watch(0, testKey, nil, 1)
+	}
+
+	// random-cancel N watchers to make it not biased towards
+	// data structures with an order, such as slice.
+	ix := rand.Perm(watcherN)
+
+	b.ResetTimer()
+	b.ReportAllocs()
+
+	// cancel N watchers
+	for _, idx := range ix[:benchSampleN] {
+		if err := w.Cancel(watchIDs[idx]); err != nil {
+			b.Error(err)
+		}
+	}
+}
+
+func BenchmarkWatchableStoreSyncedCancel(b *testing.B) {
+	be, tmpPath := backend.NewDefaultTmpBackend()
+	s := newWatchableStore(zap.NewExample(), be, &lease.FakeLessor{}, nil, StoreConfig{})
+
+	defer func() {
+		s.store.Close()
+		os.Remove(tmpPath)
+	}()
+
+	// Put a key so that we can spawn watchers on that key
+	testKey := []byte("foo")
+	testValue := []byte("bar")
+	s.Put(testKey, testValue, lease.NoLease)
+
+	w := s.NewWatchStream()
+
+	// put 1 million watchers on the same key
+	const watcherN = 1000000
+
+	watchIDs := make([]WatchID, watcherN)
+	for i := 0; i < watcherN; i++ {
+		// 0 for startRev to keep watchers in synced
+		watchIDs[i], _ = w.Watch(0, testKey, nil, 0)
+	}
+
+	// randomly cancel watchers to make it not biased towards
+	// data structures with an order, such as slice.
+	ix := rand.Perm(watcherN)
+
+	b.ResetTimer()
+	b.ReportAllocs()
+
+	for _, idx := range ix {
+		if err := w.Cancel(watchIDs[idx]); err != nil {
+			b.Error(err)
+		}
+	}
+}
--- a/server/mvcc/watchable_store_test.go
+++ b/server/mvcc/watchable_store_test.go
@@ -0,0 +1,655 @@
+// Copyright 2015 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mvcc
+
+import (
+	"bytes"
+	"fmt"
+	"os"
+	"reflect"
+	"sync"
+	"testing"
+	"time"
+
+	"go.etcd.io/etcd/api/v3/mvccpb"
+	"go.etcd.io/etcd/pkg/v3/traceutil"
+	"go.etcd.io/etcd/v3/etcdserver/cindex"
+	"go.etcd.io/etcd/v3/lease"
+	"go.etcd.io/etcd/v3/mvcc/backend"
+	"go.uber.org/zap"
+)
+
+func TestWatch(t *testing.T) {
+	b, tmpPath := backend.NewDefaultTmpBackend()
+	s := newWatchableStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
+
+	defer func() {
+		s.store.Close()
+		os.Remove(tmpPath)
+	}()
+
+	testKey := []byte("foo")
+	testValue := []byte("bar")
+	s.Put(testKey, testValue, lease.NoLease)
+
+	w := s.NewWatchStream()
+	w.Watch(0, testKey, nil, 0)
+
+	if !s.synced.contains(string(testKey)) {
+		// the key must have had an entry in synced
+		t.Errorf("existence = false, want true")
+	}
+}
+
+func TestNewWatcherCancel(t *testing.T) {
+	b, tmpPath := backend.NewDefaultTmpBackend()
+	s := newWatchableStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
+
+	defer func() {
+		s.store.Close()
+		os.Remove(tmpPath)
+	}()
+	testKey := []byte("foo")
+	testValue := []byte("bar")
+	s.Put(testKey, testValue, lease.NoLease)
+
+	w := s.NewWatchStream()
+	wt, _ := w.Watch(0, testKey, nil, 0)
+
+	if err := w.Cancel(wt); err != nil {
+		t.Error(err)
+	}
+
+	if s.synced.contains(string(testKey)) {
+		// the key shoud have been deleted
+		t.Errorf("existence = true, want false")
+	}
+}
+
+// TestCancelUnsynced tests if running CancelFunc removes watchers from unsynced.
+func TestCancelUnsynced(t *testing.T) {
+	b, tmpPath := backend.NewDefaultTmpBackend()
+
+	// manually create watchableStore instead of newWatchableStore
+	// because newWatchableStore automatically calls syncWatchers
+	// method to sync watchers in unsynced map. We want to keep watchers
+	// in unsynced to test if syncWatchers works as expected.
+	s := &watchableStore{
+		store:    NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{}),
+		unsynced: newWatcherGroup(),
+
+		// to make the test not crash from assigning to nil map.
+		// 'synced' doesn't get populated in this test.
+		synced: newWatcherGroup(),
+	}
+
+	defer func() {
+		s.store.Close()
+		os.Remove(tmpPath)
+	}()
+
+	// Put a key so that we can spawn watchers on that key.
+	// (testKey in this test). This increases the rev to 1,
+	// and later we can we set the watcher's startRev to 1,
+	// and force watchers to be in unsynced.
+	testKey := []byte("foo")
+	testValue := []byte("bar")
+	s.Put(testKey, testValue, lease.NoLease)
+
+	w := s.NewWatchStream()
+
+	// arbitrary number for watchers
+	watcherN := 100
+
+	// create watcherN of watch ids to cancel
+	watchIDs := make([]WatchID, watcherN)
+	for i := 0; i < watcherN; i++ {
+		// use 1 to keep watchers in unsynced
+		watchIDs[i], _ = w.Watch(0, testKey, nil, 1)
+	}
+
+	for _, idx := range watchIDs {
+		if err := w.Cancel(idx); err != nil {
+			t.Error(err)
+		}
+	}
+
+	// After running CancelFunc
+	//
+	// unsynced should be empty
+	// because cancel removes watcher from unsynced
+	if size := s.unsynced.size(); size != 0 {
+		t.Errorf("unsynced size = %d, want 0", size)
+	}
+}
+
+// TestSyncWatchers populates unsynced watcher map and tests syncWatchers
+// method to see if it correctly sends events to channel of unsynced watchers
+// and moves these watchers to synced.
+func TestSyncWatchers(t *testing.T) {
+	b, tmpPath := backend.NewDefaultTmpBackend()
+
+	s := &watchableStore{
+		store:    NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{}),
+		unsynced: newWatcherGroup(),
+		synced:   newWatcherGroup(),
+	}
+
+	defer func() {
+		s.store.Close()
+		os.Remove(tmpPath)
+	}()
+
+	testKey := []byte("foo")
+	testValue := []byte("bar")
+	s.Put(testKey, testValue, lease.NoLease)
+
+	w := s.NewWatchStream()
+
+	// arbitrary number for watchers
+	watcherN := 100
+
+	for i := 0; i < watcherN; i++ {
+		// specify rev as 1 to keep watchers in unsynced
+		w.Watch(0, testKey, nil, 1)
+	}
+
+	// Before running s.syncWatchers() synced should be empty because we manually
+	// populate unsynced only
+	sws := s.synced.watcherSetByKey(string(testKey))
+	uws := s.unsynced.watcherSetByKey(string(testKey))
+
+	if len(sws) != 0 {
+		t.Fatalf("synced[string(testKey)] size = %d, want 0", len(sws))
+	}
+	// unsynced should not be empty because we manually populated unsynced only
+	if len(uws) != watcherN {
+		t.Errorf("unsynced size = %d, want %d", len(uws), watcherN)
+	}
+
+	// this should move all unsynced watchers to synced ones
+	s.syncWatchers()
+
+	sws = s.synced.watcherSetByKey(string(testKey))
+	uws = s.unsynced.watcherSetByKey(string(testKey))
+
+	// After running s.syncWatchers(), synced should not be empty because syncwatchers
+	// populates synced in this test case
+	if len(sws) != watcherN {
+		t.Errorf("synced[string(testKey)] size = %d, want %d", len(sws), watcherN)
+	}
+
+	// unsynced should be empty because syncwatchers is expected to move all watchers
+	// from unsynced to synced in this test case
+	if len(uws) != 0 {
+		t.Errorf("unsynced size = %d, want 0", len(uws))
+	}
+
+	for w := range sws {
+		if w.minRev != s.Rev()+1 {
+			t.Errorf("w.minRev = %d, want %d", w.minRev, s.Rev()+1)
+		}
+	}
+
+	if len(w.(*watchStream).ch) != watcherN {
+		t.Errorf("watched event size = %d, want %d", len(w.(*watchStream).ch), watcherN)
+	}
+
+	evs := (<-w.(*watchStream).ch).Events
+	if len(evs) != 1 {
+		t.Errorf("len(evs) got = %d, want = 1", len(evs))
+	}
+	if evs[0].Type != mvccpb.PUT {
+		t.Errorf("got = %v, want = %v", evs[0].Type, mvccpb.PUT)
+	}
+	if !bytes.Equal(evs[0].Kv.Key, testKey) {
+		t.Errorf("got = %s, want = %s", evs[0].Kv.Key, testKey)
+	}
+	if !bytes.Equal(evs[0].Kv.Value, testValue) {
+		t.Errorf("got = %s, want = %s", evs[0].Kv.Value, testValue)
+	}
+}
+
+// TestWatchCompacted tests a watcher that watches on a compacted revision.
+func TestWatchCompacted(t *testing.T) {
+	b, tmpPath := backend.NewDefaultTmpBackend()
+	s := newWatchableStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
+
+	defer func() {
+		s.store.Close()
+		os.Remove(tmpPath)
+	}()
+	testKey := []byte("foo")
+	testValue := []byte("bar")
+
+	maxRev := 10
+	compactRev := int64(5)
+	for i := 0; i < maxRev; i++ {
+		s.Put(testKey, testValue, lease.NoLease)
+	}
+	_, err := s.Compact(traceutil.TODO(), compactRev)
+	if err != nil {
+		t.Fatalf("failed to compact kv (%v)", err)
+	}
+
+	w := s.NewWatchStream()
+	wt, _ := w.Watch(0, testKey, nil, compactRev-1)
+
+	select {
+	case resp := <-w.Chan():
+		if resp.WatchID != wt {
+			t.Errorf("resp.WatchID = %x, want %x", resp.WatchID, wt)
+		}
+		if resp.CompactRevision == 0 {
+			t.Errorf("resp.Compacted = %v, want %v", resp.CompactRevision, compactRev)
+		}
+	case <-time.After(1 * time.Second):
+		t.Fatalf("failed to receive response (timeout)")
+	}
+}
+
+func TestWatchFutureRev(t *testing.T) {
+	b, tmpPath := backend.NewDefaultTmpBackend()
+	s := newWatchableStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
+
+	defer func() {
+		s.store.Close()
+		os.Remove(tmpPath)
+	}()
+
+	testKey := []byte("foo")
+	testValue := []byte("bar")
+
+	w := s.NewWatchStream()
+	wrev := int64(10)
+	w.Watch(0, testKey, nil, wrev)
+
+	for i := 0; i < 10; i++ {
+		rev := s.Put(testKey, testValue, lease.NoLease)
+		if rev >= wrev {
+			break
+		}
+	}
+
+	select {
+	case resp := <-w.Chan():
+		if resp.Revision != wrev {
+			t.Fatalf("rev = %d, want %d", resp.Revision, wrev)
+		}
+		if len(resp.Events) != 1 {
+			t.Fatalf("failed to get events from the response")
+		}
+		if resp.Events[0].Kv.ModRevision != wrev {
+			t.Fatalf("kv.rev = %d, want %d", resp.Events[0].Kv.ModRevision, wrev)
+		}
+	case <-time.After(time.Second):
+		t.Fatal("failed to receive event in 1 second.")
+	}
+}
+
+func TestWatchRestore(t *testing.T) {
+	test := func(delay time.Duration) func(t *testing.T) {
+		return func(t *testing.T) {
+			b, tmpPath := backend.NewDefaultTmpBackend()
+			s := newWatchableStore(zap.NewExample(), b, &lease.FakeLessor{}, cindex.NewConsistentIndex(b.BatchTx()), StoreConfig{})
+			defer cleanup(s, b, tmpPath)
+
+			testKey := []byte("foo")
+			testValue := []byte("bar")
+			rev := s.Put(testKey, testValue, lease.NoLease)
+
+			newBackend, newPath := backend.NewDefaultTmpBackend()
+			newStore := newWatchableStore(zap.NewExample(), newBackend, &lease.FakeLessor{}, cindex.NewConsistentIndex(newBackend.BatchTx()), StoreConfig{})
+			defer cleanup(newStore, newBackend, newPath)
+
+			w := newStore.NewWatchStream()
+			w.Watch(0, testKey, nil, rev-1)
+
+			time.Sleep(delay)
+
+			newStore.Restore(b)
+			select {
+			case resp := <-w.Chan():
+				if resp.Revision != rev {
+					t.Fatalf("rev = %d, want %d", resp.Revision, rev)
+				}
+				if len(resp.Events) != 1 {
+					t.Fatalf("failed to get events from the response")
+				}
+				if resp.Events[0].Kv.ModRevision != rev {
+					t.Fatalf("kv.rev = %d, want %d", resp.Events[0].Kv.ModRevision, rev)
+				}
+			case <-time.After(time.Second):
+				t.Fatal("failed to receive event in 1 second.")
+			}
+		}
+	}
+
+	t.Run("Normal", test(0))
+	t.Run("RunSyncWatchLoopBeforeRestore", test(time.Millisecond*120)) // longer than default waitDuration
+}
+
+// TestWatchRestoreSyncedWatcher tests such a case that:
+//   1. watcher is created with a future revision "math.MaxInt64 - 2"
+//   2. watcher with a future revision is added to "synced" watcher group
+//   3. restore/overwrite storage with snapshot of a higher lasat revision
+//   4. restore operation moves "synced" to "unsynced" watcher group
+//   5. choose the watcher from step 1, without panic
+func TestWatchRestoreSyncedWatcher(t *testing.T) {
+	b1, b1Path := backend.NewDefaultTmpBackend()
+	s1 := newWatchableStore(zap.NewExample(), b1, &lease.FakeLessor{}, cindex.NewConsistentIndex(b1.BatchTx()), StoreConfig{})
+	defer cleanup(s1, b1, b1Path)
+
+	b2, b2Path := backend.NewDefaultTmpBackend()
+	s2 := newWatchableStore(zap.NewExample(), b2, &lease.FakeLessor{}, cindex.NewConsistentIndex(b2.BatchTx()), StoreConfig{})
+	defer cleanup(s2, b2, b2Path)
+
+	testKey, testValue := []byte("foo"), []byte("bar")
+	rev := s1.Put(testKey, testValue, lease.NoLease)
+	startRev := rev + 2
+
+	// create a watcher with a future revision
+	// add to "synced" watcher group (startRev > s.store.currentRev)
+	w1 := s1.NewWatchStream()
+	w1.Watch(0, testKey, nil, startRev)
+
+	// make "s2" ends up with a higher last revision
+	s2.Put(testKey, testValue, lease.NoLease)
+	s2.Put(testKey, testValue, lease.NoLease)
+
+	// overwrite storage with higher revisions
+	if err := s1.Restore(b2); err != nil {
+		t.Fatal(err)
+	}
+
+	// wait for next "syncWatchersLoop" iteration
+	// and the unsynced watcher should be chosen
+	time.Sleep(2 * time.Second)
+
+	// trigger events for "startRev"
+	s1.Put(testKey, testValue, lease.NoLease)
+
+	select {
+	case resp := <-w1.Chan():
+		if resp.Revision != startRev {
+			t.Fatalf("resp.Revision expect %d, got %d", startRev, resp.Revision)
+		}
+		if len(resp.Events) != 1 {
+			t.Fatalf("len(resp.Events) expect 1, got %d", len(resp.Events))
+		}
+		if resp.Events[0].Kv.ModRevision != startRev {
+			t.Fatalf("resp.Events[0].Kv.ModRevision expect %d, got %d", startRev, resp.Events[0].Kv.ModRevision)
+		}
+	case <-time.After(time.Second):
+		t.Fatal("failed to receive event in 1 second")
+	}
+}
+
+// TestWatchBatchUnsynced tests batching on unsynced watchers
+func TestWatchBatchUnsynced(t *testing.T) {
+	b, tmpPath := backend.NewDefaultTmpBackend()
+	s := newWatchableStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
+
+	oldMaxRevs := watchBatchMaxRevs
+	defer func() {
+		watchBatchMaxRevs = oldMaxRevs
+		s.store.Close()
+		os.Remove(tmpPath)
+	}()
+	batches := 3
+	watchBatchMaxRevs = 4
+
+	v := []byte("foo")
+	for i := 0; i < watchBatchMaxRevs*batches; i++ {
+		s.Put(v, v, lease.NoLease)
+	}
+
+	w := s.NewWatchStream()
+	w.Watch(0, v, nil, 1)
+	for i := 0; i < batches; i++ {
+		if resp := <-w.Chan(); len(resp.Events) != watchBatchMaxRevs {
+			t.Fatalf("len(events) = %d, want %d", len(resp.Events), watchBatchMaxRevs)
+		}
+	}
+
+	s.store.revMu.Lock()
+	defer s.store.revMu.Unlock()
+	if size := s.synced.size(); size != 1 {
+		t.Errorf("synced size = %d, want 1", size)
+	}
+}
+
+func TestNewMapwatcherToEventMap(t *testing.T) {
+	k0, k1, k2 := []byte("foo0"), []byte("foo1"), []byte("foo2")
+	v0, v1, v2 := []byte("bar0"), []byte("bar1"), []byte("bar2")
+
+	ws := []*watcher{{key: k0}, {key: k1}, {key: k2}}
+
+	evs := []mvccpb.Event{
+		{
+			Type: mvccpb.PUT,
+			Kv:   &mvccpb.KeyValue{Key: k0, Value: v0},
+		},
+		{
+			Type: mvccpb.PUT,
+			Kv:   &mvccpb.KeyValue{Key: k1, Value: v1},
+		},
+		{
+			Type: mvccpb.PUT,
+			Kv:   &mvccpb.KeyValue{Key: k2, Value: v2},
+		},
+	}
+
+	tests := []struct {
+		sync []*watcher
+		evs  []mvccpb.Event
+
+		wwe map[*watcher][]mvccpb.Event
+	}{
+		// no watcher in sync, some events should return empty wwe
+		{
+			nil,
+			evs,
+			map[*watcher][]mvccpb.Event{},
+		},
+
+		// one watcher in sync, one event that does not match the key of that
+		// watcher should return empty wwe
+		{
+			[]*watcher{ws[2]},
+			evs[:1],
+			map[*watcher][]mvccpb.Event{},
+		},
+
+		// one watcher in sync, one event that matches the key of that
+		// watcher should return wwe with that matching watcher
+		{
+			[]*watcher{ws[1]},
+			evs[1:2],
+			map[*watcher][]mvccpb.Event{
+				ws[1]: evs[1:2],
+			},
+		},
+
+		// two watchers in sync that watches two different keys, one event
+		// that matches the key of only one of the watcher should return wwe
+		// with the matching watcher
+		{
+			[]*watcher{ws[0], ws[2]},
+			evs[2:],
+			map[*watcher][]mvccpb.Event{
+				ws[2]: evs[2:],
+			},
+		},
+
+		// two watchers in sync that watches the same key, two events that
+		// match the keys should return wwe with those two watchers
+		{
+			[]*watcher{ws[0], ws[1]},
+			evs[:2],
+			map[*watcher][]mvccpb.Event{
+				ws[0]: evs[:1],
+				ws[1]: evs[1:2],
+			},
+		},
+	}
+
+	for i, tt := range tests {
+		wg := newWatcherGroup()
+		for _, w := range tt.sync {
+			wg.add(w)
+		}
+
+		gwe := newWatcherBatch(&wg, tt.evs)
+		if len(gwe) != len(tt.wwe) {
+			t.Errorf("#%d: len(gwe) got = %d, want = %d", i, len(gwe), len(tt.wwe))
+		}
+		// compare gwe and tt.wwe
+		for w, eb := range gwe {
+			if len(eb.evs) != len(tt.wwe[w]) {
+				t.Errorf("#%d: len(eb.evs) got = %d, want = %d", i, len(eb.evs), len(tt.wwe[w]))
+			}
+			if !reflect.DeepEqual(eb.evs, tt.wwe[w]) {
+				t.Errorf("#%d: reflect.DeepEqual events got = %v, want = true", i, false)
+			}
+		}
+	}
+}
+
+// TestWatchVictims tests that watchable store delivers watch events
+// when the watch channel is temporarily clogged with too many events.
+func TestWatchVictims(t *testing.T) {
+	oldChanBufLen, oldMaxWatchersPerSync := chanBufLen, maxWatchersPerSync
+
+	b, tmpPath := backend.NewDefaultTmpBackend()
+	s := newWatchableStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
+
+	defer func() {
+		s.store.Close()
+		os.Remove(tmpPath)
+		chanBufLen, maxWatchersPerSync = oldChanBufLen, oldMaxWatchersPerSync
+	}()
+
+	chanBufLen, maxWatchersPerSync = 1, 2
+	numPuts := chanBufLen * 64
+	testKey, testValue := []byte("foo"), []byte("bar")
+
+	var wg sync.WaitGroup
+	numWatches := maxWatchersPerSync * 128
+	errc := make(chan error, numWatches)
+	wg.Add(numWatches)
+	for i := 0; i < numWatches; i++ {
+		go func() {
+			w := s.NewWatchStream()
+			w.Watch(0, testKey, nil, 1)
+			defer func() {
+				w.Close()
+				wg.Done()
+			}()
+			tc := time.After(10 * time.Second)
+			evs, nextRev := 0, int64(2)
+			for evs < numPuts {
+				select {
+				case <-tc:
+					errc <- fmt.Errorf("time out")
+					return
+				case wr := <-w.Chan():
+					evs += len(wr.Events)
+					for _, ev := range wr.Events {
+						if ev.Kv.ModRevision != nextRev {
+							errc <- fmt.Errorf("expected rev=%d, got %d", nextRev, ev.Kv.ModRevision)
+							return
+						}
+						nextRev++
+					}
+					time.Sleep(time.Millisecond)
+				}
+			}
+			if evs != numPuts {
+				errc <- fmt.Errorf("expected %d events, got %d", numPuts, evs)
+				return
+			}
+			select {
+			case <-w.Chan():
+				errc <- fmt.Errorf("unexpected response")
+			default:
+			}
+		}()
+		time.Sleep(time.Millisecond)
+	}
+
+	var wgPut sync.WaitGroup
+	wgPut.Add(numPuts)
+	for i := 0; i < numPuts; i++ {
+		go func() {
+			defer wgPut.Done()
+			s.Put(testKey, testValue, lease.NoLease)
+		}()
+	}
+	wgPut.Wait()
+
+	wg.Wait()
+	select {
+	case err := <-errc:
+		t.Fatal(err)
+	default:
+	}
+}
+
+// TestStressWatchCancelClose tests closing a watch stream while
+// canceling its watches.
+func TestStressWatchCancelClose(t *testing.T) {
+	b, tmpPath := backend.NewDefaultTmpBackend()
+	s := newWatchableStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
+
+	defer func() {
+		s.store.Close()
+		os.Remove(tmpPath)
+	}()
+
+	testKey, testValue := []byte("foo"), []byte("bar")
+	var wg sync.WaitGroup
+	readyc := make(chan struct{})
+	wg.Add(100)
+	for i := 0; i < 100; i++ {
+		go func() {
+			defer wg.Done()
+			w := s.NewWatchStream()
+			ids := make([]WatchID, 10)
+			for i := range ids {
+				ids[i], _ = w.Watch(0, testKey, nil, 0)
+			}
+			<-readyc
+			wg.Add(1 + len(ids)/2)
+			for i := range ids[:len(ids)/2] {
+				go func(n int) {
+					defer wg.Done()
+					w.Cancel(ids[n])
+				}(i)
+			}
+			go func() {
+				defer wg.Done()
+				w.Close()
+			}()
+		}()
+	}
+
+	close(readyc)
+	for i := 0; i < 100; i++ {
+		s.Put(testKey, testValue, lease.NoLease)
+	}
+
+	wg.Wait()
+}
--- a/server/mvcc/watchable_store_txn.go
+++ b/server/mvcc/watchable_store_txn.go
@@ -0,0 +1,56 @@
+// Copyright 2017 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mvcc
+
+import (
+	"go.etcd.io/etcd/api/v3/mvccpb"
+	"go.etcd.io/etcd/pkg/v3/traceutil"
+)
+
+func (tw *watchableStoreTxnWrite) End() {
+	changes := tw.Changes()
+	if len(changes) == 0 {
+		tw.TxnWrite.End()
+		return
+	}
+
+	rev := tw.Rev() + 1
+	evs := make([]mvccpb.Event, len(changes))
+	for i, change := range changes {
+		evs[i].Kv = &changes[i]
+		if change.CreateRevision == 0 {
+			evs[i].Type = mvccpb.DELETE
+			evs[i].Kv.ModRevision = rev
+		} else {
+			evs[i].Type = mvccpb.PUT
+		}
+	}
+
+	// end write txn under watchable store lock so the updates are visible
+	// when asynchronous event posting checks the current store revision
+	tw.s.mu.Lock()
+	tw.s.notify(rev, evs)
+	tw.TxnWrite.End()
+	tw.s.mu.Unlock()
+}
+
+type watchableStoreTxnWrite struct {
+	TxnWrite
+	s *watchableStore
+}
+
+func (s *watchableStore) Write(trace *traceutil.Trace) TxnWrite {
+	return &watchableStoreTxnWrite{s.store.Write(trace), s}
+}
--- a/server/mvcc/watcher.go
+++ b/server/mvcc/watcher.go
@@ -0,0 +1,193 @@
+// Copyright 2015 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mvcc
+
+import (
+	"bytes"
+	"errors"
+	"sync"
+
+	"go.etcd.io/etcd/api/v3/mvccpb"
+)
+
+// AutoWatchID is the watcher ID passed in WatchStream.Watch when no
+// user-provided ID is available. If pass, an ID will automatically be assigned.
+const AutoWatchID WatchID = 0
+
+var (
+	ErrWatcherNotExist    = errors.New("mvcc: watcher does not exist")
+	ErrEmptyWatcherRange  = errors.New("mvcc: watcher range is empty")
+	ErrWatcherDuplicateID = errors.New("mvcc: duplicate watch ID provided on the WatchStream")
+)
+
+type WatchID int64
+
+// FilterFunc returns true if the given event should be filtered out.
+type FilterFunc func(e mvccpb.Event) bool
+
+type WatchStream interface {
+	// Watch creates a watcher. The watcher watches the events happening or
+	// happened on the given key or range [key, end) from the given startRev.
+	//
+	// The whole event history can be watched unless compacted.
+	// If "startRev" <=0, watch observes events after currentRev.
+	//
+	// The returned "id" is the ID of this watcher. It appears as WatchID
+	// in events that are sent to the created watcher through stream channel.
+	// The watch ID is used when it's not equal to AutoWatchID. Otherwise,
+	// an auto-generated watch ID is returned.
+	Watch(id WatchID, key, end []byte, startRev int64, fcs ...FilterFunc) (WatchID, error)
+
+	// Chan returns a chan. All watch response will be sent to the returned chan.
+	Chan() <-chan WatchResponse
+
+	// RequestProgress requests the progress of the watcher with given ID. The response
+	// will only be sent if the watcher is currently synced.
+	// The responses will be sent through the WatchRespone Chan attached
+	// with this stream to ensure correct ordering.
+	// The responses contains no events. The revision in the response is the progress
+	// of the watchers since the watcher is currently synced.
+	RequestProgress(id WatchID)
+
+	// Cancel cancels a watcher by giving its ID. If watcher does not exist, an error will be
+	// returned.
+	Cancel(id WatchID) error
+
+	// Close closes Chan and release all related resources.
+	Close()
+
+	// Rev returns the current revision of the KV the stream watches on.
+	Rev() int64
+}
+
+type WatchResponse struct {
+	// WatchID is the WatchID of the watcher this response sent to.
+	WatchID WatchID
+
+	// Events contains all the events that needs to send.
+	Events []mvccpb.Event
+
+	// Revision is the revision of the KV when the watchResponse is created.
+	// For a normal response, the revision should be the same as the last
+	// modified revision inside Events. For a delayed response to a unsynced
+	// watcher, the revision is greater than the last modified revision
+	// inside Events.
+	Revision int64
+
+	// CompactRevision is set when the watcher is cancelled due to compaction.
+	CompactRevision int64
+}
+
+// watchStream contains a collection of watchers that share
+// one streaming chan to send out watched events and other control events.
+type watchStream struct {
+	watchable watchable
+	ch        chan WatchResponse
+
+	mu sync.Mutex // guards fields below it
+	// nextID is the ID pre-allocated for next new watcher in this stream
+	nextID   WatchID
+	closed   bool
+	cancels  map[WatchID]cancelFunc
+	watchers map[WatchID]*watcher
+}
+
+// Watch creates a new watcher in the stream and returns its WatchID.
+func (ws *watchStream) Watch(id WatchID, key, end []byte, startRev int64, fcs ...FilterFunc) (WatchID, error) {
+	// prevent wrong range where key >= end lexicographically
+	// watch request with 'WithFromKey' has empty-byte range end
+	if len(end) != 0 && bytes.Compare(key, end) != -1 {
+		return -1, ErrEmptyWatcherRange
+	}
+
+	ws.mu.Lock()
+	defer ws.mu.Unlock()
+	if ws.closed {
+		return -1, ErrEmptyWatcherRange
+	}
+
+	if id == AutoWatchID {
+		for ws.watchers[ws.nextID] != nil {
+			ws.nextID++
+		}
+		id = ws.nextID
+		ws.nextID++
+	} else if _, ok := ws.watchers[id]; ok {
+		return -1, ErrWatcherDuplicateID
+	}
+
+	w, c := ws.watchable.watch(key, end, startRev, id, ws.ch, fcs...)
+
+	ws.cancels[id] = c
+	ws.watchers[id] = w
+	return id, nil
+}
+
+func (ws *watchStream) Chan() <-chan WatchResponse {
+	return ws.ch
+}
+
+func (ws *watchStream) Cancel(id WatchID) error {
+	ws.mu.Lock()
+	cancel, ok := ws.cancels[id]
+	w := ws.watchers[id]
+	ok = ok && !ws.closed
+	ws.mu.Unlock()
+
+	if !ok {
+		return ErrWatcherNotExist
+	}
+	cancel()
+
+	ws.mu.Lock()
+	// The watch isn't removed until cancel so that if Close() is called,
+	// it will wait for the cancel. Otherwise, Close() could close the
+	// watch channel while the store is still posting events.
+	if ww := ws.watchers[id]; ww == w {
+		delete(ws.cancels, id)
+		delete(ws.watchers, id)
+	}
+	ws.mu.Unlock()
+
+	return nil
+}
+
+func (ws *watchStream) Close() {
+	ws.mu.Lock()
+	defer ws.mu.Unlock()
+
+	for _, cancel := range ws.cancels {
+		cancel()
+	}
+	ws.closed = true
+	close(ws.ch)
+	watchStreamGauge.Dec()
+}
+
+func (ws *watchStream) Rev() int64 {
+	ws.mu.Lock()
+	defer ws.mu.Unlock()
+	return ws.watchable.rev()
+}
+
+func (ws *watchStream) RequestProgress(id WatchID) {
+	ws.mu.Lock()
+	w, ok := ws.watchers[id]
+	ws.mu.Unlock()
+	if !ok {
+		return
+	}
+	ws.watchable.progress(w)
+}
--- a/server/mvcc/watcher_bench_test.go
+++ b/server/mvcc/watcher_bench_test.go
@@ -0,0 +1,40 @@
+// Copyright 2015 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mvcc
+
+import (
+	"fmt"
+	"testing"
+
+	"go.etcd.io/etcd/v3/lease"
+	"go.etcd.io/etcd/v3/mvcc/backend"
+
+	"go.uber.org/zap"
+)
+
+func BenchmarkKVWatcherMemoryUsage(b *testing.B) {
+	be, tmpPath := backend.NewDefaultTmpBackend()
+	watchable := newWatchableStore(zap.NewExample(), be, &lease.FakeLessor{}, nil, StoreConfig{})
+
+	defer cleanup(watchable, be, tmpPath)
+
+	w := watchable.NewWatchStream()
+
+	b.ReportAllocs()
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		w.Watch(0, []byte(fmt.Sprint("foo", i)), nil, 0)
+	}
+}
--- a/server/mvcc/watcher_group.go
+++ b/server/mvcc/watcher_group.go
@@ -0,0 +1,293 @@
+// Copyright 2016 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mvcc
+
+import (
+	"fmt"
+	"math"
+
+	"go.etcd.io/etcd/api/v3/mvccpb"
+	"go.etcd.io/etcd/pkg/v3/adt"
+)
+
+var (
+	// watchBatchMaxRevs is the maximum distinct revisions that
+	// may be sent to an unsynced watcher at a time. Declared as
+	// var instead of const for testing purposes.
+	watchBatchMaxRevs = 1000
+)
+
+type eventBatch struct {
+	// evs is a batch of revision-ordered events
+	evs []mvccpb.Event
+	// revs is the minimum unique revisions observed for this batch
+	revs int
+	// moreRev is first revision with more events following this batch
+	moreRev int64
+}
+
+func (eb *eventBatch) add(ev mvccpb.Event) {
+	if eb.revs > watchBatchMaxRevs {
+		// maxed out batch size
+		return
+	}
+
+	if len(eb.evs) == 0 {
+		// base case
+		eb.revs = 1
+		eb.evs = append(eb.evs, ev)
+		return
+	}
+
+	// revision accounting
+	ebRev := eb.evs[len(eb.evs)-1].Kv.ModRevision
+	evRev := ev.Kv.ModRevision
+	if evRev > ebRev {
+		eb.revs++
+		if eb.revs > watchBatchMaxRevs {
+			eb.moreRev = evRev
+			return
+		}
+	}
+
+	eb.evs = append(eb.evs, ev)
+}
+
+type watcherBatch map[*watcher]*eventBatch
+
+func (wb watcherBatch) add(w *watcher, ev mvccpb.Event) {
+	eb := wb[w]
+	if eb == nil {
+		eb = &eventBatch{}
+		wb[w] = eb
+	}
+	eb.add(ev)
+}
+
+// newWatcherBatch maps watchers to their matched events. It enables quick
+// events look up by watcher.
+func newWatcherBatch(wg *watcherGroup, evs []mvccpb.Event) watcherBatch {
+	if len(wg.watchers) == 0 {
+		return nil
+	}
+
+	wb := make(watcherBatch)
+	for _, ev := range evs {
+		for w := range wg.watcherSetByKey(string(ev.Kv.Key)) {
+			if ev.Kv.ModRevision >= w.minRev {
+				// don't double notify
+				wb.add(w, ev)
+			}
+		}
+	}
+	return wb
+}
+
+type watcherSet map[*watcher]struct{}
+
+func (w watcherSet) add(wa *watcher) {
+	if _, ok := w[wa]; ok {
+		panic("add watcher twice!")
+	}
+	w[wa] = struct{}{}
+}
+
+func (w watcherSet) union(ws watcherSet) {
+	for wa := range ws {
+		w.add(wa)
+	}
+}
+
+func (w watcherSet) delete(wa *watcher) {
+	if _, ok := w[wa]; !ok {
+		panic("removing missing watcher!")
+	}
+	delete(w, wa)
+}
+
+type watcherSetByKey map[string]watcherSet
+
+func (w watcherSetByKey) add(wa *watcher) {
+	set := w[string(wa.key)]
+	if set == nil {
+		set = make(watcherSet)
+		w[string(wa.key)] = set
+	}
+	set.add(wa)
+}
+
+func (w watcherSetByKey) delete(wa *watcher) bool {
+	k := string(wa.key)
+	if v, ok := w[k]; ok {
+		if _, ok := v[wa]; ok {
+			delete(v, wa)
+			if len(v) == 0 {
+				// remove the set; nothing left
+				delete(w, k)
+			}
+			return true
+		}
+	}
+	return false
+}
+
+// watcherGroup is a collection of watchers organized by their ranges
+type watcherGroup struct {
+	// keyWatchers has the watchers that watch on a single key
+	keyWatchers watcherSetByKey
+	// ranges has the watchers that watch a range; it is sorted by interval
+	ranges adt.IntervalTree
+	// watchers is the set of all watchers
+	watchers watcherSet
+}
+
+func newWatcherGroup() watcherGroup {
+	return watcherGroup{
+		keyWatchers: make(watcherSetByKey),
+		ranges:      adt.NewIntervalTree(),
+		watchers:    make(watcherSet),
+	}
+}
+
+// add puts a watcher in the group.
+func (wg *watcherGroup) add(wa *watcher) {
+	wg.watchers.add(wa)
+	if wa.end == nil {
+		wg.keyWatchers.add(wa)
+		return
+	}
+
+	// interval already registered?
+	ivl := adt.NewStringAffineInterval(string(wa.key), string(wa.end))
+	if iv := wg.ranges.Find(ivl); iv != nil {
+		iv.Val.(watcherSet).add(wa)
+		return
+	}
+
+	// not registered, put in interval tree
+	ws := make(watcherSet)
+	ws.add(wa)
+	wg.ranges.Insert(ivl, ws)
+}
+
+// contains is whether the given key has a watcher in the group.
+func (wg *watcherGroup) contains(key string) bool {
+	_, ok := wg.keyWatchers[key]
+	return ok || wg.ranges.Intersects(adt.NewStringAffinePoint(key))
+}
+
+// size gives the number of unique watchers in the group.
+func (wg *watcherGroup) size() int { return len(wg.watchers) }
+
+// delete removes a watcher from the group.
+func (wg *watcherGroup) delete(wa *watcher) bool {
+	if _, ok := wg.watchers[wa]; !ok {
+		return false
+	}
+	wg.watchers.delete(wa)
+	if wa.end == nil {
+		wg.keyWatchers.delete(wa)
+		return true
+	}
+
+	ivl := adt.NewStringAffineInterval(string(wa.key), string(wa.end))
+	iv := wg.ranges.Find(ivl)
+	if iv == nil {
+		return false
+	}
+
+	ws := iv.Val.(watcherSet)
+	delete(ws, wa)
+	if len(ws) == 0 {
+		// remove interval missing watchers
+		if ok := wg.ranges.Delete(ivl); !ok {
+			panic("could not remove watcher from interval tree")
+		}
+	}
+
+	return true
+}
+
+// choose selects watchers from the watcher group to update
+func (wg *watcherGroup) choose(maxWatchers int, curRev, compactRev int64) (*watcherGroup, int64) {
+	if len(wg.watchers) < maxWatchers {
+		return wg, wg.chooseAll(curRev, compactRev)
+	}
+	ret := newWatcherGroup()
+	for w := range wg.watchers {
+		if maxWatchers <= 0 {
+			break
+		}
+		maxWatchers--
+		ret.add(w)
+	}
+	return &ret, ret.chooseAll(curRev, compactRev)
+}
+
+func (wg *watcherGroup) chooseAll(curRev, compactRev int64) int64 {
+	minRev := int64(math.MaxInt64)
+	for w := range wg.watchers {
+		if w.minRev > curRev {
+			// after network partition, possibly choosing future revision watcher from restore operation
+			// with watch key "proxy-namespace__lostleader" and revision "math.MaxInt64 - 2"
+			// do not panic when such watcher had been moved from "synced" watcher during restore operation
+			if !w.restore {
+				panic(fmt.Errorf("watcher minimum revision %d should not exceed current revision %d", w.minRev, curRev))
+			}
+
+			// mark 'restore' done, since it's chosen
+			w.restore = false
+		}
+		if w.minRev < compactRev {
+			select {
+			case w.ch <- WatchResponse{WatchID: w.id, CompactRevision: compactRev}:
+				w.compacted = true
+				wg.delete(w)
+			default:
+				// retry next time
+			}
+			continue
+		}
+		if minRev > w.minRev {
+			minRev = w.minRev
+		}
+	}
+	return minRev
+}
+
+// watcherSetByKey gets the set of watchers that receive events on the given key.
+func (wg *watcherGroup) watcherSetByKey(key string) watcherSet {
+	wkeys := wg.keyWatchers[key]
+	wranges := wg.ranges.Stab(adt.NewStringAffinePoint(key))
+
+	// zero-copy cases
+	switch {
+	case len(wranges) == 0:
+		// no need to merge ranges or copy; reuse single-key set
+		return wkeys
+	case len(wranges) == 0 && len(wkeys) == 0:
+		return nil
+	case len(wranges) == 1 && len(wkeys) == 0:
+		return wranges[0].Val.(watcherSet)
+	}
+
+	// copy case
+	ret := make(watcherSet)
+	ret.union(wg.keyWatchers[key])
+	for _, item := range wranges {
+		ret.union(item.Val.(watcherSet))
+	}
+	return ret
+}
--- a/server/mvcc/watcher_test.go
+++ b/server/mvcc/watcher_test.go
@@ -0,0 +1,380 @@
+// Copyright 2015 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package mvcc
+
+import (
+	"bytes"
+	"fmt"
+	"os"
+	"reflect"
+	"testing"
+	"time"
+
+	"go.etcd.io/etcd/api/v3/mvccpb"
+	"go.etcd.io/etcd/v3/lease"
+	"go.etcd.io/etcd/v3/mvcc/backend"
+	"go.uber.org/zap"
+)
+
+// TestWatcherWatchID tests that each watcher provides unique watchID,
+// and the watched event attaches the correct watchID.
+func TestWatcherWatchID(t *testing.T) {
+	b, tmpPath := backend.NewDefaultTmpBackend()
+	s := WatchableKV(newWatchableStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{}))
+	defer cleanup(s, b, tmpPath)
+
+	w := s.NewWatchStream()
+	defer w.Close()
+
+	idm := make(map[WatchID]struct{})
+
+	for i := 0; i < 10; i++ {
+		id, _ := w.Watch(0, []byte("foo"), nil, 0)
+		if _, ok := idm[id]; ok {
+			t.Errorf("#%d: id %d exists", i, id)
+		}
+		idm[id] = struct{}{}
+
+		s.Put([]byte("foo"), []byte("bar"), lease.NoLease)
+
+		resp := <-w.Chan()
+		if resp.WatchID != id {
+			t.Errorf("#%d: watch id in event = %d, want %d", i, resp.WatchID, id)
+		}
+
+		if err := w.Cancel(id); err != nil {
+			t.Error(err)
+		}
+	}
+
+	s.Put([]byte("foo2"), []byte("bar"), lease.NoLease)
+
+	// unsynced watchers
+	for i := 10; i < 20; i++ {
+		id, _ := w.Watch(0, []byte("foo2"), nil, 1)
+		if _, ok := idm[id]; ok {
+			t.Errorf("#%d: id %d exists", i, id)
+		}
+		idm[id] = struct{}{}
+
+		resp := <-w.Chan()
+		if resp.WatchID != id {
+			t.Errorf("#%d: watch id in event = %d, want %d", i, resp.WatchID, id)
+		}
+
+		if err := w.Cancel(id); err != nil {
+			t.Error(err)
+		}
+	}
+}
+
+func TestWatcherRequestsCustomID(t *testing.T) {
+	b, tmpPath := backend.NewDefaultTmpBackend()
+	s := WatchableKV(newWatchableStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{}))
+	defer cleanup(s, b, tmpPath)
+
+	w := s.NewWatchStream()
+	defer w.Close()
+
+	// - Request specifically ID #1
+	// - Try to duplicate it, get an error
+	// - Make sure the auto-assignment skips over things we manually assigned
+
+	tt := []struct {
+		givenID     WatchID
+		expectedID  WatchID
+		expectedErr error
+	}{
+		{1, 1, nil},
+		{1, 0, ErrWatcherDuplicateID},
+		{0, 0, nil},
+		{0, 2, nil},
+	}
+
+	for i, tcase := range tt {
+		id, err := w.Watch(tcase.givenID, []byte("foo"), nil, 0)
+		if tcase.expectedErr != nil || err != nil {
+			if err != tcase.expectedErr {
+				t.Errorf("expected get error %q in test case %q, got %q", tcase.expectedErr, i, err)
+			}
+		} else if tcase.expectedID != id {
+			t.Errorf("expected to create ID %d, got %d in test case %d", tcase.expectedID, id, i)
+		}
+	}
+}
+
+// TestWatcherWatchPrefix tests if Watch operation correctly watches
+// and returns events with matching prefixes.
+func TestWatcherWatchPrefix(t *testing.T) {
+	b, tmpPath := backend.NewDefaultTmpBackend()
+	s := WatchableKV(newWatchableStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{}))
+	defer cleanup(s, b, tmpPath)
+
+	w := s.NewWatchStream()
+	defer w.Close()
+
+	idm := make(map[WatchID]struct{})
+
+	val := []byte("bar")
+	keyWatch, keyEnd, keyPut := []byte("foo"), []byte("fop"), []byte("foobar")
+
+	for i := 0; i < 10; i++ {
+		id, _ := w.Watch(0, keyWatch, keyEnd, 0)
+		if _, ok := idm[id]; ok {
+			t.Errorf("#%d: unexpected duplicated id %x", i, id)
+		}
+		idm[id] = struct{}{}
+
+		s.Put(keyPut, val, lease.NoLease)
+
+		resp := <-w.Chan()
+		if resp.WatchID != id {
+			t.Errorf("#%d: watch id in event = %d, want %d", i, resp.WatchID, id)
+		}
+
+		if err := w.Cancel(id); err != nil {
+			t.Errorf("#%d: unexpected cancel error %v", i, err)
+		}
+
+		if len(resp.Events) != 1 {
+			t.Errorf("#%d: len(resp.Events) got = %d, want = 1", i, len(resp.Events))
+		}
+		if len(resp.Events) == 1 {
+			if !bytes.Equal(resp.Events[0].Kv.Key, keyPut) {
+				t.Errorf("#%d: resp.Events got = %s, want = %s", i, resp.Events[0].Kv.Key, keyPut)
+			}
+		}
+	}
+
+	keyWatch1, keyEnd1, keyPut1 := []byte("foo1"), []byte("foo2"), []byte("foo1bar")
+	s.Put(keyPut1, val, lease.NoLease)
+
+	// unsynced watchers
+	for i := 10; i < 15; i++ {
+		id, _ := w.Watch(0, keyWatch1, keyEnd1, 1)
+		if _, ok := idm[id]; ok {
+			t.Errorf("#%d: id %d exists", i, id)
+		}
+		idm[id] = struct{}{}
+
+		resp := <-w.Chan()
+		if resp.WatchID != id {
+			t.Errorf("#%d: watch id in event = %d, want %d", i, resp.WatchID, id)
+		}
+
+		if err := w.Cancel(id); err != nil {
+			t.Error(err)
+		}
+
+		if len(resp.Events) != 1 {
+			t.Errorf("#%d: len(resp.Events) got = %d, want = 1", i, len(resp.Events))
+		}
+		if len(resp.Events) == 1 {
+			if !bytes.Equal(resp.Events[0].Kv.Key, keyPut1) {
+				t.Errorf("#%d: resp.Events got = %s, want = %s", i, resp.Events[0].Kv.Key, keyPut1)
+			}
+		}
+	}
+}
+
+// TestWatcherWatchWrongRange ensures that watcher with wrong 'end' range
+// does not create watcher, which panics when canceling in range tree.
+func TestWatcherWatchWrongRange(t *testing.T) {
+	b, tmpPath := backend.NewDefaultTmpBackend()
+	s := WatchableKV(newWatchableStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{}))
+	defer cleanup(s, b, tmpPath)
+
+	w := s.NewWatchStream()
+	defer w.Close()
+
+	if _, err := w.Watch(0, []byte("foa"), []byte("foa"), 1); err != ErrEmptyWatcherRange {
+		t.Fatalf("key == end range given; expected ErrEmptyWatcherRange, got %+v", err)
+	}
+	if _, err := w.Watch(0, []byte("fob"), []byte("foa"), 1); err != ErrEmptyWatcherRange {
+		t.Fatalf("key > end range given; expected ErrEmptyWatcherRange, got %+v", err)
+	}
+	// watch request with 'WithFromKey' has empty-byte range end
+	if id, _ := w.Watch(0, []byte("foo"), []byte{}, 1); id != 0 {
+		t.Fatalf("\x00 is range given; id expected 0, got %d", id)
+	}
+}
+
+func TestWatchDeleteRange(t *testing.T) {
+	b, tmpPath := backend.NewDefaultTmpBackend()
+	s := newWatchableStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{})
+
+	defer func() {
+		s.store.Close()
+		os.Remove(tmpPath)
+	}()
+
+	testKeyPrefix := []byte("foo")
+
+	for i := 0; i < 3; i++ {
+		s.Put([]byte(fmt.Sprintf("%s_%d", testKeyPrefix, i)), []byte("bar"), lease.NoLease)
+	}
+
+	w := s.NewWatchStream()
+	from, to := testKeyPrefix, []byte(fmt.Sprintf("%s_%d", testKeyPrefix, 99))
+	w.Watch(0, from, to, 0)
+
+	s.DeleteRange(from, to)
+
+	we := []mvccpb.Event{
+		{Type: mvccpb.DELETE, Kv: &mvccpb.KeyValue{Key: []byte("foo_0"), ModRevision: 5}},
+		{Type: mvccpb.DELETE, Kv: &mvccpb.KeyValue{Key: []byte("foo_1"), ModRevision: 5}},
+		{Type: mvccpb.DELETE, Kv: &mvccpb.KeyValue{Key: []byte("foo_2"), ModRevision: 5}},
+	}
+
+	select {
+	case r := <-w.Chan():
+		if !reflect.DeepEqual(r.Events, we) {
+			t.Errorf("event = %v, want %v", r.Events, we)
+		}
+	case <-time.After(10 * time.Second):
+		t.Fatal("failed to receive event after 10 seconds!")
+	}
+}
+
+// TestWatchStreamCancelWatcherByID ensures cancel calls the cancel func of the watcher
+// with given id inside watchStream.
+func TestWatchStreamCancelWatcherByID(t *testing.T) {
+	b, tmpPath := backend.NewDefaultTmpBackend()
+	s := WatchableKV(newWatchableStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{}))
+	defer cleanup(s, b, tmpPath)
+
+	w := s.NewWatchStream()
+	defer w.Close()
+
+	id, _ := w.Watch(0, []byte("foo"), nil, 0)
+
+	tests := []struct {
+		cancelID WatchID
+		werr     error
+	}{
+		// no error should be returned when cancel the created watcher.
+		{id, nil},
+		// not exist error should be returned when cancel again.
+		{id, ErrWatcherNotExist},
+		// not exist error should be returned when cancel a bad id.
+		{id + 1, ErrWatcherNotExist},
+	}
+
+	for i, tt := range tests {
+		gerr := w.Cancel(tt.cancelID)
+
+		if gerr != tt.werr {
+			t.Errorf("#%d: err = %v, want %v", i, gerr, tt.werr)
+		}
+	}
+
+	if l := len(w.(*watchStream).cancels); l != 0 {
+		t.Errorf("cancels = %d, want 0", l)
+	}
+}
+
+// TestWatcherRequestProgress ensures synced watcher can correctly
+// report its correct progress.
+func TestWatcherRequestProgress(t *testing.T) {
+	b, tmpPath := backend.NewDefaultTmpBackend()
+
+	// manually create watchableStore instead of newWatchableStore
+	// because newWatchableStore automatically calls syncWatchers
+	// method to sync watchers in unsynced map. We want to keep watchers
+	// in unsynced to test if syncWatchers works as expected.
+	s := &watchableStore{
+		store:    NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{}),
+		unsynced: newWatcherGroup(),
+		synced:   newWatcherGroup(),
+	}
+
+	defer func() {
+		s.store.Close()
+		os.Remove(tmpPath)
+	}()
+
+	testKey := []byte("foo")
+	notTestKey := []byte("bad")
+	testValue := []byte("bar")
+	s.Put(testKey, testValue, lease.NoLease)
+
+	w := s.NewWatchStream()
+
+	badID := WatchID(1000)
+	w.RequestProgress(badID)
+	select {
+	case resp := <-w.Chan():
+		t.Fatalf("unexpected %+v", resp)
+	default:
+	}
+
+	id, _ := w.Watch(0, notTestKey, nil, 1)
+	w.RequestProgress(id)
+	select {
+	case resp := <-w.Chan():
+		t.Fatalf("unexpected %+v", resp)
+	default:
+	}
+
+	s.syncWatchers()
+
+	w.RequestProgress(id)
+	wrs := WatchResponse{WatchID: id, Revision: 2}
+	select {
+	case resp := <-w.Chan():
+		if !reflect.DeepEqual(resp, wrs) {
+			t.Fatalf("got %+v, expect %+v", resp, wrs)
+		}
+	case <-time.After(time.Second):
+		t.Fatal("failed to receive progress")
+	}
+}
+
+func TestWatcherWatchWithFilter(t *testing.T) {
+	b, tmpPath := backend.NewDefaultTmpBackend()
+	s := WatchableKV(newWatchableStore(zap.NewExample(), b, &lease.FakeLessor{}, nil, StoreConfig{}))
+	defer cleanup(s, b, tmpPath)
+
+	w := s.NewWatchStream()
+	defer w.Close()
+
+	filterPut := func(e mvccpb.Event) bool {
+		return e.Type == mvccpb.PUT
+	}
+
+	w.Watch(0, []byte("foo"), nil, 0, filterPut)
+	done := make(chan struct{}, 1)
+
+	go func() {
+		<-w.Chan()
+		done <- struct{}{}
+	}()
+
+	s.Put([]byte("foo"), []byte("bar"), 0)
+
+	select {
+	case <-done:
+		t.Fatal("failed to filter put request")
+	case <-time.After(100 * time.Millisecond):
+	}
+
+	s.DeleteRange([]byte("foo"), nil)
+
+	select {
+	case <-done:
+	case <-time.After(100 * time.Millisecond):
+		t.Fatal("failed to receive delete request")
+	}
+}