mirror of
https://github.com/etcd-io/etcd.git
synced 2024-09-27 06:25:44 +00:00

This is for https://github.com/coreos/etcd/issues/3848. It replaces RangeHistory method for more efficient event sending.
444 lines
10 KiB
Go
444 lines
10 KiB
Go
// Copyright 2015 CoreOS, Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package storage
|
|
|
|
import (
|
|
"fmt"
|
|
"log"
|
|
"math"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/coreos/etcd/storage/storagepb"
|
|
)
|
|
|
|
const (
|
|
// chanBufLen is the length of the buffered chan
|
|
// for sending out watched events.
|
|
// TODO: find a good buf value. 1024 is just a random one that
|
|
// seems to be reasonable.
|
|
chanBufLen = 1024
|
|
)
|
|
|
|
type watchable interface {
|
|
watch(key []byte, prefix bool, startRev, id int64, ch chan<- storagepb.Event) (*watching, CancelFunc)
|
|
}
|
|
|
|
type watchableStore struct {
|
|
mu sync.Mutex
|
|
|
|
*store
|
|
|
|
// contains all unsynced watching that needs to sync events that have happened
|
|
unsynced map[*watching]struct{}
|
|
|
|
// contains all synced watching that are tracking the events that will happen
|
|
// The key of the map is the key that the watching is watching on.
|
|
synced map[string]map[*watching]struct{}
|
|
tx *ongoingTx
|
|
|
|
stopc chan struct{}
|
|
wg sync.WaitGroup
|
|
}
|
|
|
|
func newWatchableStore(path string) *watchableStore {
|
|
s := &watchableStore{
|
|
store: newStore(path),
|
|
unsynced: make(map[*watching]struct{}),
|
|
synced: make(map[string]map[*watching]struct{}),
|
|
stopc: make(chan struct{}),
|
|
}
|
|
s.wg.Add(1)
|
|
go s.syncWatchingsLoop()
|
|
return s
|
|
}
|
|
|
|
func (s *watchableStore) Put(key, value []byte) (rev int64) {
|
|
s.mu.Lock()
|
|
defer s.mu.Unlock()
|
|
|
|
rev = s.store.Put(key, value)
|
|
// TODO: avoid this range
|
|
kvs, _, err := s.store.Range(key, nil, 0, rev)
|
|
if err != nil {
|
|
log.Panicf("unexpected range error (%v)", err)
|
|
}
|
|
s.handle(rev, storagepb.Event{
|
|
Type: storagepb.PUT,
|
|
Kv: &kvs[0],
|
|
})
|
|
return rev
|
|
}
|
|
|
|
func (s *watchableStore) DeleteRange(key, end []byte) (n, rev int64) {
|
|
s.mu.Lock()
|
|
defer s.mu.Unlock()
|
|
|
|
// TODO: avoid this range
|
|
kvs, _, err := s.store.Range(key, end, 0, 0)
|
|
if err != nil {
|
|
log.Panicf("unexpected range error (%v)", err)
|
|
}
|
|
n, rev = s.store.DeleteRange(key, end)
|
|
for _, kv := range kvs {
|
|
s.handle(rev, storagepb.Event{
|
|
Type: storagepb.DELETE,
|
|
Kv: &storagepb.KeyValue{
|
|
Key: kv.Key,
|
|
},
|
|
})
|
|
}
|
|
return n, rev
|
|
}
|
|
|
|
func (s *watchableStore) TxnBegin() int64 {
|
|
s.mu.Lock()
|
|
s.tx = newOngoingTx()
|
|
return s.store.TxnBegin()
|
|
}
|
|
|
|
func (s *watchableStore) TxnPut(txnID int64, key, value []byte) (rev int64, err error) {
|
|
rev, err = s.store.TxnPut(txnID, key, value)
|
|
if err == nil {
|
|
s.tx.put(string(key))
|
|
}
|
|
return rev, err
|
|
}
|
|
|
|
func (s *watchableStore) TxnDeleteRange(txnID int64, key, end []byte) (n, rev int64, err error) {
|
|
kvs, _, err := s.store.TxnRange(txnID, key, end, 0, 0)
|
|
if err != nil {
|
|
log.Panicf("unexpected range error (%v)", err)
|
|
}
|
|
n, rev, err = s.store.TxnDeleteRange(txnID, key, end)
|
|
if err == nil {
|
|
for _, kv := range kvs {
|
|
s.tx.del(string(kv.Key))
|
|
}
|
|
}
|
|
return n, rev, err
|
|
}
|
|
|
|
func (s *watchableStore) TxnEnd(txnID int64) error {
|
|
err := s.store.TxnEnd(txnID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
_, rev, _ := s.store.Range(nil, nil, 0, 0)
|
|
for k := range s.tx.putm {
|
|
kvs, _, err := s.store.Range([]byte(k), nil, 0, 0)
|
|
if err != nil {
|
|
log.Panicf("unexpected range error (%v)", err)
|
|
}
|
|
s.handle(rev, storagepb.Event{
|
|
Type: storagepb.PUT,
|
|
Kv: &kvs[0],
|
|
})
|
|
}
|
|
for k := range s.tx.delm {
|
|
s.handle(rev, storagepb.Event{
|
|
Type: storagepb.DELETE,
|
|
Kv: &storagepb.KeyValue{
|
|
Key: []byte(k),
|
|
},
|
|
})
|
|
}
|
|
s.mu.Unlock()
|
|
return nil
|
|
}
|
|
|
|
func (s *watchableStore) Close() error {
|
|
close(s.stopc)
|
|
s.wg.Wait()
|
|
return s.store.Close()
|
|
}
|
|
|
|
func (s *watchableStore) NewWatcher() Watcher {
|
|
watcherGauge.Inc()
|
|
return &watcher{
|
|
watchable: s,
|
|
ch: make(chan storagepb.Event, chanBufLen),
|
|
}
|
|
}
|
|
|
|
func (s *watchableStore) watch(key []byte, prefix bool, startRev, id int64, ch chan<- storagepb.Event) (*watching, CancelFunc) {
|
|
s.mu.Lock()
|
|
defer s.mu.Unlock()
|
|
|
|
wa := &watching{
|
|
key: key,
|
|
prefix: prefix,
|
|
cur: startRev,
|
|
id: id,
|
|
ch: ch,
|
|
}
|
|
|
|
k := string(key)
|
|
if startRev == 0 {
|
|
if err := unsafeAddWatching(&s.synced, k, wa); err != nil {
|
|
log.Panicf("error unsafeAddWatching (%v) for key %s", err, k)
|
|
}
|
|
} else {
|
|
slowWatchingGauge.Inc()
|
|
s.unsynced[wa] = struct{}{}
|
|
}
|
|
watchingGauge.Inc()
|
|
|
|
cancel := CancelFunc(func() {
|
|
s.mu.Lock()
|
|
defer s.mu.Unlock()
|
|
// remove global references of the watching
|
|
if _, ok := s.unsynced[wa]; ok {
|
|
delete(s.unsynced, wa)
|
|
slowWatchingGauge.Dec()
|
|
watchingGauge.Dec()
|
|
return
|
|
}
|
|
|
|
if v, ok := s.synced[k]; ok {
|
|
if _, ok := v[wa]; ok {
|
|
delete(v, wa)
|
|
// if there is nothing in s.synced[k],
|
|
// remove the key from the synced
|
|
if len(v) == 0 {
|
|
delete(s.synced, k)
|
|
}
|
|
watchingGauge.Dec()
|
|
}
|
|
}
|
|
// If we cannot find it, it should have finished watch.
|
|
})
|
|
|
|
return wa, cancel
|
|
}
|
|
|
|
// syncWatchingsLoop syncs the watching in the unsyncd map every 100ms.
|
|
func (s *watchableStore) syncWatchingsLoop() {
|
|
defer s.wg.Done()
|
|
|
|
for {
|
|
s.mu.Lock()
|
|
s.syncWatchings()
|
|
s.mu.Unlock()
|
|
|
|
select {
|
|
case <-time.After(100 * time.Millisecond):
|
|
case <-s.stopc:
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// syncWatchings periodically syncs unsynced watchings by: Iterate all unsynced
|
|
// watchings to get the minimum revision within its range, skipping the
|
|
// watching if its current revision is behind the compact revision of the
|
|
// store. And use this minimum revision to get all key-value pairs. Then send
|
|
// those events to watchings.
|
|
func (s *watchableStore) syncWatchings() {
|
|
s.store.mu.Lock()
|
|
defer s.store.mu.Unlock()
|
|
|
|
if len(s.unsynced) == 0 {
|
|
return
|
|
}
|
|
|
|
// in order to find key-value pairs from unsynced watchings, we need to
|
|
// find min revision index, and these revisions can be used to
|
|
// query the backend store of key-value pairs
|
|
minRev := int64(math.MaxInt64)
|
|
|
|
curRev := s.store.currentRev.main
|
|
compactionRev := s.store.compactMainRev
|
|
|
|
// TODO: change unsynced struct type same to this
|
|
keyToUnsynced := make(map[string]map[*watching]struct{})
|
|
|
|
for w := range s.unsynced {
|
|
k := string(w.key)
|
|
|
|
if w.cur > curRev {
|
|
panic("watching current revision should not exceed current revision")
|
|
}
|
|
|
|
if w.cur < compactionRev {
|
|
// TODO: return error compacted to that watching instead of
|
|
// just removing it sliently from unsynced.
|
|
delete(s.unsynced, w)
|
|
continue
|
|
}
|
|
|
|
if minRev >= w.cur {
|
|
minRev = w.cur
|
|
}
|
|
|
|
if _, ok := keyToUnsynced[k]; !ok {
|
|
keyToUnsynced[k] = make(map[*watching]struct{})
|
|
}
|
|
keyToUnsynced[k][w] = struct{}{}
|
|
}
|
|
|
|
minBytes, maxBytes := newRevBytes(), newRevBytes()
|
|
revToBytes(revision{main: minRev}, minBytes)
|
|
revToBytes(revision{main: curRev + 1}, maxBytes)
|
|
|
|
// UnsafeRange returns keys and values. And in boltdb, keys are revisions.
|
|
// values are actual key-value pairs in backend.
|
|
tx := s.store.b.BatchTx()
|
|
tx.Lock()
|
|
ks, vs := tx.UnsafeRange(keyBucketName, minBytes, maxBytes, 0)
|
|
tx.Unlock()
|
|
|
|
for i, v := range vs {
|
|
var kv storagepb.KeyValue
|
|
if err := kv.Unmarshal(v); err != nil {
|
|
log.Panicf("storage: cannot unmarshal event: %v", err)
|
|
}
|
|
|
|
k := string(kv.Key)
|
|
wm, ok := keyToUnsynced[k]
|
|
if !ok {
|
|
continue
|
|
}
|
|
|
|
var ev storagepb.Event
|
|
switch {
|
|
case isTombstone(ks[i]):
|
|
ev.Type = storagepb.DELETE
|
|
default:
|
|
ev.Type = storagepb.PUT
|
|
}
|
|
ev.Kv = &kv
|
|
|
|
for w := range wm {
|
|
ev.WatchID = w.id
|
|
|
|
select {
|
|
case w.ch <- ev:
|
|
pendingEventsGauge.Inc()
|
|
default:
|
|
// TODO: handle the full unsynced watchings.
|
|
// continue to process other watchings for now, the full ones
|
|
// will be processed next time and hopefully it will not be full.
|
|
continue
|
|
}
|
|
if err := unsafeAddWatching(&s.synced, k, w); err != nil {
|
|
log.Panicf("error unsafeAddWatching (%v) for key %s", err, k)
|
|
}
|
|
delete(s.unsynced, w)
|
|
}
|
|
}
|
|
|
|
slowWatchingGauge.Set(float64(len(s.unsynced)))
|
|
}
|
|
|
|
// handle handles the change of the happening event on all watchings.
|
|
func (s *watchableStore) handle(rev int64, ev storagepb.Event) {
|
|
s.notify(rev, ev)
|
|
}
|
|
|
|
// notify notifies the fact that given event at the given rev just happened to
|
|
// watchings that watch on the key of the event.
|
|
func (s *watchableStore) notify(rev int64, ev storagepb.Event) {
|
|
// check all prefixes of the key to notify all corresponded watchings
|
|
for i := 0; i <= len(ev.Kv.Key); i++ {
|
|
k := string(ev.Kv.Key[:i])
|
|
if wm, ok := s.synced[k]; ok {
|
|
for w := range wm {
|
|
// the watching needs to be notified when either it watches prefix or
|
|
// the key is exactly matched.
|
|
if !w.prefix && i != len(ev.Kv.Key) {
|
|
continue
|
|
}
|
|
ev.WatchID = w.id
|
|
select {
|
|
case w.ch <- ev:
|
|
pendingEventsGauge.Inc()
|
|
default:
|
|
w.cur = rev
|
|
s.unsynced[w] = struct{}{}
|
|
delete(wm, w)
|
|
slowWatchingGauge.Inc()
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
type ongoingTx struct {
|
|
// keys put/deleted in the ongoing txn
|
|
putm map[string]struct{}
|
|
delm map[string]struct{}
|
|
}
|
|
|
|
func newOngoingTx() *ongoingTx {
|
|
return &ongoingTx{
|
|
putm: make(map[string]struct{}),
|
|
delm: make(map[string]struct{}),
|
|
}
|
|
}
|
|
|
|
func (tx *ongoingTx) put(k string) {
|
|
tx.putm[k] = struct{}{}
|
|
if _, ok := tx.delm[k]; ok {
|
|
delete(tx.delm, k)
|
|
}
|
|
}
|
|
|
|
func (tx *ongoingTx) del(k string) {
|
|
tx.delm[k] = struct{}{}
|
|
if _, ok := tx.putm[k]; ok {
|
|
delete(tx.putm, k)
|
|
}
|
|
}
|
|
|
|
type watching struct {
|
|
// the watching key
|
|
key []byte
|
|
// prefix indicates if watching is on a key or a prefix.
|
|
// If prefix is true, the watching is on a prefix.
|
|
prefix bool
|
|
// cur is the current watching revision.
|
|
// If cur is behind the current revision of the KV,
|
|
// watching is unsynced and needs to catch up.
|
|
cur int64
|
|
id int64
|
|
|
|
// a chan to send out the watched events.
|
|
// The chan might be shared with other watchings.
|
|
ch chan<- storagepb.Event
|
|
}
|
|
|
|
// unsafeAddWatching puts watching with key k into watchableStore's synced.
|
|
// Make sure to this is thread-safe using mutex before and after.
|
|
func unsafeAddWatching(synced *map[string]map[*watching]struct{}, k string, wa *watching) error {
|
|
if wa == nil {
|
|
return fmt.Errorf("nil watching received")
|
|
}
|
|
mp := *synced
|
|
if v, ok := mp[k]; ok {
|
|
if _, ok := v[wa]; ok {
|
|
return fmt.Errorf("put the same watch twice: %+v", wa)
|
|
} else {
|
|
v[wa] = struct{}{}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
mp[k] = make(map[*watching]struct{})
|
|
mp[k][wa] = struct{}{}
|
|
return nil
|
|
}
|