etcd/storage/watchable_store.go

// Copyright 2015 CoreOS, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package storage

import (
	"fmt"
	"log"
	"math"
	"sync"
	"time"

	"github.com/coreos/etcd/storage/storagepb"
)

const (
	// chanBufLen is the length of the buffered chan
	// for sending out watched events.
	// TODO: find a good buf value. 1024 is just a random one that
	// seems to be reasonable.
	chanBufLen = 1024
)

type watchable interface {
	watch(key []byte, prefix bool, startRev, id int64, ch chan<- storagepb.Event) (*watching, CancelFunc)
}

type watchableStore struct {
	mu sync.Mutex

	*store

	// contains all unsynced watching that needs to sync events that have happened
	unsynced map[*watching]struct{}

	// contains all synced watching that are tracking the events that will happen
	// The key of the map is the key that the watching is watching on.
	synced map[string]map[*watching]struct{}
	tx     *ongoingTx

	stopc chan struct{}
	wg    sync.WaitGroup
}

func newWatchableStore(path string) *watchableStore {
	s := &watchableStore{
		store:    newStore(path),
		unsynced: make(map[*watching]struct{}),
		synced:   make(map[string]map[*watching]struct{}),
		stopc:    make(chan struct{}),
	}
	s.wg.Add(1)
	go s.syncWatchingsLoop()
	return s
}

func (s *watchableStore) Put(key, value []byte) (rev int64) {
	s.mu.Lock()
	defer s.mu.Unlock()

	rev = s.store.Put(key, value)
	// TODO: avoid this range
	kvs, _, err := s.store.Range(key, nil, 0, rev)
	if err != nil {
		log.Panicf("unexpected range error (%v)", err)
	}
	s.handle(rev, storagepb.Event{
		Type: storagepb.PUT,
		Kv:   &kvs[0],
	})
	return rev
}

func (s *watchableStore) DeleteRange(key, end []byte) (n, rev int64) {
	s.mu.Lock()
	defer s.mu.Unlock()

	// TODO: avoid this range
	kvs, _, err := s.store.Range(key, end, 0, 0)
	if err != nil {
		log.Panicf("unexpected range error (%v)", err)
	}
	n, rev = s.store.DeleteRange(key, end)
	for _, kv := range kvs {
		s.handle(rev, storagepb.Event{
			Type: storagepb.DELETE,
			Kv: &storagepb.KeyValue{
				Key: kv.Key,
			},
		})
	}
	return n, rev
}

func (s *watchableStore) TxnBegin() int64 {
	s.mu.Lock()
	s.tx = newOngoingTx()
	return s.store.TxnBegin()
}

func (s *watchableStore) TxnPut(txnID int64, key, value []byte) (rev int64, err error) {
	rev, err = s.store.TxnPut(txnID, key, value)
	if err == nil {
		s.tx.put(string(key))
	}
	return rev, err
}

func (s *watchableStore) TxnDeleteRange(txnID int64, key, end []byte) (n, rev int64, err error) {
	kvs, _, err := s.store.TxnRange(txnID, key, end, 0, 0)
	if err != nil {
		log.Panicf("unexpected range error (%v)", err)
	}
	n, rev, err = s.store.TxnDeleteRange(txnID, key, end)
	if err == nil {
		for _, kv := range kvs {
			s.tx.del(string(kv.Key))
		}
	}
	return n, rev, err
}

func (s *watchableStore) TxnEnd(txnID int64) error {
	err := s.store.TxnEnd(txnID)
	if err != nil {
		return err
	}

	_, rev, _ := s.store.Range(nil, nil, 0, 0)
	for k := range s.tx.putm {
		kvs, _, err := s.store.Range([]byte(k), nil, 0, 0)
		if err != nil {
			log.Panicf("unexpected range error (%v)", err)
		}
		s.handle(rev, storagepb.Event{
			Type: storagepb.PUT,
			Kv:   &kvs[0],
		})
	}
	for k := range s.tx.delm {
		s.handle(rev, storagepb.Event{
			Type: storagepb.DELETE,
			Kv: &storagepb.KeyValue{
				Key: []byte(k),
			},
		})
	}
	s.mu.Unlock()
	return nil
}

func (s *watchableStore) Close() error {
	close(s.stopc)
	s.wg.Wait()
	return s.store.Close()
}

func (s *watchableStore) NewWatcher() Watcher {
	watcherGauge.Inc()
	return &watcher{
		watchable: s,
		ch:        make(chan storagepb.Event, chanBufLen),
	}
}

func (s *watchableStore) watch(key []byte, prefix bool, startRev, id int64, ch chan<- storagepb.Event) (*watching, CancelFunc) {
	s.mu.Lock()
	defer s.mu.Unlock()

	wa := &watching{
		key:    key,
		prefix: prefix,
		cur:    startRev,
		id:     id,
		ch:     ch,
	}

	k := string(key)
	if startRev == 0 {
		if err := unsafeAddWatching(&s.synced, k, wa); err != nil {
			log.Panicf("error unsafeAddWatching (%v) for key %s", err, k)
		}
	} else {
		slowWatchingGauge.Inc()
		s.unsynced[wa] = struct{}{}
	}
	watchingGauge.Inc()

	cancel := CancelFunc(func() {
		s.mu.Lock()
		defer s.mu.Unlock()
		// remove global references of the watching
		if _, ok := s.unsynced[wa]; ok {
			delete(s.unsynced, wa)
			slowWatchingGauge.Dec()
			watchingGauge.Dec()
			return
		}

		if v, ok := s.synced[k]; ok {
			if _, ok := v[wa]; ok {
				delete(v, wa)
				// if there is nothing in s.synced[k],
				// remove the key from the synced
				if len(v) == 0 {
					delete(s.synced, k)
				}
				watchingGauge.Dec()
			}
		}
		// If we cannot find it, it should have finished watch.
	})

	return wa, cancel
}

// syncWatchingsLoop syncs the watching in the unsyncd map every 100ms.
func (s *watchableStore) syncWatchingsLoop() {
	defer s.wg.Done()

	for {
		s.mu.Lock()
		s.syncWatchings()
		s.mu.Unlock()

		select {
		case <-time.After(100 * time.Millisecond):
		case <-s.stopc:
			return
		}
	}
}

// syncWatchings periodically syncs unsynced watchings by: Iterate all unsynced
// watchings to get the minimum revision within its range, skipping the
// watching if its current revision is behind the compact revision of the
// store. And use this minimum revision to get all key-value pairs. Then send
// those events to watchings.
func (s *watchableStore) syncWatchings() {
	s.store.mu.Lock()
	defer s.store.mu.Unlock()

	if len(s.unsynced) == 0 {
		return
	}

	// in order to find key-value pairs from unsynced watchings, we need to
	// find min revision index, and these revisions can be used to
	// query the backend store of key-value pairs
	minRev := int64(math.MaxInt64)

	curRev := s.store.currentRev.main
	compactionRev := s.store.compactMainRev

	// TODO: change unsynced struct type same to this
	keyToUnsynced := make(map[string]map[*watching]struct{})

	for w := range s.unsynced {
		k := string(w.key)

		if w.cur > curRev {
			panic("watching current revision should not exceed current revision")
		}

		if w.cur < compactionRev {
			// TODO: return error compacted to that watching instead of
			// just removing it sliently from unsynced.
			delete(s.unsynced, w)
			continue
		}

		if minRev >= w.cur {
			minRev = w.cur
		}

		if _, ok := keyToUnsynced[k]; !ok {
			keyToUnsynced[k] = make(map[*watching]struct{})
		}
		keyToUnsynced[k][w] = struct{}{}
	}

	minBytes, maxBytes := newRevBytes(), newRevBytes()
	revToBytes(revision{main: minRev}, minBytes)
	revToBytes(revision{main: curRev + 1}, maxBytes)

	// UnsafeRange returns keys and values. And in boltdb, keys are revisions.
	// values are actual key-value pairs in backend.
	tx := s.store.b.BatchTx()
	tx.Lock()
	ks, vs := tx.UnsafeRange(keyBucketName, minBytes, maxBytes, 0)
	tx.Unlock()

	for i, v := range vs {
		var kv storagepb.KeyValue
		if err := kv.Unmarshal(v); err != nil {
			log.Panicf("storage: cannot unmarshal event: %v", err)
		}

		k := string(kv.Key)
		wm, ok := keyToUnsynced[k]
		if !ok {
			continue
		}

		var ev storagepb.Event
		switch {
		case isTombstone(ks[i]):
			ev.Type = storagepb.DELETE
		default:
			ev.Type = storagepb.PUT
		}
		ev.Kv = &kv

		for w := range wm {
			ev.WatchID = w.id

			select {
			case w.ch <- ev:
				pendingEventsGauge.Inc()
			default:
				// TODO: handle the full unsynced watchings.
				// continue to process other watchings for now, the full ones
				// will be processed next time and hopefully it will not be full.
				continue
			}
			if err := unsafeAddWatching(&s.synced, k, w); err != nil {
				log.Panicf("error unsafeAddWatching (%v) for key %s", err, k)
			}
			delete(s.unsynced, w)
		}
	}

	slowWatchingGauge.Set(float64(len(s.unsynced)))
}

// handle handles the change of the happening event on all watchings.
func (s *watchableStore) handle(rev int64, ev storagepb.Event) {
	s.notify(rev, ev)
}

// notify notifies the fact that given event at the given rev just happened to
// watchings that watch on the key of the event.
func (s *watchableStore) notify(rev int64, ev storagepb.Event) {
	// check all prefixes of the key to notify all corresponded watchings
	for i := 0; i <= len(ev.Kv.Key); i++ {
		k := string(ev.Kv.Key[:i])
		if wm, ok := s.synced[k]; ok {
			for w := range wm {
				// the watching needs to be notified when either it watches prefix or
				// the key is exactly matched.
				if !w.prefix && i != len(ev.Kv.Key) {
					continue
				}
				ev.WatchID = w.id
				select {
				case w.ch <- ev:
					pendingEventsGauge.Inc()
				default:
					w.cur = rev
					s.unsynced[w] = struct{}{}
					delete(wm, w)
					slowWatchingGauge.Inc()
				}
			}
		}
	}
}

type ongoingTx struct {
	// keys put/deleted in the ongoing txn
	putm map[string]struct{}
	delm map[string]struct{}
}

func newOngoingTx() *ongoingTx {
	return &ongoingTx{
		putm: make(map[string]struct{}),
		delm: make(map[string]struct{}),
	}
}

func (tx *ongoingTx) put(k string) {
	tx.putm[k] = struct{}{}
	if _, ok := tx.delm[k]; ok {
		delete(tx.delm, k)
	}
}

func (tx *ongoingTx) del(k string) {
	tx.delm[k] = struct{}{}
	if _, ok := tx.putm[k]; ok {
		delete(tx.putm, k)
	}
}

type watching struct {
	// the watching key
	key []byte
	// prefix indicates if watching is on a key or a prefix.
	// If prefix is true, the watching is on a prefix.
	prefix bool
	// cur is the current watching revision.
	// If cur is behind the current revision of the KV,
	// watching is unsynced and needs to catch up.
	cur int64
	id  int64

	// a chan to send out the watched events.
	// The chan might be shared with other watchings.
	ch chan<- storagepb.Event
}

// unsafeAddWatching puts watching with key k into watchableStore's synced.
// Make sure to this is thread-safe using mutex before and after.
func unsafeAddWatching(synced *map[string]map[*watching]struct{}, k string, wa *watching) error {
	if wa == nil {
		return fmt.Errorf("nil watching received")
	}
	mp := *synced
	if v, ok := mp[k]; ok {
		if _, ok := v[wa]; ok {
			return fmt.Errorf("put the same watch twice: %+v", wa)
		} else {
			v[wa] = struct{}{}
		}
		return nil
	}

	mp[k] = make(map[*watching]struct{})
	mp[k][wa] = struct{}{}
	return nil
}