mirror of
https://github.com/etcd-io/etcd.git
synced 2024-09-27 06:25:44 +00:00
mvcc: Fix races between metrics gathering and mvcc.Restore
The races was manifesting as following flakes: ``` ``` See: https://github.com/etcd-io/etcd/issues/12336 I'm taking the locks for short-duration of time (instead of the whole duriation of Restore) to allow metrics being gather when the server restoration is in progress. ``` {"level":"warn","ts":"2020-09-26T13:33:13.010Z","caller":"clientv3/retry_interceptor.go:62","msg":"retrying of unary invoker failed","target":"endpoint://client-c9c21e47-2013-4776-8e83-e331b2caa9ae/localhost:14422410081761184170","attempt":0,"error":"rpc error: code = Unavailable desc = all SubConns are in TransientFailure, latest connection error: connection error: desc = \"transport: Error while dialing dial unix localhost:14422410081761184170: connect: no such file or directory\""} {"level":"warn","ts":"2020-09-26T13:33:13.011Z","caller":"clientv3/retry_interceptor.go:62","msg":"retrying of unary invoker failed","target":"endpoint://client-c9c21e47-2013-4776-8e83-e331b2caa9ae/localhost:14422410081761184170","attempt":0,"error":"rpc error: code = Unavailable desc = all SubConns are in TransientFailure, latest connection error: connection error: desc = \"transport: Error while dialing dial unix localhost:14422410081761184170: connect: no such file or directory\""} {"level":"warn","ts":"2020-09-26T13:33:16.285Z","caller":"clientv3/retry_interceptor.go:62","msg":"retrying of unary invoker failed","target":"endpoint://client-b504e954-e000-42a4-aa4f-70ded8dbef39/localhost:55672762955698614610","attempt":0,"error":"rpc error: code = NotFound desc = etcdserver: requested lease not found"} {"level":"warn","ts":"2020-09-26T13:33:21.434Z","caller":"clientv3/retry_interceptor.go:62","msg":"retrying of unary invoker failed","target":"endpoint://client-7945004b-f67e-42aa-af11-a7b40fbbe6fc/localhost:49623072144007561240","attempt":0,"error":"rpc error: code = Canceled desc = context canceled"} ================== WARNING: DATA RACE Write at 0x00c000905f78 by goroutine 764: go.etcd.io/etcd/v3/mvcc.(*store).restore() /go/src/go.etcd.io/etcd/mvcc/kvstore.go:397 +0x773 go.etcd.io/etcd/v3/mvcc.(*store).Restore() /go/src/go.etcd.io/etcd/mvcc/kvstore.go:343 +0x5f1 go.etcd.io/etcd/v3/mvcc.(*watchableStore).Restore() /go/src/go.etcd.io/etcd/mvcc/watchable_store.go:199 +0xe2 go.etcd.io/etcd/v3/etcdserver.(*EtcdServer).applySnapshot() /go/src/go.etcd.io/etcd/etcdserver/server.go:1107 +0xa49 go.etcd.io/etcd/v3/etcdserver.(*EtcdServer).applyAll() /go/src/go.etcd.io/etcd/etcdserver/server.go:1031 +0x6d go.etcd.io/etcd/v3/etcdserver.(*EtcdServer).run.func8() /go/src/go.etcd.io/etcd/etcdserver/server.go:986 +0x53 go.etcd.io/etcd/v3/pkg/schedule.(*fifo).run() /go/src/go.etcd.io/etcd/pkg/schedule/schedule.go:157 +0x11e Previous read at 0x00c000905f78 by goroutine 180: [failed to restore the stack] Goroutine 764 (running) created at: go.etcd.io/etcd/v3/pkg/schedule.NewFIFOScheduler() /go/src/go.etcd.io/etcd/pkg/schedule/schedule.go:70 +0x2b1 go.etcd.io/etcd/v3/etcdserver.(*EtcdServer).run() /go/src/go.etcd.io/etcd/etcdserver/server.go:871 +0x32c Goroutine 180 (running) created at: net/http.(*Server).Serve() /usr/local/go/src/net/http/server.go:2933 +0x5b6 net/http/httptest.(*Server).goServe.func1() /usr/local/go/src/net/http/httptest/server.go:308 +0xd3 ================== --- FAIL: TestV3WatchRestoreSnapshotUnsync (6.74s) testing.go:906: race detected during execution of test FAIL coverage: 83.5% of statements FAIL go.etcd.io/etcd/v3/integration 231.272s FAIL Command 'go test -timeout=30m -cpu=1 --race --cover=true go.etcd.io/etcd/v3/integration' failed. ```
This commit is contained in:
parent
220f711a2a
commit
98b123f034
@ -333,8 +333,15 @@ func (s *store) Restore(b backend.Backend) error {
|
||||
|
||||
s.b = b
|
||||
s.kvindex = newTreeIndex(s.lg)
|
||||
s.currentRev = 1
|
||||
s.compactMainRev = -1
|
||||
|
||||
{
|
||||
// During restore the metrics might report 'special' values
|
||||
s.revMu.Lock()
|
||||
s.currentRev = 1
|
||||
s.compactMainRev = -1
|
||||
s.revMu.Unlock()
|
||||
}
|
||||
|
||||
s.fifoSched = schedule.NewFIFOScheduler()
|
||||
s.stopc = make(chan struct{})
|
||||
s.ci.SetBatchTx(b.BatchTx())
|
||||
@ -358,6 +365,7 @@ func (s *store) restore() error {
|
||||
|
||||
_, finishedCompactBytes := tx.UnsafeRange(metaBucketName, finishedCompactKeyName, nil, 0)
|
||||
if len(finishedCompactBytes) != 0 {
|
||||
s.revMu.Lock()
|
||||
s.compactMainRev = bytesToRev(finishedCompactBytes[0]).main
|
||||
|
||||
s.lg.Info(
|
||||
@ -366,6 +374,7 @@ func (s *store) restore() error {
|
||||
zap.String("meta-bucket-name-key", string(finishedCompactKeyName)),
|
||||
zap.Int64("restored-compact-revision", s.compactMainRev),
|
||||
)
|
||||
s.revMu.Unlock()
|
||||
}
|
||||
_, scheduledCompactBytes := tx.UnsafeRange(metaBucketName, scheduledCompactKeyName, nil, 0)
|
||||
scheduledCompact := int64(0)
|
||||
@ -394,14 +403,20 @@ func (s *store) restore() error {
|
||||
revToBytes(newMin, min)
|
||||
}
|
||||
close(rkvc)
|
||||
s.currentRev = <-revc
|
||||
|
||||
// keys in the range [compacted revision -N, compaction] might all be deleted due to compaction.
|
||||
// the correct revision should be set to compaction revision in the case, not the largest revision
|
||||
// we have seen.
|
||||
if s.currentRev < s.compactMainRev {
|
||||
s.currentRev = s.compactMainRev
|
||||
{
|
||||
s.revMu.Lock()
|
||||
s.currentRev = <-revc
|
||||
|
||||
// keys in the range [compacted revision -N, compaction] might all be deleted due to compaction.
|
||||
// the correct revision should be set to compaction revision in the case, not the largest revision
|
||||
// we have seen.
|
||||
if s.currentRev < s.compactMainRev {
|
||||
s.currentRev = s.compactMainRev
|
||||
}
|
||||
s.revMu.Unlock()
|
||||
}
|
||||
|
||||
if scheduledCompact <= s.compactMainRev {
|
||||
scheduledCompact = 0
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user