From 91f6aee4f2a8e00e31c24b2a0c1a1f61b8ae2e72 Mon Sep 17 00:00:00 2001 From: Gyu-Ho Lee Date: Tue, 25 Apr 2017 06:19:52 -0700 Subject: [PATCH] etcdserver: ensure waitForApply sync with applyAll Problem is: `Step1`: `etcdserver/raft.go`'s `Ready` process routine sends config-change entries via `r.applyc <- ap` (https://github.com/coreos/etcd/blob/master/etcdserver/raft.go#L193-L203) `Step2`: `etcdserver/server.go`'s `*EtcdServer.run` routine receives this via `ap := <-s.r.apply()` (https://github.com/coreos/etcd/blob/master/etcdserver/server.go#L735-L738) `StepA`: `Step1` proceeds without sync, right after sending `r.applyc <- ap`. `StepB`: `Step2` proceeds without sync, right after `sched.Schedule(s.applyAll(&ep,&ap))`. `StepC`: `etcdserver` tries to sync with `s.applyAll(&ep,&ap)` by calling `rh.waitForApply()`. `rh.waitForApply()` waits for all pending jobs to finish in `pkg/schedule` side. However, the order of `StepA`,`StepB`,`StepC` is not guaranteed. It is possible that `StepC` happens first, and proceeds without waiting on apply. And the restarting member comes back as a leader in single-node cluster, when there is no synchronization between apply-layer and config-change Raft entry apply. Confirmed with more debugging lines below, only reproducible with slow CPU VM (~2 vCPU). ``` ~:24.005397 I | etcdserver: starting server... [version: 3.2.0+git, cluster version: to_be_decided] ~:24.011136 I | etcdserver: [DEBUG] 29b2d24047a277df waitForApply before ~:24.011194 I | etcdserver: [DEBUG] 29b2d24047a277df starts wait for 0 pending jobs ~:24.011234 I | etcdserver: [DEBUG] 29b2d24047a277df finished wait for 0 pending jobs (current pending 0) ~:24.011268 I | etcdserver: [DEBUG] 29b2d24047a277df waitForApply after ~:24.011348 I | etcdserver: [DEBUG] [0] 29b2d24047a277df is scheduling conf change on 29b2d24047a277df ~:24.011396 I | etcdserver: [DEBUG] [1] 29b2d24047a277df is scheduling conf change on 5edf80e32a334cf0 ~:24.011437 I | etcdserver: [DEBUG] [2] 29b2d24047a277df is scheduling conf change on e32e31e76c8d2678 ~:24.011477 I | etcdserver: [DEBUG] 29b2d24047a277df scheduled conf change on 29b2d24047a277df ~:24.011509 I | etcdserver: [DEBUG] 29b2d24047a277df scheduled conf change on 5edf80e32a334cf0 ~:24.011545 I | etcdserver: [DEBUG] 29b2d24047a277df scheduled conf change on e32e31e76c8d2678 ~:24.012500 I | etcdserver: [DEBUG] 29b2d24047a277df applyConfChange on 29b2d24047a277df before ~:24.013014 I | etcdserver/membership: added member 29b2d24047a277df [unix://127.0.0.1:2100515039] to cluster 9250d4ae34216949 ~:24.013066 I | etcdserver: [DEBUG] 29b2d24047a277df applyConfChange on 29b2d24047a277df after ~:24.013113 I | etcdserver: [DEBUG] 29b2d24047a277df applyConfChange on 29b2d24047a277df after trigger ~:24.013158 I | etcdserver: [DEBUG] 29b2d24047a277df applyConfChange on 5edf80e32a334cf0 before ~:24.013666 W | etcdserver: failed to send out heartbeat on time (exceeded the 10ms timeout for 11.964739ms) ~:24.013709 W | etcdserver: server is likely overloaded ~:24.013750 W | etcdserver: failed to send out heartbeat on time (exceeded the 10ms timeout for 12.057265ms) ~:24.013775 W | etcdserver: server is likely overloaded ~:24.013950 I | raft: 29b2d24047a277df is starting a new election at term 4 ~:24.014012 I | raft: 29b2d24047a277df became candidate at term 5 ~:24.014051 I | raft: 29b2d24047a277df received MsgVoteResp from 29b2d24047a277df at term 5 ~:24.014107 I | raft: 29b2d24047a277df became leader at term 5 ~:24.014146 I | raft: raft.node: 29b2d24047a277df elected leader 29b2d24047a277df at term 5 ``` I am printing out the number of pending jobs before we call `sched.WaitFinish(0)`, and there was no pending jobs, so it returned immediately (before we schedule `applyAll`). This is the root cause to: - https://github.com/coreos/etcd/issues/7595 - https://github.com/coreos/etcd/issues/7739 - https://github.com/coreos/etcd/issues/7802 `sched.WaitFinish(0)` doesn't work when `len(f.pendings)==0` and `f.finished==0`. Config-change is the first job to apply, so `f.finished` is 0 in this case. `f.finished` monotonically increases, so we need `WaitFinish(finished+1)`. And `finished` must be the one before calling `Schedule`. This is safe because `Schedule(applyAll)` is the only place adding jobs to `sched`. Then scheduler waits on the single job of `applyAll`, by getting the current number of finished jobs before sending `Schedule`. Or just make it be blocked until `applyAll` routine triggers on the config-change job. This patch just removes `waitForApply`, and signal `raftDone` to wait until `applyAll` finishes applying entries. Confirmed that it fixes the issue, as below: ``` ~:43.198354 I | rafthttp: started streaming with peer 36cda5222aba364b (stream MsgApp v2 reader) ~:43.198740 I | etcdserver: [DEBUG] 3988bc20c2b2e40c waitForApply before ~:43.198836 I | etcdserver: [DEBUG] 3988bc20c2b2e40c starts wait for 0 pending jobs, 1 finished jobs ~:43.200696 I | integration: launched 3169361310155633349 () ~:43.201784 I | etcdserver: [DEBUG] [0] 3988bc20c2b2e40c is scheduling conf change on 36cda5222aba364b ~:43.201884 I | etcdserver: [DEBUG] [1] 3988bc20c2b2e40c is scheduling conf change on 3988bc20c2b2e40c ~:43.201965 I | etcdserver: [DEBUG] [2] 3988bc20c2b2e40c is scheduling conf change on cf5d6cbc2a121727 ~:43.202070 I | etcdserver: [DEBUG] 3988bc20c2b2e40c scheduled conf change on 36cda5222aba364b ~:43.202139 I | etcdserver: [DEBUG] 3988bc20c2b2e40c scheduled conf change on 3988bc20c2b2e40c ~:43.202204 I | etcdserver: [DEBUG] 3988bc20c2b2e40c scheduled conf change on cf5d6cbc2a121727 ~:43.202444 I | etcdserver: [DEBUG] 3988bc20c2b2e40c applyConfChange on 36cda5222aba364b (request ID: 0) before ~:43.204486 I | etcdserver/membership: added member 36cda5222aba364b [unix://127.0.0.1:2100913646] to cluster 425d73f1b7b01674 ~:43.204588 I | etcdserver: [DEBUG] 3988bc20c2b2e40c applyConfChange on 36cda5222aba364b (request ID: 0) after ~:43.204703 I | etcdserver: [DEBUG] 3988bc20c2b2e40c applyConfChange on 36cda5222aba364b (request ID: 0) after trigger ~:43.204791 I | etcdserver: [DEBUG] 3988bc20c2b2e40c applyConfChange on 3988bc20c2b2e40c (request ID: 0) before ~:43.205689 I | etcdserver/membership: added member 3988bc20c2b2e40c [unix://127.0.0.1:2101113646] to cluster 425d73f1b7b01674 ~:43.205783 I | etcdserver: [DEBUG] 3988bc20c2b2e40c applyConfChange on 3988bc20c2b2e40c (request ID: 0) after ~:43.205929 I | etcdserver: [DEBUG] 3988bc20c2b2e40c applyConfChange on 3988bc20c2b2e40c (request ID: 0) after trigger ~:43.206056 I | etcdserver: [DEBUG] 3988bc20c2b2e40c applyConfChange on cf5d6cbc2a121727 (request ID: 0) before ~:43.207353 I | etcdserver/membership: added member cf5d6cbc2a121727 [unix://127.0.0.1:2100713646] to cluster 425d73f1b7b01674 ~:43.207516 I | etcdserver: [DEBUG] 3988bc20c2b2e40c applyConfChange on cf5d6cbc2a121727 (request ID: 0) after ~:43.207619 I | etcdserver: [DEBUG] 3988bc20c2b2e40c applyConfChange on cf5d6cbc2a121727 (request ID: 0) after trigger ~:43.207710 I | etcdserver: [DEBUG] 3988bc20c2b2e40c finished scheduled conf change on 36cda5222aba364b ~:43.207781 I | etcdserver: [DEBUG] 3988bc20c2b2e40c finished scheduled conf change on 3988bc20c2b2e40c ~:43.207843 I | etcdserver: [DEBUG] 3988bc20c2b2e40c finished scheduled conf change on cf5d6cbc2a121727 ~:43.207951 I | etcdserver: [DEBUG] 3988bc20c2b2e40c finished wait for 0 pending jobs (current pending 0, finished 1) ~:43.208029 I | rafthttp: started HTTP pipelining with peer cf5d6cbc2a121727 ~:43.210339 I | rafthttp: peer 3988bc20c2b2e40c became active ~:43.210435 I | rafthttp: established a TCP streaming connection with peer 3988bc20c2b2e40c (stream MsgApp v2 reader) ~:43.210861 I | rafthttp: started streaming with peer 3988bc20c2b2e40c (writer) ~:43.211732 I | etcdserver: [DEBUG] 3988bc20c2b2e40c waitForApply after ``` Signed-off-by: Gyu-Ho Lee --- etcdserver/raft.go | 5 ++++- etcdserver/raft_test.go | 15 +++------------ etcdserver/server.go | 4 ---- 3 files changed, 7 insertions(+), 17 deletions(-) diff --git a/etcdserver/raft.go b/etcdserver/raft.go index b603c459c..4528d7ba3 100644 --- a/etcdserver/raft.go +++ b/etcdserver/raft.go @@ -257,7 +257,10 @@ func (r *raftNode) start(rh *raftReadyHandler) { } } if waitApply { - rh.waitForApply() + // blocks until 'applyAll' calls 'applyWait.Trigger' + // to be in sync with scheduled config-change job + // (assume raftDone has cap of 1) + raftDone <- struct{}{} } // gofail: var raftBeforeFollowerSend struct{} diff --git a/etcdserver/raft_test.go b/etcdserver/raft_test.go index fa5c0174b..3396e60f0 100644 --- a/etcdserver/raft_test.go +++ b/etcdserver/raft_test.go @@ -180,8 +180,6 @@ func TestStopRaftWhenWaitingForApplyDone(t *testing.T) { func TestConfgChangeBlocksApply(t *testing.T) { n := newNopReadyNode() - waitApplyc := make(chan struct{}) - r := newRaftNode(raftNodeConfig{ Node: n, storage: mockstorage.NewStorageRecorder(""), @@ -190,21 +188,14 @@ func TestConfgChangeBlocksApply(t *testing.T) { }) srv := &EtcdServer{r: *r} - rh := &raftReadyHandler{ - updateLeadership: func(bool) {}, - waitForApply: func() { - <-waitApplyc - }, - } - - srv.r.start(rh) + srv.r.start(&raftReadyHandler{updateLeadership: func(bool) {}}) defer srv.r.Stop() n.readyc <- raft.Ready{ SoftState: &raft.SoftState{RaftState: raft.StateFollower}, CommittedEntries: []raftpb.Entry{{Type: raftpb.EntryConfChange}}, } - <-srv.r.applyc + ap := <-srv.r.applyc continueC := make(chan struct{}) go func() { @@ -220,7 +211,7 @@ func TestConfgChangeBlocksApply(t *testing.T) { } // finish apply, unblock raft routine - close(waitApplyc) + <-ap.raftDone select { case <-continueC: diff --git a/etcdserver/server.go b/etcdserver/server.go index aa4a17359..f5fc36952 100644 --- a/etcdserver/server.go +++ b/etcdserver/server.go @@ -614,7 +614,6 @@ type etcdProgress struct { type raftReadyHandler struct { updateLeadership func(newLeader bool) updateCommittedIndex func(uint64) - waitForApply func() } func (s *EtcdServer) run() { @@ -676,9 +675,6 @@ func (s *EtcdServer) run() { s.setCommittedIndex(ci) } }, - waitForApply: func() { - sched.WaitFinish(0) - }, } s.r.start(rh)