From dad8208a4ddd46bcbab937d0c2cac5fedf5a3113 Mon Sep 17 00:00:00 2001
From: Tobias Grieger <tobias.b.grieger@gmail.com>
Date: Mon, 5 Sep 2022 17:29:07 +0200
Subject: [PATCH] raft: avoid panics during *node tests

`StartNode` runs a naked goroutine, so it's impossible to test against
it in a way that will reliably produce contained test failures when
assertions are hit on the `(*node).run` goroutine.

This commit introduces a harness that we can use in tests to wrap
this goroutine and allow it to defer errors to `*testing.T`.

Note that tests of `Node` still need to be architected carefully
since it's easy to produce a deadlock in them should things not
go exactly as planned.

Signed-off-by: Tobias Grieger <tobias.b.grieger@gmail.com>
---
 raft/node.go           |  18 ++++---
 raft/node_test.go      |   7 +--
 raft/node_util_test.go | 109 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 122 insertions(+), 12 deletions(-)
 create mode 100644 raft/node_util_test.go

diff --git a/raft/node.go b/raft/node.go
index d374b6c0c..381000621 100644
--- a/raft/node.go
+++ b/raft/node.go
@@ -211,11 +211,7 @@ type Peer struct {
 	Context []byte
 }
 
-// StartNode returns a new Node given configuration and a list of raft peers.
-// It appends a ConfChangeAddNode entry for each given peer to the initial log.
-//
-// Peers must not be zero length; call RestartNode in that case.
-func StartNode(c *Config, peers []Peer) Node {
+func setupNode(c *Config, peers []Peer) *node {
 	if len(peers) == 0 {
 		panic("no peers given; use RestartNode instead")
 	}
@@ -229,11 +225,19 @@ func StartNode(c *Config, peers []Peer) Node {
 	}
 
 	n := newNode(rn)
-
-	go n.run()
 	return &n
 }
 
+// StartNode returns a new Node given configuration and a list of raft peers.
+// It appends a ConfChangeAddNode entry for each given peer to the initial log.
+//
+// Peers must not be zero length; call RestartNode in that case.
+func StartNode(c *Config, peers []Peer) Node {
+	n := setupNode(c, peers)
+	go n.run()
+	return n
+}
+
 // RestartNode is similar to StartNode but does not take a list of peers.
 // The current membership of the cluster will be restored from the Storage.
 // If the caller has an existing state machine, pass in the last log index that
diff --git a/raft/node_test.go b/raft/node_test.go
index 2c2ff8018..b732174f1 100644
--- a/raft/node_test.go
+++ b/raft/node_test.go
@@ -740,9 +740,6 @@ func TestNodeRestartFromSnapshot(t *testing.T) {
 }
 
 func TestNodeAdvance(t *testing.T) {
-	ctx, cancel := context.WithCancel(context.Background())
-	defer cancel()
-
 	storage := NewMemoryStorage()
 	c := &Config{
 		ID:              1,
@@ -752,8 +749,8 @@ func TestNodeAdvance(t *testing.T) {
 		MaxSizePerMsg:   noLimit,
 		MaxInflightMsgs: 256,
 	}
-	n := StartNode(c, []Peer{{ID: 1}})
-	defer n.Stop()
+	ctx, cancel, n := newNodeTestHarness(t, context.Background(), c, Peer{ID: 1})
+	defer cancel()
 	rd := <-n.Ready()
 	storage.Append(rd.Entries)
 	n.Advance()
diff --git a/raft/node_util_test.go b/raft/node_util_test.go
new file mode 100644
index 000000000..fb3473d12
--- /dev/null
+++ b/raft/node_util_test.go
@@ -0,0 +1,109 @@
+// Copyright 2015 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package raft
+
+import (
+	"context"
+	"fmt"
+	"testing"
+	"time"
+)
+
+type nodeTestHarness struct {
+	*node
+	t *testing.T
+}
+
+func (l *nodeTestHarness) Debug(v ...interface{}) {
+	l.t.Log(v...)
+}
+
+func (l *nodeTestHarness) Debugf(format string, v ...interface{}) {
+	l.t.Logf(format, v...)
+}
+
+func (l *nodeTestHarness) Error(v ...interface{}) {
+	l.t.Error(v...)
+}
+
+func (l *nodeTestHarness) Errorf(format string, v ...interface{}) {
+	l.t.Errorf(format, v...)
+}
+
+func (l *nodeTestHarness) Info(v ...interface{}) {
+	l.t.Log(v...)
+}
+
+func (l *nodeTestHarness) Infof(format string, v ...interface{}) {
+	l.t.Logf(format, v...)
+}
+
+func (l *nodeTestHarness) Warning(v ...interface{}) {
+	l.t.Log(v...)
+}
+
+func (l *nodeTestHarness) Warningf(format string, v ...interface{}) {
+	l.t.Logf(format, v...)
+}
+
+func (l *nodeTestHarness) Fatal(v ...interface{}) {
+	l.t.Error(v...)
+	panic(v)
+}
+
+func (l *nodeTestHarness) Fatalf(format string, v ...interface{}) {
+	l.t.Errorf(format, v...)
+	panic(fmt.Sprintf(format, v...))
+}
+
+func (l *nodeTestHarness) Panic(v ...interface{}) {
+	l.t.Log(v...)
+	panic(v)
+}
+
+func (l *nodeTestHarness) Panicf(format string, v ...interface{}) {
+	l.t.Errorf(format, v...)
+	panic(fmt.Sprintf(format, v...))
+}
+
+func newNodeTestHarness(t *testing.T, ctx context.Context, cfg *Config, peers ...Peer) (_ context.Context, cancel func(), _ *nodeTestHarness) {
+	// Wrap context in a 10s timeout to make tests more robust. Otherwise,
+	// it's likely that deadlock will occur unless Node behaves exactly as
+	// expected - when you expect a Ready and start waiting on the channel
+	// but no Ready ever shows up, for example.
+	ctx, cancel = context.WithTimeout(ctx, 10*time.Second)
+	var n *node
+	if len(peers) > 0 {
+		n = setupNode(cfg, peers)
+	} else {
+		rn, err := NewRawNode(cfg)
+		if err != nil {
+			t.Fatal(err)
+		}
+		nn := newNode(rn)
+		n = &nn
+	}
+	go func() {
+		defer func() {
+			if r := recover(); r != nil {
+				t.Error(r)
+			}
+		}()
+		defer cancel()
+		defer n.Stop()
+		n.run()
+	}()
+	return ctx, cancel, &nodeTestHarness{node: n, t: t}
+}