From 5367c1c99890d0ca37d92dca6fd0c48ca4f689bd Mon Sep 17 00:00:00 2001 From: Yicheng Qin Date: Fri, 9 May 2014 15:38:03 -0700 Subject: [PATCH] chore(standby): minor changes based on comments --- Documentation/design/standbys.md | 12 ++++++------ etcd/etcd.go | 4 ++-- server/cluster_config.go | 4 ++-- server/peer_server.go | 6 +++--- server/peer_server_handlers.go | 1 - tests/functional/kill_leader_test.go | 4 ++-- tests/functional/standby_test.go | 16 ++++++++-------- 7 files changed, 23 insertions(+), 24 deletions(-) diff --git a/Documentation/design/standbys.md b/Documentation/design/standbys.md index 8bf201b81..1e024ad52 100644 --- a/Documentation/design/standbys.md +++ b/Documentation/design/standbys.md @@ -11,13 +11,13 @@ Standbys also act as standby nodes in the event that a peer node in the cluster ## Configuration Parameters -There are three configuration parameters used by standbys: active size, promotion delay and standby sync interval. +There are three configuration parameters used by standbys: active size, remove delay and standby sync interval. The active size specifies a target size for the number of peers in the cluster. If there are not enough peers to meet the active size then, standbys will send join requests until the peer count is equal to the active size. -If there are more peers than the target active size then peers are demoted to standbys by the leader. +If there are more peers than the target active size then peers are removed by the leader and will become standbys. -The promotion delay specifies how long the cluster should wait before removing a dead peer. +The remove delay specifies how long the cluster should wait before removing a dead peer. By default this is 30 minutes. If a peer is inactive for 30 minutes then the peer is removed. @@ -169,7 +169,7 @@ Loop: Sleep for some time For each peer: - If peer last activity time > promote delay: + If peer last activity time > remove delay: Remove the peer Goto Loop ``` @@ -200,7 +200,7 @@ Machines in standby mode always sync the cluster. If sync fails, it uses the fir Leader of the cluster lose the connection with the peer. -When the time exceeds promotion delay, it removes the peer from the cluster. +When the time exceeds remove delay, it removes the peer from the cluster. Machine in standby mode finds one available place of the cluster. It sends join request and joins the cluster. @@ -224,7 +224,7 @@ No change for the cluster. ## Future Attack Plans -1. Based on heartbeat miss and promotion delay, standby could adjust its next check time. +1. Based on heartbeat miss and remove delay, standby could adjust its next check time. 2. Preregister the promotion target when heartbeat miss happens. diff --git a/etcd/etcd.go b/etcd/etcd.go index 2095afa01..77177454a 100644 --- a/etcd/etcd.go +++ b/etcd/etcd.go @@ -237,11 +237,11 @@ func (e *Etcd) Run() { peerTLSConfig := server.TLSServerConfig(e.Config.PeerTLSInfo()) etcdTLSConfig := server.TLSServerConfig(e.Config.EtcdTLSInfo()) - toStartPeerServer, possiblePeers, err := e.PeerServer.FindCluster(e.Config.Discovery, e.Config.Peers) + startPeerServer, possiblePeers, err := e.PeerServer.FindCluster(e.Config.Discovery, e.Config.Peers) if err != nil { log.Fatal(err) } - if toStartPeerServer { + if startPeerServer { e.setMode(PeerMode) } else { e.StandbyServer.SyncCluster(possiblePeers) diff --git a/server/cluster_config.go b/server/cluster_config.go index 5875f461b..44c955fce 100644 --- a/server/cluster_config.go +++ b/server/cluster_config.go @@ -11,10 +11,10 @@ const ( // MinActiveSize is the minimum active size allowed. MinActiveSize = 3 - // DefaultRemoveDelay is the default elapsed time before promotion. + // DefaultRemoveDelay is the default elapsed time before removal. DefaultRemoveDelay = int((30 * time.Minute) / time.Second) - // MinRemoveDelay is the minimum promote delay allowed. + // MinRemoveDelay is the minimum remove delay allowed. MinRemoveDelay = int((2 * time.Second) / time.Second) // DefaultSyncInterval is the default interval for cluster sync. diff --git a/server/peer_server.go b/server/peer_server.go index 4b4e8fc94..d65e1573b 100644 --- a/server/peer_server.go +++ b/server/peer_server.go @@ -372,8 +372,8 @@ func (s *PeerServer) ClusterConfig() *ClusterConfig { } // SetClusterConfig updates the current cluster configuration. -// Adjusting the active size will cause the PeerServer to demote peers or -// promote standbys to match the new size. +// Adjusting the active size will cause cluster to add or remove machines +// to match the new size. func (s *PeerServer) SetClusterConfig(c *ClusterConfig) { // Set minimums. if c.ActiveSize < MinActiveSize { @@ -820,7 +820,7 @@ func (s *PeerServer) monitorPeerActivity() { removeDelay := time.Duration(s.ClusterConfig().RemoveDelay) * time.Second peers := s.raftServer.Peers() for _, peer := range peers { - // If the last response from the peer is longer than the promote delay + // If the last response from the peer is longer than the remove delay // then automatically demote the peer. if !peer.LastActivity().IsZero() && now.Sub(peer.LastActivity()) > removeDelay { log.Infof("%s: removing node: %v; last activity %v ago", s.Config.Name, peer.Name, now.Sub(peer.LastActivity())) diff --git a/server/peer_server_handlers.go b/server/peer_server_handlers.go index d909fb01e..ebeaa182d 100644 --- a/server/peer_server_handlers.go +++ b/server/peer_server_handlers.go @@ -221,7 +221,6 @@ func (ps *PeerServer) setClusterConfigHttpHandler(w http.ResponseWriter, req *ht } // Retrieves a list of peers and standbys. -// If leader exists, it is at the first place. func (ps *PeerServer) getMachinesHttpHandler(w http.ResponseWriter, req *http.Request) { machines := make([]*machineMessage, 0) leader := ps.raftServer.Leader() diff --git a/tests/functional/kill_leader_test.go b/tests/functional/kill_leader_test.go index 25fc3f996..7c18d46be 100644 --- a/tests/functional/kill_leader_test.go +++ b/tests/functional/kill_leader_test.go @@ -114,10 +114,10 @@ func TestKillLeaderWithStandbys(t *testing.T) { leader := "http://127.0.0.1:7001" for i := 0; i < clusterSize; i++ { - fmt.Println("leader is ", leader) + t.Log("leader is ", leader) port, _ := strconv.Atoi(strings.Split(leader, ":")[2]) num := port - 7001 - fmt.Println("kill server ", num) + t.Log("kill server ", num) etcds[num].Kill() etcds[num].Release() diff --git a/tests/functional/standby_test.go b/tests/functional/standby_test.go index cee9a71d9..acc666bbd 100644 --- a/tests/functional/standby_test.go +++ b/tests/functional/standby_test.go @@ -36,7 +36,7 @@ func TestStandby(t *testing.T) { assert.NoError(t, err) assert.Equal(t, len(result.Node.Nodes), 9) - fmt.Println("Reconfigure with a smaller active size") + t.Log("Reconfigure with a smaller active size") resp, _ = tests.Put("http://localhost:7001/v2/admin/config", "application/json", bytes.NewBufferString(`{"activeSize":7, "syncInterval":1}`)) if !assert.Equal(t, resp.StatusCode, 200) { t.FailNow() @@ -50,7 +50,7 @@ func TestStandby(t *testing.T) { assert.NoError(t, err) assert.Equal(t, len(result.Node.Nodes), 7) - fmt.Println("Test the functionality of all servers") + t.Log("Test the functionality of all servers") // Set key. time.Sleep(time.Second) if _, err := c.Set("foo", "bar", 0); err != nil { @@ -69,7 +69,7 @@ func TestStandby(t *testing.T) { } } - fmt.Println("Reconfigure with larger active size and wait for join") + t.Log("Reconfigure with larger active size and wait for join") resp, _ = tests.Put("http://localhost:7001/v2/admin/config", "application/json", bytes.NewBufferString(`{"activeSize":8, "syncInterval":1}`)) if !assert.Equal(t, resp.StatusCode, 200) { t.FailNow() @@ -106,7 +106,7 @@ func TestStandbyAutoJoin(t *testing.T) { assert.NoError(t, err) assert.Equal(t, len(result.Node.Nodes), 5) - // Reconfigure with a short promote delay (2 second). + // Reconfigure with a short remove delay (2 second). resp, _ := tests.Put("http://localhost:7001/v2/admin/config", "application/json", bytes.NewBufferString(`{"activeSize":4, "removeDelay":2, "syncInterval":1}`)) if !assert.Equal(t, resp.StatusCode, 200) { t.FailNow() @@ -173,7 +173,7 @@ func TestStandbyGradualChange(t *testing.T) { num++ } - fmt.Println("Reconfigure with active size", num) + t.Log("Reconfigure with active size", num) resp, _ := tests.Put("http://localhost:7001/v2/admin/config", "application/json", bytes.NewBufferString(fmt.Sprintf(`{"activeSize":%d, "syncInterval":1}`, num))) if !assert.Equal(t, resp.StatusCode, 200) { t.FailNow() @@ -191,7 +191,7 @@ func TestStandbyGradualChange(t *testing.T) { assert.NoError(t, err) assert.Equal(t, len(result.Node.Nodes), num) - fmt.Println("Test the functionality of all servers") + t.Log("Test the functionality of all servers") // Set key. if _, err := c.Set("foo", "bar", 0); err != nil { panic(err) @@ -241,7 +241,7 @@ func TestStandbyDramaticChange(t *testing.T) { num += 6 } - fmt.Println("Reconfigure with active size", num) + t.Log("Reconfigure with active size", num) resp, _ := tests.Put("http://localhost:7001/v2/admin/config", "application/json", bytes.NewBufferString(fmt.Sprintf(`{"activeSize":%d, "syncInterval":1}`, num))) if !assert.Equal(t, resp.StatusCode, 200) { t.FailNow() @@ -259,7 +259,7 @@ func TestStandbyDramaticChange(t *testing.T) { assert.NoError(t, err) assert.Equal(t, len(result.Node.Nodes), num) - fmt.Println("Test the functionality of all servers") + t.Log("Test the functionality of all servers") // Set key. if _, err := c.Set("foo", "bar", 0); err != nil { panic(err)