chore(standby): minor changes based on comments

This commit is contained in:
Yicheng Qin
2014-05-09 15:38:03 -07:00
parent c6b1a738c3
commit 5367c1c998
7 changed files with 23 additions and 24 deletions

View File

@@ -11,13 +11,13 @@ Standbys also act as standby nodes in the event that a peer node in the cluster
## Configuration Parameters
There are three configuration parameters used by standbys: active size, promotion delay and standby sync interval.
There are three configuration parameters used by standbys: active size, remove delay and standby sync interval.
The active size specifies a target size for the number of peers in the cluster.
If there are not enough peers to meet the active size then, standbys will send join requests until the peer count is equal to the active size.
If there are more peers than the target active size then peers are demoted to standbys by the leader.
If there are more peers than the target active size then peers are removed by the leader and will become standbys.
The promotion delay specifies how long the cluster should wait before removing a dead peer.
The remove delay specifies how long the cluster should wait before removing a dead peer.
By default this is 30 minutes.
If a peer is inactive for 30 minutes then the peer is removed.
@@ -169,7 +169,7 @@ Loop:
Sleep for some time
For each peer:
If peer last activity time > promote delay:
If peer last activity time > remove delay:
Remove the peer
Goto Loop
```
@@ -200,7 +200,7 @@ Machines in standby mode always sync the cluster. If sync fails, it uses the fir
Leader of the cluster lose the connection with the peer.
When the time exceeds promotion delay, it removes the peer from the cluster.
When the time exceeds remove delay, it removes the peer from the cluster.
Machine in standby mode finds one available place of the cluster. It sends join request and joins the cluster.
@@ -224,7 +224,7 @@ No change for the cluster.
## Future Attack Plans
1. Based on heartbeat miss and promotion delay, standby could adjust its next check time.
1. Based on heartbeat miss and remove delay, standby could adjust its next check time.
2. Preregister the promotion target when heartbeat miss happens.

View File

@@ -237,11 +237,11 @@ func (e *Etcd) Run() {
peerTLSConfig := server.TLSServerConfig(e.Config.PeerTLSInfo())
etcdTLSConfig := server.TLSServerConfig(e.Config.EtcdTLSInfo())
toStartPeerServer, possiblePeers, err := e.PeerServer.FindCluster(e.Config.Discovery, e.Config.Peers)
startPeerServer, possiblePeers, err := e.PeerServer.FindCluster(e.Config.Discovery, e.Config.Peers)
if err != nil {
log.Fatal(err)
}
if toStartPeerServer {
if startPeerServer {
e.setMode(PeerMode)
} else {
e.StandbyServer.SyncCluster(possiblePeers)

View File

@@ -11,10 +11,10 @@ const (
// MinActiveSize is the minimum active size allowed.
MinActiveSize = 3
// DefaultRemoveDelay is the default elapsed time before promotion.
// DefaultRemoveDelay is the default elapsed time before removal.
DefaultRemoveDelay = int((30 * time.Minute) / time.Second)
// MinRemoveDelay is the minimum promote delay allowed.
// MinRemoveDelay is the minimum remove delay allowed.
MinRemoveDelay = int((2 * time.Second) / time.Second)
// DefaultSyncInterval is the default interval for cluster sync.

View File

@@ -372,8 +372,8 @@ func (s *PeerServer) ClusterConfig() *ClusterConfig {
}
// SetClusterConfig updates the current cluster configuration.
// Adjusting the active size will cause the PeerServer to demote peers or
// promote standbys to match the new size.
// Adjusting the active size will cause cluster to add or remove machines
// to match the new size.
func (s *PeerServer) SetClusterConfig(c *ClusterConfig) {
// Set minimums.
if c.ActiveSize < MinActiveSize {
@@ -820,7 +820,7 @@ func (s *PeerServer) monitorPeerActivity() {
removeDelay := time.Duration(s.ClusterConfig().RemoveDelay) * time.Second
peers := s.raftServer.Peers()
for _, peer := range peers {
// If the last response from the peer is longer than the promote delay
// If the last response from the peer is longer than the remove delay
// then automatically demote the peer.
if !peer.LastActivity().IsZero() && now.Sub(peer.LastActivity()) > removeDelay {
log.Infof("%s: removing node: %v; last activity %v ago", s.Config.Name, peer.Name, now.Sub(peer.LastActivity()))

View File

@@ -221,7 +221,6 @@ func (ps *PeerServer) setClusterConfigHttpHandler(w http.ResponseWriter, req *ht
}
// Retrieves a list of peers and standbys.
// If leader exists, it is at the first place.
func (ps *PeerServer) getMachinesHttpHandler(w http.ResponseWriter, req *http.Request) {
machines := make([]*machineMessage, 0)
leader := ps.raftServer.Leader()

View File

@@ -114,10 +114,10 @@ func TestKillLeaderWithStandbys(t *testing.T) {
leader := "http://127.0.0.1:7001"
for i := 0; i < clusterSize; i++ {
fmt.Println("leader is ", leader)
t.Log("leader is ", leader)
port, _ := strconv.Atoi(strings.Split(leader, ":")[2])
num := port - 7001
fmt.Println("kill server ", num)
t.Log("kill server ", num)
etcds[num].Kill()
etcds[num].Release()

View File

@@ -36,7 +36,7 @@ func TestStandby(t *testing.T) {
assert.NoError(t, err)
assert.Equal(t, len(result.Node.Nodes), 9)
fmt.Println("Reconfigure with a smaller active size")
t.Log("Reconfigure with a smaller active size")
resp, _ = tests.Put("http://localhost:7001/v2/admin/config", "application/json", bytes.NewBufferString(`{"activeSize":7, "syncInterval":1}`))
if !assert.Equal(t, resp.StatusCode, 200) {
t.FailNow()
@@ -50,7 +50,7 @@ func TestStandby(t *testing.T) {
assert.NoError(t, err)
assert.Equal(t, len(result.Node.Nodes), 7)
fmt.Println("Test the functionality of all servers")
t.Log("Test the functionality of all servers")
// Set key.
time.Sleep(time.Second)
if _, err := c.Set("foo", "bar", 0); err != nil {
@@ -69,7 +69,7 @@ func TestStandby(t *testing.T) {
}
}
fmt.Println("Reconfigure with larger active size and wait for join")
t.Log("Reconfigure with larger active size and wait for join")
resp, _ = tests.Put("http://localhost:7001/v2/admin/config", "application/json", bytes.NewBufferString(`{"activeSize":8, "syncInterval":1}`))
if !assert.Equal(t, resp.StatusCode, 200) {
t.FailNow()
@@ -106,7 +106,7 @@ func TestStandbyAutoJoin(t *testing.T) {
assert.NoError(t, err)
assert.Equal(t, len(result.Node.Nodes), 5)
// Reconfigure with a short promote delay (2 second).
// Reconfigure with a short remove delay (2 second).
resp, _ := tests.Put("http://localhost:7001/v2/admin/config", "application/json", bytes.NewBufferString(`{"activeSize":4, "removeDelay":2, "syncInterval":1}`))
if !assert.Equal(t, resp.StatusCode, 200) {
t.FailNow()
@@ -173,7 +173,7 @@ func TestStandbyGradualChange(t *testing.T) {
num++
}
fmt.Println("Reconfigure with active size", num)
t.Log("Reconfigure with active size", num)
resp, _ := tests.Put("http://localhost:7001/v2/admin/config", "application/json", bytes.NewBufferString(fmt.Sprintf(`{"activeSize":%d, "syncInterval":1}`, num)))
if !assert.Equal(t, resp.StatusCode, 200) {
t.FailNow()
@@ -191,7 +191,7 @@ func TestStandbyGradualChange(t *testing.T) {
assert.NoError(t, err)
assert.Equal(t, len(result.Node.Nodes), num)
fmt.Println("Test the functionality of all servers")
t.Log("Test the functionality of all servers")
// Set key.
if _, err := c.Set("foo", "bar", 0); err != nil {
panic(err)
@@ -241,7 +241,7 @@ func TestStandbyDramaticChange(t *testing.T) {
num += 6
}
fmt.Println("Reconfigure with active size", num)
t.Log("Reconfigure with active size", num)
resp, _ := tests.Put("http://localhost:7001/v2/admin/config", "application/json", bytes.NewBufferString(fmt.Sprintf(`{"activeSize":%d, "syncInterval":1}`, num)))
if !assert.Equal(t, resp.StatusCode, 200) {
t.FailNow()
@@ -259,7 +259,7 @@ func TestStandbyDramaticChange(t *testing.T) {
assert.NoError(t, err)
assert.Equal(t, len(result.Node.Nodes), num)
fmt.Println("Test the functionality of all servers")
t.Log("Test the functionality of all servers")
// Set key.
if _, err := c.Set("foo", "bar", 0); err != nil {
panic(err)