mirror of
https://github.com/etcd-io/etcd.git
synced 2024-09-27 06:25:44 +00:00
chore(standby): minor changes based on comments
This commit is contained in:
@@ -11,13 +11,13 @@ Standbys also act as standby nodes in the event that a peer node in the cluster
|
||||
|
||||
## Configuration Parameters
|
||||
|
||||
There are three configuration parameters used by standbys: active size, promotion delay and standby sync interval.
|
||||
There are three configuration parameters used by standbys: active size, remove delay and standby sync interval.
|
||||
|
||||
The active size specifies a target size for the number of peers in the cluster.
|
||||
If there are not enough peers to meet the active size then, standbys will send join requests until the peer count is equal to the active size.
|
||||
If there are more peers than the target active size then peers are demoted to standbys by the leader.
|
||||
If there are more peers than the target active size then peers are removed by the leader and will become standbys.
|
||||
|
||||
The promotion delay specifies how long the cluster should wait before removing a dead peer.
|
||||
The remove delay specifies how long the cluster should wait before removing a dead peer.
|
||||
By default this is 30 minutes.
|
||||
If a peer is inactive for 30 minutes then the peer is removed.
|
||||
|
||||
@@ -169,7 +169,7 @@ Loop:
|
||||
Sleep for some time
|
||||
|
||||
For each peer:
|
||||
If peer last activity time > promote delay:
|
||||
If peer last activity time > remove delay:
|
||||
Remove the peer
|
||||
Goto Loop
|
||||
```
|
||||
@@ -200,7 +200,7 @@ Machines in standby mode always sync the cluster. If sync fails, it uses the fir
|
||||
|
||||
Leader of the cluster lose the connection with the peer.
|
||||
|
||||
When the time exceeds promotion delay, it removes the peer from the cluster.
|
||||
When the time exceeds remove delay, it removes the peer from the cluster.
|
||||
|
||||
Machine in standby mode finds one available place of the cluster. It sends join request and joins the cluster.
|
||||
|
||||
@@ -224,7 +224,7 @@ No change for the cluster.
|
||||
|
||||
## Future Attack Plans
|
||||
|
||||
1. Based on heartbeat miss and promotion delay, standby could adjust its next check time.
|
||||
1. Based on heartbeat miss and remove delay, standby could adjust its next check time.
|
||||
|
||||
2. Preregister the promotion target when heartbeat miss happens.
|
||||
|
||||
|
||||
@@ -237,11 +237,11 @@ func (e *Etcd) Run() {
|
||||
peerTLSConfig := server.TLSServerConfig(e.Config.PeerTLSInfo())
|
||||
etcdTLSConfig := server.TLSServerConfig(e.Config.EtcdTLSInfo())
|
||||
|
||||
toStartPeerServer, possiblePeers, err := e.PeerServer.FindCluster(e.Config.Discovery, e.Config.Peers)
|
||||
startPeerServer, possiblePeers, err := e.PeerServer.FindCluster(e.Config.Discovery, e.Config.Peers)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
if toStartPeerServer {
|
||||
if startPeerServer {
|
||||
e.setMode(PeerMode)
|
||||
} else {
|
||||
e.StandbyServer.SyncCluster(possiblePeers)
|
||||
|
||||
@@ -11,10 +11,10 @@ const (
|
||||
// MinActiveSize is the minimum active size allowed.
|
||||
MinActiveSize = 3
|
||||
|
||||
// DefaultRemoveDelay is the default elapsed time before promotion.
|
||||
// DefaultRemoveDelay is the default elapsed time before removal.
|
||||
DefaultRemoveDelay = int((30 * time.Minute) / time.Second)
|
||||
|
||||
// MinRemoveDelay is the minimum promote delay allowed.
|
||||
// MinRemoveDelay is the minimum remove delay allowed.
|
||||
MinRemoveDelay = int((2 * time.Second) / time.Second)
|
||||
|
||||
// DefaultSyncInterval is the default interval for cluster sync.
|
||||
|
||||
@@ -372,8 +372,8 @@ func (s *PeerServer) ClusterConfig() *ClusterConfig {
|
||||
}
|
||||
|
||||
// SetClusterConfig updates the current cluster configuration.
|
||||
// Adjusting the active size will cause the PeerServer to demote peers or
|
||||
// promote standbys to match the new size.
|
||||
// Adjusting the active size will cause cluster to add or remove machines
|
||||
// to match the new size.
|
||||
func (s *PeerServer) SetClusterConfig(c *ClusterConfig) {
|
||||
// Set minimums.
|
||||
if c.ActiveSize < MinActiveSize {
|
||||
@@ -820,7 +820,7 @@ func (s *PeerServer) monitorPeerActivity() {
|
||||
removeDelay := time.Duration(s.ClusterConfig().RemoveDelay) * time.Second
|
||||
peers := s.raftServer.Peers()
|
||||
for _, peer := range peers {
|
||||
// If the last response from the peer is longer than the promote delay
|
||||
// If the last response from the peer is longer than the remove delay
|
||||
// then automatically demote the peer.
|
||||
if !peer.LastActivity().IsZero() && now.Sub(peer.LastActivity()) > removeDelay {
|
||||
log.Infof("%s: removing node: %v; last activity %v ago", s.Config.Name, peer.Name, now.Sub(peer.LastActivity()))
|
||||
|
||||
@@ -221,7 +221,6 @@ func (ps *PeerServer) setClusterConfigHttpHandler(w http.ResponseWriter, req *ht
|
||||
}
|
||||
|
||||
// Retrieves a list of peers and standbys.
|
||||
// If leader exists, it is at the first place.
|
||||
func (ps *PeerServer) getMachinesHttpHandler(w http.ResponseWriter, req *http.Request) {
|
||||
machines := make([]*machineMessage, 0)
|
||||
leader := ps.raftServer.Leader()
|
||||
|
||||
@@ -114,10 +114,10 @@ func TestKillLeaderWithStandbys(t *testing.T) {
|
||||
leader := "http://127.0.0.1:7001"
|
||||
|
||||
for i := 0; i < clusterSize; i++ {
|
||||
fmt.Println("leader is ", leader)
|
||||
t.Log("leader is ", leader)
|
||||
port, _ := strconv.Atoi(strings.Split(leader, ":")[2])
|
||||
num := port - 7001
|
||||
fmt.Println("kill server ", num)
|
||||
t.Log("kill server ", num)
|
||||
etcds[num].Kill()
|
||||
etcds[num].Release()
|
||||
|
||||
|
||||
@@ -36,7 +36,7 @@ func TestStandby(t *testing.T) {
|
||||
assert.NoError(t, err)
|
||||
assert.Equal(t, len(result.Node.Nodes), 9)
|
||||
|
||||
fmt.Println("Reconfigure with a smaller active size")
|
||||
t.Log("Reconfigure with a smaller active size")
|
||||
resp, _ = tests.Put("http://localhost:7001/v2/admin/config", "application/json", bytes.NewBufferString(`{"activeSize":7, "syncInterval":1}`))
|
||||
if !assert.Equal(t, resp.StatusCode, 200) {
|
||||
t.FailNow()
|
||||
@@ -50,7 +50,7 @@ func TestStandby(t *testing.T) {
|
||||
assert.NoError(t, err)
|
||||
assert.Equal(t, len(result.Node.Nodes), 7)
|
||||
|
||||
fmt.Println("Test the functionality of all servers")
|
||||
t.Log("Test the functionality of all servers")
|
||||
// Set key.
|
||||
time.Sleep(time.Second)
|
||||
if _, err := c.Set("foo", "bar", 0); err != nil {
|
||||
@@ -69,7 +69,7 @@ func TestStandby(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
fmt.Println("Reconfigure with larger active size and wait for join")
|
||||
t.Log("Reconfigure with larger active size and wait for join")
|
||||
resp, _ = tests.Put("http://localhost:7001/v2/admin/config", "application/json", bytes.NewBufferString(`{"activeSize":8, "syncInterval":1}`))
|
||||
if !assert.Equal(t, resp.StatusCode, 200) {
|
||||
t.FailNow()
|
||||
@@ -106,7 +106,7 @@ func TestStandbyAutoJoin(t *testing.T) {
|
||||
assert.NoError(t, err)
|
||||
assert.Equal(t, len(result.Node.Nodes), 5)
|
||||
|
||||
// Reconfigure with a short promote delay (2 second).
|
||||
// Reconfigure with a short remove delay (2 second).
|
||||
resp, _ := tests.Put("http://localhost:7001/v2/admin/config", "application/json", bytes.NewBufferString(`{"activeSize":4, "removeDelay":2, "syncInterval":1}`))
|
||||
if !assert.Equal(t, resp.StatusCode, 200) {
|
||||
t.FailNow()
|
||||
@@ -173,7 +173,7 @@ func TestStandbyGradualChange(t *testing.T) {
|
||||
num++
|
||||
}
|
||||
|
||||
fmt.Println("Reconfigure with active size", num)
|
||||
t.Log("Reconfigure with active size", num)
|
||||
resp, _ := tests.Put("http://localhost:7001/v2/admin/config", "application/json", bytes.NewBufferString(fmt.Sprintf(`{"activeSize":%d, "syncInterval":1}`, num)))
|
||||
if !assert.Equal(t, resp.StatusCode, 200) {
|
||||
t.FailNow()
|
||||
@@ -191,7 +191,7 @@ func TestStandbyGradualChange(t *testing.T) {
|
||||
assert.NoError(t, err)
|
||||
assert.Equal(t, len(result.Node.Nodes), num)
|
||||
|
||||
fmt.Println("Test the functionality of all servers")
|
||||
t.Log("Test the functionality of all servers")
|
||||
// Set key.
|
||||
if _, err := c.Set("foo", "bar", 0); err != nil {
|
||||
panic(err)
|
||||
@@ -241,7 +241,7 @@ func TestStandbyDramaticChange(t *testing.T) {
|
||||
num += 6
|
||||
}
|
||||
|
||||
fmt.Println("Reconfigure with active size", num)
|
||||
t.Log("Reconfigure with active size", num)
|
||||
resp, _ := tests.Put("http://localhost:7001/v2/admin/config", "application/json", bytes.NewBufferString(fmt.Sprintf(`{"activeSize":%d, "syncInterval":1}`, num)))
|
||||
if !assert.Equal(t, resp.StatusCode, 200) {
|
||||
t.FailNow()
|
||||
@@ -259,7 +259,7 @@ func TestStandbyDramaticChange(t *testing.T) {
|
||||
assert.NoError(t, err)
|
||||
assert.Equal(t, len(result.Node.Nodes), num)
|
||||
|
||||
fmt.Println("Test the functionality of all servers")
|
||||
t.Log("Test the functionality of all servers")
|
||||
// Set key.
|
||||
if _, err := c.Set("foo", "bar", 0); err != nil {
|
||||
panic(err)
|
||||
|
||||
Reference in New Issue
Block a user