diff --git a/CHANGELOG-3.3.md b/CHANGELOG-3.3.md index ed8e15963..02198c5bd 100644 --- a/CHANGELOG-3.3.md +++ b/CHANGELOG-3.3.md @@ -6,6 +6,7 @@ See [code changes](https://github.com/coreos/etcd/compare/v3.3.3...v3.3.4) and [ ### Metrics, Monitoring +- Add [`etcd_server_is_leader`](https://github.com/coreos/etcd/pull/9587) Prometheus metric. - Fix [`etcd_debugging_server_lease_expired_total`](https://github.com/coreos/etcd/pull/9557) Prometheus metric. - Fix [race conditions in v2 server stat collecting](https://github.com/coreos/etcd/pull/9562). @@ -16,6 +17,23 @@ See [code changes](https://github.com/coreos/etcd/compare/v3.3.3...v3.3.4) and [ - However, a certificate whose SAN field does [not include any domain names but only IP addresses](https://github.com/coreos/etcd/issues/9541) would request `*tls.ClientHelloInfo` with an empty `ServerName` field, thus failing to trigger the TLS reload on initial TLS handshake; this becomes a problem when expired certificates need to be replaced online. - Now, `(*tls.Config).Certificates` is created empty on initial TLS client handshake, first to trigger `(*tls.Config).GetCertificate`, and then to populate rest of the certificates on every new TLS connection, even when client SNI is empty (e.g. cert only includes IPs). +### Added: `etcd` + +- Add [`--initial-election-tick-advance`](https://github.com/coreos/etcd/pull/9591) flag to configure initial election tick fast-forward. + - By default, `--initial-election-tick-advance=true`, then local member fast-forwards election ticks to speed up "initial" leader election trigger. + - This benefits the case of larger election ticks. For instance, cross datacenter deployment may require longer election timeout of 10-second. If true, local node does not need wait up to 10-second. Instead, forwards its election ticks to 8-second, and have only 2-second left before leader election. + - Major assumptions are that: cluster has no active leader thus advancing ticks enables faster leader election. Or cluster already has an established leader, and rejoining follower is likely to receive heartbeats from the leader after tick advance and before election timeout. + - However, when network from leader to rejoining follower is congested, and the follower does not receive leader heartbeat within left election ticks, disruptive election has to happen thus affecting cluster availabilities. + - Now, this can be disabled by setting `--initial-election-tick-advance=false`. + - Disabling this would slow down initial bootstrap process for cross datacenter deployments. Make tradeoffs by configuring `--initial-election-tick-advance` at the cost of slow initial bootstrap. + - If single-node, it advances ticks regardless. + - Address [disruptive rejoining follower node](https://github.com/coreos/etcd/issues/9333). + +### Added: `embed` + +- Add [`embed.Config.InitialElectionTickAdvance`](https://github.com/coreos/etcd/pull/9591) to enable/disable initial election tick fast-forward. + - `embed.NewConfig()` would return `*embed.Config` with `InitialElectionTickAdvance` as true by default. + ## [v3.3.3](https://github.com/coreos/etcd/releases/tag/v3.3.3) (2018-03-29) diff --git a/CHANGELOG-3.4.md b/CHANGELOG-3.4.md index dc8d0d6ab..26411e7ac 100644 --- a/CHANGELOG-3.4.md +++ b/CHANGELOG-3.4.md @@ -94,6 +94,7 @@ See [code changes](https://github.com/coreos/etcd/compare/v3.3.0...v3.4.0) and [ ### Metrics, Monitoring +- Add [`etcd_server_is_leader`](https://github.com/coreos/etcd/pull/9587) Prometheus metric. - Add [`etcd_debugging_mvcc_db_total_size_in_use_in_bytes`](https://github.com/coreos/etcd/pull/9256) Prometheus metric. - Add missing [`etcd_network_peer_sent_failures_total` count](https://github.com/coreos/etcd/pull/9437). - Fix [`etcd_debugging_server_lease_expired_total`](https://github.com/coreos/etcd/pull/9557) Prometheus metric. @@ -124,6 +125,15 @@ See [security doc](https://github.com/coreos/etcd/blob/master/Documentation/op-g ### Added: `etcd` +- Add [`--initial-election-tick-advance`](https://github.com/coreos/etcd/pull/9591) flag to configure initial election tick fast-forward. + - By default, `--initial-election-tick-advance=true`, then local member fast-forwards election ticks to speed up "initial" leader election trigger. + - This benefits the case of larger election ticks. For instance, cross datacenter deployment may require longer election timeout of 10-second. If true, local node does not need wait up to 10-second. Instead, forwards its election ticks to 8-second, and have only 2-second left before leader election. + - Major assumptions are that: cluster has no active leader thus advancing ticks enables faster leader election. Or cluster already has an established leader, and rejoining follower is likely to receive heartbeats from the leader after tick advance and before election timeout. + - However, when network from leader to rejoining follower is congested, and the follower does not receive leader heartbeat within left election ticks, disruptive election has to happen thus affecting cluster availabilities. + - Now, this can be disabled by setting `--initial-election-tick-advance=false`. + - Disabling this would slow down initial bootstrap process for cross datacenter deployments. Make tradeoffs by configuring `--initial-election-tick-advance` at the cost of slow initial bootstrap. + - If single-node, it advances ticks regardless. + - Address [disruptive rejoining follower node](https://github.com/coreos/etcd/issues/9333). - Add [`--pre-vote`](https://github.com/coreos/etcd/pull/9352) flag to enable to run an additional Raft election phase. - For instance, a flaky(or rejoining) member may drop in and out, and start campaign. This member will end up with a higher term, and ignore all incoming messages with lower term. In this case, a new leader eventually need to get elected, thus disruptive to cluster availability. Raft implements Pre-Vote phase to prevent this kind of disruptions. If enabled, Raft runs an additional phase of election to check if pre-candidate can get enough votes to win an election. - `--pre-vote=false` by default. @@ -155,6 +165,8 @@ See [security doc](https://github.com/coreos/etcd/blob/master/Documentation/op-g ### Added: `embed` +- Add [`embed.Config.InitialElectionTickAdvance`](https://github.com/coreos/etcd/pull/9591) to enable/disable initial election tick fast-forward. + - `embed.NewConfig()` would return `*embed.Config` with `InitialElectionTickAdvance` as true by default. - Add [`embed.Config.Logger`](https://github.com/coreos/etcd/pull/9518) to support [structured logger `zap`](https://github.com/uber-go/zap) in server-side. - Define [`embed.CompactorModePeriodic`](https://godoc.org/github.com/coreos/etcd/embed#pkg-variables) for `compactor.ModePeriodic`. - Define [`embed.CompactorModeRevision`](https://godoc.org/github.com/coreos/etcd/embed#pkg-variables) for `compactor.ModeRevision`. diff --git a/embed/config.go b/embed/config.go index 5b361ab86..28c3a0d78 100644 --- a/embed/config.go +++ b/embed/config.go @@ -121,8 +121,38 @@ type Config struct { // TickMs is the number of milliseconds between heartbeat ticks. // TODO: decouple tickMs and heartbeat tick (current heartbeat tick = 1). // make ticks a cluster wide configuration. - TickMs uint `json:"heartbeat-interval"` - ElectionMs uint `json:"election-timeout"` + TickMs uint `json:"heartbeat-interval"` + ElectionMs uint `json:"election-timeout"` + + // InitialElectionTickAdvance is true, then local member fast-forwards + // election ticks to speed up "initial" leader election trigger. This + // benefits the case of larger election ticks. For instance, cross + // datacenter deployment may require longer election timeout of 10-second. + // If true, local node does not need wait up to 10-second. Instead, + // forwards its election ticks to 8-second, and have only 2-second left + // before leader election. + // + // Major assumptions are that: + // - cluster has no active leader thus advancing ticks enables faster + // leader election, or + // - cluster already has an established leader, and rejoining follower + // is likely to receive heartbeats from the leader after tick advance + // and before election timeout. + // + // However, when network from leader to rejoining follower is congested, + // and the follower does not receive leader heartbeat within left election + // ticks, disruptive election has to happen thus affecting cluster + // availabilities. + // + // Disabling this would slow down initial bootstrap process for cross + // datacenter deployments. Make your own tradeoffs by configuring + // --initial-election-tick-advance at the cost of slow initial bootstrap. + // + // If single-node, it advances ticks regardless. + // + // See https://github.com/coreos/etcd/issues/9333 for more detail. + InitialElectionTickAdvance bool `json:"initial-election-tick-advance"` + QuotaBackendBytes int64 `json:"quota-backend-bytes"` MaxTxnOps uint `json:"max-txn-ops"` MaxRequestBytes uint `json:"max-request-bytes"` @@ -305,8 +335,9 @@ func NewConfig() *Config { GRPCKeepAliveInterval: DefaultGRPCKeepAliveInterval, GRPCKeepAliveTimeout: DefaultGRPCKeepAliveTimeout, - TickMs: 100, - ElectionMs: 1000, + TickMs: 100, + ElectionMs: 1000, + InitialElectionTickAdvance: true, LPUrls: []url.URL{*lpurl}, LCUrls: []url.URL{*lcurl}, diff --git a/embed/etcd.go b/embed/etcd.go index 74033e29a..c0958e49a 100644 --- a/embed/etcd.go +++ b/embed/etcd.go @@ -158,39 +158,40 @@ func StartEtcd(inCfg *Config) (e *Etcd, err error) { } srvcfg := etcdserver.ServerConfig{ - Name: cfg.Name, - ClientURLs: cfg.ACUrls, - PeerURLs: cfg.APUrls, - DataDir: cfg.Dir, - DedicatedWALDir: cfg.WalDir, - SnapCount: cfg.SnapCount, - MaxSnapFiles: cfg.MaxSnapFiles, - MaxWALFiles: cfg.MaxWalFiles, - InitialPeerURLsMap: urlsmap, - InitialClusterToken: token, - DiscoveryURL: cfg.Durl, - DiscoveryProxy: cfg.Dproxy, - NewCluster: cfg.IsNewCluster(), - PeerTLSInfo: cfg.PeerTLSInfo, - TickMs: cfg.TickMs, - ElectionTicks: cfg.ElectionTicks(), - AutoCompactionRetention: autoCompactionRetention, - AutoCompactionMode: cfg.AutoCompactionMode, - QuotaBackendBytes: cfg.QuotaBackendBytes, - MaxTxnOps: cfg.MaxTxnOps, - MaxRequestBytes: cfg.MaxRequestBytes, - StrictReconfigCheck: cfg.StrictReconfigCheck, - ClientCertAuthEnabled: cfg.ClientTLSInfo.ClientCertAuth, - AuthToken: cfg.AuthToken, - CORS: cfg.CORS, - HostWhitelist: cfg.HostWhitelist, - InitialCorruptCheck: cfg.ExperimentalInitialCorruptCheck, - CorruptCheckTime: cfg.ExperimentalCorruptCheckTime, - PreVote: cfg.PreVote, - Logger: cfg.logger, - LoggerConfig: cfg.loggerConfig, - Debug: cfg.Debug, - ForceNewCluster: cfg.ForceNewCluster, + Name: cfg.Name, + ClientURLs: cfg.ACUrls, + PeerURLs: cfg.APUrls, + DataDir: cfg.Dir, + DedicatedWALDir: cfg.WalDir, + SnapCount: cfg.SnapCount, + MaxSnapFiles: cfg.MaxSnapFiles, + MaxWALFiles: cfg.MaxWalFiles, + InitialPeerURLsMap: urlsmap, + InitialClusterToken: token, + DiscoveryURL: cfg.Durl, + DiscoveryProxy: cfg.Dproxy, + NewCluster: cfg.IsNewCluster(), + PeerTLSInfo: cfg.PeerTLSInfo, + TickMs: cfg.TickMs, + ElectionTicks: cfg.ElectionTicks(), + InitialElectionTickAdvance: cfg.InitialElectionTickAdvance, + AutoCompactionRetention: autoCompactionRetention, + AutoCompactionMode: cfg.AutoCompactionMode, + QuotaBackendBytes: cfg.QuotaBackendBytes, + MaxTxnOps: cfg.MaxTxnOps, + MaxRequestBytes: cfg.MaxRequestBytes, + StrictReconfigCheck: cfg.StrictReconfigCheck, + ClientCertAuthEnabled: cfg.ClientTLSInfo.ClientCertAuth, + AuthToken: cfg.AuthToken, + CORS: cfg.CORS, + HostWhitelist: cfg.HostWhitelist, + InitialCorruptCheck: cfg.ExperimentalInitialCorruptCheck, + CorruptCheckTime: cfg.ExperimentalCorruptCheckTime, + PreVote: cfg.PreVote, + Logger: cfg.logger, + LoggerConfig: cfg.loggerConfig, + Debug: cfg.Debug, + ForceNewCluster: cfg.ForceNewCluster, } if e.Server, err = etcdserver.NewServer(srvcfg); err != nil { return e, err diff --git a/etcdmain/config.go b/etcdmain/config.go index c4a6d0a48..b884a1113 100644 --- a/etcdmain/config.go +++ b/etcdmain/config.go @@ -153,6 +153,7 @@ func newConfig() *config { fs.Uint64Var(&cfg.ec.SnapCount, "snapshot-count", cfg.ec.SnapCount, "Number of committed transactions to trigger a snapshot to disk.") fs.UintVar(&cfg.ec.TickMs, "heartbeat-interval", cfg.ec.TickMs, "Time (in milliseconds) of a heartbeat interval.") fs.UintVar(&cfg.ec.ElectionMs, "election-timeout", cfg.ec.ElectionMs, "Time (in milliseconds) for an election to timeout.") + fs.BoolVar(&cfg.ec.InitialElectionTickAdvance, "initial-election-tick-advance", cfg.ec.InitialElectionTickAdvance, "Whether to fast-forward initial election ticks on boot for faster election.") fs.Int64Var(&cfg.ec.QuotaBackendBytes, "quota-backend-bytes", cfg.ec.QuotaBackendBytes, "Raise alarms when backend size exceeds the given quota. 0 means use the default quota.") fs.UintVar(&cfg.ec.MaxTxnOps, "max-txn-ops", cfg.ec.MaxTxnOps, "Maximum number of operations permitted in a transaction.") fs.UintVar(&cfg.ec.MaxRequestBytes, "max-request-bytes", cfg.ec.MaxRequestBytes, "Maximum client request size in bytes the server will accept.") diff --git a/etcdmain/help.go b/etcdmain/help.go index 72e50043a..55334f3fb 100644 --- a/etcdmain/help.go +++ b/etcdmain/help.go @@ -55,6 +55,8 @@ Member: Time (in milliseconds) of a heartbeat interval. --election-timeout '1000' Time (in milliseconds) for an election to timeout. See tuning documentation for details. + --initial-election-tick-advance 'true' + Whether to fast-forward initial election ticks on boot for faster election. --listen-peer-urls 'http://localhost:2380' List of URLs to listen on for peer traffic. --listen-client-urls 'http://localhost:2379' diff --git a/etcdserver/config.go b/etcdserver/config.go index 19eef8aaa..4ea8e128b 100644 --- a/etcdserver/config.go +++ b/etcdserver/config.go @@ -55,8 +55,38 @@ type ServerConfig struct { // whose Host header value exists in this white list. HostWhitelist map[string]struct{} - TickMs uint - ElectionTicks int + TickMs uint + ElectionTicks int + + // InitialElectionTickAdvance is true, then local member fast-forwards + // election ticks to speed up "initial" leader election trigger. This + // benefits the case of larger election ticks. For instance, cross + // datacenter deployment may require longer election timeout of 10-second. + // If true, local node does not need wait up to 10-second. Instead, + // forwards its election ticks to 8-second, and have only 2-second left + // before leader election. + // + // Major assumptions are that: + // - cluster has no active leader thus advancing ticks enables faster + // leader election, or + // - cluster already has an established leader, and rejoining follower + // is likely to receive heartbeats from the leader after tick advance + // and before election timeout. + // + // However, when network from leader to rejoining follower is congested, + // and the follower does not receive leader heartbeat within left election + // ticks, disruptive election has to happen thus affecting cluster + // availabilities. + // + // Disabling this would slow down initial bootstrap process for cross + // datacenter deployments. Make your own tradeoffs by configuring + // --initial-election-tick-advance at the cost of slow initial bootstrap. + // + // If single-node, it advances ticks regardless. + // + // See https://github.com/coreos/etcd/issues/9333 for more detail. + InitialElectionTickAdvance bool + BootstrapTimeout time.Duration AutoCompactionRetention time.Duration @@ -263,6 +293,7 @@ func (c *ServerConfig) print(initial bool) { zap.String("heartbeat-interval", fmt.Sprintf("%v", time.Duration(c.TickMs)*time.Millisecond)), zap.Int("election-tick-ms", c.ElectionTicks), zap.String("election-timeout", fmt.Sprintf("%v", time.Duration(c.ElectionTicks*int(c.TickMs))*time.Millisecond)), + zap.Bool("initial-election-tick-advance", c.InitialElectionTickAdvance), zap.Uint64("snapshot-count", c.SnapCount), zap.Strings("advertise-client-urls", c.getACURLs()), zap.Strings("initial-advertise-peer-urls", c.getAPURLs()), diff --git a/etcdserver/server.go b/etcdserver/server.go index 0b259363d..987eeb7f7 100644 --- a/etcdserver/server.go +++ b/etcdserver/server.go @@ -635,6 +635,16 @@ func (s *EtcdServer) adjustTicks() { return } + if !s.Cfg.InitialElectionTickAdvance { + if lg != nil { + lg.Info("skipping initial election tick advance", zap.Int("election-ticks", s.Cfg.ElectionTicks)) + } + return + } + if lg != nil { + lg.Info("starting initial election tick advance", zap.Int("election-ticks", s.Cfg.ElectionTicks)) + } + // retry up to "rafthttp.ConnReadTimeout", which is 5-sec // until peer connection reports; otherwise: // 1. all connections failed, or diff --git a/integration/cluster.go b/integration/cluster.go index 38b140eb0..e872e5521 100644 --- a/integration/cluster.go +++ b/integration/cluster.go @@ -593,6 +593,7 @@ func mustNewMember(t *testing.T, mcfg memberConfig) *member { m.ServerConfig.PeerTLSInfo = *m.PeerTLSInfo } m.ElectionTicks = electionTicks + m.InitialElectionTickAdvance = true m.TickMs = uint(tickDuration / time.Millisecond) m.QuotaBackendBytes = mcfg.quotaBackendBytes m.MaxTxnOps = mcfg.maxTxnOps