etcd/etcdserver/config.go
Marek Siarkowicz 8f4735dfd4 server: Require either cluster version v3.6 or --experimental-enable-lease-checkpoint-persist to persist lease remainingTTL
To avoid inconsistant behavior during cluster upgrade we are feature
gating persistance behind cluster version. This should ensure that
all cluster members are upgraded to v3.6 before changing behavior.

To allow backporting this fix to v3.5 we are also introducing flag
--experimental-enable-lease-checkpoint-persist that will allow for
smooth upgrade in v3.5 clusters with this feature enabled.

Signed-off-by: Marek Siarkowicz <siarkowicz@google.com>
2022-07-22 10:28:29 +02:00

323 lines
11 KiB
Go

// Copyright 2015 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package etcdserver
import (
"context"
"fmt"
"path/filepath"
"sort"
"strings"
"time"
"go.etcd.io/etcd/pkg/netutil"
"go.etcd.io/etcd/pkg/transport"
"go.etcd.io/etcd/pkg/types"
bolt "go.etcd.io/bbolt"
"go.uber.org/zap"
"go.uber.org/zap/zapcore"
)
// ServerConfig holds the configuration of etcd as taken from the command line or discovery.
type ServerConfig struct {
Name string
DiscoveryURL string
DiscoveryProxy string
ClientURLs types.URLs
PeerURLs types.URLs
DataDir string
// DedicatedWALDir config will make the etcd to write the WAL to the WALDir
// rather than the dataDir/member/wal.
DedicatedWALDir string
SnapshotCount uint64
// SnapshotCatchUpEntries is the number of entries for a slow follower
// to catch-up after compacting the raft storage entries.
// We expect the follower has a millisecond level latency with the leader.
// The max throughput is around 10K. Keep a 5K entries is enough for helping
// follower to catch up.
// WARNING: only change this for tests. Always use "DefaultSnapshotCatchUpEntries"
SnapshotCatchUpEntries uint64
MaxSnapFiles uint
MaxWALFiles uint
// BackendBatchInterval is the maximum time before commit the backend transaction.
BackendBatchInterval time.Duration
// BackendBatchLimit is the maximum operations before commit the backend transaction.
BackendBatchLimit int
// BackendFreelistType is the type of the backend boltdb freelist.
BackendFreelistType bolt.FreelistType
InitialPeerURLsMap types.URLsMap
InitialClusterToken string
NewCluster bool
PeerTLSInfo transport.TLSInfo
CORS map[string]struct{}
// HostWhitelist lists acceptable hostnames from client requests.
// If server is insecure (no TLS), server only accepts requests
// whose Host header value exists in this white list.
HostWhitelist map[string]struct{}
TickMs uint
ElectionTicks int
// InitialElectionTickAdvance is true, then local member fast-forwards
// election ticks to speed up "initial" leader election trigger. This
// benefits the case of larger election ticks. For instance, cross
// datacenter deployment may require longer election timeout of 10-second.
// If true, local node does not need wait up to 10-second. Instead,
// forwards its election ticks to 8-second, and have only 2-second left
// before leader election.
//
// Major assumptions are that:
// - cluster has no active leader thus advancing ticks enables faster
// leader election, or
// - cluster already has an established leader, and rejoining follower
// is likely to receive heartbeats from the leader after tick advance
// and before election timeout.
//
// However, when network from leader to rejoining follower is congested,
// and the follower does not receive leader heartbeat within left election
// ticks, disruptive election has to happen thus affecting cluster
// availabilities.
//
// Disabling this would slow down initial bootstrap process for cross
// datacenter deployments. Make your own tradeoffs by configuring
// --initial-election-tick-advance at the cost of slow initial bootstrap.
//
// If single-node, it advances ticks regardless.
//
// See https://github.com/etcd-io/etcd/issues/9333 for more detail.
InitialElectionTickAdvance bool
BootstrapTimeout time.Duration
AutoCompactionRetention time.Duration
AutoCompactionMode string
CompactionBatchLimit int
QuotaBackendBytes int64
MaxTxnOps uint
// MaxRequestBytes is the maximum request size to send over raft.
MaxRequestBytes uint
// MaxConcurrentStreams specifies the maximum number of concurrent
// streams that each client can open at a time.
MaxConcurrentStreams uint32
WarningApplyDuration time.Duration
StrictReconfigCheck bool
// ClientCertAuthEnabled is true when cert has been signed by the client CA.
ClientCertAuthEnabled bool
AuthToken string
BcryptCost uint
TokenTTL uint
// InitialCorruptCheck is true to check data corruption on boot
// before serving any peer/client traffic.
InitialCorruptCheck bool
CorruptCheckTime time.Duration
// PreVote is true to enable Raft Pre-Vote.
PreVote bool
// Logger logs server-side operations.
// If not nil, it disables "capnslog" and uses the given logger.
Logger *zap.Logger
// LoggerConfig is server logger configuration for Raft logger.
// Must be either: "LoggerConfig != nil" or "LoggerCore != nil && LoggerWriteSyncer != nil".
LoggerConfig *zap.Config
// LoggerCore is "zapcore.Core" for raft logger.
// Must be either: "LoggerConfig != nil" or "LoggerCore != nil && LoggerWriteSyncer != nil".
LoggerCore zapcore.Core
LoggerWriteSyncer zapcore.WriteSyncer
Debug bool
ForceNewCluster bool
// EnableLeaseCheckpoint enables leader to send regular checkpoints to other members to prevent reset of remaining TTL on leader change.
EnableLeaseCheckpoint bool
// LeaseCheckpointInterval time.Duration is the wait duration between lease checkpoints.
LeaseCheckpointInterval time.Duration
// LeaseCheckpointPersist enables persisting remainingTTL to prevent indefinite auto-renewal of long lived leases. Always enabled in v3.6. Should be used to ensure smooth upgrade from v3.5 clusters with this feature enabled.
LeaseCheckpointPersist bool
EnableGRPCGateway bool
WatchProgressNotifyInterval time.Duration
// UnsafeNoFsync disables all uses of fsync.
// Setting this is unsafe and will cause data loss.
UnsafeNoFsync bool `json:"unsafe-no-fsync"`
}
// VerifyBootstrap sanity-checks the initial config for bootstrap case
// and returns an error for things that should never happen.
func (c *ServerConfig) VerifyBootstrap() error {
if err := c.hasLocalMember(); err != nil {
return err
}
if err := c.advertiseMatchesCluster(); err != nil {
return err
}
if checkDuplicateURL(c.InitialPeerURLsMap) {
return fmt.Errorf("initial cluster %s has duplicate url", c.InitialPeerURLsMap)
}
if c.InitialPeerURLsMap.String() == "" && c.DiscoveryURL == "" {
return fmt.Errorf("initial cluster unset and no discovery URL found")
}
return nil
}
// VerifyJoinExisting sanity-checks the initial config for join existing cluster
// case and returns an error for things that should never happen.
func (c *ServerConfig) VerifyJoinExisting() error {
// The member has announced its peer urls to the cluster before starting; no need to
// set the configuration again.
if err := c.hasLocalMember(); err != nil {
return err
}
if checkDuplicateURL(c.InitialPeerURLsMap) {
return fmt.Errorf("initial cluster %s has duplicate url", c.InitialPeerURLsMap)
}
if c.DiscoveryURL != "" {
return fmt.Errorf("discovery URL should not be set when joining existing initial cluster")
}
return nil
}
// hasLocalMember checks that the cluster at least contains the local server.
func (c *ServerConfig) hasLocalMember() error {
if urls := c.InitialPeerURLsMap[c.Name]; urls == nil {
return fmt.Errorf("couldn't find local name %q in the initial cluster configuration", c.Name)
}
return nil
}
// advertiseMatchesCluster confirms peer URLs match those in the cluster peer list.
func (c *ServerConfig) advertiseMatchesCluster() error {
urls, apurls := c.InitialPeerURLsMap[c.Name], c.PeerURLs.StringSlice()
urls.Sort()
sort.Strings(apurls)
ctx, cancel := context.WithTimeout(context.TODO(), 30*time.Second)
defer cancel()
ok, err := netutil.URLStringsEqual(ctx, c.Logger, apurls, urls.StringSlice())
if ok {
return nil
}
initMap, apMap := make(map[string]struct{}), make(map[string]struct{})
for _, url := range c.PeerURLs {
apMap[url.String()] = struct{}{}
}
for _, url := range c.InitialPeerURLsMap[c.Name] {
initMap[url.String()] = struct{}{}
}
missing := []string{}
for url := range initMap {
if _, ok := apMap[url]; !ok {
missing = append(missing, url)
}
}
if len(missing) > 0 {
for i := range missing {
missing[i] = c.Name + "=" + missing[i]
}
mstr := strings.Join(missing, ",")
apStr := strings.Join(apurls, ",")
return fmt.Errorf("--initial-cluster has %s but missing from --initial-advertise-peer-urls=%s (%v)", mstr, apStr, err)
}
for url := range apMap {
if _, ok := initMap[url]; !ok {
missing = append(missing, url)
}
}
if len(missing) > 0 {
mstr := strings.Join(missing, ",")
umap := types.URLsMap(map[string]types.URLs{c.Name: c.PeerURLs})
return fmt.Errorf("--initial-advertise-peer-urls has %s but missing from --initial-cluster=%s", mstr, umap.String())
}
// resolved URLs from "--initial-advertise-peer-urls" and "--initial-cluster" did not match or failed
apStr := strings.Join(apurls, ",")
umap := types.URLsMap(map[string]types.URLs{c.Name: c.PeerURLs})
return fmt.Errorf("failed to resolve %s to match --initial-cluster=%s (%v)", apStr, umap.String(), err)
}
func (c *ServerConfig) MemberDir() string { return filepath.Join(c.DataDir, "member") }
func (c *ServerConfig) WALDir() string {
if c.DedicatedWALDir != "" {
return c.DedicatedWALDir
}
return filepath.Join(c.MemberDir(), "wal")
}
func (c *ServerConfig) SnapDir() string { return filepath.Join(c.MemberDir(), "snap") }
func (c *ServerConfig) ShouldDiscover() bool { return c.DiscoveryURL != "" }
// ReqTimeout returns timeout for request to finish.
func (c *ServerConfig) ReqTimeout() time.Duration {
// 5s for queue waiting, computation and disk IO delay
// + 2 * election timeout for possible leader election
return 5*time.Second + 2*time.Duration(c.ElectionTicks*int(c.TickMs))*time.Millisecond
}
func (c *ServerConfig) electionTimeout() time.Duration {
return time.Duration(c.ElectionTicks*int(c.TickMs)) * time.Millisecond
}
func (c *ServerConfig) peerDialTimeout() time.Duration {
// 1s for queue wait and election timeout
return time.Second + time.Duration(c.ElectionTicks*int(c.TickMs))*time.Millisecond
}
func checkDuplicateURL(urlsmap types.URLsMap) bool {
um := make(map[string]bool)
for _, urls := range urlsmap {
for _, url := range urls {
u := url.String()
if um[u] {
return true
}
um[u] = true
}
}
return false
}
func (c *ServerConfig) bootstrapTimeout() time.Duration {
if c.BootstrapTimeout != 0 {
return c.BootstrapTimeout
}
return time.Second
}
func (c *ServerConfig) backendPath() string { return filepath.Join(c.SnapDir(), "db") }