mirror of
				https://github.com/etcd-io/etcd.git
				synced 2024-09-27 06:25:44 +00:00 
			
		
		
		
	 8f4735dfd4
			
		
	
	
		8f4735dfd4
		
	
	
	
	
		
			
			To avoid inconsistant behavior during cluster upgrade we are feature gating persistance behind cluster version. This should ensure that all cluster members are upgraded to v3.6 before changing behavior. To allow backporting this fix to v3.5 we are also introducing flag --experimental-enable-lease-checkpoint-persist that will allow for smooth upgrade in v3.5 clusters with this feature enabled. Signed-off-by: Marek Siarkowicz <siarkowicz@google.com>
		
			
				
	
	
		
			323 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			323 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
| // Copyright 2015 The etcd Authors
 | |
| //
 | |
| // Licensed under the Apache License, Version 2.0 (the "License");
 | |
| // you may not use this file except in compliance with the License.
 | |
| // You may obtain a copy of the License at
 | |
| //
 | |
| //     http://www.apache.org/licenses/LICENSE-2.0
 | |
| //
 | |
| // Unless required by applicable law or agreed to in writing, software
 | |
| // distributed under the License is distributed on an "AS IS" BASIS,
 | |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
| // See the License for the specific language governing permissions and
 | |
| // limitations under the License.
 | |
| 
 | |
| package etcdserver
 | |
| 
 | |
| import (
 | |
| 	"context"
 | |
| 	"fmt"
 | |
| 	"path/filepath"
 | |
| 	"sort"
 | |
| 	"strings"
 | |
| 	"time"
 | |
| 
 | |
| 	"go.etcd.io/etcd/pkg/netutil"
 | |
| 	"go.etcd.io/etcd/pkg/transport"
 | |
| 	"go.etcd.io/etcd/pkg/types"
 | |
| 
 | |
| 	bolt "go.etcd.io/bbolt"
 | |
| 	"go.uber.org/zap"
 | |
| 	"go.uber.org/zap/zapcore"
 | |
| )
 | |
| 
 | |
| // ServerConfig holds the configuration of etcd as taken from the command line or discovery.
 | |
| type ServerConfig struct {
 | |
| 	Name           string
 | |
| 	DiscoveryURL   string
 | |
| 	DiscoveryProxy string
 | |
| 	ClientURLs     types.URLs
 | |
| 	PeerURLs       types.URLs
 | |
| 	DataDir        string
 | |
| 	// DedicatedWALDir config will make the etcd to write the WAL to the WALDir
 | |
| 	// rather than the dataDir/member/wal.
 | |
| 	DedicatedWALDir string
 | |
| 
 | |
| 	SnapshotCount uint64
 | |
| 
 | |
| 	// SnapshotCatchUpEntries is the number of entries for a slow follower
 | |
| 	// to catch-up after compacting the raft storage entries.
 | |
| 	// We expect the follower has a millisecond level latency with the leader.
 | |
| 	// The max throughput is around 10K. Keep a 5K entries is enough for helping
 | |
| 	// follower to catch up.
 | |
| 	// WARNING: only change this for tests. Always use "DefaultSnapshotCatchUpEntries"
 | |
| 	SnapshotCatchUpEntries uint64
 | |
| 
 | |
| 	MaxSnapFiles uint
 | |
| 	MaxWALFiles  uint
 | |
| 
 | |
| 	// BackendBatchInterval is the maximum time before commit the backend transaction.
 | |
| 	BackendBatchInterval time.Duration
 | |
| 	// BackendBatchLimit is the maximum operations before commit the backend transaction.
 | |
| 	BackendBatchLimit int
 | |
| 
 | |
| 	// BackendFreelistType is the type of the backend boltdb freelist.
 | |
| 	BackendFreelistType bolt.FreelistType
 | |
| 
 | |
| 	InitialPeerURLsMap  types.URLsMap
 | |
| 	InitialClusterToken string
 | |
| 	NewCluster          bool
 | |
| 	PeerTLSInfo         transport.TLSInfo
 | |
| 
 | |
| 	CORS map[string]struct{}
 | |
| 
 | |
| 	// HostWhitelist lists acceptable hostnames from client requests.
 | |
| 	// If server is insecure (no TLS), server only accepts requests
 | |
| 	// whose Host header value exists in this white list.
 | |
| 	HostWhitelist map[string]struct{}
 | |
| 
 | |
| 	TickMs        uint
 | |
| 	ElectionTicks int
 | |
| 
 | |
| 	// InitialElectionTickAdvance is true, then local member fast-forwards
 | |
| 	// election ticks to speed up "initial" leader election trigger. This
 | |
| 	// benefits the case of larger election ticks. For instance, cross
 | |
| 	// datacenter deployment may require longer election timeout of 10-second.
 | |
| 	// If true, local node does not need wait up to 10-second. Instead,
 | |
| 	// forwards its election ticks to 8-second, and have only 2-second left
 | |
| 	// before leader election.
 | |
| 	//
 | |
| 	// Major assumptions are that:
 | |
| 	//  - cluster has no active leader thus advancing ticks enables faster
 | |
| 	//    leader election, or
 | |
| 	//  - cluster already has an established leader, and rejoining follower
 | |
| 	//    is likely to receive heartbeats from the leader after tick advance
 | |
| 	//    and before election timeout.
 | |
| 	//
 | |
| 	// However, when network from leader to rejoining follower is congested,
 | |
| 	// and the follower does not receive leader heartbeat within left election
 | |
| 	// ticks, disruptive election has to happen thus affecting cluster
 | |
| 	// availabilities.
 | |
| 	//
 | |
| 	// Disabling this would slow down initial bootstrap process for cross
 | |
| 	// datacenter deployments. Make your own tradeoffs by configuring
 | |
| 	// --initial-election-tick-advance at the cost of slow initial bootstrap.
 | |
| 	//
 | |
| 	// If single-node, it advances ticks regardless.
 | |
| 	//
 | |
| 	// See https://github.com/etcd-io/etcd/issues/9333 for more detail.
 | |
| 	InitialElectionTickAdvance bool
 | |
| 
 | |
| 	BootstrapTimeout time.Duration
 | |
| 
 | |
| 	AutoCompactionRetention time.Duration
 | |
| 	AutoCompactionMode      string
 | |
| 	CompactionBatchLimit    int
 | |
| 	QuotaBackendBytes       int64
 | |
| 	MaxTxnOps               uint
 | |
| 
 | |
| 	// MaxRequestBytes is the maximum request size to send over raft.
 | |
| 	MaxRequestBytes uint
 | |
| 
 | |
| 	// MaxConcurrentStreams specifies the maximum number of concurrent
 | |
| 	// streams that each client can open at a time.
 | |
| 	MaxConcurrentStreams uint32
 | |
| 
 | |
| 	WarningApplyDuration time.Duration
 | |
| 
 | |
| 	StrictReconfigCheck bool
 | |
| 
 | |
| 	// ClientCertAuthEnabled is true when cert has been signed by the client CA.
 | |
| 	ClientCertAuthEnabled bool
 | |
| 
 | |
| 	AuthToken  string
 | |
| 	BcryptCost uint
 | |
| 	TokenTTL   uint
 | |
| 
 | |
| 	// InitialCorruptCheck is true to check data corruption on boot
 | |
| 	// before serving any peer/client traffic.
 | |
| 	InitialCorruptCheck bool
 | |
| 	CorruptCheckTime    time.Duration
 | |
| 
 | |
| 	// PreVote is true to enable Raft Pre-Vote.
 | |
| 	PreVote bool
 | |
| 
 | |
| 	// Logger logs server-side operations.
 | |
| 	// If not nil, it disables "capnslog" and uses the given logger.
 | |
| 	Logger *zap.Logger
 | |
| 
 | |
| 	// LoggerConfig is server logger configuration for Raft logger.
 | |
| 	// Must be either: "LoggerConfig != nil" or "LoggerCore != nil && LoggerWriteSyncer != nil".
 | |
| 	LoggerConfig *zap.Config
 | |
| 	// LoggerCore is "zapcore.Core" for raft logger.
 | |
| 	// Must be either: "LoggerConfig != nil" or "LoggerCore != nil && LoggerWriteSyncer != nil".
 | |
| 	LoggerCore        zapcore.Core
 | |
| 	LoggerWriteSyncer zapcore.WriteSyncer
 | |
| 
 | |
| 	Debug bool
 | |
| 
 | |
| 	ForceNewCluster bool
 | |
| 
 | |
| 	// EnableLeaseCheckpoint enables leader to send regular checkpoints to other members to prevent reset of remaining TTL on leader change.
 | |
| 	EnableLeaseCheckpoint bool
 | |
| 	// LeaseCheckpointInterval time.Duration is the wait duration between lease checkpoints.
 | |
| 	LeaseCheckpointInterval time.Duration
 | |
| 	// LeaseCheckpointPersist enables persisting remainingTTL to prevent indefinite auto-renewal of long lived leases. Always enabled in v3.6. Should be used to ensure smooth upgrade from v3.5 clusters with this feature enabled.
 | |
| 	LeaseCheckpointPersist bool
 | |
| 
 | |
| 	EnableGRPCGateway bool
 | |
| 
 | |
| 	WatchProgressNotifyInterval time.Duration
 | |
| 
 | |
| 	// UnsafeNoFsync disables all uses of fsync.
 | |
| 	// Setting this is unsafe and will cause data loss.
 | |
| 	UnsafeNoFsync bool `json:"unsafe-no-fsync"`
 | |
| }
 | |
| 
 | |
| // VerifyBootstrap sanity-checks the initial config for bootstrap case
 | |
| // and returns an error for things that should never happen.
 | |
| func (c *ServerConfig) VerifyBootstrap() error {
 | |
| 	if err := c.hasLocalMember(); err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	if err := c.advertiseMatchesCluster(); err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	if checkDuplicateURL(c.InitialPeerURLsMap) {
 | |
| 		return fmt.Errorf("initial cluster %s has duplicate url", c.InitialPeerURLsMap)
 | |
| 	}
 | |
| 	if c.InitialPeerURLsMap.String() == "" && c.DiscoveryURL == "" {
 | |
| 		return fmt.Errorf("initial cluster unset and no discovery URL found")
 | |
| 	}
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| // VerifyJoinExisting sanity-checks the initial config for join existing cluster
 | |
| // case and returns an error for things that should never happen.
 | |
| func (c *ServerConfig) VerifyJoinExisting() error {
 | |
| 	// The member has announced its peer urls to the cluster before starting; no need to
 | |
| 	// set the configuration again.
 | |
| 	if err := c.hasLocalMember(); err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	if checkDuplicateURL(c.InitialPeerURLsMap) {
 | |
| 		return fmt.Errorf("initial cluster %s has duplicate url", c.InitialPeerURLsMap)
 | |
| 	}
 | |
| 	if c.DiscoveryURL != "" {
 | |
| 		return fmt.Errorf("discovery URL should not be set when joining existing initial cluster")
 | |
| 	}
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| // hasLocalMember checks that the cluster at least contains the local server.
 | |
| func (c *ServerConfig) hasLocalMember() error {
 | |
| 	if urls := c.InitialPeerURLsMap[c.Name]; urls == nil {
 | |
| 		return fmt.Errorf("couldn't find local name %q in the initial cluster configuration", c.Name)
 | |
| 	}
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| // advertiseMatchesCluster confirms peer URLs match those in the cluster peer list.
 | |
| func (c *ServerConfig) advertiseMatchesCluster() error {
 | |
| 	urls, apurls := c.InitialPeerURLsMap[c.Name], c.PeerURLs.StringSlice()
 | |
| 	urls.Sort()
 | |
| 	sort.Strings(apurls)
 | |
| 	ctx, cancel := context.WithTimeout(context.TODO(), 30*time.Second)
 | |
| 	defer cancel()
 | |
| 	ok, err := netutil.URLStringsEqual(ctx, c.Logger, apurls, urls.StringSlice())
 | |
| 	if ok {
 | |
| 		return nil
 | |
| 	}
 | |
| 
 | |
| 	initMap, apMap := make(map[string]struct{}), make(map[string]struct{})
 | |
| 	for _, url := range c.PeerURLs {
 | |
| 		apMap[url.String()] = struct{}{}
 | |
| 	}
 | |
| 	for _, url := range c.InitialPeerURLsMap[c.Name] {
 | |
| 		initMap[url.String()] = struct{}{}
 | |
| 	}
 | |
| 
 | |
| 	missing := []string{}
 | |
| 	for url := range initMap {
 | |
| 		if _, ok := apMap[url]; !ok {
 | |
| 			missing = append(missing, url)
 | |
| 		}
 | |
| 	}
 | |
| 	if len(missing) > 0 {
 | |
| 		for i := range missing {
 | |
| 			missing[i] = c.Name + "=" + missing[i]
 | |
| 		}
 | |
| 		mstr := strings.Join(missing, ",")
 | |
| 		apStr := strings.Join(apurls, ",")
 | |
| 		return fmt.Errorf("--initial-cluster has %s but missing from --initial-advertise-peer-urls=%s (%v)", mstr, apStr, err)
 | |
| 	}
 | |
| 
 | |
| 	for url := range apMap {
 | |
| 		if _, ok := initMap[url]; !ok {
 | |
| 			missing = append(missing, url)
 | |
| 		}
 | |
| 	}
 | |
| 	if len(missing) > 0 {
 | |
| 		mstr := strings.Join(missing, ",")
 | |
| 		umap := types.URLsMap(map[string]types.URLs{c.Name: c.PeerURLs})
 | |
| 		return fmt.Errorf("--initial-advertise-peer-urls has %s but missing from --initial-cluster=%s", mstr, umap.String())
 | |
| 	}
 | |
| 
 | |
| 	// resolved URLs from "--initial-advertise-peer-urls" and "--initial-cluster" did not match or failed
 | |
| 	apStr := strings.Join(apurls, ",")
 | |
| 	umap := types.URLsMap(map[string]types.URLs{c.Name: c.PeerURLs})
 | |
| 	return fmt.Errorf("failed to resolve %s to match --initial-cluster=%s (%v)", apStr, umap.String(), err)
 | |
| }
 | |
| 
 | |
| func (c *ServerConfig) MemberDir() string { return filepath.Join(c.DataDir, "member") }
 | |
| 
 | |
| func (c *ServerConfig) WALDir() string {
 | |
| 	if c.DedicatedWALDir != "" {
 | |
| 		return c.DedicatedWALDir
 | |
| 	}
 | |
| 	return filepath.Join(c.MemberDir(), "wal")
 | |
| }
 | |
| 
 | |
| func (c *ServerConfig) SnapDir() string { return filepath.Join(c.MemberDir(), "snap") }
 | |
| 
 | |
| func (c *ServerConfig) ShouldDiscover() bool { return c.DiscoveryURL != "" }
 | |
| 
 | |
| // ReqTimeout returns timeout for request to finish.
 | |
| func (c *ServerConfig) ReqTimeout() time.Duration {
 | |
| 	// 5s for queue waiting, computation and disk IO delay
 | |
| 	// + 2 * election timeout for possible leader election
 | |
| 	return 5*time.Second + 2*time.Duration(c.ElectionTicks*int(c.TickMs))*time.Millisecond
 | |
| }
 | |
| 
 | |
| func (c *ServerConfig) electionTimeout() time.Duration {
 | |
| 	return time.Duration(c.ElectionTicks*int(c.TickMs)) * time.Millisecond
 | |
| }
 | |
| 
 | |
| func (c *ServerConfig) peerDialTimeout() time.Duration {
 | |
| 	// 1s for queue wait and election timeout
 | |
| 	return time.Second + time.Duration(c.ElectionTicks*int(c.TickMs))*time.Millisecond
 | |
| }
 | |
| 
 | |
| func checkDuplicateURL(urlsmap types.URLsMap) bool {
 | |
| 	um := make(map[string]bool)
 | |
| 	for _, urls := range urlsmap {
 | |
| 		for _, url := range urls {
 | |
| 			u := url.String()
 | |
| 			if um[u] {
 | |
| 				return true
 | |
| 			}
 | |
| 			um[u] = true
 | |
| 		}
 | |
| 	}
 | |
| 	return false
 | |
| }
 | |
| 
 | |
| func (c *ServerConfig) bootstrapTimeout() time.Duration {
 | |
| 	if c.BootstrapTimeout != 0 {
 | |
| 		return c.BootstrapTimeout
 | |
| 	}
 | |
| 	return time.Second
 | |
| }
 | |
| 
 | |
| func (c *ServerConfig) backendPath() string { return filepath.Join(c.SnapDir(), "db") }
 |