From a7e443d6412ec405948571d1f758ab7f1c078bf9 Mon Sep 17 00:00:00 2001 From: Gyu-Ho Lee Date: Mon, 28 Dec 2015 21:22:20 -0800 Subject: [PATCH] etcdserver: always remove member directory when bootstrap fails This removes member directory when bootstrap fails including joining existing cluster and forming a new cluster. This fixes https://github.com/coreos/etcd/issues/3827. --- etcdserver/server.go | 65 ++++++++++++++++++++++++-------------------- 1 file changed, 35 insertions(+), 30 deletions(-) diff --git a/etcdserver/server.go b/etcdserver/server.go index d9731dbcc..72190a970 100644 --- a/etcdserver/server.go +++ b/etcdserver/server.go @@ -194,7 +194,7 @@ type EtcdServer struct { // NewServer creates a new EtcdServer from the supplied configuration. The // configuration is considered static for the lifetime of the EtcdServer. -func NewServer(cfg *ServerConfig) (*EtcdServer, error) { +func NewServer(cfg *ServerConfig) (srv *EtcdServer, err error) { st := store.New(StoreClusterPrefix, StoreKeysPrefix) var ( w *wal.WAL @@ -213,7 +213,7 @@ func NewServer(cfg *ServerConfig) (*EtcdServer, error) { if err != nil { return nil, err } - if err := upgradeDataDir(cfg.DataDir, cfg.Name, dataVer); err != nil { + if err = upgradeDataDir(cfg.DataDir, cfg.Name, dataVer); err != nil { return nil, err } @@ -223,27 +223,36 @@ func NewServer(cfg *ServerConfig) (*EtcdServer, error) { } haveWAL := wal.Exist(cfg.WALDir()) + if !haveWAL { + defer func() { + if err != nil { + // cleans up member directory if bootstrap fails (including forming or joining a new cluster) + os.RemoveAll(cfg.MemberDir()) + } + }() + } ss := snap.New(cfg.SnapDir()) - prt, err := rafthttp.NewRoundTripper(cfg.PeerTLSInfo, cfg.peerDialTimeout()) - if err != nil { - return nil, err + prt, rterr := rafthttp.NewRoundTripper(cfg.PeerTLSInfo, cfg.peerDialTimeout()) + if rterr != nil { + return nil, rterr } + var remotes []*Member switch { case !haveWAL && !cfg.NewCluster: - if err := cfg.VerifyJoinExisting(); err != nil { + if err = cfg.VerifyJoinExisting(); err != nil { return nil, err } cl, err = newClusterFromURLsMap(cfg.InitialClusterToken, cfg.InitialPeerURLsMap) if err != nil { return nil, err } - existingCluster, err := GetClusterFromRemotePeers(getRemotePeerURLs(cl, cfg.Name), prt) - if err != nil { - return nil, fmt.Errorf("cannot fetch cluster info from peer urls: %v", err) + existingCluster, gcerr := GetClusterFromRemotePeers(getRemotePeerURLs(cl, cfg.Name), prt) + if gcerr != nil { + return nil, fmt.Errorf("cannot fetch cluster info from peer urls: %v", gcerr) } - if err := ValidateClusterAndAssignIDs(cl, existingCluster); err != nil { + if err = ValidateClusterAndAssignIDs(cl, existingCluster); err != nil { return nil, fmt.Errorf("error validating peerURLs %s: %v", existingCluster, err) } if !isCompatibleWithCluster(cl, cl.MemberByName(cfg.Name).ID, prt) { @@ -256,7 +265,7 @@ func NewServer(cfg *ServerConfig) (*EtcdServer, error) { cfg.Print() id, n, s, w = startNode(cfg, cl, nil) case !haveWAL && cfg.NewCluster: - if err := cfg.VerifyBootstrap(); err != nil { + if err = cfg.VerifyBootstrap(); err != nil { return nil, err } cl, err = newClusterFromURLsMap(cfg.InitialClusterToken, cfg.InitialPeerURLsMap) @@ -268,15 +277,13 @@ func NewServer(cfg *ServerConfig) (*EtcdServer, error) { return nil, fmt.Errorf("member %s has already been bootstrapped", m.ID) } if cfg.ShouldDiscover() { - var str string - var err error - str, err = discovery.JoinCluster(cfg.DiscoveryURL, cfg.DiscoveryProxy, m.ID, cfg.InitialPeerURLsMap.String()) - if err != nil { - return nil, &DiscoveryError{Op: "join", Err: err} + str, jerr := discovery.JoinCluster(cfg.DiscoveryURL, cfg.DiscoveryProxy, m.ID, cfg.InitialPeerURLsMap.String()) + if jerr != nil { + return nil, &DiscoveryError{Op: "join", Err: jerr} } - urlsmap, err := types.NewURLsMap(str) - if err != nil { - return nil, err + urlsmap, uerr := types.NewURLsMap(str) + if uerr != nil { + return nil, uerr } if checkDuplicateURL(urlsmap) { return nil, fmt.Errorf("discovery cluster %s has duplicate url", urlsmap) @@ -289,29 +296,27 @@ func NewServer(cfg *ServerConfig) (*EtcdServer, error) { cfg.PrintWithInitial() id, n, s, w = startNode(cfg, cl, cl.MemberIDs()) case haveWAL: - if err := fileutil.IsDirWriteable(cfg.DataDir); err != nil { + if err = fileutil.IsDirWriteable(cfg.DataDir); err != nil { return nil, fmt.Errorf("cannot write to data directory: %v", err) } - if err := fileutil.IsDirWriteable(cfg.MemberDir()); err != nil { + if err = fileutil.IsDirWriteable(cfg.MemberDir()); err != nil { return nil, fmt.Errorf("cannot write to member directory: %v", err) } - if err := fileutil.IsDirWriteable(cfg.WALDir()); err != nil { + if err = fileutil.IsDirWriteable(cfg.WALDir()); err != nil { return nil, fmt.Errorf("cannot write to WAL directory: %v", err) } if cfg.ShouldDiscover() { plog.Warningf("discovery token ignored since a cluster has already been initialized. Valid log found at %q", cfg.WALDir()) } - var snapshot *raftpb.Snapshot - var err error - snapshot, err = ss.Load() - if err != nil && err != snap.ErrNoSnapshot { - return nil, err + snapshot, lerr := ss.Load() + if lerr != nil && lerr != snap.ErrNoSnapshot { + return nil, lerr } if snapshot != nil { - if err := st.Recovery(snapshot.Data); err != nil { + if err = st.Recovery(snapshot.Data); err != nil { plog.Panicf("recovered store from snapshot error: %v", err) } plog.Infof("recovered store from snapshot at index %d", snapshot.Metadata.Index) @@ -335,7 +340,7 @@ func NewServer(cfg *ServerConfig) (*EtcdServer, error) { sstats.Initialize() lstats := stats.NewLeaderStats(id.String()) - srv := &EtcdServer{ + srv = &EtcdServer{ cfg: cfg, snapCount: cfg.SnapCount, errorc: make(chan error, 1), @@ -378,7 +383,7 @@ func NewServer(cfg *ServerConfig) (*EtcdServer, error) { ErrorC: srv.errorc, V3demo: cfg.V3demo, } - if err := tr.Start(); err != nil { + if err = tr.Start(); err != nil { return nil, err } // add all remotes into transport