From ca1bbee7374837c4d906a9cad63b2745baf293a7 Mon Sep 17 00:00:00 2001 From: Barak Michener Date: Tue, 7 Oct 2014 16:58:58 -0400 Subject: [PATCH] add logging and backoff and simple retry logic --- client/http.go | 3 +- discovery/discovery.go | 62 ++++++++++++++++++++++++++++++++++++------ 2 files changed, 55 insertions(+), 10 deletions(-) diff --git a/client/http.go b/client/http.go index 3f0386f76..e43a996bc 100644 --- a/client/http.go +++ b/client/http.go @@ -15,7 +15,8 @@ import ( ) var ( - v2Prefix = "/v2/keys" + v2Prefix = "/v2/keys" + ErrTimeout = context.DeadlineExceeded ) // transport mimics http.Transport to provide an interface which can be diff --git a/discovery/discovery.go b/discovery/discovery.go index 8416d6376..744437376 100644 --- a/discovery/discovery.go +++ b/discovery/discovery.go @@ -3,6 +3,7 @@ package discovery import ( "errors" "fmt" + "log" "net/http" "net/url" "path" @@ -15,12 +16,16 @@ import ( ) var ( - ErrInvalidURL = errors.New("discovery: invalid URL") - ErrBadSizeKey = errors.New("discovery: size key is bad") - ErrSizeNotFound = errors.New("discovery: size key not found") - ErrTokenNotFound = errors.New("discovery: token not found") - ErrDuplicateID = errors.New("discovery: found duplicate id") - ErrFullCluster = errors.New("discovery: cluster is full") + ErrInvalidURL = errors.New("discovery: invalid URL") + ErrBadSizeKey = errors.New("discovery: size key is bad") + ErrSizeNotFound = errors.New("discovery: size key not found") + ErrTokenNotFound = errors.New("discovery: token not found") + ErrDuplicateID = errors.New("discovery: found duplicate id") + ErrFullCluster = errors.New("discovery: cluster is full") + ErrTooManyRetries = errors.New("discovery: too many retries") + + // Number of retries discovery will attempt before giving up and erroring out. + nRetries = uint(3) ) type Discoverer interface { @@ -32,6 +37,8 @@ type discovery struct { id int64 config string c client.Client + retries uint + url *url.URL } func New(durl string, id int64, config string) (Discoverer, error) { @@ -41,18 +48,19 @@ func New(durl string, id int64, config string) (Discoverer, error) { } token := u.Path u.Path = "" - client, err := client.NewHTTPClient(&http.Transport{}, u.String(), time.Second*5) + c, err := client.NewHTTPClient(&http.Transport{}, u.String(), time.Second*5) if err != nil { return nil, err } // discovery service redirects /[key] to /v2/keys/[key] // set the prefix of client to "" to handle this - client.SetPrefix("") + c.SetPrefix("") return &discovery{ cluster: token, id: id, config: config, - c: client, + c: c, + url: u, }, nil } @@ -65,6 +73,12 @@ func (d *discovery) Discover() (string, error) { } if err := d.createSelf(); err != nil { + if err == client.ErrTimeout { + if d.retries < nRetries { + d.logAndBackoffForRetry("registering self") + return d.Discover() + } + } return "", err } @@ -75,6 +89,15 @@ func (d *discovery) Discover() (string, error) { all, err := d.waitNodes(nodes, size) if err != nil { + if err == client.ErrTimeout { + // Our actual connection timed out (nodes can take awhile, but the discovery + // server stopped responding) increment our retry counter and we have to + // start from scratch. Calling createSelf() again should be idempotent. + if d.retries < nRetries { + d.logAndBackoffForRetry("waiting for other nodes") + return d.Discover() + } + } return "", err } @@ -101,6 +124,9 @@ func (d *discovery) checkCluster() (client.Nodes, int, error) { if err == client.ErrKeyNoExist { return nil, 0, ErrSizeNotFound } + if err == client.ErrTimeout { + return d.checkClusterRetry() + } return nil, 0, err } size, err := strconv.Atoi(resp.Node.Value) @@ -110,6 +136,9 @@ func (d *discovery) checkCluster() (client.Nodes, int, error) { resp, err = d.c.Get(d.cluster) if err != nil { + if err == client.ErrTimeout { + return d.checkClusterRetry() + } return nil, 0, err } nodes := make(client.Nodes, 0) @@ -135,6 +164,21 @@ func (d *discovery) checkCluster() (client.Nodes, int, error) { return nodes, size, nil } +func (d *discovery) logAndBackoffForRetry(step string) { + d.retries++ + retryTime := time.Second * (0x1 << d.retries) + log.Println("discovery: during", step, "connection to", d.url, "timed out, retrying in", retryTime) + time.Sleep(retryTime) +} + +func (d *discovery) checkClusterRetry() (client.Nodes, int, error) { + if d.retries < nRetries { + d.logAndBackoffForRetry("cluster status check") + return d.checkCluster() + } + return nil, 0, ErrTooManyRetries +} + func (d *discovery) waitNodes(nodes client.Nodes, size int) (client.Nodes, error) { if len(nodes) > size { nodes = nodes[:size]