From 324d2383b88968b3703c8d281e7a4bca8d805fa7 Mon Sep 17 00:00:00 2001 From: fanmin shi Date: Fri, 24 Feb 2017 15:29:28 -0800 Subject: [PATCH] integration: ensure leader is up in waitLeader() and clusterMustProgress() The issue is caused by leader loss even after waitLeader() returns which can happen if the test machine is flaky which triggers a leader loss or the killed node is the leader since waitLeader() only scans followers in TestRestartMember() and they can have the same older leader. In those cases, clusterMustProgress() proceeds with no leader which triggers the no leader error. To get around that, use linearizable get in waitLeader() to ensure leader is up and retries on kapi.create() in clusterMustProgress() to ensure it proceeds with a leader. FIX #7258 --- integration/cluster.go | 18 +++++++++++++++++- integration/cluster_test.go | 17 ++++++++++++++--- 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/integration/cluster.go b/integration/cluster.go index a132ece0c..a64f024ee 100644 --- a/integration/cluster.go +++ b/integration/cluster.go @@ -175,8 +175,12 @@ func (c *cluster) URL(i int) string { // URLs returns a list of all active client URLs in the cluster func (c *cluster) URLs() []string { + return getMembersURLs(c.Members) +} + +func getMembersURLs(members []*member) []string { urls := make([]string, 0) - for _, m := range c.Members { + for _, m := range members { select { case <-m.s.StopNotify(): continue @@ -343,6 +347,18 @@ func (c *cluster) waitLeader(t *testing.T, membs []*member) int { for _, m := range membs { possibleLead[uint64(m.s.ID())] = true } + cc := MustNewHTTPClient(t, getMembersURLs(membs), nil) + kapi := client.NewKeysAPI(cc) + + // ensure leader is up via linearizable get + for { + ctx, cancel := context.WithTimeout(context.Background(), 10*tickDuration) + _, err := kapi.Get(ctx, "0", &client.GetOptions{Quorum: true}) + cancel() + if err == nil || strings.Contains(err.Error(), "Key not found") { + break + } + } for lead == 0 || !possibleLead[lead] { lead = 0 diff --git a/integration/cluster_test.go b/integration/cluster_test.go index 627f72ffd..1bf5fc56c 100644 --- a/integration/cluster_test.go +++ b/integration/cluster_test.go @@ -447,13 +447,24 @@ func TestRejectUnhealthyRemove(t *testing.T) { func clusterMustProgress(t *testing.T, membs []*member) { cc := MustNewHTTPClient(t, []string{membs[0].URL()}, nil) kapi := client.NewKeysAPI(cc) - ctx, cancel := context.WithTimeout(context.Background(), requestTimeout) key := fmt.Sprintf("foo%d", rand.Int()) - resp, err := kapi.Create(ctx, "/"+key, "bar") + var ( + err error + resp *client.Response + ) + // retry in case of leader loss induced by slow CI + for i := 0; i < 3; i++ { + ctx, cancel := context.WithTimeout(context.Background(), requestTimeout) + resp, err = kapi.Create(ctx, "/"+key, "bar") + cancel() + if err == nil { + break + } + t.Logf("failed to create key on %q (%v)", membs[0].URL(), err) + } if err != nil { t.Fatalf("create on %s error: %v", membs[0].URL(), err) } - cancel() for i, m := range membs { u := m.URL()