From 1e41d95ab21d91750e535b1b00248ceb7ccbc2fd Mon Sep 17 00:00:00 2001
From: Marek Siarkowicz <siarkowicz@google.com>
Date: Wed, 5 Apr 2023 22:40:39 +0200
Subject: [PATCH] tests/robustness: Document analysing watch issue

Signed-off-by: Marek Siarkowicz <siarkowicz@google.com>
---
 tests/robustness/README.md               | 34 +++++++++++++++++--
 tests/robustness/linearizability_test.go | 14 +++++++-
 tests/robustness/makefile.mk             |  5 +++
 tests/robustness/watch.go                | 42 +++++++++++++-----------
 4 files changed, 72 insertions(+), 23 deletions(-)

diff --git a/tests/robustness/README.md b/tests/robustness/README.md
index e92b0cdfb..1aa5c745b 100644
--- a/tests/robustness/README.md
+++ b/tests/robustness/README.md
@@ -53,9 +53,10 @@ Report includes multiple types of files:
 
 ### Example analysis of linearization issue
 
-Let's analyse issue [#14370].
+Let's reproduce and analyse robustness test report for issue [#14370].
 To reproduce the issue by yourself run `make test-robustness-issue14370`.
-After a couple of tries robustness tests should report `Model is not linearizable` and save report locally.
+After a couple of tries robustness tests should fail with a log `Model is not linearizable` and save report locally.
+
 Lineralization issues are easiest to analyse via history visualization. 
 Open `/tmp/TestRobustness_Issue14370/history.html` file in your browser.
 Jump to the error in linearization by clicking `[ jump to first error ]` on the top of the page.
@@ -68,4 +69,31 @@ All following requests are invalid (connected with red line) as they have revisi
 Etcd guarantee that revision is non-decreasing, so this shows a bug in etcd as there is no way revision should decrease.
 This is consistent with the root cause of [#14370] as it was issue with process crash causing last write to be lost.
 
-[#14370]: https://github.com/etcd-io/etcd/issues/14370
\ No newline at end of file
+[#14370]: https://github.com/etcd-io/etcd/issues/14370
+
+### Example analysis of watch issue
+
+Let's reproduce and analyse robustness test report for issue [#15271].
+To reproduce the issue by yourself run `make test-robustness-issue15271`.
+After a couple of tries robustness tests should fail with a logs `Broke watch guarantee` and save report locally.
+
+Watch issues are easiest to analyse by reading the recorded watch history.
+Watch history is recorded for each member separated in different subdirectory under `/tmp/TestRobustness_Issue15271/`
+Open `responses.json` for member mentioned in log `Broke watch guarantee`. 
+For example for member `TestRobustnessIssue15271-test-1` open `/tmp/TestRobustness_Issue15271/TestRobustnessIssue15271-test-1/responses.json`. 
+
+Each line consists of json blob corresponding to single watch response observed by client.
+Look for lines with `mod_revision` equal to revision mentioned in the first log with `Broke watch guarantee`
+You should see two lines where the `mod_revision` decreases like ones below:
+```
+{"Header":{"cluster_id":12951239930360520062,"member_id":16914881897345358027,"revision":2574,"raft_term":2},"Events":[{"kv":{"key":"Ng==","create_revision":2303,"mod_revision":2574,"version":46,"value":"Mjg5OA=="}}],"CompactRevision":0,"Canceled":false,"Created":false}
+{"Header":{"cluster_id":12951239930360520062,"member_id":16914881897345358027,"revision":7708,"raft_term":2},"Events":[{"kv":{"key":"NQ==","create_revision":5,"mod_revision":91,"version":10,"value":"MTAy"}}, ... }
+```
+
+Up to the first line the `mod_revision` of events within responses only increased up to a value of `2574`.
+However, the following line includes an event with `mod_revision` equal `91`.
+If you follow the `mod_revision` throughout the file you should notice that watch replayed revisions second time.
+This is incorrect and breaks `Ordered` and `Unique` [watch guarantees].
+This is consistent with the root cause of [#14370] where member reconnecting to cluster will incorrectly resend revisions.
+
+[#15271]: https://github.com/etcd-io/etcd/issues/15271
diff --git a/tests/robustness/linearizability_test.go b/tests/robustness/linearizability_test.go
index 070556c75..0a5e682a9 100644
--- a/tests/robustness/linearizability_test.go
+++ b/tests/robustness/linearizability_test.go
@@ -167,6 +167,18 @@ func TestRobustness(t *testing.T) {
 			e2e.WithClusterSize(1),
 		),
 	})
+	if v.Compare(version.V3_5) >= 0 {
+		scenarios = append(scenarios, scenario{
+			name:      "Issue15271",
+			failpoint: BlackholeUntilSnapshot,
+			traffic:   &HighTraffic,
+			config: *e2e.NewConfig(
+				e2e.WithSnapshotCount(100),
+				e2e.WithPeerProxy(true),
+				e2e.WithIsPeerTLS(true),
+			),
+		})
+	}
 	snapshotOptions := []e2e.EPClusterOption{
 		e2e.WithGoFailEnabled(true),
 		e2e.WithSnapshotCount(100),
@@ -217,7 +229,7 @@ func testRobustness(ctx context.Context, t *testing.T, lg *zap.Logger, config e2
 	forcestopCluster(r.clus)
 
 	watchProgressNotifyEnabled := r.clus.Cfg.WatchProcessNotifyInterval != 0
-	validateWatchResponses(t, r.responses, traffic.requestProgress || watchProgressNotifyEnabled)
+	validateWatchResponses(t, r.clus, r.responses, traffic.requestProgress || watchProgressNotifyEnabled)
 
 	r.events = watchEvents(r.responses)
 	validateEventsMatch(t, r.events)
diff --git a/tests/robustness/makefile.mk b/tests/robustness/makefile.mk
index acf28e436..04e911f94 100644
--- a/tests/robustness/makefile.mk
+++ b/tests/robustness/makefile.mk
@@ -15,6 +15,11 @@ test-robustness-issue14685: /tmp/etcd-v3.5.5-failpoints/bin
 	GO_TEST_FLAGS='-v --run=TestRobustness/Issue14685 --count 100 --failfast --bin-dir=/tmp/etcd-v3.5.5-failpoints/bin' make test-robustness && \
 	 echo "Failed to reproduce" || echo "Successful reproduction"
 
+.PHONY: test-robustness-issue15271
+test-robustness-issue15271: /tmp/etcd-v3.5.7-failpoints/bin
+	GO_TEST_FLAGS='-v --run=TestRobustness/Issue15271 --count 100 --failfast --bin-dir=/tmp/etcd-v3.5.7-failpoints/bin' make test-robustness && \
+	 echo "Failed to reproduce" || echo "Successful reproduction"
+
 # Failpoints
 
 GOFAIL_VERSION = $(shell cd tools/mod && go list -m -f {{.Version}} go.etcd.io/gofail)
diff --git a/tests/robustness/watch.go b/tests/robustness/watch.go
index 79ba408a6..c44d639f8 100644
--- a/tests/robustness/watch.go
+++ b/tests/robustness/watch.go
@@ -132,25 +132,25 @@ func watchResponsesMaxRevision(responses []watchResponse) int64 {
 	return maxRevision
 }
 
-func validateWatchResponses(t *testing.T, responses [][]watchResponse, expectProgressNotify bool) {
-	for _, memberResponses := range responses {
-		validateMemberWatchResponses(t, memberResponses, expectProgressNotify)
+func validateWatchResponses(t *testing.T, clus *e2e.EtcdProcessCluster, responses [][]watchResponse, expectProgressNotify bool) {
+	for i, member := range clus.Procs {
+		validateMemberWatchResponses(t, member.Config().Name, responses[i], expectProgressNotify)
 	}
 }
 
-func validateMemberWatchResponses(t *testing.T, responses []watchResponse, expectProgressNotify bool) {
+func validateMemberWatchResponses(t *testing.T, memberId string, responses []watchResponse, expectProgressNotify bool) {
 	// Validate watch is correctly configured to ensure proper testing
-	validateGotAtLeastOneProgressNotify(t, responses, expectProgressNotify)
+	validateGotAtLeastOneProgressNotify(t, memberId, responses, expectProgressNotify)
 
 	// Validate etcd watch properties defined in https://etcd.io/docs/v3.6/learning/api/#watch-streams
-	validateOrderedAndReliable(t, responses)
-	validateUnique(t, responses)
-	validateAtomic(t, responses)
+	validateOrderedAndReliable(t, memberId, responses)
+	validateUnique(t, memberId, responses)
+	validateAtomic(t, memberId, responses)
 	// Validate kubernetes usage of watch
-	validateRenewable(t, responses)
+	validateRenewable(t, memberId, responses)
 }
 
-func validateGotAtLeastOneProgressNotify(t *testing.T, responses []watchResponse, expectProgressNotify bool) {
+func validateGotAtLeastOneProgressNotify(t *testing.T, memberId string, responses []watchResponse, expectProgressNotify bool) {
 	var gotProgressNotify = false
 	var lastHeadRevision int64 = 1
 	for _, resp := range responses {
@@ -161,16 +161,16 @@ func validateGotAtLeastOneProgressNotify(t *testing.T, responses []watchResponse
 		lastHeadRevision = resp.Header.Revision
 	}
 	if gotProgressNotify != expectProgressNotify {
-		t.Errorf("Expected at least one progress notify: %v, got: %v", expectProgressNotify, gotProgressNotify)
+		t.Errorf("Progress notify does not match, expect: %v, got: %v, member: %q", expectProgressNotify, gotProgressNotify, memberId)
 	}
 }
 
-func validateRenewable(t *testing.T, responses []watchResponse) {
+func validateRenewable(t *testing.T, memberId string, responses []watchResponse) {
 	var lastProgressNotifyRevision int64 = 0
 	for _, resp := range responses {
 		for _, event := range resp.Events {
 			if event.Kv.ModRevision <= lastProgressNotifyRevision {
-				t.Errorf("BROKE: Renewable - watch can renewed using revision in last progress notification; Progress notification guarantees that previous events have been already delivered, eventRevision: %d, progressNotifyRevision: %d", event.Kv.ModRevision, lastProgressNotifyRevision)
+				t.Errorf("Broke watch guarantee: Renewable - watch can renewed using revision in last progress notification; Progress notification guarantees that previous events have been already delivered, eventRevision: %d, progressNotifyRevision: %d, member: %q", event.Kv.ModRevision, lastProgressNotifyRevision, memberId)
 			}
 		}
 		if resp.IsProgressNotify() {
@@ -179,19 +179,23 @@ func validateRenewable(t *testing.T, responses []watchResponse) {
 	}
 }
 
-func validateOrderedAndReliable(t *testing.T, responses []watchResponse) {
+func validateOrderedAndReliable(t *testing.T, memberId string, responses []watchResponse) {
 	var lastEventRevision int64 = 1
 	for _, resp := range responses {
 		for _, event := range resp.Events {
 			if event.Kv.ModRevision != lastEventRevision && event.Kv.ModRevision != lastEventRevision+1 {
-				t.Errorf("BROKE: Reliable - a sequence of events will never drop any subsequence of events; if there are events ordered in time as a < b < c, then if the watch receives events a and c, it is guaranteed to receive b, lastRevision: %d, currentRevision: %d", lastEventRevision, event.Kv.ModRevision)
+				if event.Kv.ModRevision < lastEventRevision {
+					t.Errorf("Broke watch guarantee: Ordered - events are ordered by revision; an event will never appear on a watch if it precedes an event in time that has already been posted, lastRevision: %d, currentRevision: %d, member: %q", lastEventRevision, event.Kv.ModRevision, memberId)
+				} else {
+					t.Errorf("Broke watch guarantee: Reliable - a sequence of events will never drop any subsequence of events; if there are events ordered in time as a < b < c, then if the watch receives events a and c, it is guaranteed to receive b, lastRevision: %d, currentRevision: %d, member: %q", lastEventRevision, event.Kv.ModRevision, memberId)
+				}
 			}
 			lastEventRevision = event.Kv.ModRevision
 		}
 	}
 }
 
-func validateUnique(t *testing.T, responses []watchResponse) {
+func validateUnique(t *testing.T, memberId string, responses []watchResponse) {
 	type revisionKey struct {
 		revision int64
 		key      string
@@ -201,19 +205,19 @@ func validateUnique(t *testing.T, responses []watchResponse) {
 		for _, event := range resp.Events {
 			rk := revisionKey{key: string(event.Kv.Key), revision: event.Kv.ModRevision}
 			if _, found := uniqueOperations[rk]; found {
-				t.Errorf("BROKE: Unique - an event will never appear on a watch twice, key: %q, revision: %d", rk.key, rk.revision)
+				t.Errorf("Broke watch guarantee: Unique - an event will never appear on a watch twice, key: %q, revision: %d, member: %q", rk.key, rk.revision, memberId)
 			}
 			uniqueOperations[rk] = struct{}{}
 		}
 	}
 }
 
-func validateAtomic(t *testing.T, responses []watchResponse) {
+func validateAtomic(t *testing.T, memberId string, responses []watchResponse) {
 	var lastEventRevision int64 = 1
 	for _, resp := range responses {
 		if len(resp.Events) > 0 {
 			if resp.Events[0].Kv.ModRevision == lastEventRevision {
-				t.Errorf("BROKE: Atomic - a list of events is guaranteed to encompass complete revisions; updates in the same revision over multiple keys will not be split over several lists of events, previousListEventRevision: %d, currentListEventRevision: %d", lastEventRevision, resp.Events[0].Kv.ModRevision)
+				t.Errorf("Broke watch guarantee: Atomic - a list of events is guaranteed to encompass complete revisions; updates in the same revision over multiple keys will not be split over several lists of events, previousListEventRevision: %d, currentListEventRevision: %d, member: %q", lastEventRevision, resp.Events[0].Kv.ModRevision, memberId)
 			}
 			lastEventRevision = resp.Events[len(resp.Events)-1].Kv.ModRevision
 		}