Merge pull request #17967 from serathius/robustness-update-readme

Update the robustness README and fix the #14370 reproduction case
2024-09-27 06:25:44 +00:00 · 2024-05-09 10:05:27 +02:00 · 2024-05-09 10:05:27 +02:00 · b8ffc5e8c0
commit b8ffc5e8c0
parent c4ff2c20bd be9758e2bc
3 changed files with 82 additions and 38 deletions
--- a/tests/robustness/README.md
+++ b/tests/robustness/README.md
@ -71,31 +71,62 @@ Errors in etcd model could be causing false positives, which makes the ability t

 ## Analysing failure

-If robustness tests fails we want to analyse the report to confirm if the issue is on etcd side. Location of this report
-is included in test logs. One of log lines should look like:
+If robustness tests fails we want to analyse the report to confirm if the issue is on etcd side. Location of the directory with the report
+is mentioned `Saving robustness test report` log. Logs from report generation should look like:
 ```
-    history.go:34: Model is not linearizable
-    logger.go:130: 2023-03-18T12:18:03.244+0100 INFO    Saving member data dir  {"member": "TestRobustnessIssue14370-test-0", "path": "/tmp/TestRobustness_Issue14370/TestRobustnessIssue14370-test-0"}
-    logger.go:130: 2023-03-18T12:18:03.244+0100 INFO    Saving watch responses  {"path": "/tmp/TestRobustness_Issue14370/TestRobustnessIssue14370-test-0/responses.json"}
-    logger.go:130: 2023-03-18T12:18:03.247+0100 INFO    Saving watch events     {"path": "/tmp/TestRobustness_Issue14370/TestRobustnessIssue14370-test-0/events.json"}
-    logger.go:130: 2023-05-15T17:42:37.792+0200 INFO    Saving operation history        {"path": "/tmp/TestRobustness_ClusterOfSize3_Kubernetes/client-1/operations.json"}
-    logger.go:130: 2023-05-15T17:42:37.793+0200 INFO    Saving watch responses  {"path": "/tmp/TestRobustness_ClusterOfSize3_Kubernetes/client-2/watch.json"}
-    logger.go:130: 2023-03-18T12:18:03.256+0100 INFO    Saving visualization    {"path": "/tmp/TestRobustness_Issue14370/history.html"}
+    logger.go:146: 2024-05-08T10:42:54.429+0200 INFO    Saving robustness test report   {"path": "/tmp/TestRobustnessRegression_Issue14370/1715157774429416550"}
+    logger.go:146: 2024-05-08T10:42:54.429+0200 INFO    Saving member data dir  {"member": "TestRobustnessRegressionIssue14370-test-0", "path": "/tmp/TestRobustnessRegression_Issue14370/1715157774429416550/server-TestRobustnessRegressionIssue14370-test-0"}
+    logger.go:146: 2024-05-08T10:42:54.430+0200 INFO    no watch operations for client, skip persisting {"client-id": 1}
+    logger.go:146: 2024-05-08T10:42:54.430+0200 INFO    Saving operation history        {"path": "/tmp/TestRobustnessRegression_Issue14370/1715157774429416550/client-1/operations.json"}
+    logger.go:146: 2024-05-08T10:42:54.430+0200 INFO    Saving watch operations {"path": "/tmp/TestRobustnessRegression_Issue14370/1715157774429416550/client-2/watch.json"}
+    logger.go:146: 2024-05-08T10:42:54.431+0200 INFO    no KV operations for client, skip persisting    {"client-id": 2}
+    logger.go:146: 2024-05-08T10:42:54.431+0200 INFO    no watch operations for client, skip persisting {"client-id": 3}
+    logger.go:146: 2024-05-08T10:42:54.431+0200 INFO    Saving operation history        {"path": "/tmp/TestRobustnessRegression_Issue14370/1715157774429416550/client-3/operations.json"}
+    logger.go:146: 2024-05-08T10:42:54.433+0200 INFO    no watch operations for client, skip persisting {"client-id": 4}
+    logger.go:146: 2024-05-08T10:42:54.433+0200 INFO    Saving operation history        {"path": "/tmp/TestRobustnessRegression_Issue14370/1715157774429416550/client-4/operations.json"}
+    logger.go:146: 2024-05-08T10:42:54.434+0200 INFO    no watch operations for client, skip persisting {"client-id": 5}
+    logger.go:146: 2024-05-08T10:42:54.434+0200 INFO    Saving operation history        {"path": "/tmp/TestRobustnessRegression_Issue14370/1715157774429416550/client-5/operations.json"}
+    logger.go:146: 2024-05-08T10:42:54.435+0200 INFO    no watch operations for client, skip persisting {"client-id": 6}
+    logger.go:146: 2024-05-08T10:42:54.435+0200 INFO    Saving operation history        {"path": "/tmp/TestRobustnessRegression_Issue14370/1715157774429416550/client-6/operations.json"}
+    logger.go:146: 2024-05-08T10:42:54.437+0200 INFO    no watch operations for client, skip persisting {"client-id": 7}
+    logger.go:146: 2024-05-08T10:42:54.437+0200 INFO    Saving operation history        {"path": "/tmp/TestRobustnessRegression_Issue14370/1715157774429416550/client-7/operations.json"}
+    logger.go:146: 2024-05-08T10:42:54.438+0200 INFO    no watch operations for client, skip persisting {"client-id": 8}
+    logger.go:146: 2024-05-08T10:42:54.438+0200 INFO    Saving operation history        {"path": "/tmp/TestRobustnessRegression_Issue14370/1715157774429416550/client-8/operations.json"}
+    logger.go:146: 2024-05-08T10:42:54.439+0200 INFO    no watch operations for client, skip persisting {"client-id": 9}
+    logger.go:146: 2024-05-08T10:42:54.439+0200 INFO    Saving operation history        {"path": "/tmp/TestRobustnessRegression_Issue14370/1715157774429416550/client-9/operations.json"}
+    logger.go:146: 2024-05-08T10:42:54.440+0200 INFO    no watch operations for client, skip persisting {"client-id": 10}
+    logger.go:146: 2024-05-08T10:42:54.440+0200 INFO    Saving operation history        {"path": "/tmp/TestRobustnessRegression_Issue14370/1715157774429416550/client-10/operations.json"}
+    logger.go:146: 2024-05-08T10:42:54.441+0200 INFO    Saving visualization    {"path": "/tmp/TestRobustnessRegression_Issue14370/1715157774429416550/history.html"}
 ```

-Report includes multiple types of files:
-* Member db files, can be used to verify disk/memory corruption.
-* Watch responses saved as json, can be used to validate [watch API guarantees].
-* Operation history saved as both html visualization and a json, can be used to validate [KV API guarantees].
+Report follows the hierarchy:
+* `server-*` - etcd server data directories, can be used to verify disk/memory corruption.
+  * `member`
+    * `wal` - Write Ahead Log (WAL) directory, that can be analysed using `etcd-dump-logs` command line tool available in `tools` directory.
+    * `snap` - Snapshot directory, includes the bbolt database file `db`, that can be analysed using `etcd-dump-db` command line tool available in `tools` directory.
+* `client-*` - Client request and response dumps in json format.
+  * `watch.jon` - Watch requests and responses, can be used to validate [watch API guarantees].
+  * `operations.json` - KV operation history
+* `history.html` - Visualization of KV operation history, can be used to validate [KV API guarantees].

 ### Example analysis of linearization issue

 Let's reproduce and analyse robustness test report for issue [#14370].
 To reproduce the issue by yourself run `make test-robustness-issue14370`.
-After a couple of tries robustness tests should fail with a log `Model is not linearizable` and save report locally.
+After a couple of tries robustness tests should fail with a log `Linearization failed` and save report locally.

-Lineralization issues are easiest to analyse via history visualization. 
-Open `/tmp/TestRobustness_Issue14370/history.html` file in your browser.
+Example:
+```
+    logger.go:146: 2024-05-08T10:42:53.379+0200 INFO    Validating linearizable operations      {"timeout": "5m0s"}
+    logger.go:146: 2024-05-08T10:42:54.429+0200 ERROR   Linearization failed    {"duration": "1.050105973s"}
+    validate.go:39: Failed linearization, skipping further validation
+    logger.go:146: 2024-05-08T10:42:54.429+0200 INFO    Saving robustness test report   {"path": "/tmp/TestRobustnessRegression_Issue14370/1715157774429416550"}
+    ...
+    logger.go:146: 2024-05-08T10:42:54.441+0200 INFO    Saving visualization    {"path": "/tmp/TestRobustnessRegression_Issue14370/1715157774429416550/history.html"}
+```
+
+Linearization issues are easiest to analyse via history visualization.
+Open `/tmp/TestRobustnessRegression_Issue14370/1715157774429416550/history.html` file in your browser.
 Jump to the error in linearization by clicking `[ jump to first error ]` on the top of the page.

 You should see a graph similar to the one on the image below.
@ -114,23 +145,34 @@ Let's reproduce and analyse robustness test report for issue [#15271].
 To reproduce the issue by yourself run `make test-robustness-issue15271`.
 After a couple of tries robustness tests should fail with a logs `Broke watch guarantee` and save report locally.

+Example:
+```
+    logger.go:146: 2024-05-08T10:50:11.301+0200 INFO    Validating linearizable operations      {"timeout": "5m0s"}
+    logger.go:146: 2024-05-08T10:50:15.754+0200 INFO    Linearization success   {"duration": "4.453346487s"}
+    logger.go:146: 2024-05-08T10:50:15.754+0200 INFO    Validating watch
+    logger.go:146: 2024-05-08T10:50:15.849+0200 ERROR   Broke watch guarantee   {"guarantee": "ordered", "client": 4, "revision": 3}
+    validate.go:45: Failed validating watch history, err: broke Ordered - events are ordered by revision; an event will never appear on a watch if it precedes an event in time that has already been posted
+    logger.go:146: 2024-05-08T10:50:15.849+0200 INFO    Validating serializable operations
+    logger.go:146: 2024-05-08T10:50:15.866+0200 INFO    Saving robustness test report   {"path": "/tmp/TestRobustnessRegression_Issue15271/1715158215866033806"}
+```
+
 Watch issues are easiest to analyse by reading the recorded watch history.
-Watch history is recorded for each client separated in different subdirectory under `/tmp/TestRobustness_Issue15271/`
+Watch history is recorded for each client separated in different subdirectory under `/tmp/TestRobustnessRegression_Issue15271/1715158215866033806`
 Open `watch.json` for client mentioned in log `Broke watch guarantee`.
-For example for client `14` open `/tmp/TestRobustness_Issue15271/client-14/watch.json`.
+For client `4` that broke the watch guarantee open `/tmp/TestRobustnessRegression_Issue15271/1715158215866033806/client-4/watch.json`.

-Each line consists of json blob corresponding to single watch response observed by client.
-Look for lines with `mod_revision` equal to revision mentioned in the first log with `Broke watch guarantee`
-You should see two lines where the `mod_revision` decreases like ones below:
+Each line consists of json blob corresponding to single watch request sent by client.
+Look for events with `Revision` equal to revision mentioned in the first log with `Broke watch guarantee`, in this case look for `"Revision":3,`.
+You should see watch responses where the `Revision` decreases like ones below:
 ```
-{"Events":[{"Op":{"Type":"put","Key":"5","WithPrefix":false,"Limit":0,"Value":{"Value":"357","Hash":0},"LeaseID":0},"Revision":335}],"IsProgressNotify":false,"Revision":335,"Time":1050415777}
-{"Events":[{"Op":{"Type":"put","Key":"1","WithPrefix":false,"Limit":0,"Value":{"Value":"24","Hash":0},"LeaseID":0},"Revision":24}, ...
+{"Events":[{"Type":"put-operation","Key":"key5","Value":{"Value":"793","Hash":0},"Revision":799,"IsCreate":false,"PrevValue":null}],"IsProgressNotify":false,"Revision":799,"Time":3202907249,"Error":""}
+{"Events":[{"Type":"put-operation","Key":"key4","Value":{"Value":"1","Hash":0},"Revision":3,"IsCreate":true,"PrevValue":null}, ...
 ```

-Up to the first line the `revision` of events within responses only increased up to a value of `335`.
-However, the following line includes an event with `revision` equal `24`.
+Up to the first response the `Revision` of events only increased up to a value of `799`.
+However, the following line includes an event with `Revision` equal `3`.
 If you follow the `revision` throughout the file you should notice that watch replayed revisions second time.
-This is incorrect and breaks `Ordered` and `Unique` [watch API guarantees].
-This is consistent with the root cause of [#14370] where member reconnecting to cluster will incorrectly resend revisions.
+This is incorrect and breaks `Ordered` [watch API guarantees].
+This is consistent with the root cause of [#14370] where member reconnecting to cluster will resend revisions.

 [#15271]: https://github.com/etcd-io/etcd/issues/15271
--- a/tests/robustness/failpoint/network.go
+++ b/tests/robustness/failpoint/network.go
@ -57,9 +57,6 @@ func (tb triggerBlackhole) Trigger(ctx context.Context, t *testing.T, member e2e
 }

 func (tb triggerBlackhole) Available(config e2e.EtcdProcessClusterConfig, process e2e.EtcdProcess) bool {
-	if tb.waitTillSnapshot && config.ServerConfig.SnapshotCatchUpEntries > 100 {
-		return false
-	}
 	return config.ClusterSize > 1 && process.PeerProxy() != nil
 }

--- a/tests/robustness/scenarios.go
+++ b/tests/robustness/scenarios.go
@ -19,6 +19,8 @@ import (
 	"testing"
 	"time"

+	"github.com/coreos/go-semver/semver"
+
 	"go.etcd.io/etcd/api/v3/version"
 	"go.etcd.io/etcd/tests/v3/framework/e2e"
 	"go.etcd.io/etcd/tests/v3/robustness/failpoint"
@ -162,19 +164,22 @@ func regressionScenarios(t *testing.T) []testScenario {
 			e2e.WithClusterSize(1),
 		),
 	})
-	// TODO: Deflake waiting for waiting until snapshot for etcd versions that don't support setting snapshot catchup entries.
-	if v.Compare(version.V3_6) >= 0 {
+	if v.Compare(version.V3_5) >= 0 {
+		opts := []e2e.EPClusterOption{
+			e2e.WithSnapshotCount(100),
+			e2e.WithPeerProxy(true),
+			e2e.WithIsPeerTLS(true),
+		}
+		v3_5_13 := semver.Version{Major: 3, Minor: 5, Patch: 13}
+		if v.Compare(v3_5_13) >= 0 {
+			opts = append(opts, e2e.WithSnapshotCatchUpEntries(100))
+		}
 		scenarios = append(scenarios, testScenario{
 			name:      "Issue15271",
 			failpoint: failpoint.BlackholeUntilSnapshot,
 			profile:   traffic.HighTrafficProfile,
 			traffic:   traffic.EtcdPut,
-			cluster: *e2e.NewConfig(
-				e2e.WithSnapshotCatchUpEntries(100),
-				e2e.WithSnapshotCount(100),
-				e2e.WithPeerProxy(true),
-				e2e.WithIsPeerTLS(true),
-			),
+			cluster:   *e2e.NewConfig(opts...),
 		})
 	}
 	return scenarios