mirror of
https://github.com/etcd-io/etcd.git
synced 2024-09-27 06:25:44 +00:00
Merge pull request #17967 from serathius/robustness-update-readme
Update the robustness README and fix the #14370 reproduction case
This commit is contained in:
commit
b8ffc5e8c0
@ -71,31 +71,62 @@ Errors in etcd model could be causing false positives, which makes the ability t
|
||||
|
||||
## Analysing failure
|
||||
|
||||
If robustness tests fails we want to analyse the report to confirm if the issue is on etcd side. Location of this report
|
||||
is included in test logs. One of log lines should look like:
|
||||
If robustness tests fails we want to analyse the report to confirm if the issue is on etcd side. Location of the directory with the report
|
||||
is mentioned `Saving robustness test report` log. Logs from report generation should look like:
|
||||
```
|
||||
history.go:34: Model is not linearizable
|
||||
logger.go:130: 2023-03-18T12:18:03.244+0100 INFO Saving member data dir {"member": "TestRobustnessIssue14370-test-0", "path": "/tmp/TestRobustness_Issue14370/TestRobustnessIssue14370-test-0"}
|
||||
logger.go:130: 2023-03-18T12:18:03.244+0100 INFO Saving watch responses {"path": "/tmp/TestRobustness_Issue14370/TestRobustnessIssue14370-test-0/responses.json"}
|
||||
logger.go:130: 2023-03-18T12:18:03.247+0100 INFO Saving watch events {"path": "/tmp/TestRobustness_Issue14370/TestRobustnessIssue14370-test-0/events.json"}
|
||||
logger.go:130: 2023-05-15T17:42:37.792+0200 INFO Saving operation history {"path": "/tmp/TestRobustness_ClusterOfSize3_Kubernetes/client-1/operations.json"}
|
||||
logger.go:130: 2023-05-15T17:42:37.793+0200 INFO Saving watch responses {"path": "/tmp/TestRobustness_ClusterOfSize3_Kubernetes/client-2/watch.json"}
|
||||
logger.go:130: 2023-03-18T12:18:03.256+0100 INFO Saving visualization {"path": "/tmp/TestRobustness_Issue14370/history.html"}
|
||||
logger.go:146: 2024-05-08T10:42:54.429+0200 INFO Saving robustness test report {"path": "/tmp/TestRobustnessRegression_Issue14370/1715157774429416550"}
|
||||
logger.go:146: 2024-05-08T10:42:54.429+0200 INFO Saving member data dir {"member": "TestRobustnessRegressionIssue14370-test-0", "path": "/tmp/TestRobustnessRegression_Issue14370/1715157774429416550/server-TestRobustnessRegressionIssue14370-test-0"}
|
||||
logger.go:146: 2024-05-08T10:42:54.430+0200 INFO no watch operations for client, skip persisting {"client-id": 1}
|
||||
logger.go:146: 2024-05-08T10:42:54.430+0200 INFO Saving operation history {"path": "/tmp/TestRobustnessRegression_Issue14370/1715157774429416550/client-1/operations.json"}
|
||||
logger.go:146: 2024-05-08T10:42:54.430+0200 INFO Saving watch operations {"path": "/tmp/TestRobustnessRegression_Issue14370/1715157774429416550/client-2/watch.json"}
|
||||
logger.go:146: 2024-05-08T10:42:54.431+0200 INFO no KV operations for client, skip persisting {"client-id": 2}
|
||||
logger.go:146: 2024-05-08T10:42:54.431+0200 INFO no watch operations for client, skip persisting {"client-id": 3}
|
||||
logger.go:146: 2024-05-08T10:42:54.431+0200 INFO Saving operation history {"path": "/tmp/TestRobustnessRegression_Issue14370/1715157774429416550/client-3/operations.json"}
|
||||
logger.go:146: 2024-05-08T10:42:54.433+0200 INFO no watch operations for client, skip persisting {"client-id": 4}
|
||||
logger.go:146: 2024-05-08T10:42:54.433+0200 INFO Saving operation history {"path": "/tmp/TestRobustnessRegression_Issue14370/1715157774429416550/client-4/operations.json"}
|
||||
logger.go:146: 2024-05-08T10:42:54.434+0200 INFO no watch operations for client, skip persisting {"client-id": 5}
|
||||
logger.go:146: 2024-05-08T10:42:54.434+0200 INFO Saving operation history {"path": "/tmp/TestRobustnessRegression_Issue14370/1715157774429416550/client-5/operations.json"}
|
||||
logger.go:146: 2024-05-08T10:42:54.435+0200 INFO no watch operations for client, skip persisting {"client-id": 6}
|
||||
logger.go:146: 2024-05-08T10:42:54.435+0200 INFO Saving operation history {"path": "/tmp/TestRobustnessRegression_Issue14370/1715157774429416550/client-6/operations.json"}
|
||||
logger.go:146: 2024-05-08T10:42:54.437+0200 INFO no watch operations for client, skip persisting {"client-id": 7}
|
||||
logger.go:146: 2024-05-08T10:42:54.437+0200 INFO Saving operation history {"path": "/tmp/TestRobustnessRegression_Issue14370/1715157774429416550/client-7/operations.json"}
|
||||
logger.go:146: 2024-05-08T10:42:54.438+0200 INFO no watch operations for client, skip persisting {"client-id": 8}
|
||||
logger.go:146: 2024-05-08T10:42:54.438+0200 INFO Saving operation history {"path": "/tmp/TestRobustnessRegression_Issue14370/1715157774429416550/client-8/operations.json"}
|
||||
logger.go:146: 2024-05-08T10:42:54.439+0200 INFO no watch operations for client, skip persisting {"client-id": 9}
|
||||
logger.go:146: 2024-05-08T10:42:54.439+0200 INFO Saving operation history {"path": "/tmp/TestRobustnessRegression_Issue14370/1715157774429416550/client-9/operations.json"}
|
||||
logger.go:146: 2024-05-08T10:42:54.440+0200 INFO no watch operations for client, skip persisting {"client-id": 10}
|
||||
logger.go:146: 2024-05-08T10:42:54.440+0200 INFO Saving operation history {"path": "/tmp/TestRobustnessRegression_Issue14370/1715157774429416550/client-10/operations.json"}
|
||||
logger.go:146: 2024-05-08T10:42:54.441+0200 INFO Saving visualization {"path": "/tmp/TestRobustnessRegression_Issue14370/1715157774429416550/history.html"}
|
||||
```
|
||||
|
||||
Report includes multiple types of files:
|
||||
* Member db files, can be used to verify disk/memory corruption.
|
||||
* Watch responses saved as json, can be used to validate [watch API guarantees].
|
||||
* Operation history saved as both html visualization and a json, can be used to validate [KV API guarantees].
|
||||
Report follows the hierarchy:
|
||||
* `server-*` - etcd server data directories, can be used to verify disk/memory corruption.
|
||||
* `member`
|
||||
* `wal` - Write Ahead Log (WAL) directory, that can be analysed using `etcd-dump-logs` command line tool available in `tools` directory.
|
||||
* `snap` - Snapshot directory, includes the bbolt database file `db`, that can be analysed using `etcd-dump-db` command line tool available in `tools` directory.
|
||||
* `client-*` - Client request and response dumps in json format.
|
||||
* `watch.jon` - Watch requests and responses, can be used to validate [watch API guarantees].
|
||||
* `operations.json` - KV operation history
|
||||
* `history.html` - Visualization of KV operation history, can be used to validate [KV API guarantees].
|
||||
|
||||
### Example analysis of linearization issue
|
||||
|
||||
Let's reproduce and analyse robustness test report for issue [#14370].
|
||||
To reproduce the issue by yourself run `make test-robustness-issue14370`.
|
||||
After a couple of tries robustness tests should fail with a log `Model is not linearizable` and save report locally.
|
||||
After a couple of tries robustness tests should fail with a log `Linearization failed` and save report locally.
|
||||
|
||||
Lineralization issues are easiest to analyse via history visualization.
|
||||
Open `/tmp/TestRobustness_Issue14370/history.html` file in your browser.
|
||||
Example:
|
||||
```
|
||||
logger.go:146: 2024-05-08T10:42:53.379+0200 INFO Validating linearizable operations {"timeout": "5m0s"}
|
||||
logger.go:146: 2024-05-08T10:42:54.429+0200 ERROR Linearization failed {"duration": "1.050105973s"}
|
||||
validate.go:39: Failed linearization, skipping further validation
|
||||
logger.go:146: 2024-05-08T10:42:54.429+0200 INFO Saving robustness test report {"path": "/tmp/TestRobustnessRegression_Issue14370/1715157774429416550"}
|
||||
...
|
||||
logger.go:146: 2024-05-08T10:42:54.441+0200 INFO Saving visualization {"path": "/tmp/TestRobustnessRegression_Issue14370/1715157774429416550/history.html"}
|
||||
```
|
||||
|
||||
Linearization issues are easiest to analyse via history visualization.
|
||||
Open `/tmp/TestRobustnessRegression_Issue14370/1715157774429416550/history.html` file in your browser.
|
||||
Jump to the error in linearization by clicking `[ jump to first error ]` on the top of the page.
|
||||
|
||||
You should see a graph similar to the one on the image below.
|
||||
@ -114,23 +145,34 @@ Let's reproduce and analyse robustness test report for issue [#15271].
|
||||
To reproduce the issue by yourself run `make test-robustness-issue15271`.
|
||||
After a couple of tries robustness tests should fail with a logs `Broke watch guarantee` and save report locally.
|
||||
|
||||
Example:
|
||||
```
|
||||
logger.go:146: 2024-05-08T10:50:11.301+0200 INFO Validating linearizable operations {"timeout": "5m0s"}
|
||||
logger.go:146: 2024-05-08T10:50:15.754+0200 INFO Linearization success {"duration": "4.453346487s"}
|
||||
logger.go:146: 2024-05-08T10:50:15.754+0200 INFO Validating watch
|
||||
logger.go:146: 2024-05-08T10:50:15.849+0200 ERROR Broke watch guarantee {"guarantee": "ordered", "client": 4, "revision": 3}
|
||||
validate.go:45: Failed validating watch history, err: broke Ordered - events are ordered by revision; an event will never appear on a watch if it precedes an event in time that has already been posted
|
||||
logger.go:146: 2024-05-08T10:50:15.849+0200 INFO Validating serializable operations
|
||||
logger.go:146: 2024-05-08T10:50:15.866+0200 INFO Saving robustness test report {"path": "/tmp/TestRobustnessRegression_Issue15271/1715158215866033806"}
|
||||
```
|
||||
|
||||
Watch issues are easiest to analyse by reading the recorded watch history.
|
||||
Watch history is recorded for each client separated in different subdirectory under `/tmp/TestRobustness_Issue15271/`
|
||||
Watch history is recorded for each client separated in different subdirectory under `/tmp/TestRobustnessRegression_Issue15271/1715158215866033806`
|
||||
Open `watch.json` for client mentioned in log `Broke watch guarantee`.
|
||||
For example for client `14` open `/tmp/TestRobustness_Issue15271/client-14/watch.json`.
|
||||
For client `4` that broke the watch guarantee open `/tmp/TestRobustnessRegression_Issue15271/1715158215866033806/client-4/watch.json`.
|
||||
|
||||
Each line consists of json blob corresponding to single watch response observed by client.
|
||||
Look for lines with `mod_revision` equal to revision mentioned in the first log with `Broke watch guarantee`
|
||||
You should see two lines where the `mod_revision` decreases like ones below:
|
||||
Each line consists of json blob corresponding to single watch request sent by client.
|
||||
Look for events with `Revision` equal to revision mentioned in the first log with `Broke watch guarantee`, in this case look for `"Revision":3,`.
|
||||
You should see watch responses where the `Revision` decreases like ones below:
|
||||
```
|
||||
{"Events":[{"Op":{"Type":"put","Key":"5","WithPrefix":false,"Limit":0,"Value":{"Value":"357","Hash":0},"LeaseID":0},"Revision":335}],"IsProgressNotify":false,"Revision":335,"Time":1050415777}
|
||||
{"Events":[{"Op":{"Type":"put","Key":"1","WithPrefix":false,"Limit":0,"Value":{"Value":"24","Hash":0},"LeaseID":0},"Revision":24}, ...
|
||||
{"Events":[{"Type":"put-operation","Key":"key5","Value":{"Value":"793","Hash":0},"Revision":799,"IsCreate":false,"PrevValue":null}],"IsProgressNotify":false,"Revision":799,"Time":3202907249,"Error":""}
|
||||
{"Events":[{"Type":"put-operation","Key":"key4","Value":{"Value":"1","Hash":0},"Revision":3,"IsCreate":true,"PrevValue":null}, ...
|
||||
```
|
||||
|
||||
Up to the first line the `revision` of events within responses only increased up to a value of `335`.
|
||||
However, the following line includes an event with `revision` equal `24`.
|
||||
Up to the first response the `Revision` of events only increased up to a value of `799`.
|
||||
However, the following line includes an event with `Revision` equal `3`.
|
||||
If you follow the `revision` throughout the file you should notice that watch replayed revisions second time.
|
||||
This is incorrect and breaks `Ordered` and `Unique` [watch API guarantees].
|
||||
This is consistent with the root cause of [#14370] where member reconnecting to cluster will incorrectly resend revisions.
|
||||
This is incorrect and breaks `Ordered` [watch API guarantees].
|
||||
This is consistent with the root cause of [#14370] where member reconnecting to cluster will resend revisions.
|
||||
|
||||
[#15271]: https://github.com/etcd-io/etcd/issues/15271
|
||||
|
@ -57,9 +57,6 @@ func (tb triggerBlackhole) Trigger(ctx context.Context, t *testing.T, member e2e
|
||||
}
|
||||
|
||||
func (tb triggerBlackhole) Available(config e2e.EtcdProcessClusterConfig, process e2e.EtcdProcess) bool {
|
||||
if tb.waitTillSnapshot && config.ServerConfig.SnapshotCatchUpEntries > 100 {
|
||||
return false
|
||||
}
|
||||
return config.ClusterSize > 1 && process.PeerProxy() != nil
|
||||
}
|
||||
|
||||
|
@ -19,6 +19,8 @@ import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/coreos/go-semver/semver"
|
||||
|
||||
"go.etcd.io/etcd/api/v3/version"
|
||||
"go.etcd.io/etcd/tests/v3/framework/e2e"
|
||||
"go.etcd.io/etcd/tests/v3/robustness/failpoint"
|
||||
@ -162,19 +164,22 @@ func regressionScenarios(t *testing.T) []testScenario {
|
||||
e2e.WithClusterSize(1),
|
||||
),
|
||||
})
|
||||
// TODO: Deflake waiting for waiting until snapshot for etcd versions that don't support setting snapshot catchup entries.
|
||||
if v.Compare(version.V3_6) >= 0 {
|
||||
if v.Compare(version.V3_5) >= 0 {
|
||||
opts := []e2e.EPClusterOption{
|
||||
e2e.WithSnapshotCount(100),
|
||||
e2e.WithPeerProxy(true),
|
||||
e2e.WithIsPeerTLS(true),
|
||||
}
|
||||
v3_5_13 := semver.Version{Major: 3, Minor: 5, Patch: 13}
|
||||
if v.Compare(v3_5_13) >= 0 {
|
||||
opts = append(opts, e2e.WithSnapshotCatchUpEntries(100))
|
||||
}
|
||||
scenarios = append(scenarios, testScenario{
|
||||
name: "Issue15271",
|
||||
failpoint: failpoint.BlackholeUntilSnapshot,
|
||||
profile: traffic.HighTrafficProfile,
|
||||
traffic: traffic.EtcdPut,
|
||||
cluster: *e2e.NewConfig(
|
||||
e2e.WithSnapshotCatchUpEntries(100),
|
||||
e2e.WithSnapshotCount(100),
|
||||
e2e.WithPeerProxy(true),
|
||||
e2e.WithIsPeerTLS(true),
|
||||
),
|
||||
cluster: *e2e.NewConfig(opts...),
|
||||
})
|
||||
}
|
||||
return scenarios
|
||||
|
Loading…
x
Reference in New Issue
Block a user