Merge pull request #16637 from serathius/check-cluster-id-release-3.5

[release-3.5] etcdserver: add cluster id check for hashKVHandler
This commit is contained in:
Marek Siarkowicz
2023-10-17 20:58:21 +02:00
committed by GitHub
9 changed files with 269 additions and 13 deletions

View File

@@ -181,6 +181,17 @@ type etcdProcessClusterConfig struct {
// newEtcdProcessCluster launches a new cluster from etcd processes, returning
// a new etcdProcessCluster once all nodes are ready to accept client requests.
func newEtcdProcessCluster(t testing.TB, cfg *etcdProcessClusterConfig) (*etcdProcessCluster, error) {
epc, err := initEtcdProcessCluster(t, cfg)
if err != nil {
return nil, err
}
return startEtcdProcessCluster(epc, cfg)
}
// initEtcdProcessCluster initializes a new cluster based on the given config.
// It doesn't start the cluster.
func initEtcdProcessCluster(t testing.TB, cfg *etcdProcessClusterConfig) (*etcdProcessCluster, error) {
skipInShortMode(t)
etcdCfgs := cfg.etcdServerProcessConfigs(t)
@@ -199,7 +210,11 @@ func newEtcdProcessCluster(t testing.TB, cfg *etcdProcessClusterConfig) (*etcdPr
}
epc.procs[i] = proc
}
return epc, nil
}
// startEtcdProcessCluster launches a new cluster from etcd processes.
func startEtcdProcessCluster(epc *etcdProcessCluster, cfg *etcdProcessClusterConfig) (*etcdProcessCluster, error) {
if cfg.rollingStart {
if err := epc.RollingStart(); err != nil {
return nil, fmt.Errorf("Cannot rolling-start: %v", err)

View File

@@ -12,11 +12,15 @@
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build !cluster_proxy
// +build !cluster_proxy
package e2e
import (
"context"
"fmt"
"sync"
"testing"
"time"
@@ -96,6 +100,99 @@ func corruptTest(cx ctlCtx) {
waitReadyExpectProc(proc, []string{fmt.Sprintf("etcdmain: %016x found data inconsistency with peers", id0)})
}
func TestInPlaceRecovery(t *testing.T) {
basePort := 20000
BeforeTest(t)
// Initialize the cluster.
epcOld, err := newEtcdProcessCluster(t,
&etcdProcessClusterConfig{
clusterSize: 3,
initialToken: "old",
keepDataDir: false,
CorruptCheckTime: time.Second,
basePort: basePort,
},
)
if err != nil {
t.Fatalf("could not start etcd process cluster (%v)", err)
}
t.Cleanup(func() {
if errC := epcOld.Close(); errC != nil {
t.Fatalf("error closing etcd processes (%v)", errC)
}
})
t.Log("old cluster started.")
//Put some data into the old cluster, so that after recovering from a blank db, the hash diverges.
t.Log("putting 10 keys...")
oldCc := NewEtcdctl(epcOld.EndpointsV3(), clientNonTLS, false, false)
for i := 0; i < 10; i++ {
err := oldCc.Put(testutil.PickKey(int64(i)), fmt.Sprint(i))
assert.NoError(t, err, "error on put")
}
// Create a new cluster config, but with the same port numbers. In this way the new servers can stay in
// contact with the old ones.
epcNewConfig := &etcdProcessClusterConfig{
clusterSize: 3,
initialToken: "new",
keepDataDir: false,
CorruptCheckTime: time.Second,
basePort: basePort,
initialCorruptCheck: true,
}
epcNew, err := initEtcdProcessCluster(t, epcNewConfig)
if err != nil {
t.Fatalf("could not init etcd process cluster (%v)", err)
}
t.Cleanup(func() {
if errC := epcNew.Close(); errC != nil {
t.Fatalf("error closing etcd processes (%v)", errC)
}
})
newCc := NewEtcdctl(epcNew.EndpointsV3(), clientNonTLS, false, false)
assert.NoError(t, err)
wg := sync.WaitGroup{}
// Rolling recovery of the servers.
t.Log("rolling updating servers in place...")
for i := range epcNew.procs {
oldProc := epcOld.procs[i]
err = oldProc.Close()
if err != nil {
t.Fatalf("could not stop etcd process (%v)", err)
}
t.Logf("old cluster server %d: %s stopped.", i, oldProc.Config().name)
wg.Add(1)
// Start servers in background to avoid blocking on server start.
// EtcdProcess.Start waits until etcd becomes healthy, which will not happen here until we restart at least 2 members.
go func(proc etcdProcess) {
defer wg.Done()
err = proc.Start()
if err != nil {
t.Errorf("could not start etcd process (%v)", err)
}
t.Logf("new cluster server: %s started in-place with blank db.", proc.Config().name)
}(epcNew.procs[i])
t.Log("sleeping 5 sec to let nodes do periodical check...")
time.Sleep(5 * time.Second)
}
wg.Wait()
t.Log("new cluster started.")
alarmResponse, err := newCc.AlarmList()
assert.NoError(t, err, "error on alarm list")
for _, alarm := range alarmResponse.Alarms {
if alarm.Alarm == etcdserverpb.AlarmType_CORRUPT {
t.Fatalf("there is no corruption after in-place recovery, but corruption reported.")
}
}
t.Log("no corruption detected.")
}
func TestPeriodicCheckDetectsCorruption(t *testing.T) {
checkTime := time.Second
BeforeTest(t)

View File

@@ -52,13 +52,14 @@ func TestCompactionHash(t *testing.T) {
},
}
testutil.TestCompactionHash(context.Background(), t, hashTestCase{cc, clus.Members[0].GRPCURL(), client}, 1000)
testutil.TestCompactionHash(context.Background(), t, hashTestCase{cc, clus.Members[0].GRPCURL(), client, clus.Members[0].s}, 1000)
}
type hashTestCase struct {
*clientv3.Client
url string
http *http.Client
url string
http *http.Client
server *etcdserver.EtcdServer
}
func (tc hashTestCase) Put(ctx context.Context, key, value string) error {
@@ -72,7 +73,7 @@ func (tc hashTestCase) Delete(ctx context.Context, key string) error {
}
func (tc hashTestCase) HashByRev(ctx context.Context, rev int64) (testutil.KeyValueHash, error) {
resp, err := etcdserver.HashByRev(ctx, tc.http, "http://unix", rev)
resp, err := etcdserver.HashByRev(ctx, tc.server.Cluster().ID(), tc.http, "http://unix", rev)
return testutil.KeyValueHash{Hash: resp.Hash, CompactRevision: resp.CompactRevision, Revision: resp.Header.Revision}, err
}