// Copyright 2022 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package robustness

import (
	"context"
	"path/filepath"
	"testing"
	"time"

	"go.uber.org/zap"
	"go.uber.org/zap/zaptest"
	"golang.org/x/sync/errgroup"

	"go.etcd.io/etcd/api/v3/version"
	"go.etcd.io/etcd/tests/v3/framework/e2e"
	"go.etcd.io/etcd/tests/v3/robustness/identity"
	"go.etcd.io/etcd/tests/v3/robustness/model"
	"go.etcd.io/etcd/tests/v3/robustness/report"
	"go.etcd.io/etcd/tests/v3/robustness/traffic"
	"go.etcd.io/etcd/tests/v3/robustness/validate"
)

type TrafficProfile struct {
	Traffic traffic.Traffic
	Profile traffic.Profile
}

var trafficProfiles = []TrafficProfile{
	{
		Traffic: traffic.EtcdPut,
		Profile: traffic.HighTrafficProfile,
	},
	{
		Traffic: traffic.EtcdPutDeleteLease,
		Profile: traffic.LowTraffic,
	},
	{
		Traffic: traffic.Kubernetes,
		Profile: traffic.HighTrafficProfile,
	},
	{
		Traffic: traffic.Kubernetes,
		Profile: traffic.LowTraffic,
	},
}

func TestRobustness(t *testing.T) {
	testRunner.BeforeTest(t)
	v, err := e2e.GetVersionFromBinary(e2e.BinPath.Etcd)
	if err != nil {
		t.Fatalf("Failed checking etcd version binary, binary: %q, err: %v", e2e.BinPath.Etcd, err)
	}
	enableLazyFS := e2e.BinPath.LazyFSAvailable()
	baseOptions := []e2e.EPClusterOption{
		e2e.WithSnapshotCount(100),
		e2e.WithGoFailEnabled(true),
		e2e.WithCompactionBatchLimit(100),
		e2e.WithWatchProcessNotifyInterval(100 * time.Millisecond),
	}
	scenarios := []testScenario{}
	for _, tp := range trafficProfiles {
		name := filepath.Join(tp.Traffic.Name(), tp.Profile.Name, "ClusterOfSize1")
		clusterOfSize1Options := baseOptions
		clusterOfSize1Options = append(clusterOfSize1Options, e2e.WithClusterSize(1))
		// Add LazyFS only for traffic with lower QPS as it uses a lot of CPU lowering minimal QPS.
		if enableLazyFS && tp.Profile.MinimalQPS <= 100 {
			clusterOfSize1Options = append(clusterOfSize1Options, e2e.WithLazyFSEnabled(true))
			name = filepath.Join(name, "LazyFS")
		}
		scenarios = append(scenarios, testScenario{
			name:    name,
			traffic: tp.Traffic,
			profile: tp.Profile,
			cluster: *e2e.NewConfig(clusterOfSize1Options...),
		})
	}

	for _, tp := range trafficProfiles {
		name := filepath.Join(tp.Traffic.Name(), tp.Profile.Name, "ClusterOfSize3")
		clusterOfSize3Options := baseOptions
		clusterOfSize3Options = append(clusterOfSize3Options, e2e.WithIsPeerTLS(true))
		clusterOfSize3Options = append(clusterOfSize3Options, e2e.WithPeerProxy(true))
		if !v.LessThan(version.V3_6) {
			clusterOfSize3Options = append(clusterOfSize3Options, e2e.WithSnapshotCatchUpEntries(100))
		}
		scenarios = append(scenarios, testScenario{
			name:    name,
			traffic: tp.Traffic,
			profile: tp.Profile,
			cluster: *e2e.NewConfig(clusterOfSize3Options...),
		})
	}
	scenarios = append(scenarios, testScenario{
		name:      "Issue14370",
		failpoint: RaftBeforeSavePanic,
		profile:   traffic.LowTraffic,
		traffic:   traffic.EtcdPutDeleteLease,
		cluster: *e2e.NewConfig(
			e2e.WithClusterSize(1),
			e2e.WithGoFailEnabled(true),
		),
	})
	scenarios = append(scenarios, testScenario{
		name:      "Issue14685",
		failpoint: DefragBeforeCopyPanic,
		profile:   traffic.LowTraffic,
		traffic:   traffic.EtcdPutDeleteLease,
		cluster: *e2e.NewConfig(
			e2e.WithClusterSize(1),
			e2e.WithGoFailEnabled(true),
		),
	})
	scenarios = append(scenarios, testScenario{
		name:      "Issue13766",
		failpoint: KillFailpoint,
		profile:   traffic.HighTrafficProfile,
		traffic:   traffic.EtcdPut,
		cluster: *e2e.NewConfig(
			e2e.WithSnapshotCount(100),
		),
	})
	scenarios = append(scenarios, testScenario{
		name: "Issue15220",
		watch: watchConfig{
			requestProgress: true,
		},
		profile: traffic.LowTraffic,
		traffic: traffic.EtcdPutDeleteLease,
		cluster: *e2e.NewConfig(
			e2e.WithClusterSize(1),
		),
	})
	// TODO: Deflake waiting for waiting until snapshot for etcd versions that don't support setting snapshot catchup entries.
	if v.Compare(version.V3_6) >= 0 {
		scenarios = append(scenarios, testScenario{
			name:      "Issue15271",
			failpoint: BlackholeUntilSnapshot,
			profile:   traffic.HighTrafficProfile,
			traffic:   traffic.EtcdPut,
			cluster: *e2e.NewConfig(
				e2e.WithSnapshotCatchUpEntries(100),
				e2e.WithSnapshotCount(100),
				e2e.WithPeerProxy(true),
				e2e.WithIsPeerTLS(true),
			),
		})
	}
	for _, scenario := range scenarios {
		t.Run(scenario.name, func(t *testing.T) {
			lg := zaptest.NewLogger(t)
			scenario.cluster.Logger = lg
			ctx := context.Background()
			testRobustness(ctx, t, lg, scenario)
		})
	}
}

type testScenario struct {
	name      string
	failpoint Failpoint
	cluster   e2e.EtcdProcessClusterConfig
	traffic   traffic.Traffic
	profile   traffic.Profile
	watch     watchConfig
}

func testRobustness(ctx context.Context, t *testing.T, lg *zap.Logger, s testScenario) {
	report := report.TestReport{Logger: lg}
	var err error
	report.Cluster, err = e2e.NewEtcdProcessCluster(ctx, t, e2e.WithConfig(&s.cluster))
	if err != nil {
		t.Fatal(err)
	}
	defer report.Cluster.Close()

	if s.failpoint == nil {
		s.failpoint = pickRandomFailpoint(t, report.Cluster)
	} else {
		err = validateFailpoint(report.Cluster, s.failpoint)
		if err != nil {
			t.Fatal(err)
		}
	}

	// t.Failed() returns false during panicking. We need to forcibly
	// save data on panicking.
	// Refer to: https://github.com/golang/go/issues/49929
	panicked := true
	defer func() {
		report.Report(t, panicked)
	}()
	report.Client = s.run(ctx, t, lg, report.Cluster)
	forcestopCluster(report.Cluster)

	watchProgressNotifyEnabled := report.Cluster.Cfg.ServerConfig.ExperimentalWatchProgressNotifyInterval != 0
	validateGotAtLeastOneProgressNotify(t, report.Client, s.watch.requestProgress || watchProgressNotifyEnabled)
	validateConfig := validate.Config{ExpectRevisionUnique: s.traffic.ExpectUniqueRevision()}
	report.Visualize = validate.ValidateAndReturnVisualize(t, lg, validateConfig, report.Client)

	panicked = false
}

func (s testScenario) run(ctx context.Context, t *testing.T, lg *zap.Logger, clus *e2e.EtcdProcessCluster) (reports []report.ClientReport) {
	g := errgroup.Group{}
	var operationReport, watchReport []report.ClientReport
	finishTraffic := make(chan struct{})

	// using baseTime time-measuring operation to get monotonic clock reading
	// see https://github.com/golang/go/blob/master/src/time/time.go#L17
	baseTime := time.Now()
	ids := identity.NewIdProvider()
	g.Go(func() error {
		defer close(finishTraffic)
		injectFailpoints(ctx, t, lg, clus, s.failpoint)
		time.Sleep(time.Second)
		return nil
	})
	maxRevisionChan := make(chan int64, 1)
	g.Go(func() error {
		defer close(maxRevisionChan)
		operationReport = traffic.SimulateTraffic(ctx, t, lg, clus, s.profile, s.traffic, finishTraffic, baseTime, ids)
		maxRevisionChan <- operationsMaxRevision(operationReport)
		return nil
	})
	g.Go(func() error {
		watchReport = collectClusterWatchEvents(ctx, t, clus, maxRevisionChan, s.watch, baseTime, ids)
		return nil
	})
	g.Wait()
	return append(operationReport, watchReport...)
}

func operationsMaxRevision(reports []report.ClientReport) int64 {
	var maxRevision int64
	for _, r := range reports {
		for _, op := range r.KeyValue {
			resp := op.Output.(model.MaybeEtcdResponse)
			if resp.Revision > maxRevision {
				maxRevision = resp.Revision
			}
		}
	}
	return maxRevision
}

// forcestopCluster stops the etcd member with signal kill.
func forcestopCluster(clus *e2e.EtcdProcessCluster) error {
	for _, member := range clus.Procs {
		member.Kill()
	}
	return clus.ConcurrentStop()
}