From 487fab0e2b8b874a80f0e4ab7d12d25a064888b5 Mon Sep 17 00:00:00 2001
From: stasatdaglabs <39559713+stasatdaglabs@users.noreply.github.com>
Date: Wed, 7 Jul 2021 16:14:22 +0300
Subject: [PATCH] Implement a stability test to stress test the DAA algorithm
 (#1767)

* Copy over boilerplate and begin implementing TestDAA.

* Implement a fairly reliable method of hashing at a certain hashrate.

* Convert the DAA test to an application.

* Start kaspad and make sure that hashrate throttling works with that as well.

* Finish implementing testConstantHashRate.

* Tidied up a bit.

* Convert TestDAA back into a go test.

* Reorganize TestDAA to be more like a traditional test.

* Add sudden hashrate drop/jump tests.

* Simplify targetHashNanosecondsFunction.

* Improve progress logs.

* Add more tests.

* Remove the no-longer relevant `hashes` part of targetHashNanosecondsFunction.

* Implement a constant hashrate increase test.

* Implement a constant hashrate decrease test.

* Give the correct run duration to the constant hashrate decrease test.

* Add cooldowns to exponential functions.

* Add run.sh to the DAA test.

* Add a README.

* Add `daa` to run-slow.sh.

* Make go lint happy.

* Fix the README's title.

* Keep running tests even if one of them failed on high block rate deviation.

* Fix hashrate peak/valley tests.

* Preallocate arrays for hash and mining durations.

* Add more statistics to the "mined block" log.

* Make sure runDAATest stops when it's suppposed to.

* Add a newline after "5 minute cooldown."

* Fix variable names.

* Rename totalElapsedTime to tatalElapsedDuration.

* In measureMachineHashNanoseconds, generate a random nonce only once.

* In runDAATest, add "DAA" to the start/finish log.

* Remove --logdir from kaspadRunCommand.

* In runDAATest, enlarge the nonce range to the entirety of uint64.

* Explain what targetHashNanosecondsFunction is.

* Move RunKaspadForTesting into common.

* Rename runForDuration to loopForDuration.

* Make go lint happy.

* Extract fetchBlockForMining to a separate function.

* Extract waitUntilTargetHashDurationHadElapsed to a separate function.

* Extract pushHashDuration and pushMiningDuration to separate functions.

* Extract logMinedBlockStatsAndUpdateStatFields to a separate function.

* Extract submitMinedBlock to a separate function.

* Extract tryNonceForMiningAndIncrementNonce to a separate function.

* Add comments.

* Use a rolling average instead of appending to an array for performance/accuracy.

* Change a word in a comment.

* Explain why we wait for five minutes at the end of the exponential increase/decrease tests.

Co-authored-by: Svarog <feanorr@gmail.com>
---
 stability-tests/common/run-kaspad.go    |  53 ++++
 stability-tests/daa/README.md           |  11 +
 stability-tests/daa/average_duration.go |  37 +++
 stability-tests/daa/daa_test.go         | 334 ++++++++++++++++++++++++
 stability-tests/daa/run/run.sh          |  13 +
 stability-tests/run/run-slow.sh         |   4 +
 6 files changed, 452 insertions(+)
 create mode 100644 stability-tests/common/run-kaspad.go
 create mode 100644 stability-tests/daa/README.md
 create mode 100644 stability-tests/daa/average_duration.go
 create mode 100644 stability-tests/daa/daa_test.go
 create mode 100755 stability-tests/daa/run/run.sh

diff --git a/stability-tests/common/run-kaspad.go b/stability-tests/common/run-kaspad.go
new file mode 100644
index 000000000..953775ae8
--- /dev/null
+++ b/stability-tests/common/run-kaspad.go
@@ -0,0 +1,53 @@
+package common
+
+import (
+	"fmt"
+	"github.com/kaspanet/kaspad/domain/dagconfig"
+	"os"
+	"sync/atomic"
+	"syscall"
+	"testing"
+)
+
+// RunKaspadForTesting runs kaspad for testing purposes
+func RunKaspadForTesting(t *testing.T, testName string, rpcAddress string) func() {
+	appDir, err := TempDir(testName)
+	if err != nil {
+		t.Fatalf("TempDir: %s", err)
+	}
+
+	kaspadRunCommand, err := StartCmd("KASPAD",
+		"kaspad",
+		NetworkCliArgumentFromNetParams(&dagconfig.DevnetParams),
+		"--appdir", appDir,
+		"--rpclisten", rpcAddress,
+		"--loglevel", "debug",
+	)
+	if err != nil {
+		t.Fatalf("StartCmd: %s", err)
+	}
+	t.Logf("Kaspad started with --appdir=%s", appDir)
+
+	isShutdown := uint64(0)
+	go func() {
+		err := kaspadRunCommand.Wait()
+		if err != nil {
+			if atomic.LoadUint64(&isShutdown) == 0 {
+				panic(fmt.Sprintf("Kaspad closed unexpectedly: %s. See logs at: %s", err, appDir))
+			}
+		}
+	}()
+
+	return func() {
+		err := kaspadRunCommand.Process.Signal(syscall.SIGTERM)
+		if err != nil {
+			t.Fatalf("Signal: %s", err)
+		}
+		err = os.RemoveAll(appDir)
+		if err != nil {
+			t.Fatalf("RemoveAll: %s", err)
+		}
+		atomic.StoreUint64(&isShutdown, 1)
+		t.Logf("Kaspad stopped")
+	}
+}
diff --git a/stability-tests/daa/README.md b/stability-tests/daa/README.md
new file mode 100644
index 000000000..8e3315e7f
--- /dev/null
+++ b/stability-tests/daa/README.md
@@ -0,0 +1,11 @@
+# DAA tool
+
+This tool simulates various hashrate patterns to stress-test the DAA algorithm 
+
+## Running
+
+1. `go install` kaspad and daa.
+2. `cd run`
+3. `./run.sh`
+
+
diff --git a/stability-tests/daa/average_duration.go b/stability-tests/daa/average_duration.go
new file mode 100644
index 000000000..b10c76744
--- /dev/null
+++ b/stability-tests/daa/average_duration.go
@@ -0,0 +1,37 @@
+package daa
+
+import "time"
+
+type averageDuration struct {
+	average    float64
+	count      uint64
+	sampleSize uint64
+}
+
+func newAverageDuration(sampleSize uint64) *averageDuration {
+	return &averageDuration{
+		average:    0,
+		count:      0,
+		sampleSize: sampleSize,
+	}
+}
+
+func (ad *averageDuration) add(duration time.Duration) {
+	durationNanoseconds := float64(duration.Nanoseconds())
+
+	ad.count++
+	if ad.count > ad.sampleSize {
+		ad.count = ad.sampleSize
+	}
+
+	if ad.count == 1 {
+		ad.average = durationNanoseconds
+		return
+	}
+
+	ad.average = ad.average + ((durationNanoseconds - ad.average) / float64(ad.count))
+}
+
+func (ad *averageDuration) toDuration() time.Duration {
+	return time.Duration(ad.average)
+}
diff --git a/stability-tests/daa/daa_test.go b/stability-tests/daa/daa_test.go
new file mode 100644
index 000000000..e989b685f
--- /dev/null
+++ b/stability-tests/daa/daa_test.go
@@ -0,0 +1,334 @@
+package daa
+
+import (
+	"github.com/kaspanet/kaspad/app/appmessage"
+	"github.com/kaspanet/kaspad/domain/consensus/model/externalapi"
+	"github.com/kaspanet/kaspad/domain/consensus/utils/pow"
+	"github.com/kaspanet/kaspad/domain/dagconfig"
+	"github.com/kaspanet/kaspad/infrastructure/network/rpcclient"
+	"github.com/kaspanet/kaspad/stability-tests/common"
+	"github.com/kaspanet/kaspad/util/difficulty"
+	"math"
+	"math/big"
+	"math/rand"
+	"os"
+	"testing"
+	"time"
+)
+
+const rpcAddress = "localhost:9000"
+const miningAddress = "kaspadev:qrcqat6l9zcjsu7swnaztqzrv0s7hu04skpaezxk43y4etj8ncwfkuhy0zmax"
+const blockRateDeviationThreshold = 0.5
+const averageBlockRateSampleSize = 60
+const averageHashRateSampleSize = 100_000
+
+func TestDAA(t *testing.T) {
+	if os.Getenv("RUN_STABILITY_TESTS") == "" {
+		t.Skip()
+	}
+
+	machineHashNanoseconds := measureMachineHashNanoseconds(t)
+	t.Logf("Machine hashes per second: %d", hashNanosecondsToHashesPerSecond(machineHashNanoseconds))
+
+	tests := []struct {
+		name        string
+		runDuration time.Duration
+
+		// targetHashNanosecondsFunction receives the duration of time between now and the start
+		// of the run (moments before the first hash has been calculated). It returns the target
+		// duration of a single hash operation in nanoseconds (greater return value = lower hash rate)
+		targetHashNanosecondsFunction func(totalElapsedDuration time.Duration) int64
+	}{
+		{
+			name:        "constant hash rate",
+			runDuration: 10 * time.Minute,
+			targetHashNanosecondsFunction: func(totalElapsedDuration time.Duration) int64 {
+				return machineHashNanoseconds * 2
+			},
+		},
+		{
+			name:        "sudden hash rate drop",
+			runDuration: 15 * time.Minute,
+			targetHashNanosecondsFunction: func(totalElapsedDuration time.Duration) int64 {
+				if totalElapsedDuration < 5*time.Minute {
+					return machineHashNanoseconds * 2
+				}
+				return machineHashNanoseconds * 10
+			},
+		},
+		{
+			name:        "sudden hash rate jump",
+			runDuration: 15 * time.Minute,
+			targetHashNanosecondsFunction: func(totalElapsedDuration time.Duration) int64 {
+				if totalElapsedDuration < 5*time.Minute {
+					return machineHashNanoseconds * 10
+				}
+				return machineHashNanoseconds * 2
+			},
+		},
+		{
+			name:        "hash rate peak",
+			runDuration: 10 * time.Minute,
+			targetHashNanosecondsFunction: func(totalElapsedDuration time.Duration) int64 {
+				if totalElapsedDuration > 4*time.Minute && totalElapsedDuration < 5*time.Minute {
+					return machineHashNanoseconds * 2
+				}
+				return machineHashNanoseconds * 10
+			},
+		},
+		{
+			name:        "hash rate valley",
+			runDuration: 10 * time.Minute,
+			targetHashNanosecondsFunction: func(totalElapsedDuration time.Duration) int64 {
+				if totalElapsedDuration > 4*time.Minute && totalElapsedDuration < 5*time.Minute {
+					return machineHashNanoseconds * 10
+				}
+				return machineHashNanoseconds * 2
+			},
+		},
+		{
+			name:        "periodic hash rate peaks",
+			runDuration: 10 * time.Minute,
+			targetHashNanosecondsFunction: func(totalElapsedDuration time.Duration) int64 {
+				if int(totalElapsedDuration.Seconds())%30 == 0 {
+					return machineHashNanoseconds * 2
+				}
+				return machineHashNanoseconds * 10
+			},
+		},
+		{
+			name:        "periodic hash rate valleys",
+			runDuration: 10 * time.Minute,
+			targetHashNanosecondsFunction: func(totalElapsedDuration time.Duration) int64 {
+				if int(totalElapsedDuration.Seconds())%30 == 0 {
+					return machineHashNanoseconds * 10
+				}
+				return machineHashNanoseconds * 2
+			},
+		},
+		{
+			name:        "constant exponential hash rate increase",
+			runDuration: 15 * time.Minute,
+			targetHashNanosecondsFunction: func(totalElapsedDuration time.Duration) int64 {
+				fromHashNanoseconds := machineHashNanoseconds * 10
+				toHashNanoseconds := machineHashNanoseconds * 2
+
+				if totalElapsedDuration < 10*time.Minute {
+					exponentialIncreaseDuration := 10 * time.Minute
+					timeElapsedFraction := float64(totalElapsedDuration.Nanoseconds()) / float64(exponentialIncreaseDuration.Nanoseconds())
+
+					return fromHashNanoseconds -
+						int64(math.Pow(float64(fromHashNanoseconds-toHashNanoseconds), timeElapsedFraction))
+				}
+
+				// 5 minute cooldown. We expect the DAA to still be "catching up" at the end
+				// of the exponential increase so, for the sake of testing, we wait a while for
+				// the hash rate to stabilize
+
+				return toHashNanoseconds
+			},
+		},
+		{
+			name:        "constant exponential hash rate decrease",
+			runDuration: 15 * time.Minute,
+			targetHashNanosecondsFunction: func(totalElapsedDuration time.Duration) int64 {
+				fromHashNanoseconds := machineHashNanoseconds * 2
+				toHashNanoseconds := machineHashNanoseconds * 10
+
+				if totalElapsedDuration < 10*time.Minute {
+					exponentialDecreaseDuration := 10 * time.Minute
+					timeElapsedFraction := float64(totalElapsedDuration.Nanoseconds()) / float64(exponentialDecreaseDuration.Nanoseconds())
+
+					return fromHashNanoseconds +
+						int64(math.Pow(float64(toHashNanoseconds-fromHashNanoseconds), timeElapsedFraction))
+				}
+
+				// 5 minute cooldown. We expect the DAA to still be "catching up" at the end
+				// of the exponential decrease so, for the sake of testing, we wait a while for
+				// the hash rate to stabilize
+
+				return toHashNanoseconds
+			},
+		},
+	}
+
+	for _, test := range tests {
+		runDAATest(t, test.name, test.runDuration, test.targetHashNanosecondsFunction)
+	}
+}
+
+func measureMachineHashNanoseconds(t *testing.T) int64 {
+	t.Logf("Measuring machine hash rate")
+	defer t.Logf("Finished measuring machine hash rate")
+
+	genesisBlock := dagconfig.DevnetParams.GenesisBlock
+	targetDifficulty := difficulty.CompactToBig(genesisBlock.Header.Bits())
+	headerForMining := genesisBlock.Header.ToMutable()
+
+	machineHashesPerSecondMeasurementDuration := 10 * time.Second
+	hashes := int64(0)
+	nonce := rand.Uint64()
+	loopForDuration(machineHashesPerSecondMeasurementDuration, func(isFinished *bool) {
+		headerForMining.SetNonce(nonce)
+		pow.CheckProofOfWorkWithTarget(headerForMining, targetDifficulty)
+		hashes++
+		nonce++
+	})
+
+	return machineHashesPerSecondMeasurementDuration.Nanoseconds() / hashes
+}
+
+func runDAATest(t *testing.T, testName string, runDuration time.Duration,
+	targetHashNanosecondsFunction func(totalElapsedDuration time.Duration) int64) {
+
+	t.Logf("DAA TEST STARTED: %s", testName)
+	defer t.Logf("DAA TEST FINISHED: %s", testName)
+
+	tearDownKaspad := common.RunKaspadForTesting(t, "kaspad-daa-test", rpcAddress)
+	defer tearDownKaspad()
+
+	rpcClient, err := rpcclient.NewRPCClient(rpcAddress)
+	if err != nil {
+		t.Fatalf("NewRPCClient: %s", err)
+	}
+
+	// These variables are for gathering stats. Useful mostly for debugging
+	averageHashDuration := newAverageDuration(averageHashRateSampleSize)
+	averageMiningDuration := newAverageDuration(averageBlockRateSampleSize)
+	previousDifficulty := float64(0)
+	blocksMined := 0
+
+	// Mine blocks the same way a CPU miner mines blocks until `runDuration` elapses
+	startTime := time.Now()
+	loopForDuration(runDuration, func(isFinished *bool) {
+		templateBlock := fetchBlockForMining(t, rpcClient)
+		targetDifficulty := difficulty.CompactToBig(templateBlock.Header.Bits())
+		headerForMining := templateBlock.Header.ToMutable()
+
+		// Try hashes until we find a valid block
+		miningStartTime := time.Now()
+		nonce := rand.Uint64()
+		for {
+			hashStartTime := time.Now()
+
+			blockFound := tryNonceForMiningAndIncrementNonce(headerForMining, &nonce, targetDifficulty, templateBlock)
+			if blockFound {
+				break
+			}
+
+			// Throttle the hash rate by waiting until the target hash duration elapses
+			waitUntilTargetHashDurationHadElapsed(startTime, hashStartTime, targetHashNanosecondsFunction)
+
+			// Collect stats about hash rate
+			hashDuration := time.Since(hashStartTime)
+			averageHashDuration.add(hashDuration)
+
+			// Exit early if the test is finished
+			if *isFinished {
+				return
+			}
+		}
+
+		// Collect stats about block rate
+		miningDuration := time.Since(miningStartTime)
+		averageMiningDuration.add(miningDuration)
+
+		logMinedBlockStatsAndUpdateStatFields(t, rpcClient, averageMiningDuration, averageHashDuration, startTime,
+			miningDuration, &previousDifficulty, &blocksMined)
+
+		// Exit early if the test is finished
+		if *isFinished {
+			return
+		}
+
+		submitMinedBlock(t, rpcClient, templateBlock)
+	})
+
+	averageMiningDurationInSeconds := averageMiningDuration.toDuration().Seconds()
+	expectedAverageMiningDurationInSeconds := float64(1)
+	deviation := math.Abs(expectedAverageMiningDurationInSeconds - averageMiningDurationInSeconds)
+	if deviation > blockRateDeviationThreshold {
+		t.Errorf("Block rate deviation %f is higher than threshold %f. Want: %f, got: %f",
+			deviation, blockRateDeviationThreshold, expectedAverageMiningDurationInSeconds, averageMiningDurationInSeconds)
+	}
+}
+
+func fetchBlockForMining(t *testing.T, rpcClient *rpcclient.RPCClient) *externalapi.DomainBlock {
+	getBlockTemplateResponse, err := rpcClient.GetBlockTemplate(miningAddress)
+	if err != nil {
+		t.Fatalf("GetBlockTemplate: %s", err)
+	}
+	templateBlock, err := appmessage.RPCBlockToDomainBlock(getBlockTemplateResponse.Block)
+	if err != nil {
+		t.Fatalf("RPCBlockToDomainBlock: %s", err)
+	}
+	return templateBlock
+}
+
+func tryNonceForMiningAndIncrementNonce(headerForMining externalapi.MutableBlockHeader, nonce *uint64,
+	targetDifficulty *big.Int, templateBlock *externalapi.DomainBlock) bool {
+
+	headerForMining.SetNonce(*nonce)
+	if pow.CheckProofOfWorkWithTarget(headerForMining, targetDifficulty) {
+		templateBlock.Header = headerForMining.ToImmutable()
+		return true
+	}
+
+	*nonce++
+	return false
+}
+
+func waitUntilTargetHashDurationHadElapsed(startTime time.Time, hashStartTime time.Time,
+	targetHashNanosecondsFunction func(totalElapsedDuration time.Duration) int64) {
+
+	// Yielding a thread in Go takes up to a few milliseconds whereas hashing once
+	// takes a few hundred nanoseconds, so we spin in place instead of e.g. calling time.Sleep()
+	for {
+		targetHashNanoseconds := targetHashNanosecondsFunction(time.Since(startTime))
+		hashElapsedDurationNanoseconds := time.Since(hashStartTime).Nanoseconds()
+		if hashElapsedDurationNanoseconds >= targetHashNanoseconds {
+			break
+		}
+	}
+}
+
+func logMinedBlockStatsAndUpdateStatFields(t *testing.T, rpcClient *rpcclient.RPCClient,
+	averageMiningDuration *averageDuration, averageHashDurations *averageDuration,
+	startTime time.Time, miningDuration time.Duration, previousDifficulty *float64, blocksMined *int) {
+
+	averageMiningDurationAsDuration := averageMiningDuration.toDuration()
+	averageHashNanoseconds := averageHashDurations.toDuration().Nanoseconds()
+	averageHashesPerSecond := hashNanosecondsToHashesPerSecond(averageHashNanoseconds)
+	blockDAGInfoResponse, err := rpcClient.GetBlockDAGInfo()
+	if err != nil {
+		t.Fatalf("GetBlockDAGInfo: %s", err)
+	}
+	difficultyDelta := blockDAGInfoResponse.Difficulty - *previousDifficulty
+	*previousDifficulty = blockDAGInfoResponse.Difficulty
+	*blocksMined++
+	t.Logf("Mined block. Took: %s, average block mining duration: %s, "+
+		"average hashes per second: %d, difficulty delta: %f, time elapsed: %s, blocks mined: %d",
+		miningDuration, averageMiningDurationAsDuration, averageHashesPerSecond, difficultyDelta, time.Since(startTime), *blocksMined)
+}
+
+func submitMinedBlock(t *testing.T, rpcClient *rpcclient.RPCClient, block *externalapi.DomainBlock) {
+	_, err := rpcClient.SubmitBlock(block)
+	if err != nil {
+		t.Fatalf("SubmitBlock: %s", err)
+	}
+}
+
+func hashNanosecondsToHashesPerSecond(hashNanoseconds int64) int64 {
+	return time.Second.Nanoseconds() / hashNanoseconds
+}
+
+func loopForDuration(duration time.Duration, runFunction func(isFinished *bool)) {
+	isFinished := false
+	go func() {
+		for !isFinished {
+			runFunction(&isFinished)
+		}
+	}()
+	time.Sleep(duration)
+	isFinished = true
+}
diff --git a/stability-tests/daa/run/run.sh b/stability-tests/daa/run/run.sh
new file mode 100755
index 000000000..54160240d
--- /dev/null
+++ b/stability-tests/daa/run/run.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+RUN_STABILITY_TESTS=true go test ../ -v -timeout 86400s
+TEST_EXIT_CODE=$?
+
+echo "Exit code: $TEST_EXIT_CODE"
+
+if [ $TEST_EXIT_CODE -eq 0 ]; then
+  echo "daa test: PASSED"
+  exit 0
+fi
+echo "daa test: FAILED"
+exit 1
diff --git a/stability-tests/run/run-slow.sh b/stability-tests/run/run-slow.sh
index 94d4cc19d..b2f87c433 100755
--- a/stability-tests/run/run-slow.sh
+++ b/stability-tests/run/run-slow.sh
@@ -38,6 +38,10 @@ echo "Running many-tips"
 cd "${PROJECT_ROOT}/many-tips/run" && ./run.sh || failedTests+=("many-tips")
 echo "Done running many-tips"
 
+echo "Running daa"
+cd "${PROJECT_ROOT}/daa/run" && ./run.sh || failedTests+=("daa")
+echo "Done running daa"
+
 echo "Running reorg"
 cd "${PROJECT_ROOT}/reorg/run" && ./run-full-finality-window-reorg.sh || failedTests+=("reorg")
 echo "Done running reorg"