From 487fab0e2b8b874a80f0e4ab7d12d25a064888b5 Mon Sep 17 00:00:00 2001 From: stasatdaglabs <39559713+stasatdaglabs@users.noreply.github.com> Date: Wed, 7 Jul 2021 16:14:22 +0300 Subject: [PATCH] Implement a stability test to stress test the DAA algorithm (#1767) * Copy over boilerplate and begin implementing TestDAA. * Implement a fairly reliable method of hashing at a certain hashrate. * Convert the DAA test to an application. * Start kaspad and make sure that hashrate throttling works with that as well. * Finish implementing testConstantHashRate. * Tidied up a bit. * Convert TestDAA back into a go test. * Reorganize TestDAA to be more like a traditional test. * Add sudden hashrate drop/jump tests. * Simplify targetHashNanosecondsFunction. * Improve progress logs. * Add more tests. * Remove the no-longer relevant `hashes` part of targetHashNanosecondsFunction. * Implement a constant hashrate increase test. * Implement a constant hashrate decrease test. * Give the correct run duration to the constant hashrate decrease test. * Add cooldowns to exponential functions. * Add run.sh to the DAA test. * Add a README. * Add `daa` to run-slow.sh. * Make go lint happy. * Fix the README's title. * Keep running tests even if one of them failed on high block rate deviation. * Fix hashrate peak/valley tests. * Preallocate arrays for hash and mining durations. * Add more statistics to the "mined block" log. * Make sure runDAATest stops when it's suppposed to. * Add a newline after "5 minute cooldown." * Fix variable names. * Rename totalElapsedTime to tatalElapsedDuration. * In measureMachineHashNanoseconds, generate a random nonce only once. * In runDAATest, add "DAA" to the start/finish log. * Remove --logdir from kaspadRunCommand. * In runDAATest, enlarge the nonce range to the entirety of uint64. * Explain what targetHashNanosecondsFunction is. * Move RunKaspadForTesting into common. * Rename runForDuration to loopForDuration. * Make go lint happy. * Extract fetchBlockForMining to a separate function. * Extract waitUntilTargetHashDurationHadElapsed to a separate function. * Extract pushHashDuration and pushMiningDuration to separate functions. * Extract logMinedBlockStatsAndUpdateStatFields to a separate function. * Extract submitMinedBlock to a separate function. * Extract tryNonceForMiningAndIncrementNonce to a separate function. * Add comments. * Use a rolling average instead of appending to an array for performance/accuracy. * Change a word in a comment. * Explain why we wait for five minutes at the end of the exponential increase/decrease tests. Co-authored-by: Svarog --- stability-tests/common/run-kaspad.go | 53 ++++ stability-tests/daa/README.md | 11 + stability-tests/daa/average_duration.go | 37 +++ stability-tests/daa/daa_test.go | 334 ++++++++++++++++++++++++ stability-tests/daa/run/run.sh | 13 + stability-tests/run/run-slow.sh | 4 + 6 files changed, 452 insertions(+) create mode 100644 stability-tests/common/run-kaspad.go create mode 100644 stability-tests/daa/README.md create mode 100644 stability-tests/daa/average_duration.go create mode 100644 stability-tests/daa/daa_test.go create mode 100755 stability-tests/daa/run/run.sh diff --git a/stability-tests/common/run-kaspad.go b/stability-tests/common/run-kaspad.go new file mode 100644 index 000000000..953775ae8 --- /dev/null +++ b/stability-tests/common/run-kaspad.go @@ -0,0 +1,53 @@ +package common + +import ( + "fmt" + "github.com/kaspanet/kaspad/domain/dagconfig" + "os" + "sync/atomic" + "syscall" + "testing" +) + +// RunKaspadForTesting runs kaspad for testing purposes +func RunKaspadForTesting(t *testing.T, testName string, rpcAddress string) func() { + appDir, err := TempDir(testName) + if err != nil { + t.Fatalf("TempDir: %s", err) + } + + kaspadRunCommand, err := StartCmd("KASPAD", + "kaspad", + NetworkCliArgumentFromNetParams(&dagconfig.DevnetParams), + "--appdir", appDir, + "--rpclisten", rpcAddress, + "--loglevel", "debug", + ) + if err != nil { + t.Fatalf("StartCmd: %s", err) + } + t.Logf("Kaspad started with --appdir=%s", appDir) + + isShutdown := uint64(0) + go func() { + err := kaspadRunCommand.Wait() + if err != nil { + if atomic.LoadUint64(&isShutdown) == 0 { + panic(fmt.Sprintf("Kaspad closed unexpectedly: %s. See logs at: %s", err, appDir)) + } + } + }() + + return func() { + err := kaspadRunCommand.Process.Signal(syscall.SIGTERM) + if err != nil { + t.Fatalf("Signal: %s", err) + } + err = os.RemoveAll(appDir) + if err != nil { + t.Fatalf("RemoveAll: %s", err) + } + atomic.StoreUint64(&isShutdown, 1) + t.Logf("Kaspad stopped") + } +} diff --git a/stability-tests/daa/README.md b/stability-tests/daa/README.md new file mode 100644 index 000000000..8e3315e7f --- /dev/null +++ b/stability-tests/daa/README.md @@ -0,0 +1,11 @@ +# DAA tool + +This tool simulates various hashrate patterns to stress-test the DAA algorithm + +## Running + +1. `go install` kaspad and daa. +2. `cd run` +3. `./run.sh` + + diff --git a/stability-tests/daa/average_duration.go b/stability-tests/daa/average_duration.go new file mode 100644 index 000000000..b10c76744 --- /dev/null +++ b/stability-tests/daa/average_duration.go @@ -0,0 +1,37 @@ +package daa + +import "time" + +type averageDuration struct { + average float64 + count uint64 + sampleSize uint64 +} + +func newAverageDuration(sampleSize uint64) *averageDuration { + return &averageDuration{ + average: 0, + count: 0, + sampleSize: sampleSize, + } +} + +func (ad *averageDuration) add(duration time.Duration) { + durationNanoseconds := float64(duration.Nanoseconds()) + + ad.count++ + if ad.count > ad.sampleSize { + ad.count = ad.sampleSize + } + + if ad.count == 1 { + ad.average = durationNanoseconds + return + } + + ad.average = ad.average + ((durationNanoseconds - ad.average) / float64(ad.count)) +} + +func (ad *averageDuration) toDuration() time.Duration { + return time.Duration(ad.average) +} diff --git a/stability-tests/daa/daa_test.go b/stability-tests/daa/daa_test.go new file mode 100644 index 000000000..e989b685f --- /dev/null +++ b/stability-tests/daa/daa_test.go @@ -0,0 +1,334 @@ +package daa + +import ( + "github.com/kaspanet/kaspad/app/appmessage" + "github.com/kaspanet/kaspad/domain/consensus/model/externalapi" + "github.com/kaspanet/kaspad/domain/consensus/utils/pow" + "github.com/kaspanet/kaspad/domain/dagconfig" + "github.com/kaspanet/kaspad/infrastructure/network/rpcclient" + "github.com/kaspanet/kaspad/stability-tests/common" + "github.com/kaspanet/kaspad/util/difficulty" + "math" + "math/big" + "math/rand" + "os" + "testing" + "time" +) + +const rpcAddress = "localhost:9000" +const miningAddress = "kaspadev:qrcqat6l9zcjsu7swnaztqzrv0s7hu04skpaezxk43y4etj8ncwfkuhy0zmax" +const blockRateDeviationThreshold = 0.5 +const averageBlockRateSampleSize = 60 +const averageHashRateSampleSize = 100_000 + +func TestDAA(t *testing.T) { + if os.Getenv("RUN_STABILITY_TESTS") == "" { + t.Skip() + } + + machineHashNanoseconds := measureMachineHashNanoseconds(t) + t.Logf("Machine hashes per second: %d", hashNanosecondsToHashesPerSecond(machineHashNanoseconds)) + + tests := []struct { + name string + runDuration time.Duration + + // targetHashNanosecondsFunction receives the duration of time between now and the start + // of the run (moments before the first hash has been calculated). It returns the target + // duration of a single hash operation in nanoseconds (greater return value = lower hash rate) + targetHashNanosecondsFunction func(totalElapsedDuration time.Duration) int64 + }{ + { + name: "constant hash rate", + runDuration: 10 * time.Minute, + targetHashNanosecondsFunction: func(totalElapsedDuration time.Duration) int64 { + return machineHashNanoseconds * 2 + }, + }, + { + name: "sudden hash rate drop", + runDuration: 15 * time.Minute, + targetHashNanosecondsFunction: func(totalElapsedDuration time.Duration) int64 { + if totalElapsedDuration < 5*time.Minute { + return machineHashNanoseconds * 2 + } + return machineHashNanoseconds * 10 + }, + }, + { + name: "sudden hash rate jump", + runDuration: 15 * time.Minute, + targetHashNanosecondsFunction: func(totalElapsedDuration time.Duration) int64 { + if totalElapsedDuration < 5*time.Minute { + return machineHashNanoseconds * 10 + } + return machineHashNanoseconds * 2 + }, + }, + { + name: "hash rate peak", + runDuration: 10 * time.Minute, + targetHashNanosecondsFunction: func(totalElapsedDuration time.Duration) int64 { + if totalElapsedDuration > 4*time.Minute && totalElapsedDuration < 5*time.Minute { + return machineHashNanoseconds * 2 + } + return machineHashNanoseconds * 10 + }, + }, + { + name: "hash rate valley", + runDuration: 10 * time.Minute, + targetHashNanosecondsFunction: func(totalElapsedDuration time.Duration) int64 { + if totalElapsedDuration > 4*time.Minute && totalElapsedDuration < 5*time.Minute { + return machineHashNanoseconds * 10 + } + return machineHashNanoseconds * 2 + }, + }, + { + name: "periodic hash rate peaks", + runDuration: 10 * time.Minute, + targetHashNanosecondsFunction: func(totalElapsedDuration time.Duration) int64 { + if int(totalElapsedDuration.Seconds())%30 == 0 { + return machineHashNanoseconds * 2 + } + return machineHashNanoseconds * 10 + }, + }, + { + name: "periodic hash rate valleys", + runDuration: 10 * time.Minute, + targetHashNanosecondsFunction: func(totalElapsedDuration time.Duration) int64 { + if int(totalElapsedDuration.Seconds())%30 == 0 { + return machineHashNanoseconds * 10 + } + return machineHashNanoseconds * 2 + }, + }, + { + name: "constant exponential hash rate increase", + runDuration: 15 * time.Minute, + targetHashNanosecondsFunction: func(totalElapsedDuration time.Duration) int64 { + fromHashNanoseconds := machineHashNanoseconds * 10 + toHashNanoseconds := machineHashNanoseconds * 2 + + if totalElapsedDuration < 10*time.Minute { + exponentialIncreaseDuration := 10 * time.Minute + timeElapsedFraction := float64(totalElapsedDuration.Nanoseconds()) / float64(exponentialIncreaseDuration.Nanoseconds()) + + return fromHashNanoseconds - + int64(math.Pow(float64(fromHashNanoseconds-toHashNanoseconds), timeElapsedFraction)) + } + + // 5 minute cooldown. We expect the DAA to still be "catching up" at the end + // of the exponential increase so, for the sake of testing, we wait a while for + // the hash rate to stabilize + + return toHashNanoseconds + }, + }, + { + name: "constant exponential hash rate decrease", + runDuration: 15 * time.Minute, + targetHashNanosecondsFunction: func(totalElapsedDuration time.Duration) int64 { + fromHashNanoseconds := machineHashNanoseconds * 2 + toHashNanoseconds := machineHashNanoseconds * 10 + + if totalElapsedDuration < 10*time.Minute { + exponentialDecreaseDuration := 10 * time.Minute + timeElapsedFraction := float64(totalElapsedDuration.Nanoseconds()) / float64(exponentialDecreaseDuration.Nanoseconds()) + + return fromHashNanoseconds + + int64(math.Pow(float64(toHashNanoseconds-fromHashNanoseconds), timeElapsedFraction)) + } + + // 5 minute cooldown. We expect the DAA to still be "catching up" at the end + // of the exponential decrease so, for the sake of testing, we wait a while for + // the hash rate to stabilize + + return toHashNanoseconds + }, + }, + } + + for _, test := range tests { + runDAATest(t, test.name, test.runDuration, test.targetHashNanosecondsFunction) + } +} + +func measureMachineHashNanoseconds(t *testing.T) int64 { + t.Logf("Measuring machine hash rate") + defer t.Logf("Finished measuring machine hash rate") + + genesisBlock := dagconfig.DevnetParams.GenesisBlock + targetDifficulty := difficulty.CompactToBig(genesisBlock.Header.Bits()) + headerForMining := genesisBlock.Header.ToMutable() + + machineHashesPerSecondMeasurementDuration := 10 * time.Second + hashes := int64(0) + nonce := rand.Uint64() + loopForDuration(machineHashesPerSecondMeasurementDuration, func(isFinished *bool) { + headerForMining.SetNonce(nonce) + pow.CheckProofOfWorkWithTarget(headerForMining, targetDifficulty) + hashes++ + nonce++ + }) + + return machineHashesPerSecondMeasurementDuration.Nanoseconds() / hashes +} + +func runDAATest(t *testing.T, testName string, runDuration time.Duration, + targetHashNanosecondsFunction func(totalElapsedDuration time.Duration) int64) { + + t.Logf("DAA TEST STARTED: %s", testName) + defer t.Logf("DAA TEST FINISHED: %s", testName) + + tearDownKaspad := common.RunKaspadForTesting(t, "kaspad-daa-test", rpcAddress) + defer tearDownKaspad() + + rpcClient, err := rpcclient.NewRPCClient(rpcAddress) + if err != nil { + t.Fatalf("NewRPCClient: %s", err) + } + + // These variables are for gathering stats. Useful mostly for debugging + averageHashDuration := newAverageDuration(averageHashRateSampleSize) + averageMiningDuration := newAverageDuration(averageBlockRateSampleSize) + previousDifficulty := float64(0) + blocksMined := 0 + + // Mine blocks the same way a CPU miner mines blocks until `runDuration` elapses + startTime := time.Now() + loopForDuration(runDuration, func(isFinished *bool) { + templateBlock := fetchBlockForMining(t, rpcClient) + targetDifficulty := difficulty.CompactToBig(templateBlock.Header.Bits()) + headerForMining := templateBlock.Header.ToMutable() + + // Try hashes until we find a valid block + miningStartTime := time.Now() + nonce := rand.Uint64() + for { + hashStartTime := time.Now() + + blockFound := tryNonceForMiningAndIncrementNonce(headerForMining, &nonce, targetDifficulty, templateBlock) + if blockFound { + break + } + + // Throttle the hash rate by waiting until the target hash duration elapses + waitUntilTargetHashDurationHadElapsed(startTime, hashStartTime, targetHashNanosecondsFunction) + + // Collect stats about hash rate + hashDuration := time.Since(hashStartTime) + averageHashDuration.add(hashDuration) + + // Exit early if the test is finished + if *isFinished { + return + } + } + + // Collect stats about block rate + miningDuration := time.Since(miningStartTime) + averageMiningDuration.add(miningDuration) + + logMinedBlockStatsAndUpdateStatFields(t, rpcClient, averageMiningDuration, averageHashDuration, startTime, + miningDuration, &previousDifficulty, &blocksMined) + + // Exit early if the test is finished + if *isFinished { + return + } + + submitMinedBlock(t, rpcClient, templateBlock) + }) + + averageMiningDurationInSeconds := averageMiningDuration.toDuration().Seconds() + expectedAverageMiningDurationInSeconds := float64(1) + deviation := math.Abs(expectedAverageMiningDurationInSeconds - averageMiningDurationInSeconds) + if deviation > blockRateDeviationThreshold { + t.Errorf("Block rate deviation %f is higher than threshold %f. Want: %f, got: %f", + deviation, blockRateDeviationThreshold, expectedAverageMiningDurationInSeconds, averageMiningDurationInSeconds) + } +} + +func fetchBlockForMining(t *testing.T, rpcClient *rpcclient.RPCClient) *externalapi.DomainBlock { + getBlockTemplateResponse, err := rpcClient.GetBlockTemplate(miningAddress) + if err != nil { + t.Fatalf("GetBlockTemplate: %s", err) + } + templateBlock, err := appmessage.RPCBlockToDomainBlock(getBlockTemplateResponse.Block) + if err != nil { + t.Fatalf("RPCBlockToDomainBlock: %s", err) + } + return templateBlock +} + +func tryNonceForMiningAndIncrementNonce(headerForMining externalapi.MutableBlockHeader, nonce *uint64, + targetDifficulty *big.Int, templateBlock *externalapi.DomainBlock) bool { + + headerForMining.SetNonce(*nonce) + if pow.CheckProofOfWorkWithTarget(headerForMining, targetDifficulty) { + templateBlock.Header = headerForMining.ToImmutable() + return true + } + + *nonce++ + return false +} + +func waitUntilTargetHashDurationHadElapsed(startTime time.Time, hashStartTime time.Time, + targetHashNanosecondsFunction func(totalElapsedDuration time.Duration) int64) { + + // Yielding a thread in Go takes up to a few milliseconds whereas hashing once + // takes a few hundred nanoseconds, so we spin in place instead of e.g. calling time.Sleep() + for { + targetHashNanoseconds := targetHashNanosecondsFunction(time.Since(startTime)) + hashElapsedDurationNanoseconds := time.Since(hashStartTime).Nanoseconds() + if hashElapsedDurationNanoseconds >= targetHashNanoseconds { + break + } + } +} + +func logMinedBlockStatsAndUpdateStatFields(t *testing.T, rpcClient *rpcclient.RPCClient, + averageMiningDuration *averageDuration, averageHashDurations *averageDuration, + startTime time.Time, miningDuration time.Duration, previousDifficulty *float64, blocksMined *int) { + + averageMiningDurationAsDuration := averageMiningDuration.toDuration() + averageHashNanoseconds := averageHashDurations.toDuration().Nanoseconds() + averageHashesPerSecond := hashNanosecondsToHashesPerSecond(averageHashNanoseconds) + blockDAGInfoResponse, err := rpcClient.GetBlockDAGInfo() + if err != nil { + t.Fatalf("GetBlockDAGInfo: %s", err) + } + difficultyDelta := blockDAGInfoResponse.Difficulty - *previousDifficulty + *previousDifficulty = blockDAGInfoResponse.Difficulty + *blocksMined++ + t.Logf("Mined block. Took: %s, average block mining duration: %s, "+ + "average hashes per second: %d, difficulty delta: %f, time elapsed: %s, blocks mined: %d", + miningDuration, averageMiningDurationAsDuration, averageHashesPerSecond, difficultyDelta, time.Since(startTime), *blocksMined) +} + +func submitMinedBlock(t *testing.T, rpcClient *rpcclient.RPCClient, block *externalapi.DomainBlock) { + _, err := rpcClient.SubmitBlock(block) + if err != nil { + t.Fatalf("SubmitBlock: %s", err) + } +} + +func hashNanosecondsToHashesPerSecond(hashNanoseconds int64) int64 { + return time.Second.Nanoseconds() / hashNanoseconds +} + +func loopForDuration(duration time.Duration, runFunction func(isFinished *bool)) { + isFinished := false + go func() { + for !isFinished { + runFunction(&isFinished) + } + }() + time.Sleep(duration) + isFinished = true +} diff --git a/stability-tests/daa/run/run.sh b/stability-tests/daa/run/run.sh new file mode 100755 index 000000000..54160240d --- /dev/null +++ b/stability-tests/daa/run/run.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +RUN_STABILITY_TESTS=true go test ../ -v -timeout 86400s +TEST_EXIT_CODE=$? + +echo "Exit code: $TEST_EXIT_CODE" + +if [ $TEST_EXIT_CODE -eq 0 ]; then + echo "daa test: PASSED" + exit 0 +fi +echo "daa test: FAILED" +exit 1 diff --git a/stability-tests/run/run-slow.sh b/stability-tests/run/run-slow.sh index 94d4cc19d..b2f87c433 100755 --- a/stability-tests/run/run-slow.sh +++ b/stability-tests/run/run-slow.sh @@ -38,6 +38,10 @@ echo "Running many-tips" cd "${PROJECT_ROOT}/many-tips/run" && ./run.sh || failedTests+=("many-tips") echo "Done running many-tips" +echo "Running daa" +cd "${PROJECT_ROOT}/daa/run" && ./run.sh || failedTests+=("daa") +echo "Done running daa" + echo "Running reorg" cd "${PROJECT_ROOT}/reorg/run" && ./run-full-finality-window-reorg.sh || failedTests+=("reorg") echo "Done running reorg"