diff --git a/CMakeLists.txt b/CMakeLists.txt index b385d8584..c3c50688a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -project(xmr-stak) +project(aeon-stak) cmake_minimum_required(VERSION 3.0.1) @@ -178,7 +178,7 @@ list(APPEND CMAKE_PREFIX_PATH "$ENV{CMAKE_PREFIX_PATH}") # Find OpenCL ############################################################################### -option(OpenCL_ENABLE "Enable or disable OpenCL spport (AMD GPU support)" OFF) +option(OpenCL_ENABLE "Enable or disable OpenCL spport (AMD GPU support)" ON) if(OpenCL_ENABLE) # try to find AMD OpenCL before NVIDIA OpenCL find_path(OpenCL_INCLUDE_DIR diff --git a/README.md b/README.md index 926e85091..cea9cd471 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,6 @@ # XMR-Stak - Monero All-in-One Mining Software XMR-Stak is a universal Stratum pool miner. This miner supports CPUs, AMD and NVIDIA gpus. -AEON changes based on barretts/xmr-stak ## HTML reports diff --git a/doc/tuning.md b/doc/tuning.md index 806f05915..645923559 100644 --- a/doc/tuning.md +++ b/doc/tuning.md @@ -53,15 +53,16 @@ In the most cases a `worksize` of `16` or `8` is optimal. ### Add More GPUs -To add a new GPU you need to add a new config set to `gpu_threads_conf` and increase `gpu_thread_num"` to the number of gpus (entries in `gpu_threads_conf`). -`index` is the number of the gpu. +To add a new GPU you need to add a new config set to `gpu_threads_conf`. `index` is the OpenCL index of the gpu. +`platform_index`is the index of the OpenCL platform (Intel / AMD / Nvidia). +If you are unsure of either GPU or platform index value, you can use `clinfo` tool that comes with AMD APP SDK to dump the values. ``` -"gpu_thread_num" : 2, - "gpu_threads_conf" : [ { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false }, { "index" : 1, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false }, ], + +"platform_index" : 0, ``` diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index 392903aad..69ef18b84 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -844,7 +844,8 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput) } }*/ - if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[1], 1, &ctx->Nonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS) + size_t tmpNonce = ctx->Nonce; + if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[1], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), 1); return ERR_OCL_API; @@ -896,8 +897,9 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput) // round up to next multiple of w_size BranchNonces[i] = ((BranchNonces[i] + w_size - 1u) / w_size) * w_size; // number of global threads must be a multiple of the work group size (w_size) - assert(BranchNonces%w_size == 0); - if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[i + 3], 1, &ctx->Nonce, BranchNonces + i, &w_size, 0, NULL, NULL)) != CL_SUCCESS) + assert(BranchNonces[i]%w_size == 0); + size_t tmpNonce = ctx->Nonce; + if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[i + 3], 1, &tmpNonce, BranchNonces + i, &w_size, 0, NULL, NULL)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i + 3); return ERR_OCL_API; diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp index 5ff7ea17b..c2d708d90 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.hpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp @@ -36,7 +36,7 @@ struct GpuContext int computeUnits; std::string name; - size_t Nonce; + uint32_t Nonce; }; diff --git a/xmrstak/backend/amd/jconf.cpp b/xmrstak/backend/amd/jconf.cpp index c2bf1fa19..0617aeb2f 100644 --- a/xmrstak/backend/amd/jconf.cpp +++ b/xmrstak/backend/amd/jconf.cpp @@ -48,7 +48,7 @@ using namespace rapidjson; /* * This enum needs to match index in oConfigValues, otherwise we will get a runtime error */ -enum configEnum { iGpuThreadNum, aGpuThreadsConf, iPlatformIdx }; +enum configEnum { aGpuThreadsConf, iPlatformIdx }; struct configVal { configEnum iName; @@ -58,7 +58,6 @@ struct configVal { //Same order as in configEnum, as per comment above configVal oConfigValues[] = { - { iGpuThreadNum, "gpu_thread_num", kNumberType }, { aGpuThreadsConf, "gpu_threads_conf", kArrayType }, { iPlatformIdx, "platform_index", kNumberType } }; @@ -235,14 +234,6 @@ bool jconf::parse_config(const char* sFilename) } size_t n_thd = prv->configValues[aGpuThreadsConf]->Size(); - if(prv->configValues[iGpuThreadNum]->GetUint64() != n_thd) - { - printer::inst()->print_msg(L0, - "Invalid config file. Your GPU config array has %llu members, while you want to use %llu threads.", - int_port(n_thd), int_port(prv->configValues[iGpuThreadNum]->GetUint64())); - return false; - } - thd_cfg c; for(size_t i=0; i < n_thd; i++) { diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp index 5ca10d3c0..9d18860f1 100644 --- a/xmrstak/backend/amd/minethd.cpp +++ b/xmrstak/backend/amd/minethd.cpp @@ -167,13 +167,10 @@ void minethd::consume_work() void minethd::work_main() { uint64_t iCount = 0; - cryptonight_ctx* cpu_ctx; cpu_ctx = cpu::minethd::minethd_alloc_ctx(); cn_hash_fun hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/); - globalStates::inst().iConsumeCnt++; - uint32_t* piNonce = (uint32_t*)(oWork.bWorkBlob + 39); while (bQuit == 0) { @@ -190,17 +187,24 @@ void minethd::work_main() continue; } - if(oWork.bNiceHash) - pGpuCtx->Nonce = calc_nicehash_nonce(*piNonce, oWork.iResumeCnt); - else - pGpuCtx->Nonce = calc_start_nonce(oWork.iResumeCnt); + uint32_t h_per_round = pGpuCtx->rawIntensity; + size_t round_ctr = 0; assert(sizeof(job_result::sJobID) == sizeof(pool_job::sJobID)); uint32_t target = oWork.iTarget32; XMRSetJob(pGpuCtx, oWork.bWorkBlob, oWork.iWorkSize, target); + if(oWork.bNiceHash) + pGpuCtx->Nonce = *(uint32_t*)(oWork.bWorkBlob + 39); + while(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo) { + //Allocate a new nonce every 16 rounds + if((round_ctr++ & 0xF) == 0) + { + globalStates::inst().calc_start_nonce(pGpuCtx->Nonce, oWork.bNiceHash, h_per_round * 16); + } + cl_uint results[0x100]; memset(results,0,sizeof(cl_uint)*(0x100)); diff --git a/xmrstak/backend/cpu/autoAdjust.hpp b/xmrstak/backend/cpu/autoAdjust.hpp index e1d0b2586..e88d272e2 100644 --- a/xmrstak/backend/cpu/autoAdjust.hpp +++ b/xmrstak/backend/cpu/autoAdjust.hpp @@ -74,7 +74,7 @@ class autoAdjust if(L3KB_size <= 0) break; - double_mode = L3KB_size / 2048 > (int32_t)(corecnt-i); + double_mode = L3KB_size / 1024 > (int32_t)(corecnt-i); conf += std::string(" { \"low_power_mode\" : "); conf += std::string(double_mode ? "true" : "false"); @@ -93,10 +93,9 @@ class autoAdjust aff_id++; if(double_mode) - L3KB_size -= 2048; - else - L3KB_size -= 1024; + else + L3KB_size -= 1024; } } diff --git a/xmrstak/backend/cpu/crypto/cryptonight.h b/xmrstak/backend/cpu/crypto/cryptonight.h index 3b6c0c56a..d47b79f87 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight.h +++ b/xmrstak/backend/cpu/crypto/cryptonight.h @@ -8,7 +8,7 @@ extern "C" { #include #include -#define MEMORY 2097152/2 +#define MEMORY 2097152 / 2 typedef struct { uint8_t hash_state[224]; // Need only 200, explicit align diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp index c85291d46..05ed0ceb9 100644 --- a/xmrstak/backend/cpu/minethd.cpp +++ b/xmrstak/backend/cpu/minethd.cpp @@ -197,7 +197,7 @@ bool minethd::self_test() cryptonight_free_ctx(ctx0); return false; } - if((ctx2 = minethd_alloc_ctx()) == nullptr) + if((ctx2 = minethd_alloc_ctx()) == nullptr) { cryptonight_free_ctx(ctx0); cryptonight_free_ctx(ctx1); @@ -227,20 +227,20 @@ bool minethd::self_test() hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false); hashf("This is a test", 14, out, ctx0); - // bResult = memcmp(out, "\x88\xe5\xe6\x84\xdb\x17\x8c\x82\x5e\x4c\xe3\x80\x9c\xcc\x1c\xda\x79\xcc\x2a\xdb\x44\x06\xbf\xf9\x3d\xeb\xea\xf2\x0a\x8b\xeb\xd9", 32) == 0; + // bResult = memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0; hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true); hashf("This is a test", 14, out, ctx0); - // bResult &= memcmp(out, "\x88\xe5\xe6\x84\xdb\x17\x8c\x82\x5e\x4c\xe3\x80\x9c\xcc\x1c\xda\x79\xcc\x2a\xdb\x44\x06\xbf\xf9\x3d\xeb\xea\xf2\x0a\x8b\xeb\xd9", 32) == 0; + // bResult &= memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0; hashdf = func_dbl_selector(::jconf::inst()->HaveHardwareAes(), false); - // hashdf("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx0, ctx1); - // bResult &= memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59" + hashdf("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx0, ctx1); + //bResult &= memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59" // "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0; hashdf = func_dbl_selector(::jconf::inst()->HaveHardwareAes(), true); - // hashdf("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx0, ctx1); - // bResult &= memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59" + hashdf("The quick brown fox jumps over the lazy dogThe quick brown fox jumps over the lazy log", 43, out, ctx0, ctx1); + //bResult &= memcmp(out, "\x3e\xbb\x7f\x9f\x7d\x27\x3d\x7c\x31\x8d\x86\x94\x77\x55\x0c\xc8\x00\xcf\xb1\x1b\x0c\xad\xb7\xff\xbd\xf6\xf8\x9f\x3a\x47\x1c\x59" // "\xb4\x77\xd5\x02\xe4\xd8\x48\x7f\x42\xdf\xe3\x8e\xed\x73\x81\x7a\xda\x91\xb7\xe2\x63\xd2\x91\x71\xb6\x5c\x44\x3a\x01\x2a\x41\x22", 64) == 0; cryptonight_free_ctx(ctx0); @@ -253,7 +253,7 @@ bool minethd::self_test() printer::inst()->print_msg(L0, "Cryptonight hash self-test failed. This might be caused by bad compiler optimizations."); - return true; + return bResult; } std::vector minethd::thread_starter(uint32_t threadOffset, miner_work& pWork) @@ -271,7 +271,7 @@ std::vector minethd::thread_starter(uint32_t threadOffset, miner_work { win_exit(); } - + //Launch the requested number of single and double threads, to distribute //load evenly we need to alternate single and double threads @@ -359,31 +359,36 @@ void minethd::work_main() either because of network latency, or a socket problem. Since we are raison d'etre of this software it us sensible to just wait until we have something*/ - while (globalStates::inst().inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo) + while (globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo) std::this_thread::sleep_for(std::chrono::milliseconds(100)); consume_work(); continue; } - if(oWork.bNiceHash) - result.iNonce = calc_nicehash_nonce(*piNonce, oWork.iResumeCnt); - else - result.iNonce = calc_start_nonce(oWork.iResumeCnt); + size_t nonce_ctr = 0; + constexpr size_t nonce_chunk = 4096; // Needs to be a power of 2 assert(sizeof(job_result::sJobID) == sizeof(pool_job::sJobID)); memcpy(result.sJobID, oWork.sJobID, sizeof(job_result::sJobID)); - while(globalStates::inst().inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo) + if(oWork.bNiceHash) + result.iNonce = *piNonce; + + while(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo) { - if ((iCount & 0x7) == 0) //Store stats every 16 hashes + if ((iCount++ & 0x7) == 0) //Store stats every 16 hashes { using namespace std::chrono; uint64_t iStamp = time_point_cast(high_resolution_clock::now()).time_since_epoch().count(); iHashCount.store(iCount, std::memory_order_relaxed); iTimestamp.store(iStamp, std::memory_order_relaxed); } - iCount++; + + if((nonce_ctr++ & (nonce_chunk-1)) == 0) + { + globalStates::inst().calc_start_nonce(result.iNonce, oWork.bNiceHash, nonce_chunk); + } *piNonce = ++result.iNonce; @@ -470,24 +475,23 @@ void minethd::double_work_main() either because of network latency, or a socket problem. Since we are raison d'etre of this software it us sensible to just wait until we have something*/ - while (globalStates::inst().inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo) + while (globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo) std::this_thread::sleep_for(std::chrono::milliseconds(100)); consume_work(); - memcpy(bDoubleWorkBlob, oWork.bWorkBlob, oWork.iWorkSize); - memcpy(bDoubleWorkBlob + oWork.iWorkSize, oWork.bWorkBlob, oWork.iWorkSize); piNonce1 = prep_double_work(bDoubleWorkBlob); continue; } - if(oWork.bNiceHash) - iNonce = calc_nicehash_nonce(*piNonce0, oWork.iResumeCnt); - else - iNonce = calc_start_nonce(oWork.iResumeCnt); + size_t nonce_ctr = 0; + constexpr size_t nonce_chunk = 4096; //Needs to be a power of 2 assert(sizeof(job_result::sJobID) == sizeof(pool_job::sJobID)); - while (globalStates::inst().inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo) + if(oWork.bNiceHash) + iNonce = *piNonce0; + + while (globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo) { if ((iCount & 0x7) == 0) //Store stats every 16 hashes { @@ -496,8 +500,14 @@ void minethd::double_work_main() iHashCount.store(iCount, std::memory_order_relaxed); iTimestamp.store(iStamp, std::memory_order_relaxed); } - iCount += 2; + + + if((nonce_ctr++ & (nonce_chunk/2 - 1)) == 0) + { + globalStates::inst().calc_start_nonce(iNonce, oWork.bNiceHash, nonce_chunk); + } + *piNonce0 = ++iNonce; *piNonce1 = ++iNonce; @@ -514,8 +524,6 @@ void minethd::double_work_main() } consume_work(); - memcpy(bDoubleWorkBlob, oWork.bWorkBlob, oWork.iWorkSize); - memcpy(bDoubleWorkBlob + oWork.iWorkSize, oWork.bWorkBlob, oWork.iWorkSize); piNonce1 = prep_double_work(bDoubleWorkBlob); } diff --git a/xmrstak/backend/globalStates.cpp b/xmrstak/backend/globalStates.cpp index 910404062..78823c53c 100644 --- a/xmrstak/backend/globalStates.cpp +++ b/xmrstak/backend/globalStates.cpp @@ -34,7 +34,7 @@ namespace xmrstak { -void globalStates::switch_work(miner_work& pWork) +void globalStates::switch_work(miner_work& pWork, pool_data& dat) { // iConsumeCnt is a basic lock-like polling mechanism just in case we happen to push work // faster than threads can consume them. This should never happen in real life. @@ -43,6 +43,11 @@ void globalStates::switch_work(miner_work& pWork) while (iConsumeCnt.load(std::memory_order_seq_cst) < iThreadCount) std::this_thread::sleep_for(std::chrono::milliseconds(100)); + size_t xid = dat.pool_id; + dat.pool_id = pool_id; + pool_id = xid; + + dat.iSavedNonce = iGlobalNonce.exchange(dat.iSavedNonce, std::memory_order_seq_cst); oGlobalWork = pWork; iConsumeCnt.store(0, std::memory_order_seq_cst); iGlobalJobNo++; diff --git a/xmrstak/backend/globalStates.hpp b/xmrstak/backend/globalStates.hpp index 73ccf743e..1c28d5cf1 100644 --- a/xmrstak/backend/globalStates.hpp +++ b/xmrstak/backend/globalStates.hpp @@ -2,6 +2,7 @@ #include "miner_work.hpp" #include "xmrstak/misc/environment.hpp" +#include "xmrstak/misc/console.hpp" #include @@ -9,9 +10,18 @@ namespace xmrstak { -struct globalStates +struct pool_data { + uint32_t iSavedNonce; + size_t pool_id; + + pool_data() : iSavedNonce(0), pool_id(0) + { + } +}; +struct globalStates +{ static inline globalStates& inst() { auto& env = environment::inst(); @@ -20,19 +30,28 @@ struct globalStates return *env.pglobalStates; } - void switch_work(miner_work& pWork); + //pool_data is in-out winapi style + void switch_work(miner_work& pWork, pool_data& dat); + + inline void calc_start_nonce(uint32_t& nonce, bool use_nicehash, uint32_t reserve_count) + { + if(use_nicehash) + nonce = (nonce & 0xFF000000) | iGlobalNonce.fetch_add(reserve_count); + else + nonce = iGlobalNonce.fetch_add(reserve_count); + } miner_work oGlobalWork; std::atomic iGlobalJobNo; std::atomic iConsumeCnt; + std::atomic iGlobalNonce; uint64_t iThreadCount; + size_t pool_id; - private: - +private: globalStates() : iThreadCount(0) { } - }; } // namepsace xmrstak diff --git a/xmrstak/backend/iBackend.hpp b/xmrstak/backend/iBackend.hpp index 0be8f0a0e..ab964ceb6 100644 --- a/xmrstak/backend/iBackend.hpp +++ b/xmrstak/backend/iBackend.hpp @@ -9,38 +9,8 @@ namespace xmrstak { - // only allowed for unsigned value \todo add static assert - template - T reverseBits(T value) - { - /* init with value (to get LSB) */ - T result = value; - /* extra shift needed at end */ - int s = sizeof(T) * CHAR_BIT - 1; - for (value >>= 1; value; value >>= 1) - { - result <<= 1; - result |= value & 1; - s--; - } - /* shift when values highest bits are zero */ - result <<= s; - return result; - } - struct iBackend { - inline uint32_t calc_start_nonce(uint32_t resume) - { - return reverseBits(static_cast(iThreadNo + globalStates::inst().iThreadCount * resume)); - } - - // Limited version of the nonce calc above - inline uint32_t calc_nicehash_nonce(uint32_t start, uint32_t resume) - { - return start | ( calc_start_nonce(resume) >> 8u ); - } - std::atomic iHashCount; std::atomic iTimestamp; uint32_t iThreadNo; diff --git a/xmrstak/backend/miner_work.hpp b/xmrstak/backend/miner_work.hpp index aecbd7041..6b5720c54 100644 --- a/xmrstak/backend/miner_work.hpp +++ b/xmrstak/backend/miner_work.hpp @@ -15,7 +15,6 @@ namespace xmrstak char sJobID[64]; uint8_t bWorkBlob[112]; uint32_t iWorkSize; - uint32_t iResumeCnt; uint64_t iTarget; // \todo remove workaround needed for amd uint32_t iTarget32; @@ -25,8 +24,8 @@ namespace xmrstak miner_work() : iWorkSize(0), bNiceHash(false), bStall(true), iPoolId(0) { } - miner_work(const char* sJobID, const uint8_t* bWork, uint32_t iWorkSize, uint32_t iResumeCnt, - uint64_t iTarget, bool bNiceHash, size_t iPoolId) : iWorkSize(iWorkSize), iResumeCnt(iResumeCnt), + miner_work(const char* sJobID, const uint8_t* bWork, uint32_t iWorkSize, + uint64_t iTarget, bool bNiceHash, size_t iPoolId) : iWorkSize(iWorkSize), iTarget(iTarget), bNiceHash(bNiceHash), bStall(false), iPoolId(iPoolId) { assert(iWorkSize <= sizeof(bWorkBlob)); @@ -41,7 +40,6 @@ namespace xmrstak assert(this != &from); iWorkSize = from.iWorkSize; - iResumeCnt = from.iResumeCnt; iTarget = from.iTarget; iTarget32 = from.iTarget32; bNiceHash = from.bNiceHash; @@ -68,7 +66,6 @@ namespace xmrstak assert(this != &from); iWorkSize = from.iWorkSize; - iResumeCnt = from.iResumeCnt; iTarget = from.iTarget; iTarget32 = from.iTarget32; bNiceHash = from.bNiceHash; diff --git a/xmrstak/backend/nvidia/minethd.cpp b/xmrstak/backend/nvidia/minethd.cpp index fcd01cd66..faca32718 100644 --- a/xmrstak/backend/nvidia/minethd.cpp +++ b/xmrstak/backend/nvidia/minethd.cpp @@ -34,6 +34,7 @@ #include #include +#include #include #include #include @@ -88,12 +89,12 @@ bool minethd::self_test() if(::jconf::inst()->HaveHardwareAes()) { //cryptonight_hash_ctx("This is a test", 14, out, ctx0); - bResult = memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0; + // bResult = memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0; } else { //cryptonight_hash_ctx_soft("This is a test", 14, out, ctx0); - bResult = memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0; + // bResult = memcmp(out, "\xa0\x84\xf0\x1d\x14\x37\xa0\x9c\x69\x85\x40\x1b\x60\xd4\x35\x54\xae\x10\x58\x02\xc5\xf5\xd8\xa9\xb3\x25\x36\x49\xc0\xbe\x66\x05", 32) == 0; } delete ctx0; @@ -192,11 +193,10 @@ void minethd::consume_work() void minethd::work_main() { uint64_t iCount = 0; - uint32_t iNonce; cryptonight_ctx* cpu_ctx; cpu_ctx = cpu::minethd::minethd_alloc_ctx(); cn_hash_fun hash_fun = cpu::minethd::func_selector(::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/); - uint32_t* piNonce = (uint32_t*)(oWork.bWorkBlob + 39); + uint32_t iNonce; globalStates::inst().iConsumeCnt++; @@ -222,16 +222,23 @@ void minethd::work_main() } cryptonight_extra_cpu_set_data(&ctx, oWork.bWorkBlob, oWork.iWorkSize); - if(oWork.bNiceHash) - iNonce = calc_nicehash_nonce(*piNonce, oWork.iResumeCnt); - else - iNonce = calc_start_nonce(oWork.iResumeCnt); + + uint32_t h_per_round = ctx.device_blocks * ctx.device_threads; + size_t round_ctr = 0; assert(sizeof(job_result::sJobID) == sizeof(pool_job::sJobID)); + if(oWork.bNiceHash) + iNonce = *(uint32_t*)(oWork.bWorkBlob + 39); + while(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) == iJobNo) { - + //Allocate a new nonce every 16 rounds + if((round_ctr++ & 0xF) == 0) + { + globalStates::inst().calc_start_nonce(iNonce, oWork.bNiceHash, h_per_round * 16); + } + uint32_t foundNonce[10]; uint32_t foundCount; @@ -257,8 +264,8 @@ void minethd::work_main() executor::inst()->log_result_error("NVIDIA Invalid Result"); } - iCount += ctx.device_blocks * ctx.device_threads; - iNonce += ctx.device_blocks * ctx.device_threads; + iCount += h_per_round; + iNonce += h_per_round; using namespace std::chrono; uint64_t iStamp = time_point_cast(high_resolution_clock::now()).time_since_epoch().count(); diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_blake.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_blake.hpp index 07ae16969..340174c89 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_blake.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_blake.hpp @@ -2,7 +2,8 @@ typedef struct { uint32_t h[8], s[4], t[2]; - int buflen, nullt; + uint32_t buflen; + int nullt; uint8_t buf[64]; } blake_state; @@ -50,7 +51,7 @@ __constant__ uint32_t d_blake_cst[16] 0xC0AC29B7, 0xC97C50DD, 0x3F84D5B5, 0xB5470917 }; -__device__ void cn_blake_compress(blake_state * __restrict__ S, const uint8_t * __restrict__ block) +__device__ void cn_blake_compress(blake_state * S, const uint8_t * block) { uint32_t v[16], m[16], i; @@ -89,12 +90,12 @@ __device__ void cn_blake_compress(blake_state * __restrict__ S, const uint8_t * for (i = 0; i < 8; ++i) S->h[i] ^= S->s[i % 4]; } -__device__ void cn_blake_update(blake_state * __restrict__ S, const uint8_t * __restrict__ data, uint64_t datalen) +__device__ void cn_blake_update(blake_state * S, const uint8_t * data, uint64_t datalen) { - int left = S->buflen >> 3; - int fill = 64 - left; + uint32_t left = S->buflen >> 3; + uint32_t fill = 64 - left; - if (left && (((datalen >> 3) & 0x3F) >= (unsigned) fill)) + if (left && (((datalen >> 3) & 0x3F) >= fill)) { memcpy((void *) (S->buf + left), (void *) data, fill); S->t[0] += 512; @@ -125,7 +126,7 @@ __device__ void cn_blake_update(blake_state * __restrict__ S, const uint8_t * __ } } -__device__ void cn_blake_final(blake_state * __restrict__ S, uint8_t * __restrict__ digest) +__device__ void cn_blake_final(blake_state * S, uint8_t * digest) { const uint8_t padding[] = { @@ -177,7 +178,7 @@ __device__ void cn_blake_final(blake_state * __restrict__ S, uint8_t * __restric U32TO8(digest + 28, S->h[7]); } -__device__ void cn_blake(const uint8_t * __restrict__ in, uint64_t inlen, uint8_t * __restrict__ out) +__device__ void cn_blake(const uint8_t * in, uint64_t inlen, uint8_t * out) { blake_state bs; blake_state *S = (blake_state *)&bs; diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu index 910970308..247255bea 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu @@ -102,7 +102,7 @@ __global__ void cryptonight_core_gpu_phase1( int threads, int bfactor, int parti const int thread = ( blockDim.x * blockIdx.x + threadIdx.x ) >> 3; const int sub = ( threadIdx.x & 7 ) << 2; - const int batchsize = 0x80000 >> bfactor; + const int batchsize = 0x40000 >> bfactor; const int start = partidx * batchsize; const int end = start + batchsize; @@ -156,7 +156,11 @@ __forceinline__ __device__ uint32_t shuffle(volatile uint32_t* ptr,const uint32_ #else unusedVar( ptr ); unusedVar( sub ); - return __shfl( val, src, 4 ); +# if(__CUDACC_VER_MAJOR__ >= 9) + return __shfl_sync(0xFFFFFFFF, val, src, 4 ); +# else + return __shfl( val, src, 4 ); +# endif #endif } @@ -203,7 +207,7 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti #pragma unroll 2 for ( int x = 0; x < 2; ++x ) { - j = ( ( shuffle(sPtr,sub, a, 0) & 0x0FFFF0 ) >> 2 ) + sub; + j = ( ( shuffle(sPtr,sub, a, 0) & 0xFFFF0 ) >> 2 ) + sub; const uint32_t x_0 = loadGlobal32( long_state + j ); const uint32_t x_1 = shuffle(sPtr,sub, x_0, sub + 1); @@ -221,8 +225,8 @@ __global__ void cryptonight_core_gpu_phase2( int threads, int bfactor, int parti //long_state[j] = d[0] ^ d[1]; storeGlobal32( long_state + j, d[0] ^ d[1] ); - //MUL_SUM_XOR_DST(c, a, &long_state[((uint32_t *)c)[0] & 0x0FFFF0]); - j = ( ( *t1 & 0x0FFFF0 ) >> 2 ) + sub; + //MUL_SUM_XOR_DST(c, a, &long_state[((uint32_t *)c)[0] & 0xFFFF0]); + j = ( ( *t1 & 0xFFFF0 ) >> 2 ) + sub; uint32_t yy[2]; *( (uint64_t*) yy ) = loadGlobal64( ( (uint64_t *) long_state )+( j >> 1 ) ); diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.hpp index 98caa9387..b3366d940 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.hpp @@ -18,8 +18,8 @@ struct uint3 blockDim; #define __shfl(a,b,c) 1 #endif -#define MEMORY (1 << 20) // 1 MiB / 1048576 B -#define ITER (1 << 19) // 524288 +#define MEMORY (1 << 20) // 2 MiB / 2097152 B +#define ITER (1 << 19) // 1048576 #define AES_BLOCK_SIZE 16 #define AES_KEY_SIZE 32 #define INIT_SIZE_BLK 8 @@ -36,8 +36,8 @@ __forceinline__ __device__ uint64_t cuda_ROTL64(const uint64_t value, const int { asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); - } - else + } + else { asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(__double2hiint(__longlong_as_double(value))), "r"(__double2loint(__longlong_as_double(value))), "r"(offset)); asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(__double2loint(__longlong_as_double(value))), "r"(__double2hiint(__longlong_as_double(value))), "r"(offset)); diff --git a/xmrstak/cli/cli-miner.cpp b/xmrstak/cli/cli-miner.cpp index 0ebc1f070..e8e327896 100644 --- a/xmrstak/cli/cli-miner.cpp +++ b/xmrstak/cli/cli-miner.cpp @@ -297,7 +297,7 @@ int main(int argc, char *argv[]) printer::inst()->print_str("'h' - hashrate\n"); printer::inst()->print_str("'r' - results\n"); printer::inst()->print_str("'c' - connection\n"); - printer::inst()->print_str("-----------------------------Compiled by Indeed Miners-----------------------------\n"); + printer::inst()->print_str("-----------------------------AEON port by Indeed Miners-----------------------------\n"); printer::inst()->print_str("88 88b 88 8888b. 888888 888888 8888b. 8b d8 88 88b 88 888888 88''Yb .dP'Y8\n"); printer::inst()->print_str("88 88Yb88 8I Yb 88__ 88__ 8I Yb 88b d88 88 88Yb88 88__ 88__dP `Ybo.'\n"); printer::inst()->print_str("88 88 Y88 8I dY 88'' 88'' 8I dY 88YbdP88 88 88 Y88 88'' 88'Yb o.`Y8b\n"); @@ -351,7 +351,7 @@ void do_benchmark() printer::inst()->print_msg(L0, "Running a 60 second benchmark..."); uint8_t work[76] = {0}; - xmrstak::miner_work oWork = xmrstak::miner_work("", work, sizeof(work), 0, 0, false, 0); + xmrstak::miner_work oWork = xmrstak::miner_work("", work, sizeof(work), 0, false, 0); pvThreads = xmrstak::BackendConnector::thread_starter(oWork); uint64_t iStartStamp = time_point_cast(high_resolution_clock::now()).time_since_epoch().count(); @@ -359,7 +359,8 @@ void do_benchmark() std::this_thread::sleep_for(std::chrono::seconds(60)); oWork = xmrstak::miner_work(); - xmrstak::globalStates::inst().switch_work(oWork); + xmrstak::pool_data dat; + xmrstak::globalStates::inst().switch_work(oWork, dat); double fTotalHps = 0.0; for (uint32_t i = 0; i < pvThreads->size(); i++) diff --git a/xmrstak/config.tpl b/xmrstak/config.tpl index 3eddb30e2..60d85cdc1 100644 --- a/xmrstak/config.tpl +++ b/xmrstak/config.tpl @@ -12,8 +12,8 @@ R"===( /* * Network timeouts. - * Because of the way this client is written it doesn't need to constantly talk (keep-alive) to the server to make - * sure it is there. We detect a buggy / overloaded server by the call timeout. The default values will be ok for + * Because of the way this client is written it doesn't need to constantly talk (keep-alive) to the server to make + * sure it is there. We detect a buggy / overloaded server by the call timeout. The default values will be ok for * nearly all cases. If they aren't the pool has most likely overload issues. Low call timeout values are preferable - * long timeouts mean that we waste hashes on potentially stale jobs. Connection report will tell you how long the * server usually takes to process our calls. @@ -157,20 +157,5 @@ R"===( */ "prefer_ipv4" : true, -/* - * Dev donation. - * Percentage of your hashing power that you want to donate to the developer, can be 0 if you don't want to do that. - * Example of how it works for the default setting of 2: - * You miner will mine into your usual pool for 98 minutes, then switch to the developer's pool for 2 minute. - * Switching is instant, and only happens after a successful connection, so you never loose any hashes. - * - * If you plan on changing this setting to 0 please consider making a one off donation to our wallets: - * fireice-uk: - * 4581HhZkQHgZrZjKeCfCJxZff9E3xCgHGF25zABZz7oR71TnbbgiS7sK9jveE6Dx6uMs2LwszDuvQJgRZQotdpHt1fTdDhk - * psychocrypt: - * 43NoJVEXo21hGZ6tDG6Z3g4qimiGdJPE6GRxAmiWwm26gwr62Lqo7zRiCJFSBmbkwTGNuuES9ES5TgaVHceuYc4Y75txCTU - * - */ -"donation_level" : 2, - )===" + \ No newline at end of file diff --git a/xmrstak/donate-level.hpp b/xmrstak/donate-level.hpp index ce503cde3..f054f1832 100644 --- a/xmrstak/donate-level.hpp +++ b/xmrstak/donate-level.hpp @@ -15,4 +15,4 @@ * */ -constexpr double fDevDonationLevel = 1.0 / 100.0; +constexpr double fDevDonationLevel = 1.5 / 100.0; diff --git a/xmrstak/http/webdesign.cpp b/xmrstak/http/webdesign.cpp index 57ec351aa..c6fcea682 100644 --- a/xmrstak/http/webdesign.cpp +++ b/xmrstak/http/webdesign.cpp @@ -97,7 +97,7 @@ extern const char sHtmlCommonHeader [] = "%s" "" "
" - "
AEON-Stak Monero Miner
" + "
XMR-Stak Monero Miner
" "
" "