diff --git a/.gitignore b/.gitignore index 29b52cbf6..178859304 100644 --- a/.gitignore +++ b/.gitignore @@ -49,6 +49,9 @@ out.txt # For clion IDE .idea +# For vscode +.vscode + # For cmake CMakeCache.txt CMakeFiles/ diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 375dc3808..eb458c260 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -23,25 +23,9 @@ if(NOT WIN32) set(ColorBoldRed "${ColorRed}${ColorBold}") endif() -#--------------------------- CMAKE VARIABLES (partly for Cmake GUI) ---------------------------------------------------- - -set(BUILD_DISTRIBUTED 0 CACHE BOOL "Build with http support for contributing to distributed training") -set(USE_BACKEND CACHE STRING "Neural net backend") -string(TOUPPER "${USE_BACKEND}" USE_BACKEND) -set_property(CACHE USE_BACKEND PROPERTY STRINGS "" CUDA OPENCL EIGEN) - -set(USE_TCMALLOC 0 CACHE BOOL "Use TCMalloc") -set(NO_GIT_REVISION 0 CACHE BOOL "Disable embedding the git revision into the compiled exe") -set(USE_AVX2 0 CACHE BOOL "Compile with AVX2") -set(USE_BIGGER_BOARDS_EXPENSIVE 0 CACHE BOOL "Allow boards up to size 29. Compiling with this Will use more memory and slow down KataGo, even when playing on boards of size 19.") - -#--------------------------- NEURAL NET BACKEND ------------------------------------------------------------------------ - -message(STATUS "Building 'katago' executable for GTP engine and other tools.") -if(USE_BACKEND STREQUAL "CUDA") - message(STATUS "-DUSE_BACKEND=CUDA, using CUDA backend.") - +#--------------------------- CUDA MACRO ------------------------------------------------------------------------------- +macro(CONFIGURE_CUDA) # Ensure dynamic cuda linking (Versions prior to 3.17) if (${CMAKE_VERSION} VERSION_LESS "3.17") set(CMAKE_CUDA_FLAGS "" CACHE STRING "") @@ -146,6 +130,26 @@ if(USE_BACKEND STREQUAL "CUDA") " ) endif() +endmacro() + +#--------------------------- CMAKE VARIABLES (partly for Cmake GUI) ---------------------------------------------------- + +set(BUILD_DISTRIBUTED 0 CACHE BOOL "Build with http support for contributing to distributed training") +set(USE_BACKEND CACHE STRING "Neural net backend") +string(TOUPPER "${USE_BACKEND}" USE_BACKEND) +set_property(CACHE USE_BACKEND PROPERTY STRINGS "" CUDA OPENCL EIGEN ONNXRUNTIME) + +set(USE_TCMALLOC 0 CACHE BOOL "Use TCMalloc") +set(NO_GIT_REVISION 0 CACHE BOOL "Disable embedding the git revision into the compiled exe") +set(USE_AVX2 0 CACHE BOOL "Compile with AVX2") +set(USE_BIGGER_BOARDS_EXPENSIVE 0 CACHE BOOL "Allow boards up to size 29. Compiling with this Will use more memory and slow down KataGo, even when playing on boards of size 19.") + +#--------------------------- NEURAL NET BACKEND ------------------------------------------------------------------------ + +message(STATUS "Building 'katago' executable for GTP engine and other tools.") +if(USE_BACKEND STREQUAL "CUDA") + message(STATUS "-DUSE_BACKEND=CUDA, using CUDA backend.") + configure_cuda() elseif(USE_BACKEND STREQUAL "OPENCL") message(STATUS "-DUSE_BACKEND=OPENCL, using OpenCL backend.") set(NEURALNET_BACKEND_SOURCES @@ -162,8 +166,28 @@ elseif(USE_BACKEND STREQUAL "EIGEN") set(NEURALNET_BACKEND_SOURCES neuralnet/eigenbackend.cpp ) +elseif(USE_BACKEND STREQUAL "ONNXRUNTIME") + message(STATUS "-DUSE_BACKEND=ONNXRUNTIME, using ONNXRuntime backend.") + set(ORT_CUDA 0 CACHE BOOL "Use CUDA execution provider for ONNXRuntime.") + set(ORT_TENSORRT 0 CACHE BOOL "Use TensorRT execution provider for ONNXRuntime.") + set(ORT_DIRECTML 0 CACHE BOOL "Use DirectML execution provider for ONNXRuntime.") + set(ORT_MIGRAPHX 0 CACHE BOOL "Use MIGraphX execution provider for ONNXRuntime.") + if(ORT_CUDA OR ORT_TENSORRT) + configure_cuda() + endif() + if(ORT_MIGRAPHX) + set(NEURALNET_BACKEND_SOURCES + neuralnet/ortbackend.cpp + neuralnet/openclhelpers.cpp + ) + else() + set(NEURALNET_BACKEND_SOURCES + neuralnet/ortbackend.cpp + ) + endif() + elseif(USE_BACKEND STREQUAL "") - message(WARNING "${ColorBoldRed}WARNING: Using dummy neural net backend, intended for non-neural-net testing only, will fail on any code path requiring a neural net. To use neural net, specify -DUSE_BACKEND=CUDA or -DUSE_BACKEND=OPENCL or -DUSE_BACKEND=EIGEN to compile with the respective backend.${ColorReset}") + message(WARNING "${ColorBoldRed}WARNING: Using dummy neural net backend, intended for non-neural-net testing only, will fail on any code path requiring a neural net. To use neural net, specify -DUSE_BACKEND=CUDA or -DUSE_BACKEND=OPENCL or -DUSE_BACKEND=ONNXRUNTIME or -DUSE_BACKEND=EIGEN to compile with the respective backend.${ColorReset}") set(NEURALNET_BACKEND_SOURCES neuralnet/dummybackend.cpp) else() message(FATAL_ERROR "Unrecognized backend: " ${USE_BACKEND}) @@ -331,6 +355,66 @@ elseif(USE_BACKEND STREQUAL "EIGEN") endif() endif() endif() +elseif(USE_BACKEND STREQUAL "ONNXRUNTIME") + target_compile_definitions(katago PRIVATE USE_ONNXRUNTIME_BACKEND) + set(ORT_LIB_DIR CACHE STRING "ONNXRuntime library location") + set(ORT_INCLUDE_DIR CACHE STRING "ONNXRuntime header files location") + message(STATUS "ORT_LIB_DIR: " ${ORT_LIB_DIR}) + message(STATUS "ORT_INCLUDE_DIR: " ${ORT_INCLUDE_DIR}) + include_directories(${ORT_INCLUDE_DIR}) + if(EXISTS ${ORT_INCLUDE_DIR}/core/session) + include_directories(${ORT_INCLUDE_DIR}/core/session) + endif() + if(EXISTS ${ORT_INCLUDE_DIR}/core/providers/cpu) + include_directories(${ORT_INCLUDE_DIR}/core/providers/cpu) + endif() + find_library(ORT_LIBRARY NAMES onnxruntime PATHS ${ORT_LIB_DIR}) + if(NOT ORT_LIBRARY) + message(FATAL_ERROR "Could not find onnxruntime") + endif() + target_link_libraries(katago ${ORT_LIBRARY}) + if(ORT_CUDA) + target_compile_definitions(katago PRIVATE USE_ORT_CUDA) + endif() + if(ORT_TENSORRT) + target_compile_definitions(katago PRIVATE USE_ORT_TENSORRT) + set(TENSORRT_LIB_DIR CACHE STRING "TensorRT library location") + set(TENSORRT_INCLUDE_DIR CACHE STRING "TensorRT header file location") + include_directories(${TENSORRT_INCLUDE_DIR}) + find_library(TENSORRT_LIBRARY NAMES nvinfer PATHS ${TENSORRT_LIB_DIR}) + if(NOT TENSORRT_LIBRARY) + message(FATAL_ERROR "Could not find nvinfer") + endif() + target_link_libraries(katago ${TENSORRT_LIBRARY}) + if(EXISTS ${ORT_INCLUDE_DIR}/core/providers/tensorrt) + include_directories(${ORT_INCLUDE_DIR}/core/providers/tensorrt) + endif() + endif() + if(ORT_CUDA OR ORT_TENSORRT) + find_package(CUDA REQUIRED) + find_path(CUDNN_INCLUDE_DIR cudnn.h HINTS ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR} PATH_SUFFIXES cuda/include include) + if((NOT CUDNN_INCLUDE_DIR)) + message(ERROR "${ColorBoldRed} cudnn.h was NOT found, specify CUDNN_INCLUDE_DIR to indicate where it is. ${ColorReset}") + endif() + find_library(CUDNN_LIBRARY libcudnn.so PATHS /usr/local/cuda/lib64 /opt/cuda/lib64) + include_directories(SYSTEM ${CUDA_INCLUDE_DIRS} ${CUDNN_INCLUDE_DIR}) #SYSTEM is for suppressing some compiler warnings in thrust libraries + target_link_libraries(katago ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_LIBRARIES}) + if(EXISTS ${ORT_INCLUDE_DIR}/core/providers/cuda) + include_directories(${ORT_INCLUDE_DIR}/core/providers/cuda) + endif() + endif() + if(ORT_DIRECTML) + target_compile_definitions(katago PRIVATE USE_ORT_DIRECTML) + if(EXISTS ${ORT_INCLUDE_DIR}/core/providers/directml) + include_directories(${ORT_INCLUDE_DIR}/core/providers/directml) + endif() + endif() + if(ORT_MIGRAPHX) + target_compile_definitions(katago PRIVATE USE_ORT_MIGRAPHX) + if(EXISTS ${ORT_INCLUDE_DIR}/core/providers/migraphx) + include_directories(${ORT_INCLUDE_DIR}/core/providers/migraphx) + endif() + endif() endif() if(USE_BIGGER_BOARDS_EXPENSIVE) diff --git a/cpp/command/benchmark.cpp b/cpp/command/benchmark.cpp index 65504d6a9..f7dee0b6a 100644 --- a/cpp/command/benchmark.cpp +++ b/cpp/command/benchmark.cpp @@ -209,6 +209,10 @@ int MainCmds::benchmark(int argc, const char* const* argv) { cout << "If you have a strong GPU capable of FP16 tensor cores (e.g. RTX2080), " << "using the Cuda version of KataGo instead may give a mild performance boost." << endl; #endif +#ifdef USE_ONNXRUNTIME_BACKEND + cout << "You are currently using the ONNXRuntime version of KataGo with " + << nnEval->getOnnxRuntimeExecutionProvider() << " execution provider." << endl; +#endif #ifdef USE_EIGEN_BACKEND cout << "You are currently using the Eigen (CPU) version of KataGo. Due to having no GPU, it may be slow." << endl; #endif @@ -564,6 +568,7 @@ int MainCmds::genconfig(int argc, const char* const* argv, const char* firstComm int64_t configMaxPlayouts = ((int64_t)1) << 50; double configMaxTime = 1e20; double configMaxPonderTime = -1.0; + string configOnnxRuntimeExecutionProvider; vector configDeviceIdxs; int configNNCacheSizePowerOfTwo = 20; int configNNMutexPoolSizePowerOfTwo = 16; @@ -693,6 +698,41 @@ int MainCmds::genconfig(int argc, const char* const* argv, const char* firstComm }); } +#ifdef USE_ONNXRUNTIME_BACKEND + cout << endl; + cout << "=========================================================================" << endl; + cout << "ONNXRUNTIME EXECUTION PROVIDER" << endl; + + { + vector executionProviders; +#ifdef USE_ORT_CUDA + executionProviders.push_back("CUDA"); +#endif +#ifdef USE_ORT_TENSORRT + executionProviders.push_back("TensorRT"); +#endif +#ifdef USE_ORT_DIRECTML + executionProviders.push_back("DirectML"); +#endif +#ifdef USE_ORT_MIGRAPHX + executionProviders.push_back("MIGraphX"); +#endif + + cout << endl; + cout << "Available ONNXRuntime execution providers:" << endl; + for(const auto provider: executionProviders) { + cout << provider << " "; + } + cout << endl << endl; + + string prompt = "Specify an execution provider for ONNXRuntime. Leave blank to use the first available provider.\n"; + promptAndParseInput(prompt, [&](const string& line) { + if(line == "") configOnnxRuntimeExecutionProvider = executionProviders[0]; + else configOnnxRuntimeExecutionProvider = line; + }); + } +#endif + cout << endl; cout << "=========================================================================" << endl; cout << "GPUS AND RAM" << endl; @@ -701,7 +741,11 @@ int MainCmds::genconfig(int argc, const char* const* argv, const char* firstComm { cout << endl; cout << "Finding available GPU-like devices..." << endl; + #ifndef USE_ONNXRUNTIME_BACKEND NeuralNet::printDevices(); + #else + NeuralNet::printDevices(configOnnxRuntimeExecutionProvider); + #endif cout << endl; string prompt = @@ -789,6 +833,7 @@ int MainCmds::genconfig(int argc, const char* const* argv, const char* firstComm configMaxPlayouts, configMaxTime, configMaxPonderTime, + configOnnxRuntimeExecutionProvider, configDeviceIdxs, configNNCacheSizePowerOfTwo, configNNMutexPoolSizePowerOfTwo, diff --git a/cpp/neuralnet/cudabackend.cpp b/cpp/neuralnet/cudabackend.cpp index ce01d610d..d65fb4d90 100644 --- a/cpp/neuralnet/cudabackend.cpp +++ b/cpp/neuralnet/cudabackend.cpp @@ -2584,6 +2584,8 @@ ComputeContext* NeuralNet::createComputeContext( int nnXLen, int nnYLen, const string& openCLTunerFile, + const string& onnxOptModelFile, + const string& onnxRuntimeExecutionProvider, const string& homeDataDirOverride, bool openCLReTunePerBoardSize, enabled_t useFP16Mode, @@ -2593,6 +2595,8 @@ ComputeContext* NeuralNet::createComputeContext( (void)gpuIdxs; (void)logger; (void)openCLTunerFile; + (void)onnxOptModelFile; + (void)onnxRuntimeExecutionProvider; (void)homeDataDirOverride; (void)openCLReTunePerBoardSize; (void)loadedModel; diff --git a/cpp/neuralnet/dummybackend.cpp b/cpp/neuralnet/dummybackend.cpp index 72817fdf1..fc3fd1ec4 100644 --- a/cpp/neuralnet/dummybackend.cpp +++ b/cpp/neuralnet/dummybackend.cpp @@ -19,6 +19,8 @@ ComputeContext* NeuralNet::createComputeContext( int nnXLen, int nnYLen, const string& openCLTunerFile, + const string& onnxOptModelFile, + const string& onnxRuntimeExecutionProvider, const string& homeDataDirOverride, bool openCLReTunePerBoardSize, enabled_t useFP16Mode, @@ -30,6 +32,8 @@ ComputeContext* NeuralNet::createComputeContext( (void)nnXLen; (void)nnYLen; (void)openCLTunerFile; + (void)onnxOptModelFile; + (void)onnxRuntimeExecutionProvider; (void)homeDataDirOverride; (void)openCLReTunePerBoardSize; (void)useFP16Mode; diff --git a/cpp/neuralnet/eigenbackend.cpp b/cpp/neuralnet/eigenbackend.cpp index bf1b9c0ff..97292e7e1 100644 --- a/cpp/neuralnet/eigenbackend.cpp +++ b/cpp/neuralnet/eigenbackend.cpp @@ -1429,6 +1429,8 @@ ComputeContext* NeuralNet::createComputeContext( int nnXLen, int nnYLen, const string& openCLTunerFile, + const string& onnxOptModelFile, + const string& onnxRuntimeExecutionProvider, const string& homeDataDirOverride, bool openCLReTunePerBoardSize, enabled_t useFP16Mode, @@ -1438,6 +1440,8 @@ ComputeContext* NeuralNet::createComputeContext( (void)gpuIdxs; (void)logger; (void)openCLTunerFile; + (void)onnxOptModelFile; + (void)onnxRuntimeExecutionProvider; (void)homeDataDirOverride; (void)openCLReTunePerBoardSize; diff --git a/cpp/neuralnet/nneval.cpp b/cpp/neuralnet/nneval.cpp index 0f957073e..0df52a318 100644 --- a/cpp/neuralnet/nneval.cpp +++ b/cpp/neuralnet/nneval.cpp @@ -66,6 +66,8 @@ NNEvaluator::NNEvaluator( int nnMutexPoolSizePowerofTwo, bool skipNeuralNet, const string& openCLTunerFile, + const string& onnxOptModelFile, + const string& onnxRuntimeExecutionProvider, const string& homeDataDirOverride, bool openCLReTunePerBoardSize, enabled_t useFP16Mode, @@ -83,6 +85,7 @@ NNEvaluator::NNEvaluator( requireExactNNLen(rExactNNLen), policySize(NNPos::getPolicySize(xLen,yLen)), inputsUseNHWC(iUseNHWC), + ortExecutionProvider(onnxRuntimeExecutionProvider), usingFP16Mode(useFP16Mode), usingNHWCMode(useNHWCMode), numThreads(numThr), @@ -145,8 +148,8 @@ NNEvaluator::NNEvaluator( inputsVersion = NNModelVersion::getInputsVersion(modelVersion); computeContext = NeuralNet::createComputeContext( gpuIdxs,logger,nnXLen,nnYLen, - openCLTunerFile,homeDataDirOverride,openCLReTunePerBoardSize, - usingFP16Mode,usingNHWCMode,loadedModel + openCLTunerFile,onnxOptModelFile,onnxRuntimeExecutionProvider, + homeDataDirOverride,openCLReTunePerBoardSize,usingFP16Mode,usingNHWCMode,loadedModel ); } else { @@ -224,6 +227,9 @@ int NNEvaluator::getNNXLen() const { int NNEvaluator::getNNYLen() const { return nnYLen; } +string NNEvaluator::getOnnxRuntimeExecutionProvider() const{ + return ortExecutionProvider; +} enabled_t NNEvaluator::getUsingFP16Mode() const { return usingFP16Mode; } diff --git a/cpp/neuralnet/nneval.h b/cpp/neuralnet/nneval.h index 35a6b4d31..b5b60c027 100644 --- a/cpp/neuralnet/nneval.h +++ b/cpp/neuralnet/nneval.h @@ -89,6 +89,8 @@ class NNEvaluator { int nnMutexPoolSizePowerofTwo, bool debugSkipNeuralNet, const std::string& openCLTunerFile, + const std::string& onnxOptModelFile, + const std::string& onnxRuntimeExecutionProvider, const std::string& homeDataDirOverride, bool openCLReTunePerBoardSize, enabled_t useFP16Mode, @@ -113,6 +115,7 @@ class NNEvaluator { int getNumServerThreads() const; int getNNXLen() const; int getNNYLen() const; + std::string getOnnxRuntimeExecutionProvider() const; enabled_t getUsingFP16Mode() const; enabled_t getUsingNHWCMode() const; @@ -172,6 +175,7 @@ class NNEvaluator { const bool requireExactNNLen; const int policySize; const bool inputsUseNHWC; + const std::string ortExecutionProvider; const enabled_t usingFP16Mode; const enabled_t usingNHWCMode; int numThreads; diff --git a/cpp/neuralnet/nninterface.h b/cpp/neuralnet/nninterface.h index a9c53e509..249230abe 100644 --- a/cpp/neuralnet/nninterface.h +++ b/cpp/neuralnet/nninterface.h @@ -36,6 +36,16 @@ namespace NeuralNet { //Print available backend devices void printDevices(); + void printDevices(const std::string& ortExecutionProvider); + #if defined(USE_ORT_CUDA) || defined(USE_ORT_TENSORRT) + void printCUDADevices(); + #endif + #ifdef USE_ORT_DIRECTML + void printDirectMLDevices(); + #endif + #ifdef USE_ORT_MIGRAPHX + void printOpenCLDevices(); + #endif // Model I/O ----------------------------------------------------------------- @@ -59,6 +69,8 @@ namespace NeuralNet { int nnXLen, int nnYLen, const std::string& openCLTunerFile, + const std::string& onnxOptModelFile, + const std::string& onnxRuntimeExecutionProvider, const std::string& homeDataDirOverride, bool openCLReTunePerBoardSize, enabled_t useFP16Mode, diff --git a/cpp/neuralnet/openclbackend.cpp b/cpp/neuralnet/openclbackend.cpp index 8372142c5..9d60857e3 100644 --- a/cpp/neuralnet/openclbackend.cpp +++ b/cpp/neuralnet/openclbackend.cpp @@ -394,12 +394,17 @@ ComputeContext* NeuralNet::createComputeContext( int nnXLen, int nnYLen, const string& openCLTunerFile, + const string& onnxOptModelFile, + const string& onnxRuntimeExecutionProvider, const string& homeDataDirOverride, bool openCLReTunePerBoardSize, enabled_t useFP16Mode, enabled_t useNHWCMode, const LoadedModel* loadedModel ) { + (void)onnxOptModelFile; + (void)onnxRuntimeExecutionProvider; + if(gpuIdxs.size() <= 0) throw StringError("NeuralNet::createComputeContext - specified no gpus to use"); diff --git a/cpp/neuralnet/ortbackend.cpp b/cpp/neuralnet/ortbackend.cpp new file mode 100644 index 000000000..7d656db7e --- /dev/null +++ b/cpp/neuralnet/ortbackend.cpp @@ -0,0 +1,851 @@ +#ifdef USE_ONNXRUNTIME_BACKEND + +#include "../neuralnet/nninterface.h" +#include "../neuralnet/nninputs.h" +#include "../neuralnet/nneval.h" +#include "../neuralnet/modelversion.h" +#include "../core/makedir.h" +#include "../dataio/homedata.h" + +#include "../external/half-2.1.0/include/half.hpp" +#include +#include + +#if defined(USE_ORT_CUDA) || defined(USE_ORT_TENSORRT) + #include + #include "../neuralnet/cudaincludes.h" +#endif +#ifdef USE_ORT_TENSORRT + #include +#endif +#ifdef USE_ORT_DIRECTML + #include + #include +#endif +#ifdef USE_ORT_MIGRAPHX + #include + #include "../neuralnet/openclincludes.h" + #include "../neuralnet/openclhelpers.h" +#endif + +using namespace std; + +//------------------------------------------------------------------------------ + +void NeuralNet::globalInitialize() { +} + +void NeuralNet::globalCleanup() { +} + +//------------------------------------------------------------------------------ + +// Model itself is loaded in ComputeHandle instead +struct LoadedModel { + ModelDesc modelDesc; + + // This is not optimal of course, we can probably tar .json and .onnx together? + // Or the ONNX file itself can be parsed. + LoadedModel(const string& fileName) { + modelDesc.name = fileName; + modelDesc.version = 8; + modelDesc.numInputChannels = 22; + modelDesc.numInputGlobalChannels = 19; + modelDesc.numValueChannels = 3; + modelDesc.numOwnershipChannels = 1; + modelDesc.numScoreValueChannels = 4; + } + + LoadedModel() = delete; + LoadedModel(const LoadedModel&) = delete; + LoadedModel& operator=(const LoadedModel&) = delete; +}; + +LoadedModel* NeuralNet::loadModelFile(const string& file) { + LoadedModel* loadedModel = new LoadedModel(file); + return loadedModel; +} + +void NeuralNet::freeLoadedModel(LoadedModel* loadedModel) { + delete loadedModel; +} + +string NeuralNet::getModelName(const LoadedModel* loadedModel) { + return loadedModel->modelDesc.name; +} + +int NeuralNet::getModelVersion(const LoadedModel* loadedModel) { + return loadedModel->modelDesc.version; +} + +Rules NeuralNet::getSupportedRules(const LoadedModel* loadedModel, const Rules& desiredRules, bool& supported) { + return loadedModel->modelDesc.getSupportedRules(desiredRules, supported); +} + +//------------------------------------------------------------------------------ + +std::unique_ptr < Ort::Env> env = nullptr; + +struct Model { + string name; + int version; + int numInputChannels; + int numInputGlobalChannels; + int numValueChannels; + int numScoreValueChannels; + int numOwnershipChannels; + + Ort::Session* session; + + Model( + const ModelDesc* desc, + int gpuIdx, + const string& onnxOptModelFile, + const string& onnxRuntimeExecutionProvider, + const string& homeDataDirOverride + ) { + name = desc->name; + version = desc->version; + numInputChannels = desc->numInputChannels; + numInputGlobalChannels = desc->numInputGlobalChannels; + numValueChannels = desc->numValueChannels; + numScoreValueChannels = desc->numScoreValueChannels; + numOwnershipChannels = desc->numOwnershipChannels; + + auto envLocal = std::make_unique(ORT_LOGGING_LEVEL_ERROR, "Default"); + env = std::move(envLocal); + Ort::SessionOptions sf; + sf.SetExecutionMode(ExecutionMode::ORT_PARALLEL); + sf.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED); + string dir = HomeData::getHomeDataDir(true, homeDataDirOverride); + MakeDir::make(dir); + string optModelPath = dir + "/" + onnxOptModelFile; +#ifdef _WIN32 + std::wstring optModelFile = std::wstring(optModelPath.begin(), optModelPath.end()); + sf.SetOptimizedModelFilePath(optModelFile.data()); +#else + sf.SetOptimizedModelFilePath(optModelPath.data()); +#endif + + if(onnxRuntimeExecutionProvider == "CUDA") { + #ifdef USE_ORT_CUDA + Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(sf, gpuIdx)); + #else + throw StringError("KataGo was not compiled with CUDA support."); + #endif + } + else if(onnxRuntimeExecutionProvider == "TensorRT") { + #ifdef USE_ORT_TENSORRT + Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_Tensorrt(sf, gpuIdx)); + Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(sf, gpuIdx)); + #else + throw StringError("KataGo was not compiled with TensorRT support."); + #endif + } + else if(onnxRuntimeExecutionProvider == "DirectML") { + #ifdef USE_ORT_DIRECTML + sf.DisableMemPattern(); + Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_DML(sf, gpuIdx)); + #else + throw StringError("KataGo was not compiled with DirectML support."); + #endif + } + else if(onnxRuntimeExecutionProvider == "MIGraphX") { + #ifdef USE_ORT_MIGRAPHX + Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(sf, gpuIdx)); + #else + throw StringError("KataGo was not compiled with MIGraphX support."); + #endif + } + else { + throw StringError("Invalid ONNXRuntime backend"); + } + +#ifdef _WIN32 + std::wstring modelName = std::wstring(name.begin(), name.end()); + session = new Ort::Session(*env, modelName.data(), sf); +#else + session = new Ort::Session(*env, name.data(), sf); +#endif + + Ort::AllocatorWithDefaultOptions allocator; + + // input nodes + numInputNodes = session->GetInputCount(); + assert(numInputNodes == 2); + + for(int inputIdx = 0; inputIdx < numInputNodes; inputIdx++) { + inputNodeNames.emplace_back(session->GetInputName(inputIdx, allocator)); + } + + // output nodes + numOutputNodes = session->GetOutputCount(); + + for(int outputIdx = 0; outputIdx < numOutputNodes; outputIdx++) { + outputNodeNames.emplace_back(session->GetOutputName(outputIdx, allocator)); + } + } + + bool getUsingFP16() { + Ort::TypeInfo typeInfo = session->GetInputTypeInfo(0); + auto tensorInfo = typeInfo.GetTensorTypeAndShapeInfo(); + auto type = tensorInfo.GetElementType(); + return type == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16; + } + + vector evaluate(vector& inputTensors) { + auto outputTensors = session->Run( + Ort::RunOptions{nullptr}, + inputNodeNames.data(), + inputTensors.data(), + inputTensors.size(), + outputNodeNames.data(), + outputNodeNames.size() + ); + + return outputTensors; + } + + Model() = delete; + Model(const Model&) = delete; + Model& operator=(const Model&) = delete; + + ~Model() { + session->release(); + } + +private: + size_t numInputNodes; + size_t numOutputNodes; + vector inputNodeNames; + vector outputNodeNames; +}; + +//------------------------------------------------------------------------------ + +struct ComputeContext { + int nnXLen; + int nnYLen; + enabled_t usingFP16; + string onnxOptModelFile; + string onnxRuntimeExecutionProvider; + string homeDataDirOverride; + + ComputeContext( + int nnX, + int nnY, + const string& optModelFile, + const string& runtimeExecutionProvider, + const string& homeDataDir, + enabled_t useFP16 + ) { + nnXLen = nnX; + nnYLen = nnY; + onnxOptModelFile = optModelFile; + onnxRuntimeExecutionProvider = runtimeExecutionProvider; + homeDataDirOverride = homeDataDir; + usingFP16 = useFP16; + } +}; + +ComputeContext* NeuralNet::createComputeContext( + const std::vector& gpuIdxs, + Logger* logger, + int nnXLen, + int nnYLen, + const string& openCLTunerFile, + const string& onnxOptModelFile, + const string& onnxRuntimeExecutionProvider, + const string& homeDataDirOverride, + bool openCLReTunePerBoardSize, + enabled_t useFP16Mode, + enabled_t useNHWCMode, + const LoadedModel* loadedModel +) { + (void)gpuIdxs; + (void)logger; + (void)openCLTunerFile; + (void)openCLReTunePerBoardSize; + (void)useNHWCMode; + (void)loadedModel; + + return new ComputeContext( + nnXLen, nnYLen, onnxOptModelFile, onnxRuntimeExecutionProvider, homeDataDirOverride, useFP16Mode + ); +} + +void NeuralNet::freeComputeContext(ComputeContext* computeContext) { + delete computeContext; +} + +//------------------------------------------------------------------------------ + +struct ComputeHandle { + int nnXLen; + int nnYLen; + int policySize; + bool usingFP16; + Model* model; + + ComputeHandle( + ComputeContext* context, + const LoadedModel* loadedModel, + int gpuIdx + ) { + nnXLen = context->nnXLen; + nnYLen = context->nnYLen; + policySize = NNPos::getPolicySize(nnXLen, nnYLen); + model = new Model( + &(loadedModel->modelDesc), + gpuIdx, + context->onnxOptModelFile, + context->onnxRuntimeExecutionProvider, + context->homeDataDirOverride + ); + usingFP16 = model->getUsingFP16(); + } + ~ComputeHandle() { + delete model; + } + + ComputeHandle() = delete; + ComputeHandle(const ComputeHandle&) = delete; + ComputeHandle& operator=(const ComputeHandle&) = delete; +}; + +ComputeHandle* NeuralNet::createComputeHandle( + ComputeContext* context, + const LoadedModel* loadedModel, + Logger* logger, + int maxBatchSize, + bool requireExactNNLen, + bool inputsUseNHWC, + int gpuIdxForThisThread, + int serverThreadIdx +) { + (void)maxBatchSize; + (void)requireExactNNLen; + (void)inputsUseNHWC; + (void)serverThreadIdx; + + auto deviceStr = [&]() { + if(gpuIdxForThisThread < 0) + return string(""); + return " Device " + Global::intToString(gpuIdxForThisThread); + }; + + if(logger != NULL) { + logger->write("ONNXRuntime backend thread " + Global::intToString(serverThreadIdx) + ":" + deviceStr() + " Model version " + Global::intToString(loadedModel->modelDesc.version)); + logger->write("ONNXRuntime backend thread " + Global::intToString(serverThreadIdx) + ":" + deviceStr() + " Model name: " + loadedModel->modelDesc.name); + } + + ComputeHandle* handle = new ComputeHandle(context, loadedModel, gpuIdxForThisThread); + + if(logger != NULL) { + if(context->onnxRuntimeExecutionProvider == "CUDA") { + logger->write("ONNXRuntime: CUDA backend"); + } + else if(context->onnxRuntimeExecutionProvider == "TensorRT") { + logger->write("ONNXRuntime: TensorRT backend"); + } + else if(context->onnxRuntimeExecutionProvider == "DirectML") { + logger->write("ONNXRuntime: DirectML backend"); + } + else if(context->onnxRuntimeExecutionProvider == "MIGraphX") { + logger->write("ONNXRuntime: MIGraphX backend"); + } + else { + throw StringError("Invalid ONNXRuntime backend"); + } + } + + return handle; +} + +void NeuralNet::freeComputeHandle(ComputeHandle* handle) { + delete handle; +} + +//------------------------------------------------------------------------------ + +void NeuralNet::printDevices() { + +} + +#if defined(USE_ORT_CUDA) || defined(USE_ORT_TENSORRT) +void NeuralNet::printCUDADevices() { + int numDevices = 0; + cudaGetDeviceCount(&numDevices); + for(int i = 0; i vAdapters; + + if(FAILED(CreateDXGIFactory(__uuidof(IDXGIFactory), (void**)&pFactory))) + { + throw StringError("Unable to create IDXGIFactory."); + } + + for(int i = 0; pFactory->EnumAdapters(i, &pAdapter) != DXGI_ERROR_NOT_FOUND; ++i) { + vAdapters.push_back(pAdapter); + } + + if(pFactory) { + pFactory->Release(); + } + + for(int i = 0; i < vAdapters.size(); i++) { + DXGI_ADAPTER_DESC adapterDescription; + vAdapters[i]->GetDesc(&adapterDescription); + wstring wsDeviceName(adapterDescription.Description); + string deviceName(wsDeviceName.begin(), wsDeviceName.end()); + if(deviceName != "Microsoft Basic Render Driver"){ + cout << "Found DirectML device " << i << ": " << deviceName.c_str() << endl; + } + } +} +#endif + +#ifdef USE_ORT_MIGRAPHX +void NeuralNet::printOpenCLDevices() { + vector devices = DeviceInfo::getAllDeviceInfosOnSystem(NULL); + for(int i = 0; imodelDesc; + + int xSize = nnXLen; + int ySize = nnYLen; + maxBatchSize = maxBatchSz; + + singleBinaryInputElts = (size_t)m.numInputChannels * xSize * ySize; + singleGlobalInputElts = (size_t)m.numInputGlobalChannels; + singlePolicyResultElts = (size_t)(1 + xSize * ySize); + singleValueResultElts = (size_t)m.numValueChannels; + singleScoreResultElts = (size_t)m.numScoreValueChannels; + singleOwnershipResultElts = (size_t)m.numOwnershipChannels * xSize * ySize; + + assert(NNModelVersion::getNumSpatialFeatures(m.version) == m.numInputChannels); + assert(NNModelVersion::getNumGlobalFeatures(m.version) == m.numInputGlobalChannels); + + userBinaryInputBuffer = new float[(size_t)maxBatchSize * singleBinaryInputElts]; + userGlobalInputBuffer = new float[(size_t)maxBatchSize * singleGlobalInputElts]; + policyResults = new float[(size_t)maxBatchSize * singlePolicyResultElts]; + valueResults = new float[(size_t)maxBatchSize * singleValueResultElts]; + scoreResults = new float[(size_t)maxBatchSize * singleScoreResultElts]; + ownershipResults = new float[(size_t)maxBatchSize * singleOwnershipResultElts]; + + userBinaryInputBufferFP16 = new uint16_t[(size_t)maxBatchSize * singleBinaryInputElts]; + userGlobalInputBufferFP16 = new uint16_t[(size_t)maxBatchSize * singleGlobalInputElts]; + policyResultsFP16 = new uint16_t[(size_t)maxBatchSize * singlePolicyResultElts]; + valueResultsFP16 = new uint16_t[(size_t)maxBatchSize * singleValueResultElts]; + scoreResultsFP16 = new uint16_t[(size_t)maxBatchSize * singleScoreResultElts]; + ownershipResultsFP16 = new uint16_t[(size_t)maxBatchSize * singleOwnershipResultElts]; + } + + void copyInputFloatToHalf(size_t batchSize) { + for(int i = 0; i < batchSize * singleBinaryInputElts; i++) { + userBinaryInputBufferFP16[i] = half_float::detail::float2half(userBinaryInputBuffer[i]); + } + for(int i = 0; i < batchSize * singleGlobalInputElts; i++) { + userGlobalInputBufferFP16[i] = half_float::detail::float2half(userGlobalInputBuffer[i]); + } + } + + void copyOutputHalfToFloat(size_t batchSize) { + for(int i = 0; i < batchSize * singlePolicyResultElts; i++) { + policyResults[i] = half_float::detail::half2float(policyResultsFP16[i]); + } + for(int i = 0; i < batchSize * singleValueResultElts; i++) { + valueResults[i] = half_float::detail::half2float(valueResultsFP16[i]); + } + for(int i = 0; i < batchSize * singleScoreResultElts; i++) { + scoreResults[i] = half_float::detail::half2float(scoreResultsFP16[i]); + } + for(int i = 0; i < batchSize * singleOwnershipResultElts; i++) { + ownershipResults[i] = half_float::detail::half2float(ownershipResultsFP16[i]); + } + } + + ~InputBuffers() { + delete[] userBinaryInputBuffer; + delete[] userGlobalInputBuffer; + delete[] policyResults; + delete[] valueResults; + delete[] scoreResults; + delete[] ownershipResults; + delete[] userBinaryInputBufferFP16; + delete[] userGlobalInputBufferFP16; + delete[] policyResultsFP16; + delete[] valueResultsFP16; + delete[] scoreResultsFP16; + delete[] ownershipResultsFP16; + } + + InputBuffers() = delete; + InputBuffers(const InputBuffers&) = delete; + InputBuffers& operator=(const InputBuffers&) = delete; +}; + +InputBuffers* NeuralNet::createInputBuffers(const LoadedModel* loadedModel, int maxBatchSize, int nnXLen, int nnYLen) { + return new InputBuffers(loadedModel,maxBatchSize,nnXLen,nnYLen); +} + +void NeuralNet::freeInputBuffers(InputBuffers* inputBuffers) { + delete inputBuffers; +} + +//------------------------------------------------------------------------------ + +void NeuralNet::getOutput( + ComputeHandle* gpuHandle, + InputBuffers* inputBuffers, + int numBatchEltsFilled, + NNResultBuf** inputBufs, + int symmetry, + vector& outputs +) { + assert(numBatchEltsFilled <= inputBuffers->maxBatchSize); + assert(numBatchEltsFilled > 0); + int batchSize = numBatchEltsFilled; + int nnXLen = gpuHandle->nnXLen; + int nnYLen = gpuHandle->nnYLen; + int version = gpuHandle->model->version; + bool usingFP16 = gpuHandle->usingFP16; + + int numSpatialFeatures = NNModelVersion::getNumSpatialFeatures(version); + int numGlobalFeatures = NNModelVersion::getNumGlobalFeatures(version); + assert(numSpatialFeatures == gpuHandle->model->numInputChannels); + assert(numSpatialFeatures * nnXLen * nnYLen == inputBuffers->singleBinaryInputElts); + assert(numGlobalFeatures == inputBuffers->singleGlobalInputElts); + + for(int nIdx = 0; nIdx < batchSize; nIdx++) { + float* rowSpatialInput = inputBuffers->userBinaryInputBuffer + (inputBuffers->singleBinaryInputElts * nIdx); + float* rowGlobalInput = inputBuffers->userGlobalInputBuffer + (inputBuffers->singleGlobalInputElts * nIdx); + + const float* rowGlobal = inputBufs[nIdx]->rowGlobal; + const float* rowSpatial = inputBufs[nIdx]->rowSpatial; + std::copy(rowGlobal, rowGlobal + numGlobalFeatures, rowGlobalInput); + SymmetryHelpers::copyInputsWithSymmetry(rowSpatial, rowSpatialInput, 1, nnYLen, nnXLen, numSpatialFeatures, false, symmetry); + } + + const int policySize = nnXLen * nnYLen + 1; + const int valueSize = gpuHandle->model->numValueChannels; + const int scoreSize = gpuHandle->model->numScoreValueChannels; + const int ownershipSize = nnXLen * nnYLen; + + assert(valueSize == 3); + assert(gpuHandle->model->numOwnershipChannels == 1); + + // input + vector> inputNodeShape(2); + vector inputNodeSizes(2); + vector inputTensors; + + for(int i = 0; i < 2; i++) { + Ort::TypeInfo typeInfo = gpuHandle->model->session->GetInputTypeInfo(i); + auto tensorInfo = typeInfo.GetTensorTypeAndShapeInfo(); + + // input node dimensions + inputNodeShape[i] = tensorInfo.GetShape(); + // This is -1, so should be manually assigned + inputNodeShape[i][0] = (int64_t)batchSize; + } + assert(inputNodeShape[0].size() == 4); + assert(inputNodeShape[0][1] == numSpatialFeatures); + // Dynamic input shape for onnx models without masking + inputNodeShape[0][2] = nnYLen; + inputNodeShape[0][3] = nnXLen; + assert(inputNodeShape[1].size() == 2); + assert(inputNodeShape[1][1] == numGlobalFeatures); + + inputNodeSizes[0] = (int64_t)(batchSize * inputBuffers->singleBinaryInputElts); + inputNodeSizes[1] = (int64_t)(batchSize * inputBuffers->singleGlobalInputElts); + + auto memoryInfo = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault); + + if(!usingFP16) { + inputTensors.emplace_back(Ort::Value::CreateTensor( + memoryInfo, + inputBuffers->userBinaryInputBuffer, + inputNodeSizes[0], + inputNodeShape[0].data(), + inputNodeShape[0].size() + )); + inputTensors.emplace_back(Ort::Value::CreateTensor( + memoryInfo, + inputBuffers->userGlobalInputBuffer, + inputNodeSizes[1], + inputNodeShape[1].data(), + inputNodeShape[1].size() + )); + } + else { + inputBuffers->copyInputFloatToHalf(batchSize); + inputTensors.emplace_back(Ort::Value::CreateTensor( + memoryInfo, + inputBuffers->userBinaryInputBufferFP16, + inputNodeSizes[0] * sizeof(uint16_t), + inputNodeShape[0].data(), + inputNodeShape[0].size(), + ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16 + )); + inputTensors.emplace_back(Ort::Value::CreateTensor( + memoryInfo, + inputBuffers->userGlobalInputBufferFP16, + inputNodeSizes[1] * sizeof(uint16_t), + inputNodeShape[1].data(), + inputNodeShape[1].size(), + ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16 + )); + } + + vector inputVector(inputNodeSizes[1]); + for(int i = 0; i < inputNodeSizes[1]; i++) { + inputVector[i] = (inputBuffers->userGlobalInputBuffer[i]); + } + + // Evaluate + auto outputTensors = gpuHandle->model->evaluate(inputTensors); + + // collect outputs to vectors + if(!usingFP16){ + auto policy = outputTensors[0].GetTensorMutableData(); + auto value = outputTensors[1].GetTensorMutableData(); + auto score= outputTensors[2].GetTensorMutableData(); + auto ownership = outputTensors[3].GetTensorMutableData(); + std::copy(policy, policy + batchSize * policySize, inputBuffers->policyResults); + std::copy(value, value + batchSize * valueSize, inputBuffers->valueResults); + std::copy(score, score + batchSize * scoreSize, inputBuffers->scoreResults); + std::copy(ownership, ownership + batchSize * ownershipSize, inputBuffers->ownershipResults); + } + else { + auto policy = outputTensors[0].GetTensorMutableData(); + auto value = outputTensors[1].GetTensorMutableData(); + auto score = outputTensors[2].GetTensorMutableData(); + auto ownership = outputTensors[3].GetTensorMutableData(); + std::copy(policy, policy + batchSize * policySize, inputBuffers->policyResultsFP16); + std::copy(value, value + batchSize * valueSize, inputBuffers->valueResultsFP16); + std::copy(score, score + batchSize * scoreSize, inputBuffers->scoreResultsFP16); + std::copy(ownership, ownership + batchSize * ownershipSize, inputBuffers->ownershipResultsFP16); + inputBuffers->copyOutputHalfToFloat(batchSize); + } + + for(int row = 0; row < batchSize; row++) { + NNOutput* output = outputs[row]; + assert(output->nnXLen == nnXLen); + assert(output->nnYLen == nnYLen); + + // Policy + const float* policySrcBuf = inputBuffers->policyResults + row * policySize; + float* policyProbs = output->policyProbs; + + //These are not actually correct, the client does the postprocessing to turn them into + //policy probabilities and white game outcome probabilities + //Also we don't fill in the nnHash here either + SymmetryHelpers::copyOutputsWithSymmetry(policySrcBuf, policyProbs, 1, nnYLen, nnXLen, symmetry); + policyProbs[policySize-1] = policySrcBuf[policySize-1]; + + // Value + output->whiteWinProb = inputBuffers->valueResults[row * valueSize]; + output->whiteLossProb = inputBuffers->valueResults[row * valueSize + 1]; + output->whiteNoResultProb = inputBuffers->valueResults[row * valueSize + 2]; + + // Score + if(version >= 8) { + assert(scoreSize == 4); + output->whiteScoreMean = inputBuffers->scoreResults[row * scoreSize]; + output->whiteScoreMeanSq = inputBuffers->scoreResults[row * scoreSize + 1]; + output->whiteLead = inputBuffers->scoreResults[row * scoreSize + 2]; + output->varTimeLeft = inputBuffers->scoreResults[row * scoreSize + 3]; + } + else if(version >= 4) { + assert(scoreSize== 2); + output->whiteScoreMean = inputBuffers->scoreResults[row * scoreSize]; + output->whiteScoreMeanSq = inputBuffers->scoreResults[row * scoreSize + 1]; + output->whiteLead = output->whiteScoreMean; + output->varTimeLeft = 0; + } + else if(version >= 3) { + assert(scoreSize == 1); + output->whiteScoreMean = inputBuffers->scoreResults[row * scoreSize]; + //Version 3 neural nets don't have any second moment output, implicitly already folding it in, so we just use the mean squared + output->whiteScoreMeanSq = output->whiteScoreMean * output->whiteScoreMean; + output->whiteLead = output->whiteScoreMean; + output->varTimeLeft = 0; + } + else { + ASSERT_UNREACHABLE; + } + + // Ownership + //As above, these are NOT actually from white's perspective, but rather the player to move. + //As usual the client does the postprocessing. + if(output->whiteOwnerMap != NULL) { + const float* ownershipSrcBuf = inputBuffers->ownershipResults + row * ownershipSize; + SymmetryHelpers::copyOutputsWithSymmetry(ownershipSrcBuf, output->whiteOwnerMap, 1, nnYLen, nnXLen, symmetry); + } + } +} + +//------------------------------------------------------------------------------ + +bool NeuralNet::testEvaluateConv( + const ConvLayerDesc* desc, + int desiredBatchSize, + int nnXLen, + int nnYLen, + bool useFP16, + bool useNHWC, + const vector& inputBuffer, + vector& outputBuffer +) { + (void)desc; + (void)desiredBatchSize; + (void)nnXLen; + (void)nnYLen; + (void)useFP16; + (void)useNHWC; + (void)inputBuffer; + (void)outputBuffer; + return false; +} + +bool NeuralNet::testEvaluateBatchNorm( + const BatchNormLayerDesc* desc, + int desiredBatchSize, + int nnXLen, + int nnYLen, + bool useFP16, + bool useNHWC, + const vector& inputBuffer, + const vector& maskBuffer, + vector& outputBuffer +) { + (void)desc; + (void)desiredBatchSize; + (void)nnXLen; + (void)nnYLen; + (void)useFP16; + (void)useNHWC; + (void)inputBuffer; + (void)maskBuffer; + (void)outputBuffer; + return false; +} + +bool NeuralNet::testEvaluateResidualBlock( + const ResidualBlockDesc* desc, + int desiredBatchSize, + int nnXLen, + int nnYLen, + bool useFP16, + bool useNHWC, + const std::vector& inputBuffer, + const std::vector& maskBuffer, + std::vector& outputBuffer +) { + (void)desc; + (void)desiredBatchSize; + (void)nnXLen; + (void)nnYLen; + (void)useFP16; + (void)useNHWC; + (void)inputBuffer; + (void)maskBuffer; + (void)outputBuffer; + return false; +} + +bool NeuralNet::testEvaluateGlobalPoolingResidualBlock( + const GlobalPoolingResidualBlockDesc* desc, + int desiredBatchSize, + int nnXLen, + int nnYLen, + bool useFP16, + bool useNHWC, + const vector& inputBuffer, + const vector& maskBuffer, + vector& outputBuffer +) { + (void)desc; + (void)desiredBatchSize; + (void)nnXLen; + (void)nnYLen; + (void)useFP16; + (void)useNHWC; + (void)inputBuffer; + (void)maskBuffer; + (void)outputBuffer; + return false; +} + +#endif // USE_ONNXRUNTIME_BACKEND \ No newline at end of file diff --git a/cpp/program/gtpconfig.cpp b/cpp/program/gtpconfig.cpp index 1ad392c79..dc035254a 100644 --- a/cpp/program/gtpconfig.cpp +++ b/cpp/program/gtpconfig.cpp @@ -4,7 +4,7 @@ using namespace std; static const string gtpBase = R"%%( -# Logs and files-------------------------------------------------------------------------- +# Logs and files---------------------------------------------------------------- # Where to output log? logDir = gtp_logs # Each run of KataGo will log to a separate file in this dir @@ -18,7 +18,7 @@ logToStderr = false # Optionally override where KataGo will attempt to save things like openCLTuner files and other cached data. # homeDataDir = DIRECTORY -# Analysis------------------------------------------------------------------------------------ +# Analysis---------------------------------------------------------------------- # Configure the maximum length of analysis printed out by lz-analyze and other places. # Controls the number of moves after the first move in a variation. @@ -35,7 +35,7 @@ logToStderr = false # analysisWideRootNoise = 0.0 -# Default rules------------------------------------------------------------------------------------ +# Default rules----------------------------------------------------------------- # See https://lightvector.github.io/KataGo/rules.html for a description of the rules. # These rules are defaults and can be changed mid-run by several custom GTP commands. # See https://github.com/lightvector/KataGo/blob/master/docs/GTP_Extensions.md for those commands. @@ -52,9 +52,9 @@ logToStderr = false $$WHITE_HANDICAP_BONUS -# Bot behavior--------------------------------------------------------------------------------------- +# Bot behavior------------------------------------------------------------------ -# Resignation ------------- +# Resignation ------------------------- # Resignation occurs if for at least resignConsecTurns in a row, # the winLossUtility (which is on a [-1,1] scale) is below resignThreshold. @@ -64,7 +64,7 @@ resignConsecTurns = 3 # Uncomment to make katago not resign close games, behind by fewer than this many points # resignMinScoreDifference = 10 -# Handicap ------------- +# Handicap ---------------------------- # Assume that if black makes many moves in a row right at the start of the game, then the game is a handicap game. # This is necessary on some servers and for some GUIs and also when initializing from many SGF files, which may @@ -93,13 +93,13 @@ resignConsecTurns = 3 # playoutDoublingAdvantagePla = BLACK # playoutDoublingAdvantagePla = WHITE -# Misc Behavior -------------------- +# Misc Behavior ----------------------- # Uncomment and set to true to make KataGo avoid a particular joseki that some KataGo nets misevaluate, # and also to improve opening diversity versus some particular other bots that like to play it all the time. # avoidMYTDaggerHack = false -# Search limits----------------------------------------------------------------------------------- +# Search limits----------------------------------------------------------------- # For all of "maxVisits", "maxPlayouts", "maxTime", search will still try to follow GTP time controls and may make a move # faster than the specified max if GTP tells it that it is playing under a clock as well in the current game. @@ -128,7 +128,7 @@ searchFactorAfterTwoPass = 0.25 searchFactorWhenWinning = 0.40 searchFactorWhenWinningThreshold = 0.95 -# GPU Settings------------------------------------------------------------------------------- +# GPU Settings------------------------------------------------------------------ # Maximum number of positions to send to a single GPU at once. # The default value here is roughly equal to numSearchThreads, but you can specify it manually @@ -145,8 +145,14 @@ nnMutexPoolSizePowerOfTwo = $$NN_MUTEX_POOL_SIZE_POWER_OF_TWO $$MULTIPLE_GPUS +# ONNXRuntime Backend Settings-------------------------------------------------- +# Execution provider for the ONNXRuntime backend. +# Currently available options for this binary are: +# $$ONNXRUNTIME_AVAILABLE_EXECUTION_PROVIDERS +onnxOptModelFile = opt_model.onnx +onnxRuntimeExecutionProvider = $$ONNXRUNTIME_EXECUTION_PROVIDER -# Internal params------------------------------------------------------------------------------ +# Internal params--------------------------------------------------------------- # Uncomment and edit any of the below values to change them from their default. # How big to make the mutex pool for search synchronization @@ -161,6 +167,7 @@ string GTPConfig::makeConfig( int64_t maxPlayouts, double maxTime, double maxPonderTime, + string configOnnxRuntimeExecutionProvider, std::vector deviceIdxs, int nnCacheSizePowerOfTwo, int nnMutexPoolSizePowerOfTwo, @@ -214,6 +221,31 @@ string GTPConfig::makeConfig( replace("$$NN_CACHE_SIZE_POWER_OF_TWO", Global::intToString(nnCacheSizePowerOfTwo)); replace("$$NN_MUTEX_POOL_SIZE_POWER_OF_TWO", Global::intToString(nnMutexPoolSizePowerOfTwo)); +#ifdef USE_ONNXRUNTIME_BACKEND + vector availableExecutionProviders; + #ifdef USE_ORT_CUDA + availableExecutionProviders.push_back("CUDA"); + #endif + #ifdef USE_ORT_TENSORRT + availableExecutionProviders.push_back("TensorRT"); + #endif + #ifdef USE_ORT_DIRECTML + availableExecutionProviders.push_back("DirectML"); + #endif + #ifdef USE_ORT_MIGRAPHX + availableExecutionProviders.push_back("MIGraphX"); + #endif + string providers = ""; + for(int i = 0; i < availableExecutionProviders.size(); i++) { + providers += availableExecutionProviders[i]; + if(i < availableExecutionProviders.size() - 1){ + providers += ", "; + } + } + replace("$$ONNXRUNTIME_AVAILABLE_EXECUTION_PROVIDERS", providers); + replace("$$ONNXRUNTIME_EXECUTION_PROVIDER", configOnnxRuntimeExecutionProvider); +#endif + if(deviceIdxs.size() <= 0) { replace("$$MULTIPLE_GPUS", ""); } @@ -227,6 +259,9 @@ string GTPConfig::makeConfig( #endif #ifdef USE_OPENCL_BACKEND replacement += "openclDeviceToUseThread" + Global::intToString(i) + " = " + Global::intToString(deviceIdxs[i]) + "\n"; +#endif +#ifdef USE_ONNXRUNTIME_BACKEND + replacement += "onnxruntimeDeviceToUseThread" + Global::intToString(i) + " = " + Global::intToString(deviceIdxs[i]) + "\n"; #endif } replace("$$MULTIPLE_GPUS", replacement); diff --git a/cpp/program/gtpconfig.h b/cpp/program/gtpconfig.h index f70e329b5..4a290c5c6 100644 --- a/cpp/program/gtpconfig.h +++ b/cpp/program/gtpconfig.h @@ -11,6 +11,7 @@ namespace GTPConfig { int64_t maxPlayouts, double maxTime, double maxPonderTime, + std::string configOnnxRuntimeExecutionProvider, std::vector deviceIdxs, int nnCacheSizePowerOfTwo, int nnMutexPoolSizePowerOfTwo, diff --git a/cpp/program/setup.cpp b/cpp/program/setup.cpp index 58feba29a..7bcf1da27 100644 --- a/cpp/program/setup.cpp +++ b/cpp/program/setup.cpp @@ -52,6 +52,8 @@ vector Setup::initializeNNEvaluators( string backendPrefix = "opencl"; #elif defined(USE_EIGEN_BACKEND) string backendPrefix = "eigen"; + #elif defined(USE_ONNXRUNTIME_BACKEND) + string backendPrefix = "onnxruntime"; #else string backendPrefix = "dummybackend"; #endif @@ -64,6 +66,8 @@ vector Setup::initializeNNEvaluators( cfg.markAllKeysUsedWithPrefix("opencl"); if(backendPrefix != "eigen") cfg.markAllKeysUsedWithPrefix("eigen"); + if(backendPrefix != "onnxruntime") + cfg.markAllKeysUsedWithPrefix("onnxruntime"); if(backendPrefix != "dummybackend") cfg.markAllKeysUsedWithPrefix("dummybackend"); @@ -108,7 +112,7 @@ vector Setup::initializeNNEvaluators( requireExactNNLen = cfg.getBool("requireMaxBoardSize"); } - bool inputsUseNHWC = backendPrefix == "opencl" ? false : true; + bool inputsUseNHWC = (backendPrefix == "opencl" || backendPrefix == "onnxruntime") ? false : true; if(cfg.contains(backendPrefix+"InputsUseNHWC"+idxStr)) inputsUseNHWC = cfg.getBool(backendPrefix+"InputsUseNHWC"+idxStr); else if(cfg.contains("inputsUseNHWC"+idxStr)) @@ -206,6 +210,14 @@ vector Setup::initializeNNEvaluators( bool openCLReTunePerBoardSize = false; if(cfg.contains("openclReTunePerBoardSize")) openCLReTunePerBoardSize = cfg.getBool("openclReTunePerBoardSize"); + + string onnxOptModelFile; + if(cfg.contains("onnxOptModelFile")) + onnxOptModelFile = cfg.getString("onnxOptModelFile"); + + string onnxRuntimeExecutionProvider; + if(cfg.contains("onnxRuntimeExecutionProvider")) + onnxRuntimeExecutionProvider = cfg.getString("onnxRuntimeExecutionProvider"); enabled_t useFP16Mode = enabled_t::Auto; if(cfg.contains(backendPrefix+"UseFP16-"+idxStr)) @@ -294,6 +306,8 @@ vector Setup::initializeNNEvaluators( nnMutexPoolSizePowerOfTwo, debugSkipNeuralNet, openCLTunerFile, + onnxOptModelFile, + onnxRuntimeExecutionProvider, homeDataDirOverride, openCLReTunePerBoardSize, useFP16Mode, diff --git a/cpp/tests/testsearch.cpp b/cpp/tests/testsearch.cpp index 34071d9e0..2e33f9bd4 100644 --- a/cpp/tests/testsearch.cpp +++ b/cpp/tests/testsearch.cpp @@ -132,6 +132,8 @@ static NNEvaluator* startNNEval( bool openCLReTunePerBoardSize = false; const string& modelName = modelFile; const string openCLTunerFile = ""; + const string onnxOptModelFile = ""; + const string onnxRuntimeExecutionProvider = "DirectML"; const string homeDataDirOverride = ""; int numNNServerThreadsPerModel = 1; bool nnRandomize = false; @@ -156,6 +158,8 @@ static NNEvaluator* startNNEval( nnMutexPoolSizePowerOfTwo, debugSkipNeuralNet, openCLTunerFile, + onnxOptModelFile, + onnxRuntimeExecutionProvider, homeDataDirOverride, openCLReTunePerBoardSize, useFP16 ? enabled_t::True : enabled_t::False, diff --git a/cpp/tests/testtrainingwrite.cpp b/cpp/tests/testtrainingwrite.cpp index 856159db4..d588ccb38 100644 --- a/cpp/tests/testtrainingwrite.cpp +++ b/cpp/tests/testtrainingwrite.cpp @@ -24,6 +24,8 @@ static NNEvaluator* startNNEval( int nnMutexPoolSizePowerOfTwo = 12; bool debugSkipNeuralNet = modelFile == "/dev/null"; const string openCLTunerFile = ""; + const string onnxOptModelFile = ""; + const string onnxRuntimeExecutionProvider = ""; const string homeDataDirOverride = ""; bool openCLReTunePerBoardSize = false; int numNNServerThreadsPerModel = 1; @@ -43,6 +45,8 @@ static NNEvaluator* startNNEval( nnMutexPoolSizePowerOfTwo, debugSkipNeuralNet, openCLTunerFile, + onnxOptModelFile, + onnxRuntimeExecutionProvider, homeDataDirOverride, openCLReTunePerBoardSize, useFP16 ? enabled_t::True : enabled_t::False,