diff --git a/.gitignore b/.gitignore
index 29b52cbf6..178859304 100644
--- a/.gitignore
+++ b/.gitignore
@@ -49,6 +49,9 @@ out.txt
 # For clion IDE
 .idea
 
+# For vscode
+.vscode
+
 # For cmake
 CMakeCache.txt
 CMakeFiles/
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 375dc3808..eb458c260 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -23,25 +23,9 @@ if(NOT WIN32)
   set(ColorBoldRed "${ColorRed}${ColorBold}")
 endif()
 
-#--------------------------- CMAKE VARIABLES (partly for Cmake GUI) ----------------------------------------------------
-
-set(BUILD_DISTRIBUTED 0 CACHE BOOL "Build with http support for contributing to distributed training")
-set(USE_BACKEND CACHE STRING "Neural net backend")
-string(TOUPPER "${USE_BACKEND}" USE_BACKEND)
-set_property(CACHE USE_BACKEND PROPERTY STRINGS "" CUDA OPENCL EIGEN)
-
-set(USE_TCMALLOC 0 CACHE BOOL "Use TCMalloc")
-set(NO_GIT_REVISION 0 CACHE BOOL "Disable embedding the git revision into the compiled exe")
-set(USE_AVX2 0 CACHE BOOL "Compile with AVX2")
-set(USE_BIGGER_BOARDS_EXPENSIVE 0 CACHE BOOL "Allow boards up to size 29. Compiling with this Will use more memory and slow down KataGo, even when playing on boards of size 19.")
-
-#--------------------------- NEURAL NET BACKEND ------------------------------------------------------------------------
-
-message(STATUS "Building 'katago' executable for GTP engine and other tools.")
-if(USE_BACKEND STREQUAL "CUDA")
-  message(STATUS "-DUSE_BACKEND=CUDA, using CUDA backend.")
-
+#--------------------------- CUDA MACRO -------------------------------------------------------------------------------
 
+macro(CONFIGURE_CUDA)
   # Ensure dynamic cuda linking (Versions prior to 3.17)
   if (${CMAKE_VERSION} VERSION_LESS "3.17")
     set(CMAKE_CUDA_FLAGS "" CACHE STRING "")
@@ -146,6 +130,26 @@ if(USE_BACKEND STREQUAL "CUDA")
         "
         )
   endif()
+endmacro()
+
+#--------------------------- CMAKE VARIABLES (partly for Cmake GUI) ----------------------------------------------------
+
+set(BUILD_DISTRIBUTED 0 CACHE BOOL "Build with http support for contributing to distributed training")
+set(USE_BACKEND CACHE STRING "Neural net backend")
+string(TOUPPER "${USE_BACKEND}" USE_BACKEND)
+set_property(CACHE USE_BACKEND PROPERTY STRINGS "" CUDA OPENCL EIGEN ONNXRUNTIME)
+
+set(USE_TCMALLOC 0 CACHE BOOL "Use TCMalloc")
+set(NO_GIT_REVISION 0 CACHE BOOL "Disable embedding the git revision into the compiled exe")
+set(USE_AVX2 0 CACHE BOOL "Compile with AVX2")
+set(USE_BIGGER_BOARDS_EXPENSIVE 0 CACHE BOOL "Allow boards up to size 29. Compiling with this Will use more memory and slow down KataGo, even when playing on boards of size 19.")
+
+#--------------------------- NEURAL NET BACKEND ------------------------------------------------------------------------
+
+message(STATUS "Building 'katago' executable for GTP engine and other tools.")
+if(USE_BACKEND STREQUAL "CUDA")
+  message(STATUS "-DUSE_BACKEND=CUDA, using CUDA backend.")
+  configure_cuda()
 elseif(USE_BACKEND STREQUAL "OPENCL")
   message(STATUS "-DUSE_BACKEND=OPENCL, using OpenCL backend.")
   set(NEURALNET_BACKEND_SOURCES
@@ -162,8 +166,28 @@ elseif(USE_BACKEND STREQUAL "EIGEN")
   set(NEURALNET_BACKEND_SOURCES
     neuralnet/eigenbackend.cpp
     )
+elseif(USE_BACKEND STREQUAL "ONNXRUNTIME")
+  message(STATUS "-DUSE_BACKEND=ONNXRUNTIME, using ONNXRuntime backend.")
+  set(ORT_CUDA 0 CACHE BOOL "Use CUDA execution provider for ONNXRuntime.")
+  set(ORT_TENSORRT 0 CACHE BOOL "Use TensorRT execution provider for ONNXRuntime.")
+  set(ORT_DIRECTML 0 CACHE BOOL "Use DirectML execution provider for ONNXRuntime.")
+  set(ORT_MIGRAPHX 0 CACHE BOOL "Use MIGraphX execution provider for ONNXRuntime.")
+  if(ORT_CUDA OR ORT_TENSORRT)
+    configure_cuda()
+  endif()
+  if(ORT_MIGRAPHX)
+    set(NEURALNET_BACKEND_SOURCES
+      neuralnet/ortbackend.cpp
+      neuralnet/openclhelpers.cpp
+      )
+  else()
+    set(NEURALNET_BACKEND_SOURCES
+      neuralnet/ortbackend.cpp
+      )
+  endif()
+  
 elseif(USE_BACKEND STREQUAL "")
-  message(WARNING "${ColorBoldRed}WARNING: Using dummy neural net backend, intended for non-neural-net testing only, will fail on any code path requiring a neural net. To use neural net, specify -DUSE_BACKEND=CUDA or -DUSE_BACKEND=OPENCL or -DUSE_BACKEND=EIGEN to compile with the respective backend.${ColorReset}")
+  message(WARNING "${ColorBoldRed}WARNING: Using dummy neural net backend, intended for non-neural-net testing only, will fail on any code path requiring a neural net. To use neural net, specify -DUSE_BACKEND=CUDA or -DUSE_BACKEND=OPENCL or -DUSE_BACKEND=ONNXRUNTIME or -DUSE_BACKEND=EIGEN to compile with the respective backend.${ColorReset}")
   set(NEURALNET_BACKEND_SOURCES neuralnet/dummybackend.cpp)
 else()
   message(FATAL_ERROR "Unrecognized backend: " ${USE_BACKEND})
@@ -331,6 +355,66 @@ elseif(USE_BACKEND STREQUAL "EIGEN")
       endif()
     endif()
   endif()
+elseif(USE_BACKEND STREQUAL "ONNXRUNTIME")
+  target_compile_definitions(katago PRIVATE USE_ONNXRUNTIME_BACKEND)
+  set(ORT_LIB_DIR CACHE STRING "ONNXRuntime library location")
+  set(ORT_INCLUDE_DIR CACHE STRING "ONNXRuntime header files location")
+  message(STATUS "ORT_LIB_DIR: " ${ORT_LIB_DIR})
+  message(STATUS "ORT_INCLUDE_DIR: " ${ORT_INCLUDE_DIR})
+  include_directories(${ORT_INCLUDE_DIR})
+  if(EXISTS ${ORT_INCLUDE_DIR}/core/session)
+    include_directories(${ORT_INCLUDE_DIR}/core/session)
+  endif()
+  if(EXISTS ${ORT_INCLUDE_DIR}/core/providers/cpu)
+    include_directories(${ORT_INCLUDE_DIR}/core/providers/cpu)
+  endif()
+  find_library(ORT_LIBRARY NAMES onnxruntime PATHS ${ORT_LIB_DIR})
+  if(NOT ORT_LIBRARY)
+    message(FATAL_ERROR "Could not find onnxruntime")
+  endif()
+  target_link_libraries(katago ${ORT_LIBRARY})
+  if(ORT_CUDA)
+    target_compile_definitions(katago PRIVATE USE_ORT_CUDA)
+  endif()
+  if(ORT_TENSORRT)
+    target_compile_definitions(katago PRIVATE USE_ORT_TENSORRT)
+    set(TENSORRT_LIB_DIR CACHE STRING "TensorRT library location")
+    set(TENSORRT_INCLUDE_DIR CACHE STRING "TensorRT header file location")
+    include_directories(${TENSORRT_INCLUDE_DIR})
+    find_library(TENSORRT_LIBRARY NAMES nvinfer PATHS ${TENSORRT_LIB_DIR})
+    if(NOT TENSORRT_LIBRARY)
+      message(FATAL_ERROR "Could not find nvinfer")
+    endif()
+    target_link_libraries(katago ${TENSORRT_LIBRARY})
+    if(EXISTS ${ORT_INCLUDE_DIR}/core/providers/tensorrt)
+      include_directories(${ORT_INCLUDE_DIR}/core/providers/tensorrt)
+    endif()
+  endif()
+  if(ORT_CUDA OR ORT_TENSORRT)
+    find_package(CUDA REQUIRED)
+    find_path(CUDNN_INCLUDE_DIR cudnn.h HINTS ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR} PATH_SUFFIXES cuda/include include)
+    if((NOT CUDNN_INCLUDE_DIR))
+      message(ERROR "${ColorBoldRed} cudnn.h was NOT found, specify CUDNN_INCLUDE_DIR to indicate where it is. ${ColorReset}")
+    endif()
+    find_library(CUDNN_LIBRARY libcudnn.so PATHS /usr/local/cuda/lib64 /opt/cuda/lib64)
+    include_directories(SYSTEM ${CUDA_INCLUDE_DIRS} ${CUDNN_INCLUDE_DIR}) #SYSTEM is for suppressing some compiler warnings in thrust libraries
+    target_link_libraries(katago ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_LIBRARIES})
+    if(EXISTS ${ORT_INCLUDE_DIR}/core/providers/cuda)
+      include_directories(${ORT_INCLUDE_DIR}/core/providers/cuda)
+    endif()
+  endif()
+  if(ORT_DIRECTML)
+    target_compile_definitions(katago PRIVATE USE_ORT_DIRECTML)
+    if(EXISTS ${ORT_INCLUDE_DIR}/core/providers/directml)
+      include_directories(${ORT_INCLUDE_DIR}/core/providers/directml)
+    endif()
+  endif()
+  if(ORT_MIGRAPHX)
+    target_compile_definitions(katago PRIVATE USE_ORT_MIGRAPHX)
+    if(EXISTS ${ORT_INCLUDE_DIR}/core/providers/migraphx)
+      include_directories(${ORT_INCLUDE_DIR}/core/providers/migraphx)
+    endif()
+  endif()
 endif()
 
 if(USE_BIGGER_BOARDS_EXPENSIVE)
diff --git a/cpp/command/benchmark.cpp b/cpp/command/benchmark.cpp
index 65504d6a9..f7dee0b6a 100644
--- a/cpp/command/benchmark.cpp
+++ b/cpp/command/benchmark.cpp
@@ -209,6 +209,10 @@ int MainCmds::benchmark(int argc, const char* const* argv) {
   cout << "If you have a strong GPU capable of FP16 tensor cores (e.g. RTX2080), "
        << "using the Cuda version of KataGo instead may give a mild performance boost." << endl;
 #endif
+#ifdef USE_ONNXRUNTIME_BACKEND
+  cout << "You are currently using the ONNXRuntime version of KataGo with "
+       << nnEval->getOnnxRuntimeExecutionProvider() << " execution provider." << endl;
+#endif
 #ifdef USE_EIGEN_BACKEND
   cout << "You are currently using the Eigen (CPU) version of KataGo. Due to having no GPU, it may be slow." << endl;
 #endif
@@ -564,6 +568,7 @@ int MainCmds::genconfig(int argc, const char* const* argv, const char* firstComm
   int64_t configMaxPlayouts = ((int64_t)1) << 50;
   double configMaxTime = 1e20;
   double configMaxPonderTime = -1.0;
+  string configOnnxRuntimeExecutionProvider;
   vector<int> configDeviceIdxs;
   int configNNCacheSizePowerOfTwo = 20;
   int configNNMutexPoolSizePowerOfTwo = 16;
@@ -693,6 +698,41 @@ int MainCmds::genconfig(int argc, const char* const* argv, const char* firstComm
       });
   }
 
+#ifdef USE_ONNXRUNTIME_BACKEND
+  cout << endl;
+  cout << "=========================================================================" << endl;
+  cout << "ONNXRUNTIME EXECUTION PROVIDER" << endl;
+
+  {
+    vector<string> executionProviders;
+#ifdef USE_ORT_CUDA
+      executionProviders.push_back("CUDA");
+#endif
+#ifdef USE_ORT_TENSORRT
+      executionProviders.push_back("TensorRT");
+#endif
+#ifdef USE_ORT_DIRECTML
+      executionProviders.push_back("DirectML");
+#endif
+#ifdef USE_ORT_MIGRAPHX
+      executionProviders.push_back("MIGraphX");
+#endif
+
+    cout << endl;
+    cout << "Available ONNXRuntime execution providers:" << endl;
+    for(const auto provider: executionProviders) {
+      cout << provider << " ";
+    }
+    cout << endl << endl;
+
+    string prompt = "Specify an execution provider for ONNXRuntime. Leave blank to use the first available provider.\n";
+    promptAndParseInput(prompt, [&](const string& line) {
+      if(line == "") configOnnxRuntimeExecutionProvider = executionProviders[0];
+      else configOnnxRuntimeExecutionProvider = line;
+    });
+  }
+#endif
+
   cout << endl;
   cout << "=========================================================================" << endl;
   cout << "GPUS AND RAM" << endl;
@@ -701,7 +741,11 @@ int MainCmds::genconfig(int argc, const char* const* argv, const char* firstComm
   {
     cout << endl;
     cout << "Finding available GPU-like devices..." << endl;
+    #ifndef USE_ONNXRUNTIME_BACKEND
     NeuralNet::printDevices();
+    #else
+    NeuralNet::printDevices(configOnnxRuntimeExecutionProvider);
+    #endif
     cout << endl;
 
     string prompt =
@@ -789,6 +833,7 @@ int MainCmds::genconfig(int argc, const char* const* argv, const char* firstComm
       configMaxPlayouts,
       configMaxTime,
       configMaxPonderTime,
+      configOnnxRuntimeExecutionProvider,
       configDeviceIdxs,
       configNNCacheSizePowerOfTwo,
       configNNMutexPoolSizePowerOfTwo,
diff --git a/cpp/neuralnet/cudabackend.cpp b/cpp/neuralnet/cudabackend.cpp
index ce01d610d..d65fb4d90 100644
--- a/cpp/neuralnet/cudabackend.cpp
+++ b/cpp/neuralnet/cudabackend.cpp
@@ -2584,6 +2584,8 @@ ComputeContext* NeuralNet::createComputeContext(
   int nnXLen,
   int nnYLen,
   const string& openCLTunerFile,
+  const string& onnxOptModelFile,
+  const string& onnxRuntimeExecutionProvider,
   const string& homeDataDirOverride,
   bool openCLReTunePerBoardSize,
   enabled_t useFP16Mode,
@@ -2593,6 +2595,8 @@ ComputeContext* NeuralNet::createComputeContext(
   (void)gpuIdxs;
   (void)logger;
   (void)openCLTunerFile;
+  (void)onnxOptModelFile;
+  (void)onnxRuntimeExecutionProvider;
   (void)homeDataDirOverride;
   (void)openCLReTunePerBoardSize;
   (void)loadedModel;
diff --git a/cpp/neuralnet/dummybackend.cpp b/cpp/neuralnet/dummybackend.cpp
index 72817fdf1..fc3fd1ec4 100644
--- a/cpp/neuralnet/dummybackend.cpp
+++ b/cpp/neuralnet/dummybackend.cpp
@@ -19,6 +19,8 @@ ComputeContext* NeuralNet::createComputeContext(
   int nnXLen,
   int nnYLen,
   const string& openCLTunerFile,
+  const string& onnxOptModelFile,
+  const string& onnxRuntimeExecutionProvider,
   const string& homeDataDirOverride,
   bool openCLReTunePerBoardSize,
   enabled_t useFP16Mode,
@@ -30,6 +32,8 @@ ComputeContext* NeuralNet::createComputeContext(
   (void)nnXLen;
   (void)nnYLen;
   (void)openCLTunerFile;
+  (void)onnxOptModelFile;
+  (void)onnxRuntimeExecutionProvider;
   (void)homeDataDirOverride;
   (void)openCLReTunePerBoardSize;
   (void)useFP16Mode;
diff --git a/cpp/neuralnet/eigenbackend.cpp b/cpp/neuralnet/eigenbackend.cpp
index bf1b9c0ff..97292e7e1 100644
--- a/cpp/neuralnet/eigenbackend.cpp
+++ b/cpp/neuralnet/eigenbackend.cpp
@@ -1429,6 +1429,8 @@ ComputeContext* NeuralNet::createComputeContext(
   int nnXLen,
   int nnYLen,
   const string& openCLTunerFile,
+  const string& onnxOptModelFile,
+  const string& onnxRuntimeExecutionProvider,
   const string& homeDataDirOverride,
   bool openCLReTunePerBoardSize,
   enabled_t useFP16Mode,
@@ -1438,6 +1440,8 @@ ComputeContext* NeuralNet::createComputeContext(
   (void)gpuIdxs;
   (void)logger;
   (void)openCLTunerFile;
+  (void)onnxOptModelFile;
+  (void)onnxRuntimeExecutionProvider;
   (void)homeDataDirOverride;
   (void)openCLReTunePerBoardSize;
 
diff --git a/cpp/neuralnet/nneval.cpp b/cpp/neuralnet/nneval.cpp
index 0f957073e..0df52a318 100644
--- a/cpp/neuralnet/nneval.cpp
+++ b/cpp/neuralnet/nneval.cpp
@@ -66,6 +66,8 @@ NNEvaluator::NNEvaluator(
   int nnMutexPoolSizePowerofTwo,
   bool skipNeuralNet,
   const string& openCLTunerFile,
+  const string& onnxOptModelFile,
+  const string& onnxRuntimeExecutionProvider,
   const string& homeDataDirOverride,
   bool openCLReTunePerBoardSize,
   enabled_t useFP16Mode,
@@ -83,6 +85,7 @@ NNEvaluator::NNEvaluator(
    requireExactNNLen(rExactNNLen),
    policySize(NNPos::getPolicySize(xLen,yLen)),
    inputsUseNHWC(iUseNHWC),
+   ortExecutionProvider(onnxRuntimeExecutionProvider),
    usingFP16Mode(useFP16Mode),
    usingNHWCMode(useNHWCMode),
    numThreads(numThr),
@@ -145,8 +148,8 @@ NNEvaluator::NNEvaluator(
     inputsVersion = NNModelVersion::getInputsVersion(modelVersion);
     computeContext = NeuralNet::createComputeContext(
       gpuIdxs,logger,nnXLen,nnYLen,
-      openCLTunerFile,homeDataDirOverride,openCLReTunePerBoardSize,
-      usingFP16Mode,usingNHWCMode,loadedModel
+      openCLTunerFile,onnxOptModelFile,onnxRuntimeExecutionProvider,
+      homeDataDirOverride,openCLReTunePerBoardSize,usingFP16Mode,usingNHWCMode,loadedModel
     );
   }
   else {
@@ -224,6 +227,9 @@ int NNEvaluator::getNNXLen() const {
 int NNEvaluator::getNNYLen() const {
   return nnYLen;
 }
+string NNEvaluator::getOnnxRuntimeExecutionProvider() const{
+   return ortExecutionProvider;
+}
 enabled_t NNEvaluator::getUsingFP16Mode() const {
   return usingFP16Mode;
 }
diff --git a/cpp/neuralnet/nneval.h b/cpp/neuralnet/nneval.h
index 35a6b4d31..b5b60c027 100644
--- a/cpp/neuralnet/nneval.h
+++ b/cpp/neuralnet/nneval.h
@@ -89,6 +89,8 @@ class NNEvaluator {
     int nnMutexPoolSizePowerofTwo,
     bool debugSkipNeuralNet,
     const std::string& openCLTunerFile,
+    const std::string& onnxOptModelFile,
+    const std::string& onnxRuntimeExecutionProvider,
     const std::string& homeDataDirOverride,
     bool openCLReTunePerBoardSize,
     enabled_t useFP16Mode,
@@ -113,6 +115,7 @@ class NNEvaluator {
   int getNumServerThreads() const;
   int getNNXLen() const;
   int getNNYLen() const;
+  std::string getOnnxRuntimeExecutionProvider() const;
   enabled_t getUsingFP16Mode() const;
   enabled_t getUsingNHWCMode() const;
 
@@ -172,6 +175,7 @@ class NNEvaluator {
   const bool requireExactNNLen;
   const int policySize;
   const bool inputsUseNHWC;
+  const std::string ortExecutionProvider;
   const enabled_t usingFP16Mode;
   const enabled_t usingNHWCMode;
   int numThreads;
diff --git a/cpp/neuralnet/nninterface.h b/cpp/neuralnet/nninterface.h
index a9c53e509..249230abe 100644
--- a/cpp/neuralnet/nninterface.h
+++ b/cpp/neuralnet/nninterface.h
@@ -36,6 +36,16 @@ namespace NeuralNet {
 
   //Print available backend devices
   void printDevices();
+  void printDevices(const std::string& ortExecutionProvider);
+  #if defined(USE_ORT_CUDA) || defined(USE_ORT_TENSORRT)
+    void printCUDADevices();
+  #endif
+  #ifdef USE_ORT_DIRECTML
+    void printDirectMLDevices();
+  #endif
+  #ifdef USE_ORT_MIGRAPHX
+    void printOpenCLDevices();
+  #endif
 
   // Model I/O -----------------------------------------------------------------
 
@@ -59,6 +69,8 @@ namespace NeuralNet {
     int nnXLen,
     int nnYLen,
     const std::string& openCLTunerFile,
+    const std::string& onnxOptModelFile,
+    const std::string& onnxRuntimeExecutionProvider,
     const std::string& homeDataDirOverride,
     bool openCLReTunePerBoardSize,
     enabled_t useFP16Mode,
diff --git a/cpp/neuralnet/openclbackend.cpp b/cpp/neuralnet/openclbackend.cpp
index 8372142c5..9d60857e3 100644
--- a/cpp/neuralnet/openclbackend.cpp
+++ b/cpp/neuralnet/openclbackend.cpp
@@ -394,12 +394,17 @@ ComputeContext* NeuralNet::createComputeContext(
   int nnXLen,
   int nnYLen,
   const string& openCLTunerFile,
+  const string& onnxOptModelFile,
+  const string& onnxRuntimeExecutionProvider,
   const string& homeDataDirOverride,
   bool openCLReTunePerBoardSize,
   enabled_t useFP16Mode,
   enabled_t useNHWCMode,
   const LoadedModel* loadedModel
 ) {
+  (void)onnxOptModelFile;
+  (void)onnxRuntimeExecutionProvider;
+
   if(gpuIdxs.size() <= 0)
     throw StringError("NeuralNet::createComputeContext - specified no gpus to use");
 
diff --git a/cpp/neuralnet/ortbackend.cpp b/cpp/neuralnet/ortbackend.cpp
new file mode 100644
index 000000000..7d656db7e
--- /dev/null
+++ b/cpp/neuralnet/ortbackend.cpp
@@ -0,0 +1,851 @@
+#ifdef USE_ONNXRUNTIME_BACKEND
+
+#include "../neuralnet/nninterface.h"
+#include "../neuralnet/nninputs.h"
+#include "../neuralnet/nneval.h"
+#include "../neuralnet/modelversion.h"
+#include "../core/makedir.h"
+#include "../dataio/homedata.h"
+
+#include "../external/half-2.1.0/include/half.hpp"
+#include <onnxruntime_c_api.h>
+#include <onnxruntime_cxx_api.h>
+
+#if defined(USE_ORT_CUDA) || defined(USE_ORT_TENSORRT)
+  #include <cuda_provider_factory.h>
+  #include "../neuralnet/cudaincludes.h"
+#endif
+#ifdef USE_ORT_TENSORRT
+  #include <tensorrt_provider_factory.h>
+#endif
+#ifdef USE_ORT_DIRECTML
+  #include <dml_provider_factory.h>
+  #include <Dxgi.h>
+#endif
+#ifdef USE_ORT_MIGRAPHX
+  #include <migraphx_provider_factory.h>
+  #include "../neuralnet/openclincludes.h"
+  #include "../neuralnet/openclhelpers.h"
+#endif
+
+using namespace std;
+
+//------------------------------------------------------------------------------
+
+void NeuralNet::globalInitialize() {
+}
+
+void NeuralNet::globalCleanup() {
+}
+
+//------------------------------------------------------------------------------
+
+// Model itself is loaded in ComputeHandle instead
+struct LoadedModel {
+  ModelDesc modelDesc;
+
+  // This is not optimal of course, we can probably tar .json and .onnx together?
+  // Or the ONNX file itself can be parsed.
+  LoadedModel(const string& fileName) {
+    modelDesc.name = fileName;
+    modelDesc.version = 8;
+    modelDesc.numInputChannels = 22;
+    modelDesc.numInputGlobalChannels = 19;
+    modelDesc.numValueChannels = 3;
+    modelDesc.numOwnershipChannels = 1;
+    modelDesc.numScoreValueChannels = 4;
+  }
+
+  LoadedModel() = delete;
+  LoadedModel(const LoadedModel&) = delete;
+  LoadedModel& operator=(const LoadedModel&) = delete;
+};
+
+LoadedModel* NeuralNet::loadModelFile(const string& file) {
+  LoadedModel* loadedModel = new LoadedModel(file);
+  return loadedModel;
+}
+
+void NeuralNet::freeLoadedModel(LoadedModel* loadedModel) {
+  delete loadedModel;
+}
+
+string NeuralNet::getModelName(const LoadedModel* loadedModel) {
+  return loadedModel->modelDesc.name;
+}
+
+int NeuralNet::getModelVersion(const LoadedModel* loadedModel) {
+  return loadedModel->modelDesc.version;
+}
+
+Rules NeuralNet::getSupportedRules(const LoadedModel* loadedModel, const Rules& desiredRules, bool& supported) {
+  return loadedModel->modelDesc.getSupportedRules(desiredRules, supported);
+}
+
+//------------------------------------------------------------------------------
+
+std::unique_ptr < Ort::Env> env = nullptr;
+
+struct Model {
+  string name;
+  int version;
+  int numInputChannels;
+  int numInputGlobalChannels;
+  int numValueChannels;
+  int numScoreValueChannels;
+  int numOwnershipChannels;
+
+  Ort::Session* session;
+
+  Model(
+    const ModelDesc* desc,
+    int gpuIdx,
+    const string& onnxOptModelFile,
+    const string& onnxRuntimeExecutionProvider,
+    const string& homeDataDirOverride
+  ) {
+    name = desc->name;
+    version = desc->version;
+    numInputChannels = desc->numInputChannels;
+    numInputGlobalChannels = desc->numInputGlobalChannels;
+    numValueChannels = desc->numValueChannels;
+    numScoreValueChannels = desc->numScoreValueChannels;
+    numOwnershipChannels = desc->numOwnershipChannels;
+
+    auto envLocal = std::make_unique<Ort::Env>(ORT_LOGGING_LEVEL_ERROR, "Default");
+    env = std::move(envLocal);
+    Ort::SessionOptions sf;
+    sf.SetExecutionMode(ExecutionMode::ORT_PARALLEL);
+    sf.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
+    string dir = HomeData::getHomeDataDir(true, homeDataDirOverride);
+    MakeDir::make(dir);
+    string optModelPath = dir + "/" + onnxOptModelFile;
+#ifdef _WIN32
+    std::wstring optModelFile = std::wstring(optModelPath.begin(), optModelPath.end());
+    sf.SetOptimizedModelFilePath(optModelFile.data());
+#else
+    sf.SetOptimizedModelFilePath(optModelPath.data());
+#endif
+
+    if(onnxRuntimeExecutionProvider == "CUDA") {
+      #ifdef USE_ORT_CUDA
+        Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(sf, gpuIdx));
+      #else
+        throw StringError("KataGo was not compiled with CUDA support.");
+      #endif
+    }
+    else if(onnxRuntimeExecutionProvider == "TensorRT") {
+      #ifdef USE_ORT_TENSORRT
+        Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_Tensorrt(sf, gpuIdx));
+        Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(sf, gpuIdx));
+      #else
+        throw StringError("KataGo was not compiled with TensorRT support.");
+      #endif
+    }
+    else if(onnxRuntimeExecutionProvider == "DirectML") {
+      #ifdef USE_ORT_DIRECTML
+        sf.DisableMemPattern();
+        Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_DML(sf, gpuIdx));
+      #else
+        throw StringError("KataGo was not compiled with DirectML support.");
+      #endif
+    }
+    else if(onnxRuntimeExecutionProvider == "MIGraphX") {
+      #ifdef USE_ORT_MIGRAPHX
+        Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(sf, gpuIdx));
+      #else
+        throw StringError("KataGo was not compiled with MIGraphX support.");
+      #endif
+    }
+    else {
+      throw StringError("Invalid ONNXRuntime backend");
+    }
+
+#ifdef _WIN32
+    std::wstring modelName = std::wstring(name.begin(), name.end());
+    session = new Ort::Session(*env, modelName.data(), sf);
+#else
+    session = new Ort::Session(*env, name.data(), sf);
+#endif
+
+    Ort::AllocatorWithDefaultOptions allocator;
+
+    // input nodes
+    numInputNodes = session->GetInputCount();
+    assert(numInputNodes == 2);
+
+    for(int inputIdx = 0; inputIdx < numInputNodes; inputIdx++) {
+      inputNodeNames.emplace_back(session->GetInputName(inputIdx, allocator));
+    }
+    
+    // output nodes
+    numOutputNodes = session->GetOutputCount();
+
+    for(int outputIdx = 0; outputIdx < numOutputNodes; outputIdx++) {
+      outputNodeNames.emplace_back(session->GetOutputName(outputIdx, allocator));
+    }
+  }
+
+  bool getUsingFP16() {
+    Ort::TypeInfo typeInfo = session->GetInputTypeInfo(0);
+    auto tensorInfo = typeInfo.GetTensorTypeAndShapeInfo();
+    auto type = tensorInfo.GetElementType();
+    return type == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16;
+  }
+
+  vector<Ort::Value> evaluate(vector<Ort::Value>& inputTensors) {
+    auto outputTensors = session->Run(
+      Ort::RunOptions{nullptr},
+      inputNodeNames.data(),
+      inputTensors.data(),
+      inputTensors.size(),
+      outputNodeNames.data(),
+      outputNodeNames.size()
+    );
+
+    return outputTensors;
+  }
+
+  Model() = delete;
+  Model(const Model&) = delete;
+  Model& operator=(const Model&) = delete;
+
+  ~Model() {
+    session->release();
+  }
+
+private:
+  size_t numInputNodes;
+  size_t numOutputNodes;
+  vector<const char*> inputNodeNames;
+  vector<const char*> outputNodeNames;
+};
+
+//------------------------------------------------------------------------------
+
+struct ComputeContext {
+  int nnXLen;
+  int nnYLen;
+  enabled_t usingFP16;
+  string onnxOptModelFile;
+  string onnxRuntimeExecutionProvider;
+  string homeDataDirOverride;
+
+  ComputeContext(
+    int nnX,
+    int nnY,
+    const string& optModelFile,
+    const string& runtimeExecutionProvider,
+    const string& homeDataDir,
+    enabled_t useFP16
+  ) {
+    nnXLen = nnX;
+    nnYLen = nnY;
+    onnxOptModelFile = optModelFile;
+    onnxRuntimeExecutionProvider = runtimeExecutionProvider;
+    homeDataDirOverride = homeDataDir;
+    usingFP16 = useFP16;
+  }
+};
+
+ComputeContext* NeuralNet::createComputeContext(
+  const std::vector<int>& gpuIdxs,
+  Logger* logger,
+  int nnXLen,
+  int nnYLen,
+  const string& openCLTunerFile,
+  const string& onnxOptModelFile,
+  const string& onnxRuntimeExecutionProvider,
+  const string& homeDataDirOverride,
+  bool openCLReTunePerBoardSize,
+  enabled_t useFP16Mode,
+  enabled_t useNHWCMode,
+  const LoadedModel* loadedModel
+) {
+  (void)gpuIdxs;
+  (void)logger;
+  (void)openCLTunerFile;
+  (void)openCLReTunePerBoardSize;
+  (void)useNHWCMode;
+  (void)loadedModel;
+
+  return new ComputeContext(
+    nnXLen, nnYLen, onnxOptModelFile, onnxRuntimeExecutionProvider, homeDataDirOverride, useFP16Mode
+  );
+}
+
+void NeuralNet::freeComputeContext(ComputeContext* computeContext) {
+  delete computeContext;
+}
+
+//------------------------------------------------------------------------------
+
+struct ComputeHandle {
+  int nnXLen;
+  int nnYLen;
+  int policySize;
+  bool usingFP16;
+  Model* model;
+
+  ComputeHandle(
+    ComputeContext* context,
+    const LoadedModel* loadedModel, 
+    int gpuIdx
+  ) {
+    nnXLen = context->nnXLen;
+    nnYLen = context->nnYLen;
+    policySize = NNPos::getPolicySize(nnXLen, nnYLen);
+    model = new Model(
+      &(loadedModel->modelDesc),
+      gpuIdx,
+      context->onnxOptModelFile,
+      context->onnxRuntimeExecutionProvider,
+      context->homeDataDirOverride
+    );
+    usingFP16 = model->getUsingFP16();
+  }
+  ~ComputeHandle() {
+    delete model;
+  }
+
+  ComputeHandle() = delete;
+  ComputeHandle(const ComputeHandle&) = delete;
+  ComputeHandle& operator=(const ComputeHandle&) = delete;
+};
+
+ComputeHandle* NeuralNet::createComputeHandle(
+  ComputeContext* context,
+  const LoadedModel* loadedModel,
+  Logger* logger,
+  int maxBatchSize,
+  bool requireExactNNLen,
+  bool inputsUseNHWC,
+  int gpuIdxForThisThread,
+  int serverThreadIdx
+) {
+  (void)maxBatchSize;
+  (void)requireExactNNLen;
+  (void)inputsUseNHWC;
+  (void)serverThreadIdx;
+
+  auto deviceStr = [&]() {
+    if(gpuIdxForThisThread < 0)
+      return string("");
+    return " Device " + Global::intToString(gpuIdxForThisThread);
+  };
+
+  if(logger != NULL) {
+    logger->write("ONNXRuntime backend thread " + Global::intToString(serverThreadIdx) + ":" + deviceStr() + " Model version " + Global::intToString(loadedModel->modelDesc.version));
+    logger->write("ONNXRuntime backend thread " + Global::intToString(serverThreadIdx) + ":" + deviceStr() + " Model name: " + loadedModel->modelDesc.name);
+  }
+
+  ComputeHandle* handle = new ComputeHandle(context, loadedModel, gpuIdxForThisThread);
+
+  if(logger != NULL) {
+    if(context->onnxRuntimeExecutionProvider == "CUDA") {
+      logger->write("ONNXRuntime: CUDA backend");
+    }
+    else if(context->onnxRuntimeExecutionProvider == "TensorRT") {
+      logger->write("ONNXRuntime: TensorRT backend");
+    }
+    else if(context->onnxRuntimeExecutionProvider == "DirectML") {
+      logger->write("ONNXRuntime: DirectML backend");
+    }
+    else if(context->onnxRuntimeExecutionProvider == "MIGraphX") {
+      logger->write("ONNXRuntime: MIGraphX backend");
+    }
+    else {
+      throw StringError("Invalid ONNXRuntime backend");
+    }
+  }
+
+  return handle;
+}
+
+void NeuralNet::freeComputeHandle(ComputeHandle* handle) {
+  delete handle;
+}
+
+//------------------------------------------------------------------------------
+
+void NeuralNet::printDevices() {
+
+}
+
+#if defined(USE_ORT_CUDA) || defined(USE_ORT_TENSORRT)
+void NeuralNet::printCUDADevices() {
+  int numDevices = 0;
+  cudaGetDeviceCount(&numDevices);
+  for(int i = 0; i<numDevices; i++) {
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, i);
+    cout << "Found CUDA device " << i << ": " << prop.name << endl;
+  }
+}
+#endif
+
+#ifdef USE_ORT_DIRECTML
+#pragma comment(lib, "dxgi")
+void NeuralNet::printDirectMLDevices() {
+  IDXGIFactory* pFactory = NULL;
+  IDXGIAdapter* pAdapter;
+  vector<IDXGIAdapter*> vAdapters;
+
+  if(FAILED(CreateDXGIFactory(__uuidof(IDXGIFactory), (void**)&pFactory)))
+  {
+    throw StringError("Unable to create IDXGIFactory.");
+  }
+
+  for(int i = 0; pFactory->EnumAdapters(i, &pAdapter) != DXGI_ERROR_NOT_FOUND; ++i) {
+    vAdapters.push_back(pAdapter);
+  }
+
+  if(pFactory) {
+    pFactory->Release();
+  }
+
+  for(int i = 0; i < vAdapters.size(); i++) {
+    DXGI_ADAPTER_DESC adapterDescription;
+    vAdapters[i]->GetDesc(&adapterDescription);
+    wstring wsDeviceName(adapterDescription.Description);
+    string deviceName(wsDeviceName.begin(), wsDeviceName.end());
+    if(deviceName != "Microsoft Basic Render Driver"){
+      cout << "Found DirectML device " << i << ": " << deviceName.c_str() << endl;
+    }
+  }
+}
+#endif
+
+#ifdef USE_ORT_MIGRAPHX
+void NeuralNet::printOpenCLDevices() {
+  vector<DeviceInfo> devices = DeviceInfo::getAllDeviceInfosOnSystem(NULL);
+  for(int i = 0; i<devices.size(); i++) {
+    const DeviceInfo& device = devices[i];
+    string msg =
+      "Found OpenCL Device " + Global::intToString(device.gpuIdx) + ": " + device.name + " (" + device.vendor + ")" +
+      " (score " + Global::intToString(device.defaultDesirability) + ")";
+    cout << msg << endl;
+  }
+}
+#endif
+
+void NeuralNet::printDevices(const string& ortExecutionProvider) {
+  if(ortExecutionProvider == "CUDA") {
+    #ifdef USE_ORT_CUDA
+      NeuralNet::printCUDADevices();
+    #endif
+  }
+  else if (ortExecutionProvider == "TensorRT") {
+    #ifdef USE_ORT_TENSORRT
+      NeuralNet::printCUDADevices();
+    #endif
+  }
+  else if(ortExecutionProvider == "DirectML") {
+    #ifdef USE_ORT_DIRECTML
+      NeuralNet::printDirectMLDevices();
+    #endif
+  }
+  else if(ortExecutionProvider == "MIGraphX") {
+    #ifdef USE_ORT_MIGRAPHX
+      NeuralNet::printOpenCLDevices();
+    #endif
+  }
+}
+
+//------------------------------------------------------------------------------
+
+struct InputBuffers {
+  int maxBatchSize;
+
+  size_t singleBinaryInputElts;
+  size_t singleGlobalInputElts;
+  size_t singlePolicyResultElts;
+  size_t singleValueResultElts;
+  size_t singleScoreResultElts;
+  size_t singleOwnershipResultElts;
+
+  // Host pointers
+  float* userBinaryInputBuffer;
+  float* userGlobalInputBuffer;
+  float* policyResults;
+  float* valueResults;
+  float* scoreResults;
+  float* ownershipResults;
+
+  uint16_t* userBinaryInputBufferFP16;
+  uint16_t* userGlobalInputBufferFP16;
+  uint16_t* policyResultsFP16;
+  uint16_t* valueResultsFP16;
+  uint16_t* scoreResultsFP16;
+  uint16_t* ownershipResultsFP16;
+
+  InputBuffers(const LoadedModel* loadedModel, int maxBatchSz, int nnXLen, int nnYLen) {
+    const ModelDesc& m = loadedModel->modelDesc;
+
+    int xSize = nnXLen;
+    int ySize = nnYLen;
+    maxBatchSize = maxBatchSz;
+
+    singleBinaryInputElts = (size_t)m.numInputChannels * xSize * ySize;
+    singleGlobalInputElts = (size_t)m.numInputGlobalChannels;
+    singlePolicyResultElts = (size_t)(1 + xSize * ySize);
+    singleValueResultElts = (size_t)m.numValueChannels;
+    singleScoreResultElts = (size_t)m.numScoreValueChannels;
+    singleOwnershipResultElts = (size_t)m.numOwnershipChannels * xSize * ySize;
+
+    assert(NNModelVersion::getNumSpatialFeatures(m.version) == m.numInputChannels);
+    assert(NNModelVersion::getNumGlobalFeatures(m.version) == m.numInputGlobalChannels);
+
+    userBinaryInputBuffer = new float[(size_t)maxBatchSize * singleBinaryInputElts];
+    userGlobalInputBuffer = new float[(size_t)maxBatchSize * singleGlobalInputElts];
+    policyResults = new float[(size_t)maxBatchSize * singlePolicyResultElts];
+    valueResults = new float[(size_t)maxBatchSize * singleValueResultElts];
+    scoreResults = new float[(size_t)maxBatchSize * singleScoreResultElts];
+    ownershipResults = new float[(size_t)maxBatchSize * singleOwnershipResultElts];
+
+    userBinaryInputBufferFP16 = new uint16_t[(size_t)maxBatchSize * singleBinaryInputElts];
+    userGlobalInputBufferFP16 = new uint16_t[(size_t)maxBatchSize * singleGlobalInputElts];
+    policyResultsFP16 = new uint16_t[(size_t)maxBatchSize * singlePolicyResultElts];
+    valueResultsFP16 = new uint16_t[(size_t)maxBatchSize * singleValueResultElts];
+    scoreResultsFP16 = new uint16_t[(size_t)maxBatchSize * singleScoreResultElts];
+    ownershipResultsFP16 = new uint16_t[(size_t)maxBatchSize * singleOwnershipResultElts];
+  }
+
+  void copyInputFloatToHalf(size_t batchSize) {
+    for(int i = 0; i < batchSize * singleBinaryInputElts; i++) {
+      userBinaryInputBufferFP16[i] = half_float::detail::float2half<std::round_to_nearest>(userBinaryInputBuffer[i]);
+    }
+    for(int i = 0; i < batchSize * singleGlobalInputElts; i++) {
+      userGlobalInputBufferFP16[i] = half_float::detail::float2half<std::round_to_nearest>(userGlobalInputBuffer[i]);
+    }
+  }
+
+  void copyOutputHalfToFloat(size_t batchSize) {
+    for(int i = 0; i < batchSize * singlePolicyResultElts; i++) {
+      policyResults[i] = half_float::detail::half2float<float>(policyResultsFP16[i]);
+    }
+    for(int i = 0; i < batchSize * singleValueResultElts; i++) {
+      valueResults[i] = half_float::detail::half2float<float>(valueResultsFP16[i]);
+    }
+    for(int i = 0; i < batchSize * singleScoreResultElts; i++) {
+      scoreResults[i] = half_float::detail::half2float<float>(scoreResultsFP16[i]);
+    }
+    for(int i = 0; i < batchSize * singleOwnershipResultElts; i++) {
+      ownershipResults[i] = half_float::detail::half2float<float>(ownershipResultsFP16[i]);
+    }
+  }
+
+  ~InputBuffers() {
+    delete[] userBinaryInputBuffer;
+    delete[] userGlobalInputBuffer;
+    delete[] policyResults;
+    delete[] valueResults;
+    delete[] scoreResults;
+    delete[] ownershipResults;
+    delete[] userBinaryInputBufferFP16;
+    delete[] userGlobalInputBufferFP16;
+    delete[] policyResultsFP16;
+    delete[] valueResultsFP16;
+    delete[] scoreResultsFP16;
+    delete[] ownershipResultsFP16;
+  }
+
+  InputBuffers() = delete;
+  InputBuffers(const InputBuffers&) = delete;
+  InputBuffers& operator=(const InputBuffers&) = delete;
+};
+
+InputBuffers* NeuralNet::createInputBuffers(const LoadedModel* loadedModel, int maxBatchSize, int nnXLen, int nnYLen) {
+  return new InputBuffers(loadedModel,maxBatchSize,nnXLen,nnYLen);
+}
+
+void NeuralNet::freeInputBuffers(InputBuffers* inputBuffers) {
+  delete inputBuffers;
+}
+
+//------------------------------------------------------------------------------
+
+void NeuralNet::getOutput(
+  ComputeHandle* gpuHandle,
+  InputBuffers* inputBuffers,
+  int numBatchEltsFilled,
+  NNResultBuf** inputBufs,
+  int symmetry,
+  vector<NNOutput*>& outputs
+) {
+  assert(numBatchEltsFilled <= inputBuffers->maxBatchSize);
+  assert(numBatchEltsFilled > 0);
+  int batchSize = numBatchEltsFilled;
+  int nnXLen = gpuHandle->nnXLen;
+  int nnYLen = gpuHandle->nnYLen;
+  int version = gpuHandle->model->version;
+  bool usingFP16 = gpuHandle->usingFP16;
+
+  int numSpatialFeatures = NNModelVersion::getNumSpatialFeatures(version);
+  int numGlobalFeatures = NNModelVersion::getNumGlobalFeatures(version);
+  assert(numSpatialFeatures == gpuHandle->model->numInputChannels);
+  assert(numSpatialFeatures * nnXLen * nnYLen == inputBuffers->singleBinaryInputElts);
+  assert(numGlobalFeatures == inputBuffers->singleGlobalInputElts);
+
+  for(int nIdx = 0; nIdx < batchSize; nIdx++) {
+    float* rowSpatialInput = inputBuffers->userBinaryInputBuffer + (inputBuffers->singleBinaryInputElts * nIdx);
+    float* rowGlobalInput = inputBuffers->userGlobalInputBuffer + (inputBuffers->singleGlobalInputElts * nIdx);
+
+    const float* rowGlobal = inputBufs[nIdx]->rowGlobal;
+    const float* rowSpatial = inputBufs[nIdx]->rowSpatial;
+    std::copy(rowGlobal, rowGlobal + numGlobalFeatures, rowGlobalInput);
+    SymmetryHelpers::copyInputsWithSymmetry(rowSpatial, rowSpatialInput, 1, nnYLen, nnXLen, numSpatialFeatures, false, symmetry);
+  }
+
+  const int policySize = nnXLen * nnYLen + 1;
+  const int valueSize = gpuHandle->model->numValueChannels;
+  const int scoreSize = gpuHandle->model->numScoreValueChannels;
+  const int ownershipSize = nnXLen * nnYLen;
+
+  assert(valueSize == 3);
+  assert(gpuHandle->model->numOwnershipChannels == 1);
+
+  // input
+  vector<vector<int64_t>> inputNodeShape(2);
+  vector<int64_t> inputNodeSizes(2);
+  vector<Ort::Value> inputTensors;
+
+  for(int i = 0; i < 2; i++) {
+    Ort::TypeInfo typeInfo = gpuHandle->model->session->GetInputTypeInfo(i);
+    auto tensorInfo = typeInfo.GetTensorTypeAndShapeInfo();
+
+    // input node dimensions
+    inputNodeShape[i] = tensorInfo.GetShape();
+    // This is -1, so should be manually assigned
+    inputNodeShape[i][0] = (int64_t)batchSize;
+  }
+  assert(inputNodeShape[0].size() == 4);
+  assert(inputNodeShape[0][1] == numSpatialFeatures);
+  // Dynamic input shape for onnx models without masking
+  inputNodeShape[0][2] = nnYLen;
+  inputNodeShape[0][3] = nnXLen;
+  assert(inputNodeShape[1].size() == 2);
+  assert(inputNodeShape[1][1] == numGlobalFeatures);
+
+  inputNodeSizes[0] = (int64_t)(batchSize * inputBuffers->singleBinaryInputElts);
+  inputNodeSizes[1] = (int64_t)(batchSize * inputBuffers->singleGlobalInputElts);
+
+  auto memoryInfo = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
+
+  if(!usingFP16) {
+    inputTensors.emplace_back(Ort::Value::CreateTensor<float>(
+      memoryInfo,
+      inputBuffers->userBinaryInputBuffer,
+      inputNodeSizes[0],
+      inputNodeShape[0].data(),
+      inputNodeShape[0].size()
+    ));
+    inputTensors.emplace_back(Ort::Value::CreateTensor<float>(
+      memoryInfo,
+      inputBuffers->userGlobalInputBuffer,
+      inputNodeSizes[1],
+      inputNodeShape[1].data(),
+      inputNodeShape[1].size()
+    ));
+  }
+  else {
+    inputBuffers->copyInputFloatToHalf(batchSize);
+    inputTensors.emplace_back(Ort::Value::CreateTensor(
+      memoryInfo,
+      inputBuffers->userBinaryInputBufferFP16,
+      inputNodeSizes[0] * sizeof(uint16_t),
+      inputNodeShape[0].data(),
+      inputNodeShape[0].size(),
+      ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16
+    ));
+    inputTensors.emplace_back(Ort::Value::CreateTensor(
+      memoryInfo,
+      inputBuffers->userGlobalInputBufferFP16,
+      inputNodeSizes[1] * sizeof(uint16_t),
+      inputNodeShape[1].data(),
+      inputNodeShape[1].size(),
+      ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16
+    ));
+  }
+
+  vector<float> inputVector(inputNodeSizes[1]);
+  for(int i = 0; i < inputNodeSizes[1]; i++) {
+    inputVector[i] = (inputBuffers->userGlobalInputBuffer[i]);
+  }
+
+  // Evaluate
+  auto outputTensors = gpuHandle->model->evaluate(inputTensors);
+
+  // collect outputs to vectors
+  if(!usingFP16){
+    auto policy = outputTensors[0].GetTensorMutableData<float>();
+    auto value = outputTensors[1].GetTensorMutableData<float>();
+    auto score= outputTensors[2].GetTensorMutableData<float>();
+    auto ownership = outputTensors[3].GetTensorMutableData<float>();
+    std::copy(policy, policy + batchSize * policySize, inputBuffers->policyResults);
+    std::copy(value, value + batchSize * valueSize, inputBuffers->valueResults);
+    std::copy(score, score + batchSize * scoreSize, inputBuffers->scoreResults);
+    std::copy(ownership, ownership + batchSize * ownershipSize, inputBuffers->ownershipResults);
+  }
+  else {
+    auto policy = outputTensors[0].GetTensorMutableData<uint16_t>();
+    auto value = outputTensors[1].GetTensorMutableData<uint16_t>();
+    auto score = outputTensors[2].GetTensorMutableData<uint16_t>();
+    auto ownership = outputTensors[3].GetTensorMutableData<uint16_t>();
+    std::copy(policy, policy + batchSize * policySize, inputBuffers->policyResultsFP16);
+    std::copy(value, value + batchSize * valueSize, inputBuffers->valueResultsFP16);
+    std::copy(score, score + batchSize * scoreSize, inputBuffers->scoreResultsFP16);
+    std::copy(ownership, ownership + batchSize * ownershipSize, inputBuffers->ownershipResultsFP16);
+    inputBuffers->copyOutputHalfToFloat(batchSize);
+  }
+
+  for(int row = 0; row < batchSize; row++) {
+    NNOutput* output = outputs[row];
+    assert(output->nnXLen == nnXLen);
+    assert(output->nnYLen == nnYLen);
+
+    // Policy
+    const float* policySrcBuf = inputBuffers->policyResults + row * policySize;
+    float* policyProbs = output->policyProbs;
+
+    //These are not actually correct, the client does the postprocessing to turn them into
+    //policy probabilities and white game outcome probabilities
+    //Also we don't fill in the nnHash here either
+    SymmetryHelpers::copyOutputsWithSymmetry(policySrcBuf, policyProbs, 1, nnYLen, nnXLen, symmetry);
+    policyProbs[policySize-1] = policySrcBuf[policySize-1];
+
+    // Value
+    output->whiteWinProb = inputBuffers->valueResults[row * valueSize];
+    output->whiteLossProb = inputBuffers->valueResults[row * valueSize + 1];
+    output->whiteNoResultProb = inputBuffers->valueResults[row * valueSize + 2];
+
+    // Score
+    if(version >= 8) {
+      assert(scoreSize == 4);
+      output->whiteScoreMean = inputBuffers->scoreResults[row * scoreSize];
+      output->whiteScoreMeanSq = inputBuffers->scoreResults[row * scoreSize + 1];
+      output->whiteLead = inputBuffers->scoreResults[row * scoreSize + 2];
+      output->varTimeLeft = inputBuffers->scoreResults[row * scoreSize + 3];
+    }
+    else if(version >= 4) {
+      assert(scoreSize== 2);
+      output->whiteScoreMean = inputBuffers->scoreResults[row * scoreSize];
+      output->whiteScoreMeanSq = inputBuffers->scoreResults[row * scoreSize + 1];
+      output->whiteLead = output->whiteScoreMean;
+      output->varTimeLeft = 0;
+    }
+    else if(version >= 3) {
+      assert(scoreSize == 1);
+      output->whiteScoreMean = inputBuffers->scoreResults[row * scoreSize];
+      //Version 3 neural nets don't have any second moment output, implicitly already folding it in, so we just use the mean squared
+      output->whiteScoreMeanSq = output->whiteScoreMean * output->whiteScoreMean;
+      output->whiteLead = output->whiteScoreMean;
+      output->varTimeLeft = 0;
+    }
+    else {
+      ASSERT_UNREACHABLE;
+    }
+
+    // Ownership
+    //As above, these are NOT actually from white's perspective, but rather the player to move.
+    //As usual the client does the postprocessing.
+    if(output->whiteOwnerMap != NULL) {
+      const float* ownershipSrcBuf = inputBuffers->ownershipResults + row * ownershipSize;
+      SymmetryHelpers::copyOutputsWithSymmetry(ownershipSrcBuf, output->whiteOwnerMap, 1, nnYLen, nnXLen, symmetry);
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+
+bool NeuralNet::testEvaluateConv(
+  const ConvLayerDesc* desc,
+  int desiredBatchSize,
+  int nnXLen,
+  int nnYLen,
+  bool useFP16,
+  bool useNHWC,
+  const vector<float>& inputBuffer,
+  vector<float>& outputBuffer
+) {
+  (void)desc;
+  (void)desiredBatchSize;
+  (void)nnXLen;
+  (void)nnYLen;
+  (void)useFP16;
+  (void)useNHWC;
+  (void)inputBuffer;
+  (void)outputBuffer;
+  return false;
+}
+
+bool NeuralNet::testEvaluateBatchNorm(
+  const BatchNormLayerDesc* desc,
+  int desiredBatchSize,
+  int nnXLen,
+  int nnYLen,
+  bool useFP16,
+  bool useNHWC,
+  const vector<float>& inputBuffer,
+  const vector<float>& maskBuffer,
+  vector<float>& outputBuffer
+) {
+  (void)desc;
+  (void)desiredBatchSize;
+  (void)nnXLen;
+  (void)nnYLen;
+  (void)useFP16;
+  (void)useNHWC;
+  (void)inputBuffer;
+  (void)maskBuffer;
+  (void)outputBuffer;
+  return false;
+}
+
+bool NeuralNet::testEvaluateResidualBlock(
+  const ResidualBlockDesc* desc,
+  int desiredBatchSize,
+  int nnXLen,
+  int nnYLen,
+  bool useFP16,
+  bool useNHWC,
+  const std::vector<float>& inputBuffer,
+  const std::vector<float>& maskBuffer,
+  std::vector<float>& outputBuffer
+) {
+  (void)desc;
+  (void)desiredBatchSize;
+  (void)nnXLen;
+  (void)nnYLen;
+  (void)useFP16;
+  (void)useNHWC;
+  (void)inputBuffer;
+  (void)maskBuffer;
+  (void)outputBuffer;
+  return false;
+}
+
+bool NeuralNet::testEvaluateGlobalPoolingResidualBlock(
+  const GlobalPoolingResidualBlockDesc* desc,
+  int desiredBatchSize,
+  int nnXLen,
+  int nnYLen,
+  bool useFP16,
+  bool useNHWC,
+  const vector<float>& inputBuffer,
+  const vector<float>& maskBuffer,
+  vector<float>& outputBuffer
+) {
+  (void)desc;
+  (void)desiredBatchSize;
+  (void)nnXLen;
+  (void)nnYLen;
+  (void)useFP16;
+  (void)useNHWC;
+  (void)inputBuffer;
+  (void)maskBuffer;
+  (void)outputBuffer;
+  return false;
+}
+
+#endif  // USE_ONNXRUNTIME_BACKEND
\ No newline at end of file
diff --git a/cpp/program/gtpconfig.cpp b/cpp/program/gtpconfig.cpp
index 1ad392c79..dc035254a 100644
--- a/cpp/program/gtpconfig.cpp
+++ b/cpp/program/gtpconfig.cpp
@@ -4,7 +4,7 @@ using namespace std;
 
 static const string gtpBase = R"%%(
 
-# Logs and files--------------------------------------------------------------------------
+# Logs and files----------------------------------------------------------------
 
 # Where to output log?
 logDir = gtp_logs    # Each run of KataGo will log to a separate file in this dir
@@ -18,7 +18,7 @@ logToStderr = false
 # Optionally override where KataGo will attempt to save things like openCLTuner files and other cached data.
 # homeDataDir = DIRECTORY
 
-# Analysis------------------------------------------------------------------------------------
+# Analysis----------------------------------------------------------------------
 
 # Configure the maximum length of analysis printed out by lz-analyze and other places.
 # Controls the number of moves after the first move in a variation.
@@ -35,7 +35,7 @@ logToStderr = false
 # analysisWideRootNoise = 0.0
 
 
-# Default rules------------------------------------------------------------------------------------
+# Default rules-----------------------------------------------------------------
 # See https://lightvector.github.io/KataGo/rules.html for a description of the rules.
 # These rules are defaults and can be changed mid-run by several custom GTP commands.
 # See https://github.com/lightvector/KataGo/blob/master/docs/GTP_Extensions.md for those commands.
@@ -52,9 +52,9 @@ logToStderr = false
 
 $$WHITE_HANDICAP_BONUS
 
-# Bot behavior---------------------------------------------------------------------------------------
+# Bot behavior------------------------------------------------------------------
 
-# Resignation -------------
+# Resignation -------------------------
 
 # Resignation occurs if for at least resignConsecTurns in a row,
 # the winLossUtility (which is on a [-1,1] scale) is below resignThreshold.
@@ -64,7 +64,7 @@ resignConsecTurns = 3
 # Uncomment to make katago not resign close games, behind by fewer than this many points
 # resignMinScoreDifference = 10
 
-# Handicap -------------
+# Handicap ----------------------------
 
 # Assume that if black makes many moves in a row right at the start of the game, then the game is a handicap game.
 # This is necessary on some servers and for some GUIs and also when initializing from many SGF files, which may
@@ -93,13 +93,13 @@ resignConsecTurns = 3
 # playoutDoublingAdvantagePla = BLACK
 # playoutDoublingAdvantagePla = WHITE
 
-# Misc Behavior --------------------
+# Misc Behavior -----------------------
 
 # Uncomment and set to true to make KataGo avoid a particular joseki that some KataGo nets misevaluate,
 # and also to improve opening diversity versus some particular other bots that like to play it all the time.
 # avoidMYTDaggerHack = false
 
-# Search limits-----------------------------------------------------------------------------------
+# Search limits-----------------------------------------------------------------
 
 # For all of "maxVisits", "maxPlayouts", "maxTime", search will still try to follow GTP time controls and may make a move
 # faster than the specified max if GTP tells it that it is playing under a clock as well in the current game.
@@ -128,7 +128,7 @@ searchFactorAfterTwoPass = 0.25
 searchFactorWhenWinning = 0.40
 searchFactorWhenWinningThreshold = 0.95
 
-# GPU Settings-------------------------------------------------------------------------------
+# GPU Settings------------------------------------------------------------------
 
 # Maximum number of positions to send to a single GPU at once.
 # The default value here is roughly equal to numSearchThreads, but you can specify it manually
@@ -145,8 +145,14 @@ nnMutexPoolSizePowerOfTwo = $$NN_MUTEX_POOL_SIZE_POWER_OF_TWO
 
 $$MULTIPLE_GPUS
 
+# ONNXRuntime Backend Settings--------------------------------------------------
+# Execution provider for the ONNXRuntime backend.
+# Currently available options for this binary are:
+# $$ONNXRUNTIME_AVAILABLE_EXECUTION_PROVIDERS
+onnxOptModelFile = opt_model.onnx
+onnxRuntimeExecutionProvider = $$ONNXRUNTIME_EXECUTION_PROVIDER
 
-# Internal params------------------------------------------------------------------------------
+# Internal params---------------------------------------------------------------
 # Uncomment and edit any of the below values to change them from their default.
 
 # How big to make the mutex pool for search synchronization
@@ -161,6 +167,7 @@ string GTPConfig::makeConfig(
   int64_t maxPlayouts,
   double maxTime,
   double maxPonderTime,
+  string configOnnxRuntimeExecutionProvider,
   std::vector<int> deviceIdxs,
   int nnCacheSizePowerOfTwo,
   int nnMutexPoolSizePowerOfTwo,
@@ -214,6 +221,31 @@ string GTPConfig::makeConfig(
   replace("$$NN_CACHE_SIZE_POWER_OF_TWO", Global::intToString(nnCacheSizePowerOfTwo));
   replace("$$NN_MUTEX_POOL_SIZE_POWER_OF_TWO", Global::intToString(nnMutexPoolSizePowerOfTwo));
 
+#ifdef USE_ONNXRUNTIME_BACKEND
+  vector<string> availableExecutionProviders;
+  #ifdef USE_ORT_CUDA
+    availableExecutionProviders.push_back("CUDA");
+  #endif
+  #ifdef USE_ORT_TENSORRT
+    availableExecutionProviders.push_back("TensorRT");
+  #endif
+  #ifdef USE_ORT_DIRECTML
+    availableExecutionProviders.push_back("DirectML");
+  #endif
+  #ifdef USE_ORT_MIGRAPHX
+    availableExecutionProviders.push_back("MIGraphX");
+  #endif
+  string providers = "";
+  for(int i = 0; i < availableExecutionProviders.size(); i++) {
+    providers += availableExecutionProviders[i];
+    if(i < availableExecutionProviders.size() - 1){
+      providers += ", ";
+    }
+  }
+  replace("$$ONNXRUNTIME_AVAILABLE_EXECUTION_PROVIDERS", providers);
+  replace("$$ONNXRUNTIME_EXECUTION_PROVIDER", configOnnxRuntimeExecutionProvider);
+#endif
+
   if(deviceIdxs.size() <= 0) {
     replace("$$MULTIPLE_GPUS", "");
   }
@@ -227,6 +259,9 @@ string GTPConfig::makeConfig(
 #endif
 #ifdef USE_OPENCL_BACKEND
       replacement += "openclDeviceToUseThread" + Global::intToString(i) + " = " + Global::intToString(deviceIdxs[i]) + "\n";
+#endif
+#ifdef USE_ONNXRUNTIME_BACKEND
+      replacement += "onnxruntimeDeviceToUseThread" + Global::intToString(i) + " = " + Global::intToString(deviceIdxs[i]) + "\n";
 #endif
     }
     replace("$$MULTIPLE_GPUS", replacement);
diff --git a/cpp/program/gtpconfig.h b/cpp/program/gtpconfig.h
index f70e329b5..4a290c5c6 100644
--- a/cpp/program/gtpconfig.h
+++ b/cpp/program/gtpconfig.h
@@ -11,6 +11,7 @@ namespace GTPConfig {
     int64_t maxPlayouts,
     double maxTime,
     double maxPonderTime,
+    std::string configOnnxRuntimeExecutionProvider,
     std::vector<int> deviceIdxs,
     int nnCacheSizePowerOfTwo,
     int nnMutexPoolSizePowerOfTwo,
diff --git a/cpp/program/setup.cpp b/cpp/program/setup.cpp
index 58feba29a..7bcf1da27 100644
--- a/cpp/program/setup.cpp
+++ b/cpp/program/setup.cpp
@@ -52,6 +52,8 @@ vector<NNEvaluator*> Setup::initializeNNEvaluators(
   string backendPrefix = "opencl";
   #elif defined(USE_EIGEN_BACKEND)
   string backendPrefix = "eigen";
+  #elif defined(USE_ONNXRUNTIME_BACKEND)
+  string backendPrefix = "onnxruntime";
   #else
   string backendPrefix = "dummybackend";
   #endif
@@ -64,6 +66,8 @@ vector<NNEvaluator*> Setup::initializeNNEvaluators(
     cfg.markAllKeysUsedWithPrefix("opencl");
   if(backendPrefix != "eigen")
     cfg.markAllKeysUsedWithPrefix("eigen");
+  if(backendPrefix != "onnxruntime")
+    cfg.markAllKeysUsedWithPrefix("onnxruntime");
   if(backendPrefix != "dummybackend")
     cfg.markAllKeysUsedWithPrefix("dummybackend");
 
@@ -108,7 +112,7 @@ vector<NNEvaluator*> Setup::initializeNNEvaluators(
         requireExactNNLen = cfg.getBool("requireMaxBoardSize");
     }
 
-    bool inputsUseNHWC = backendPrefix == "opencl" ? false : true;
+    bool inputsUseNHWC = (backendPrefix == "opencl" || backendPrefix == "onnxruntime") ? false : true;
     if(cfg.contains(backendPrefix+"InputsUseNHWC"+idxStr))
       inputsUseNHWC = cfg.getBool(backendPrefix+"InputsUseNHWC"+idxStr);
     else if(cfg.contains("inputsUseNHWC"+idxStr))
@@ -206,6 +210,14 @@ vector<NNEvaluator*> Setup::initializeNNEvaluators(
     bool openCLReTunePerBoardSize = false;
     if(cfg.contains("openclReTunePerBoardSize"))
       openCLReTunePerBoardSize = cfg.getBool("openclReTunePerBoardSize");
+    
+    string onnxOptModelFile;
+    if(cfg.contains("onnxOptModelFile"))
+      onnxOptModelFile = cfg.getString("onnxOptModelFile");
+
+    string onnxRuntimeExecutionProvider;
+    if(cfg.contains("onnxRuntimeExecutionProvider"))
+      onnxRuntimeExecutionProvider = cfg.getString("onnxRuntimeExecutionProvider");
 
     enabled_t useFP16Mode = enabled_t::Auto;
     if(cfg.contains(backendPrefix+"UseFP16-"+idxStr))
@@ -294,6 +306,8 @@ vector<NNEvaluator*> Setup::initializeNNEvaluators(
       nnMutexPoolSizePowerOfTwo,
       debugSkipNeuralNet,
       openCLTunerFile,
+      onnxOptModelFile,
+      onnxRuntimeExecutionProvider,
       homeDataDirOverride,
       openCLReTunePerBoardSize,
       useFP16Mode,
diff --git a/cpp/tests/testsearch.cpp b/cpp/tests/testsearch.cpp
index 34071d9e0..2e33f9bd4 100644
--- a/cpp/tests/testsearch.cpp
+++ b/cpp/tests/testsearch.cpp
@@ -132,6 +132,8 @@ static NNEvaluator* startNNEval(
   bool openCLReTunePerBoardSize = false;
   const string& modelName = modelFile;
   const string openCLTunerFile = "";
+  const string onnxOptModelFile = "";
+  const string onnxRuntimeExecutionProvider = "DirectML";
   const string homeDataDirOverride = "";
   int numNNServerThreadsPerModel = 1;
   bool nnRandomize = false;
@@ -156,6 +158,8 @@ static NNEvaluator* startNNEval(
     nnMutexPoolSizePowerOfTwo,
     debugSkipNeuralNet,
     openCLTunerFile,
+    onnxOptModelFile,
+    onnxRuntimeExecutionProvider,
     homeDataDirOverride,
     openCLReTunePerBoardSize,
     useFP16 ? enabled_t::True : enabled_t::False,
diff --git a/cpp/tests/testtrainingwrite.cpp b/cpp/tests/testtrainingwrite.cpp
index 856159db4..d588ccb38 100644
--- a/cpp/tests/testtrainingwrite.cpp
+++ b/cpp/tests/testtrainingwrite.cpp
@@ -24,6 +24,8 @@ static NNEvaluator* startNNEval(
   int nnMutexPoolSizePowerOfTwo = 12;
   bool debugSkipNeuralNet = modelFile == "/dev/null";
   const string openCLTunerFile = "";
+  const string onnxOptModelFile = "";
+  const string onnxRuntimeExecutionProvider = "";
   const string homeDataDirOverride = "";
   bool openCLReTunePerBoardSize = false;
   int numNNServerThreadsPerModel = 1;
@@ -43,6 +45,8 @@ static NNEvaluator* startNNEval(
     nnMutexPoolSizePowerOfTwo,
     debugSkipNeuralNet,
     openCLTunerFile,
+    onnxOptModelFile,
+    onnxRuntimeExecutionProvider,
     homeDataDirOverride,
     openCLReTunePerBoardSize,
     useFP16 ? enabled_t::True : enabled_t::False,