diff --git a/appveyor.yml b/appveyor.yml
index bc351a5ca3..affe4d08bb 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -2,23 +2,24 @@ version: '{build}'
 configuration: Release
 platform: x64
 image:
-- Visual Studio 2017
+- Visual Studio 2019
 environment:
   matrix:
   - NAME: gpu-nvidia-cudnn
   - NAME: gpu-nvidia-cuda
-  - NAME: gpu-dx12
-  - NAME: gpu-opencl
+#  - NAME: gpu-dx12
+#  - NAME: gpu-opencl
   - NAME: cpu-dnnl
   - NAME: cpu-openblas
-  - NAME: onednn
+#  - NAME: onednn
   - NAME: onnx-dml
+  - NAME: onnx-trt
   - NAME: android
 for:
 -
   matrix:
     only:
-    - NAME: gpu-opencl
+#    - NAME: gpu-opencl
     - NAME: cpu-dnnl
   skip_non_tags: true
 clone_folder: c:\projects\lc0
@@ -30,6 +31,7 @@ install:
 - cmd: set BLAS=false
 - cmd: set ONEDNN=false
 - cmd: set ONNX_DML=false
+- cmd: set ONNX_TRT=false
 - cmd: set GTEST=false
 - cmd: set ANDROID=false
 - cmd: IF %NAME%==android set ANDROID=true
@@ -43,11 +45,12 @@ install:
 - cmd: IF %NAME%==cpu-openblas set GTEST=true
 - cmd: IF %NAME%==onednn set ONEDNN=true
 - cmd: IF %NAME%==onnx-dml set ONNX_DML=true
+- cmd: IF %NAME%==onnx-trt set ONNX_TRT=true
 - cmd: set NET=753723
 - cmd: set NET_HASH=3e3444370b9fe413244fdc79671a490e19b93d3cca1669710ffeac890493d198
 - cmd: IF NOT %OPENCL%==true IF NOT %DX%==true set NET=791556
 - cmd: IF NOT %OPENCL%==true IF NOT %DX%==true set NET_HASH=f404e156ceb2882470fd8c032b8754af0fa0b71168328912eaef14671a256e34
-- cmd: call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat" amd64
+#- cmd: call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvarsall.bat" amd64
 - cmd: set DNNL_NAME=dnnl_win_1.5.0_cpu_vcomp
 - cmd: IF %NAME%==cpu-dnnl IF NOT EXIST C:\cache\%DNNL_NAME% appveyor DownloadFile https://github.com/oneapi-src/oneDNN/releases/download/v1.5/dnnl_win_1.5.0_cpu_vcomp.zip
 - cmd: IF %NAME%==cpu-dnnl IF NOT EXIST C:\cache\%DNNL_NAME% 7z x dnnl_win_1.5.0_cpu_vcomp.zip -oC:\cache
@@ -57,6 +60,9 @@ install:
 - cmd: IF %NAME%==onnx-dml IF NOT EXIST C:\cache\onnxruntime-win-x64-dml-1.13.1 appveyor DownloadFile https://github.com/borg323/onnxruntime/releases/download/v1.13.1/onnxruntime-win-x64-dml-1.13.1.zip
 - cmd: IF %NAME%==onnx-dml IF NOT EXIST C:\cache\onnxruntime-win-x64-dml-1.13.1 7z x onnxruntime-win-x64-dml-1.13.1.zip -oC:\cache
 - cmd: IF %NAME%==onnx-dml set ONNX_NAME=onnxruntime-win-x64-dml-1.13.1
+- cmd: IF %NAME%==onnx-trt IF NOT EXIST C:\cache\onnxruntime-win-x64-gpu-1.22.0 appveyor DownloadFile https://github.com/microsoft/onnxruntime/releases/download/v1.22.0/onnxruntime-win-x64-gpu-1.22.0.zip
+- cmd: IF %NAME%==onnx-trt IF NOT EXIST C:\cache\onnxruntime-win-x64-gpu-1.22.0 7z x onnxruntime-win-x64-gpu-1.22.0.zip -oC:\cache
+- cmd: IF %NAME%==onnx-trt set ONNX_NAME=onnxruntime-win-x64-gpu-1.22.0
 - cmd: IF %NAME%==cpu-openblas IF NOT EXIST C:\cache\OpenBLAS appveyor DownloadFile https://sjeng.org/ftp/OpenBLAS-0.3.3-win-oldthread.zip
 - cmd: IF %NAME%==cpu-openblas IF NOT EXIST C:\cache\OpenBLAS 7z x OpenBLAS-0.3.3-win-oldthread.zip -oC:\cache\OpenBLAS
 - cmd: IF %OPENCL%==true nuget install opencl-nug -Version 0.777.77 -OutputDirectory C:\cache
@@ -65,26 +71,26 @@ install:
 - cmd: IF %ISPC%==true IF NOT EXIST C:\cache\ispc-v1.13.0-windows appveyor DownloadFile https://github.com/ispc/ispc/releases/download/v1.13.0/ispc-v1.13.0-windows.zip
 - cmd: IF %ISPC%==true IF NOT EXIST C:\cache\ispc-v1.13.0-windows 7z x ispc-v1.13.0-windows.zip -oC:\cache\ispc-v1.13.0-windows
 - cmd: IF %ISPC%==true set PATH=C:\cache\ispc-v1.13.0-windows\bin;%PATH%
-- cmd: set "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.0"
+- cmd: set "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1"
 - cmd: IF %CUDNN%==true IF NOT EXIST "%CUDA_PATH%\cuda" set CUDNN_INSTALL=1
-- cmd: IF DEFINED CUDNN_INSTALL appveyor DownloadFile https://developer.nvidia.com/compute/cuda/10.0/Prod/network_installers/cuda_10.0.130_win10_network
-- cmd: IF DEFINED CUDNN_INSTALL cuda_10.0.130_win10_network -s nvcc_10.0 cublas_dev_10.0 cublas_10.0 cudart_10.0
-- cmd: IF DEFINED CUDNN_INSTALL appveyor DownloadFile http://developer.download.nvidia.com/compute/redist/cudnn/v7.4.2/cudnn-10.0-windows10-x64-v7.4.2.24.zip
-- cmd: IF DEFINED CUDNN_INSTALL 7z x cudnn-10.0-windows10-x64-v7.4.2.24.zip -o"%CUDA_PATH%"
-- cmd: IF %CUDNN%==false set "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1"
+- cmd: IF DEFINED CUDNN_INSTALL appveyor DownloadFile https://developer.download.nvidia.com/compute/cuda/10.1/Prod/network_installers/cuda_10.1.243_win10_network.exe
+- cmd: IF DEFINED CUDNN_INSTALL cuda_10.1.243_win10_network -s nvcc_10.1 cublas_dev_10.1 cublas_10.1 cudart_10.1
+- cmd: IF DEFINED CUDNN_INSTALL appveyor DownloadFile https://developer.download.nvidia.com/compute/redist/cudnn/v7.5.1/cudnn-10.1-windows10-x64-v7.5.1.10.zip
+- cmd: IF DEFINED CUDNN_INSTALL 7z x cudnn-10.1-windows10-x64-v7.5.1.10.zip -o"%CUDA_PATH%"
+- cmd: IF %CUDNN%==false set "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9"
 - cmd: IF %CUDA%==true IF NOT EXIST "%CUDA_PATH%" set CUDA_INSTALL=1
-- cmd: IF DEFINED CUDA_INSTALL appveyor DownloadFile https://developer.download.nvidia.com/compute/cuda/11.1.0/network_installers/cuda_11.1.0_win10_network.exe
-- cmd: IF DEFINED CUDA_INSTALL cuda_11.1.0_win10_network.exe -s nvcc_11.1 cublas_dev_11.1 cublas_11.1 cudart_11.1 documentation_11.1
+- cmd: IF DEFINED CUDA_INSTALL appveyor DownloadFile https://developer.download.nvidia.com/compute/cuda/12.9.0/network_installers/cuda_12.9.0_windows_network.exe
+- cmd: IF DEFINED CUDA_INSTALL cuda_12.9.0_windows_network.exe -s nvcc_12.9 cublas_dev_12.9 cublas_12.9 cudart_12.9 documentation_12.9
 - cmd: IF %CUDA%==true set PATH=%CUDA_PATH%\bin;%PATH%
-- cmd: set PATH=C:\Python36;C:\Python36\scripts;%PATH%
-- cmd: pip3 install --upgrade meson==0.55.3
-- cmd: set MIMALLOC_PATH=C:\cache\mimalloc-1.7.1
-- cmd: IF %ANDROID%==false IF NOT EXIST "%MIMALLOC_PATH%" appveyor DownloadFile https://github.com/microsoft/mimalloc/archive/refs/tags/v1.7.1.zip
-- cmd: IF %ANDROID%==false IF NOT EXIST "%MIMALLOC_PATH%" 7z x v1.7.1.zip -oC:\cache\
-- cmd: IF %ANDROID%==false IF NOT EXIST "%MIMALLOC_PATH%"\out msbuild "%MIMALLOC_PATH%"\ide\vs2017\mimalloc-override.vcxproj /p:Configuration=Release /m
-- cmd: IF %NAME%==android IF NOT EXIST C:\ndk\android-ndk-r19c\toolchains\llvm\prebuilt\windows-x86_64 appveyor DownloadFile https://dl.google.com/android/repository/android-ndk-r19c-windows-x86_64.zip
-- cmd: IF %NAME%==android IF NOT EXIST C:\ndk\android-ndk-r19c\toolchains\llvm\prebuilt\windows-x86_64 7z x android-ndk-r19c-windows-x86_64.zip -oC:\ndk
-- cmd: IF %NAME%==android set PATH=C:\ndk\android-ndk-r19c\toolchains\llvm\prebuilt\windows-x86_64\bin;%PATH%
+- cmd: set PATH=C:\Python310;C:\Python310\scripts;%PATH%
+#- cmd: pip3 install --upgrade meson==0.55.3
+- cmd: set MIMALLOC_PATH=C:\cache\mimalloc-1.8.7
+- cmd: IF %ANDROID%==false IF NOT EXIST "%MIMALLOC_PATH%" appveyor DownloadFile https://github.com/microsoft/mimalloc/archive/refs/tags/v1.8.7.zip
+- cmd: IF %ANDROID%==false IF NOT EXIST "%MIMALLOC_PATH%" 7z x v1.8.7.zip -oC:\cache\
+- cmd: IF %ANDROID%==false IF NOT EXIST "%MIMALLOC_PATH%"\out msbuild "%MIMALLOC_PATH%"\ide\vs2019\mimalloc-override.vcxproj /p:Configuration=Release /m
+- cmd: IF %NAME%==android IF NOT EXIST C:\ndk\android-ndk-r27c\toolchains\llvm\prebuilt\windows-x86_64 appveyor DownloadFile https://dl.google.com/android/repository/android-ndk-r27c-windows.zip
+- cmd: IF %NAME%==android IF NOT EXIST C:\ndk\android-ndk-r27c\toolchains\llvm\prebuilt\windows-x86_64 7z x android-ndk-r27c-windows.zip -oC:\ndk
+- cmd: IF %NAME%==android set PATH=C:\ndk\android-ndk-r27c\toolchains\llvm\prebuilt\windows-x86_64\bin;%PATH%
 - cmd: IF %NAME%==android sed "s/clang+*/&.cmd/" cross-files/aarch64-linux-android >crossfile-aarch64
 - cmd: IF %NAME%==android IF NOT EXIST C:\cache\OpenBLAS\android-aarch64 appveyor DownloadFile https://github.com/borg323/OpenBLAS/releases/download/android-0.3.27/openblas-android-aarch64.zip
 - cmd: IF %NAME%==android IF NOT EXIST C:\cache\OpenBLAS\android-aarch64 7z x openblas-android-aarch64.zip -oC:\cache\OpenBLAS
@@ -97,16 +103,19 @@ install:
 - cmd: touch -t 201801010000.00 c:\cache\%NET%.pb.gz
 - cmd: IF %GTEST%==true IF NOT EXIST C:\cache\syzygy mkdir C:\cache\syzygy
 - cmd: IF %GTEST%==true cd C:\cache\syzygy
-- cmd: IF %GTEST%==true IF NOT EXIST KQvK.rtbz curl --remote-name-all https://tablebase.lichess.ovh/tables/standard/3-4-5/K{P,N,R,B,Q}vK.rtb{w,z}
-- cmd: IF %GTEST%==true IF NOT EXIST KQQvK.rtbz curl --remote-name-all https://tablebase.lichess.ovh/tables/standard/3-4-5/K{P,N,R,B,Q}{P,N,R,B,Q}vK.rtb{w,z}
-- cmd: IF %GTEST%==true IF NOT EXIST KQvKQ.rtbz curl --remote-name-all https://tablebase.lichess.ovh/tables/standard/3-4-5/K{P,N,R,B,Q}vK{P,N,R,B,Q}.rtb{w,z}
+- cmd: IF %GTEST%==true IF NOT EXIST KQvK.rtbz curl --remote-name-all https://tablebase.lichess.ovh/tables/standard/3-4-5-dtz/K{P,N,R,B,Q}vK.rtbz
+- cmd: IF %GTEST%==true IF NOT EXIST KQQvK.rtbz curl --remote-name-all https://tablebase.lichess.ovh/tables/standard/3-4-5-dtz/K{P,N,R,B,Q}{P,N,R,B,Q}vK.rtbz
+- cmd: IF %GTEST%==true IF NOT EXIST KQvKQ.rtbz curl --remote-name-all https://tablebase.lichess.ovh/tables/standard/3-4-5-dtz/K{P,N,R,B,Q}vK{P,N,R,B,Q}.rtbz
+- cmd: IF %GTEST%==true IF NOT EXIST KQvK.rtbw curl --remote-name-all https://tablebase.lichess.ovh/tables/standard/3-4-5-wdl/K{P,N,R,B,Q}vK.rtbw
+- cmd: IF %GTEST%==true IF NOT EXIST KQQvK.rtbw curl --remote-name-all https://tablebase.lichess.ovh/tables/standard/3-4-5-wdl/K{P,N,R,B,Q}{P,N,R,B,Q}vK.rtbw
+- cmd: IF %GTEST%==true IF NOT EXIST KQvKQ.rtbw curl --remote-name-all https://tablebase.lichess.ovh/tables/standard/3-4-5-wdl/K{P,N,R,B,Q}vK{P,N,R,B,Q}.rtbw
 - cmd: cd C:\projects\lc0
 cache:
   - C:\cache
-  - 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.0'
-  - 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1'
+  - 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1'
+  - 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9'
   - C:\projects\lc0\subprojects\packagecache
-  - C:\ndk\android-ndk-r19c\toolchains\llvm\prebuilt\windows-x86_64
+  - C:\ndk\android-ndk-r27c\toolchains\llvm\prebuilt\windows-x86_64
 before_build:
 - cmd: git submodule update --init --recursive
 - cmd: IF %BLAS%==true (echo.#define DEFAULT_MAX_PREFETCH 0 & echo.#define DEFAULT_TASK_WORKERS 0) > params_override.h
@@ -124,7 +133,8 @@ before_build:
 - cmd: SET EXTRA=
 - cmd: IF %ANDROID%==false SET EXTRA=-Db_vscrt=md
 - cmd: IF %ONNX_DML%==true SET EXTRA=-Db_vscrt=md -Donnx_libdir=C:\cache\%ONNX_NAME%\lib -Donnx_include=C:\cache\%ONNX_NAME%\include
-- cmd: IF %ANDROID%==false meson build --backend vs2017 --buildtype release -Dgtest=%GTEST% -Dopencl=%OPENCL% -Dblas=%BUILD_BLAS% -Ddnnl=true -Ddx=%DX% -Dcudnn=%CUDNN% -Donednn=%ONEDNN% -Dispc_native_only=false -Dnative_cuda=false -Dpopcnt=%POPCNT% -Df16c=%F16C% -Dcudnn_include="%CUDA_PATH%\include","%CUDA_PATH%\cuda\include" -Dcudnn_libdirs="%CUDA_PATH%\lib\x64","%CUDA_PATH%\cuda\lib\x64" -Dopenblas_include="%PKG_FOLDER%\OpenBLAS\dist64\include" -Dopenblas_libdirs="%PKG_FOLDER%\OpenBLAS\dist64\lib" -Ddnnl_dir="%PKG_FOLDER%\%DNNL_NAME%" -Dopencl_include="%PKG_FOLDER%\opencl-nug.0.777.77\build\native\include" -Dopencl_libdirs="%PKG_FOLDER%\opencl-nug.0.777.77\build\native\lib\x64" -Ddefault_library=static -Dmalloc=mimalloc -Dmimalloc_libdir="%MIMALLOC_PATH%"\out\msvc-x64\Release %EXTRA%
+- cmd: IF %ONNX_TRT%==true SET EXTRA=-Db_vscrt=md -Donnx_libdir=C:\cache\%ONNX_NAME%\lib -Donnx_include=C:\cache\%ONNX_NAME%\include
+- cmd: IF %ANDROID%==false meson build --backend vs2019 --buildtype release -Dgtest=%GTEST% -Dopencl=%OPENCL% -Dblas=%BUILD_BLAS% -Ddnnl=true -Ddx=%DX% -Dcudnn=%CUDNN% -Donednn=%ONEDNN% -Dispc_native_only=false -Dnative_cuda=false -Dpopcnt=%POPCNT% -Df16c=%F16C% -Dcudnn_include="%CUDA_PATH%\include","%CUDA_PATH%\cuda\include" -Dcudnn_libdirs="%CUDA_PATH%\lib\x64","%CUDA_PATH%\cuda\lib\x64" -Dopenblas_include="%PKG_FOLDER%\OpenBLAS\dist64\include" -Dopenblas_libdirs="%PKG_FOLDER%\OpenBLAS\dist64\lib" -Ddnnl_dir="%PKG_FOLDER%\%DNNL_NAME%" -Dopencl_include="%PKG_FOLDER%\opencl-nug.0.777.77\build\native\include" -Dopencl_libdirs="%PKG_FOLDER%\opencl-nug.0.777.77\build\native\lib\x64" -Ddefault_library=static -Dmalloc=mimalloc -Dmimalloc_libdir="%MIMALLOC_PATH%"\out\msvc-x64\Release %EXTRA%
 - cmd: IF %ANDROID%==true meson arm64-v8a --buildtype release -Dgtest=false -Dopenblas_include="%PKG_FOLDER%\OpenBLAS\android-aarch64\include" -Dopenblas_libdirs="%PKG_FOLDER%\OpenBLAS\android-aarch64\lib" -Dembed=%EMBED% -Ddefault_library=static --cross-file crossfile-aarch64
 - cmd: IF %ANDROID%==true meson armeabi-v7a --buildtype release -Dgtest=false -Dopenblas_include="%PKG_FOLDER%\OpenBLAS\android-armv7a\include" -Dopenblas_libdirs="%PKG_FOLDER%\OpenBLAS\android-armv7a\lib" -Dembed=%EMBED% -Ddefault_library=static --cross-file crossfile-armv7a -Dispc=false -Dneon=false
 build_script:
diff --git a/cross-files/aarch64-linux-android b/cross-files/aarch64-linux-android
index 4a55d838de..75e9e63de9 100644
--- a/cross-files/aarch64-linux-android
+++ b/cross-files/aarch64-linux-android
@@ -1,5 +1,5 @@
 
-# Tested with Android NDK r19c, default toolchain
+# Tested with Android NDK r27c, default toolchain
 # Targeting API level 21
 
 # Set the toolchain path on your environment
@@ -17,8 +17,8 @@ cpp_link_args = ['-llog', '-static-libstdc++']
 [binaries]
 c = 'aarch64-linux-android21-clang'
 cpp = 'aarch64-linux-android21-clang++'
-ar = 'aarch64-linux-android-ar'
-strip = 'aarch64-linux-android-strip'
-ld = 'aarch64-linux-android-ld'
-ranlib = 'aarch64-linux-android-ranlib'
-as = 'aarch64-linux-android-as'
+ar = 'llvm-ar'
+strip = 'llvm-strip'
+ld = 'ld'
+ranlib = 'llvm-ranlib'
+as = 'aarch64-linux-android21-clang'
diff --git a/cross-files/armv7a-linux-android b/cross-files/armv7a-linux-android
index 16b3e93f90..3fed7aee8b 100644
--- a/cross-files/armv7a-linux-android
+++ b/cross-files/armv7a-linux-android
@@ -1,5 +1,5 @@
 
-# Tested with Android NDK r19c, default toolchain
+# Tested with Android NDK r27c, default toolchain
 # Targeting API level 21
 
 # When targeting API levels < 24 the build fails unless _FILE_OFFSET_BITS is unset.
@@ -24,8 +24,8 @@ cpp_link_args = ['-llog', '-static-libstdc++']
 [binaries]
 c = 'armv7a-linux-androideabi21-clang'
 cpp = 'armv7a-linux-androideabi21-clang++'
-ar = 'arm-linux-androideabi-ar'
-strip = 'arm-linux-androideabi-strip'
-ld = 'arm-linux-androideabi-ld'
-ranlib = 'arm-linux-androideabi-ranlib'
-as = 'arm-linux-androideabi-as'
+ar = 'llvm-ar'
+strip = 'llvm-strip'
+ld = 'ld'
+ranlib = 'llvm-ranlib'
+as = 'armv7a-linux-androideabi21-clang'
diff --git a/dist/README-cuda.txt b/dist/README-cuda.txt
index 8278cc53cb..4f35003cda 100644
--- a/dist/README-cuda.txt
+++ b/dist/README-cuda.txt
@@ -4,13 +4,16 @@ Lc0 is a UCI-compliant chess engine designed to play chess via
 neural network, specifically those of the LeelaChessZero project
 (https://lczero.org).
 
-This binary uses CUDA and cuDNN dynamic link libraries copyrighted
-by Nvidia corporation (http://www.nvidia.com), and redistributed as
-permitted by the respective license file (see CUDA.txt section 2.2
-and CUDNN.txt section "CUDNN DISTRIBUTION" for details). You are
-authorized to redistribute these libraries together with this
-package as a whole but not individually.
-
+This binary uses CUDA dynamic link libraries copyrighted by Nvidia
+corporation (http://www.nvidia.com), that can be redistributed as
+permitted by the respective license file (see CUDA.txt section 2.2).
+For size reasons you will have to get the required files by running
+the included `install.cmd` script. If this fails you can get them by
+downloading
+<https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-11.8.89-archive.zip> and
+<https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-11.11.3.6-archive.zip>, the required dynamic link libraries are in the respective `bin`
+directories. You are authorized to redistribute these libraries
+together with this package as a whole but not individually.
 
 License
 
diff --git a/dist/README-cudnn.txt b/dist/README-cudnn.txt
new file mode 100644
index 0000000000..8278cc53cb
--- /dev/null
+++ b/dist/README-cudnn.txt
@@ -0,0 +1,38 @@
+Lc0
+
+Lc0 is a UCI-compliant chess engine designed to play chess via
+neural network, specifically those of the LeelaChessZero project
+(https://lczero.org).
+
+This binary uses CUDA and cuDNN dynamic link libraries copyrighted
+by Nvidia corporation (http://www.nvidia.com), and redistributed as
+permitted by the respective license file (see CUDA.txt section 2.2
+and CUDNN.txt section "CUDNN DISTRIBUTION" for details). You are
+authorized to redistribute these libraries together with this
+package as a whole but not individually.
+
+
+License
+
+Leela Chess is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+Leela Chess is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+Additional permission under GNU GPL version 3 section 7
+
+If you modify this Program, or any covered work, by linking or
+combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+modified version of those libraries), containing parts covered by the
+terms of the respective license agreement, the licensors of this
+Program grant you additional permission to convey the resulting work.
+
diff --git a/dist/install-cuda_12_9.cmd b/dist/install-cuda_12_9.cmd
new file mode 100644
index 0000000000..e7bc785ff9
--- /dev/null
+++ b/dist/install-cuda_12_9.cmd
@@ -0,0 +1,41 @@
+@echo off
+where /q tar
+if errorlevel 1 goto error
+
+cd /d %~dp0
+
+cls
+echo Installing the CUDA dlls required by the Lc0 cuda backend.
+
+echo 1/4. Downloading cudart.
+curl -# --ssl-no-revoke -o tmp_cudart.zip https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.9.37-archive.zip"
+if errorlevel 1 goto error
+
+echo 2/4. Extracting files.
+tar -xzOf tmp_cudart.zip cuda_cudart-windows-x86_64-12.9.37-archive/bin/cudart64_12.dll >cudart64_12.dll
+if errorlevel 1 goto error
+
+del /q tmp_cudart.zip
+
+echo 3/4. Downloading cublas.
+curl -# --ssl-no-revoke -o tmp_cublas.zip https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.9.0.13-archive.zip"
+if errorlevel 1 goto error
+
+echo 4/4. Extracting files.
+tar -xzOf tmp_cublas.zip libcublas-windows-x86_64-12.9.0.13-archive/bin/cublas64_12.dll >cublas64_12.dll
+if errorlevel 1 goto error
+
+tar -xzOf tmp_cublas.zip libcublas-windows-x86_64-12.9.0.13-archive/bin/cublasLt64_12.dll >cublasLt64_12.dll
+if errorlevel 1 goto error
+
+del /q tmp_cublas.zip
+
+echo Installation successful.
+pause
+exit /b
+
+:error
+cls
+echo Installation failed - see the README for an alternative approach.
+pause
+
diff --git a/scripts/appveyor_android_build.cmd b/scripts/appveyor_android_build.cmd
index 9f2f79665a..a9f3f01860 100644
--- a/scripts/appveyor_android_build.cmd
+++ b/scripts/appveyor_android_build.cmd
@@ -1,7 +1,7 @@
 cd arm64-v8a
 ninja
-aarch64-linux-android-strip lc0
+llvm-strip lc0
 cd C:\projects\lc0
 cd armeabi-v7a
 ninja
-arm-linux-androideabi-strip lc0
+llvm-strip lc0
diff --git a/scripts/appveyor_win_package.cmd b/scripts/appveyor_win_package.cmd
index 36f98d8eef..f1b9ac0ed1 100644
--- a/scripts/appveyor_win_package.cmd
+++ b/scripts/appveyor_win_package.cmd
@@ -10,14 +10,13 @@ type "%MIMALLOC_PATH%"\LICENSE |more /P > dist\mimalloc-LICENSE
 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip "%MIMALLOC_PATH%"\out\msvc-x64\Release\mimalloc-redirect.dll
 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\mimalloc-readme.md
 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\mimalloc-LICENSE
-IF %CUDA%==true copy lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-nodll.zip
+IF %CUDNN%==true copy lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-nodll.zip
 IF %NAME%==cpu-openblas 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip C:\cache\OpenBLAS\dist64\bin\libopenblas.dll
 IF %NAME%==cpu-dnnl 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip C:\cache\%DNNL_NAME%\bin\dnnl.dll
 IF %NAME%==onednn 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip C:\cache\%DNNL_NAME%\bin\dnnl.dll
 IF %OPENCL%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip C:\cache\opencl-nug.0.777.77\build\native\bin\OpenCL.dll
 IF %CUDNN%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip "%CUDA_PATH%\bin\cudart64_100.dll" "%CUDA_PATH%\bin\cublas64_100.dll"
 IF %CUDNN%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip "%CUDA_PATH%\cuda\bin\cudnn64_7.dll"
-IF %CUDA%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip "%CUDA_PATH%\bin\cudart64_110.dll" "%CUDA_PATH%\bin\cublas64_11.dll" "%CUDA_PATH%\bin\cublasLt64_11.dll"
 IF %NAME%==cpu-dnnl copy "%PKG_FOLDER%\%DNNL_NAME%\LICENSE" dist\DNNL-LICENSE
 IF %NAME%==cpu-dnnl copy "%PKG_FOLDER%\%DNNL_NAME%\THIRD-PARTY-PROGRAMS" dist\DNNL-THIRD-PARTY-PROGRAMS
 IF %NAME%==cpu-dnnl 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\DNNL-LICENSE
@@ -39,8 +38,11 @@ IF %OPENCL%==true type scripts\check_opencl.bat |more /P > dist\check_opencl.bat
 IF %OPENCL%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\check_opencl.bat
 IF %DX%==true type scripts\check_dx.bat |more /P > dist\check_dx.bat
 IF %DX%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\check_dx.bat
+IF %CUDA%==true IF %CUDNN%==false type dist\install-cuda_12_9.cmd |more /P > dist\install.cmd
+IF %CUDA%==true IF %CUDNN%==false 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\install.cmd
 IF %CUDA%==true copy "%CUDA_PATH%\EULA.txt" dist\CUDA.txt
-IF %CUDA%==true type dist\README-cuda.txt |more /P > dist\README.txt
+IF %CUDA%==true IF %CUDNN%==false type dist\README-cuda.txt |more /P > dist\README.txt
+IF %CUDNN%==true type dist\README-cudnn.txt |more /P > dist\README.txt
 IF %CUDA%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\README.txt .\dist\CUDA.txt
 IF %CUDNN%==true copy "%CUDA_PATH%\cuda\NVIDIA_SLA_cuDNN_Support.txt" dist\CUDNN.txt
 IF %CUDNN%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\CUDNN.txt
diff --git a/src/mcts/params.cc b/src/mcts/params.cc
index 5e65028583..394f941b08 100644
--- a/src/mcts/params.cc
+++ b/src/mcts/params.cc
@@ -518,7 +518,7 @@ void SearchParams::Populate(OptionsParser* options) {
   options->Add<IntOption>(kTemperatureCutoffMoveId, 0, 1000) = 0;
   options->Add<FloatOption>(kTemperatureEndgameId, 0.0f, 100.0f) = 0.0f;
   options->Add<FloatOption>(kTemperatureWinpctCutoffId, 0.0f, 100.0f) = 100.0f;
-  options->Add<FloatOption>(kTemperatureVisitOffsetId, -1000.0f, 1000.0f) =
+  options->Add<FloatOption>(kTemperatureVisitOffsetId, -1000.0f, 1000000.0f) =
       0.0f;
   options->Add<FloatOption>(kNoiseEpsilonId, 0.0f, 1.0f) = 0.0f;
   options->Add<FloatOption>(kNoiseAlphaId, 0.0f, 10000000.0f) = 0.3f;
diff --git a/src/neural/onnx/network_onnx.cc b/src/neural/onnx/network_onnx.cc
index 150787927b..645bc46825 100644
--- a/src/neural/onnx/network_onnx.cc
+++ b/src/neural/onnx/network_onnx.cc
@@ -46,6 +46,7 @@
 #include "onnxruntime_cxx_api.h"
 #include "utils/bf16_utils.h"
 #include "utils/bititer.h"
+#include "utils/commandline.h"
 #include "utils/exception.h"
 #include "utils/fp16_utils.h"
 #include "utils/logging.h"
@@ -53,7 +54,7 @@
 namespace lczero {
 namespace {
 
-enum class OnnxProvider { CPU, CUDA, DML, ROCM };
+enum class OnnxProvider { CPU, CUDA, DML, ROCM, TRT };
 
 class OnnxNetwork;
 
@@ -83,8 +84,7 @@ class OnnxComputation : public NetworkComputation {
 class OnnxNetwork : public Network {
  public:
   OnnxNetwork(const WeightsFile& file, const OptionsDict& options,
-              OnnxProvider provider, int gpu, int threads, int batch_size,
-              int steps);
+              OnnxProvider provider);
   std::unique_ptr<NetworkComputation> NewComputation() override {
     if (fp16_) {
       return std::make_unique<OnnxComputation<Ort::Float16_t>>(this);
@@ -103,6 +103,8 @@ class OnnxNetwork : public Network {
   }
   bool IsCpu() const override { return provider_ == OnnxProvider::CPU; }
 
+  Ort::SessionOptions GetOptions(int gpu, int threads, int batch_size);
+
   Ort::Env onnx_env_;
   // Prepare sessions for this many multiples of the batch size;
   int steps_;
@@ -123,8 +125,10 @@ class OnnxNetwork : public Network {
   bool bf16_;
   // The batch size to use, or -1 for variable.
   int batch_size_;
+  // The lower limit for variable batch size.
+  int min_batch_size_;
   static constexpr int max_batch_size_ = 1024;
-  // For conditional locking if running the DML provider.
+  // For conditional locking if running the DML/ROCM/TRT provider.
   OnnxProvider provider_;
   std::mutex lock_;
 };
@@ -259,8 +263,10 @@ Ort::Value OnnxComputation<DataType>::PrepareInputs(int start, int batch_size) {
 template <typename DataType>
 void OnnxComputation<DataType>::ComputeBlocking() {
   int batch_size = network_->batch_size_;
-  if (batch_size < 0) batch_size = raw_input_.size();
-
+  if (batch_size < 0) {
+    batch_size = std::max(static_cast<int>(raw_input_.size()),
+                          network_->min_batch_size_);
+  }
   for (size_t i = 0; i < raw_input_.size();) {
     int step = (raw_input_.size() - i + batch_size - 1) / batch_size;
     if (step > network_->steps_) step = network_->steps_;
@@ -272,7 +278,8 @@ void OnnxComputation<DataType>::ComputeBlocking() {
     // same to be true for the ROCm execution provider (at least for CNNs).
     // TODO: This may be a onnxruntime/ROCm bug, check onnxruntime 1.16 release.
     if (network_->provider_ == OnnxProvider::DML ||
-        network_->provider_ == OnnxProvider::ROCM) {
+        network_->provider_ == OnnxProvider::ROCM ||
+        network_->provider_ == OnnxProvider::TRT) {
       network_->lock_.lock();
     }
     network_->session_[step - 1].Run(
@@ -280,15 +287,16 @@ void OnnxComputation<DataType>::ComputeBlocking() {
         network_->outputs_cstr_.data(), output_tensors_.data(),
         output_tensors_.size());
     if (network_->provider_ == OnnxProvider::DML ||
-        network_->provider_ == OnnxProvider::ROCM) {
+        network_->provider_ == OnnxProvider::ROCM ||
+        network_->provider_ == OnnxProvider::TRT) {
       network_->lock_.unlock();
     }
     i += batch;
   }
 }
 
-Ort::SessionOptions GetOptions(OnnxProvider provider, int gpu, int threads,
-                               int batch_size) {
+Ort::SessionOptions OnnxNetwork::GetOptions(int gpu, int threads,
+                                            int batch_size) {
   Ort::SessionOptions options;
   options.SetIntraOpNumThreads(threads);
   options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
@@ -301,7 +309,7 @@ Ort::SessionOptions GetOptions(OnnxProvider provider, int gpu, int threads,
             ->AddFreeDimensionOverrideByName(options, "batch", batch_size));
   }
 
-  switch (provider) {
+  switch (provider_) {
     case OnnxProvider::DML:
       options.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
       options.DisableMemPattern();
@@ -312,6 +320,59 @@ Ort::SessionOptions GetOptions(OnnxProvider provider, int gpu, int threads,
       throw Exception("ONNX backend internal error.");
 #endif
       break;
+    case OnnxProvider::TRT: {
+      options.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
+
+      std::string cache_dir = CommandLine::BinaryDirectory() + "/trt_cache";
+      std::map<std::string, std::string> trt_options;
+      trt_options["device_id"] = std::to_string(gpu);
+      trt_options["trt_fp16_enable"] = fp16_ ? "1" : "0";
+      trt_options["trt_int8_enable"] = "0";
+      trt_options["trt_max_partition_iterations"] = "1000";
+      trt_options["trt_min_subgraph_size"] = "1";
+      trt_options["trt_engine_cache_enable"] = "1";
+      trt_options["trt_engine_cache_prefix"] =
+          "Lc0_ONNX_TRT_batch_" + std::to_string(batch_size) + "_";
+      trt_options["trt_engine_cache_path"] = cache_dir;
+      trt_options["trt_timing_cache_enable"] = "1";
+      trt_options["trt_timing_cache_path"] = cache_dir;
+      trt_options["trt_layer_norm_fp32_fallback"] = "1";
+      trt_options["trt_force_sequential_engine_build"] = "1";
+      // Looks like we need I/O binding to enable this.
+      // trt_options["trt_cuda_graph_enable"] = "1";
+      if (batch_size < 0) {
+        trt_options["trt_profile_min_shapes"] =
+            inputs_[0] + ":" + std::to_string(min_batch_size_) + "x112x8x8";
+        trt_options["trt_profile_max_shapes"] =
+            inputs_[0] + ":" + std::to_string(max_batch_size_) + "x112x8x8";
+        trt_options["trt_profile_opt_shapes"] =
+            inputs_[0] + ":" + std::to_string(max_batch_size_ / 4) + "x112x8x8";
+      } else {
+        trt_options["trt_profile_min_shapes"] =
+            inputs_[0] + ":" + std::to_string(batch_size_) + "x112x8x8";
+        trt_options["trt_profile_max_shapes"] =
+            inputs_[0] + ":" + std::to_string(batch_size_ * steps_) +
+            "x112x8x8";
+        trt_options["trt_profile_opt_shapes"] =
+            inputs_[0] + ":" + std::to_string(batch_size_ * steps_) +
+            "x112x8x8";
+      }
+      std::vector<const char*> keys;
+      std::vector<const char*> values;
+      for (const auto& [key, value] : trt_options) {
+        keys.push_back(key.c_str());
+        values.push_back(value.c_str());
+      }
+
+      const auto& api = Ort::GetApi();
+      OrtTensorRTProviderOptionsV2* trt_options_v2;
+      Ort::ThrowOnError(api.CreateTensorRTProviderOptions(&trt_options_v2));
+      Ort::ThrowOnError(api.UpdateTensorRTProviderOptions(
+          trt_options_v2, keys.data(), values.data(), keys.size()));
+      options.AppendExecutionProvider_TensorRT_V2(*trt_options_v2);
+      api.ReleaseTensorRTProviderOptions(trt_options_v2);
+      break;
+    }
     case OnnxProvider::ROCM: {
       OrtROCMProviderOptions rocm_options;
       rocm_options.device_id = gpu;
@@ -338,30 +399,34 @@ Ort::SessionOptions GetOptions(OnnxProvider provider, int gpu, int threads,
   return options;
 }
 
-OnnxNetwork::OnnxNetwork(const WeightsFile& file, const OptionsDict&,
-                         OnnxProvider provider, int gpu, int threads,
-                         int batch_size, int steps)
+OnnxNetwork::OnnxNetwork(const WeightsFile& file, const OptionsDict& opts,
+                         OnnxProvider provider)
     : onnx_env_(ORT_LOGGING_LEVEL_WARNING, "lc0"),
-      steps_(steps),
       capabilities_{file.format().network_format().input(),
                     file.format().network_format().output(),
                     file.format().network_format().moves_left()},
       fp16_(file.onnx_model().data_type() == pblczero::OnnxModel::FLOAT16),
       bf16_(file.onnx_model().data_type() == pblczero::OnnxModel::BFLOAT16),
-      batch_size_(batch_size),
       provider_(provider) {
+  batch_size_ =
+      opts.GetOrDefault<int>("batch", provider == OnnxProvider::DML ? 16 : -1);
+  steps_ =
+      opts.GetOrDefault<int>("steps", provider == OnnxProvider::DML ? 4 : 1);
+  min_batch_size_ = opts.GetOrDefault<int>(
+      "min_batch", provider == OnnxProvider::TRT ? 4 : 1);
+  int gpu = opts.GetOrDefault<int>("gpu", 0);
+  int threads =
+      opts.GetOrDefault<int>("threads", provider == OnnxProvider::CPU ? 1 : 0);
+
   // Sanity checks.
-  if (batch_size_ < 0) steps_ = 1;
-  if (batch_size_ * steps > max_batch_size_) {
+  if (batch_size_ <= 0) {
+    batch_size_ = -1;  // Variable batch size.
+    steps_ = 1;
+  }
+  if (batch_size_ * steps_ > max_batch_size_) {
     batch_size_ = max_batch_size_ / steps_;
   }
 
-  for (int step = 1; step <= steps_; step++)
-    session_.emplace_back(
-        onnx_env_, file.onnx_model().model().data(),
-        file.onnx_model().model().size(),
-        GetOptions(provider, gpu, threads, batch_size_ * step));
-
   const auto& md = file.onnx_model();
   if (!md.has_input_planes()) {
     throw Exception("NN doesn't have input planes defined.");
@@ -391,6 +456,11 @@ OnnxNetwork::OnnxNetwork(const WeightsFile& file, const OptionsDict&,
   std::transform(outputs_.begin(), outputs_.end(),
                  std::back_inserter(outputs_cstr_),
                  [](const auto& x) { return x.c_str(); });
+
+  for (int step = 1; step <= steps_; step++)
+    session_.emplace_back(onnx_env_, file.onnx_model().model().data(),
+                          file.onnx_model().model().size(),
+                          GetOptions(gpu, threads, batch_size_ * step));
 }
 
 template <OnnxProvider kProvider>
@@ -398,22 +468,8 @@ std::unique_ptr<Network> MakeOnnxNetwork(const std::optional<WeightsFile>& w,
                                          const OptionsDict& opts) {
   if (!w) throw Exception("The ONNX backend requires a network file.");
 
-  int gpu = opts.GetOrDefault<int>("gpu", 0);
-
-  int batch_size =
-      opts.GetOrDefault<int>("batch", kProvider == OnnxProvider::DML ? 16 : -1);
-
-  int steps =
-      opts.GetOrDefault<int>("steps", kProvider == OnnxProvider::DML ? 4 : 1);
-
-  int threads =
-      opts.GetOrDefault<int>("threads", kProvider == OnnxProvider::CPU ? 1 : 0);
-
-  if (batch_size <= 0) batch_size = -1;  // Variable batch size.
-
   if (w->has_onnx_model()) {
-    return std::make_unique<OnnxNetwork>(*w, opts, kProvider, gpu, threads,
-                                         batch_size, steps);
+    return std::make_unique<OnnxNetwork>(*w, opts, kProvider);
   } else {
     WeightsToOnnxConverterOptions converter_options;
     converter_options.opset = opts.GetOrDefault<int>("opset", 17);
@@ -428,20 +484,18 @@ std::unique_ptr<Network> MakeOnnxNetwork(const std::optional<WeightsFile>& w,
         opts.GetOrDefault<std::string>("value_head", "winner");
 
     std::string datatype;
-    if (opts.IsDefault<std::string>("datatype")) {
+    if (opts.Exists<std::string>("datatype")) {
+      datatype = opts.Get<std::string>("datatype");
+    } else {
       bool fp16 = opts.GetOrDefault<bool>(
           "fp16", kProvider == OnnxProvider::CPU ? false : true);
       datatype = fp16 ? "f16" : "f32";
-    } else {
-      datatype = opts.Get<std::string>("datatype");
     }
     converter_options.data_type =
         WeightsToOnnxConverterOptions::StringToDataType(datatype);
-    converter_options.relax_op_types = false;
 
     auto converted = ConvertWeightsToOnnx(*w, converter_options);
-    return std::make_unique<OnnxNetwork>(converted, opts, kProvider, gpu,
-                                         threads, batch_size, steps);
+    return std::make_unique<OnnxNetwork>(converted, opts, kProvider);
   }
 }
 
@@ -451,6 +505,7 @@ REGISTER_NETWORK("onnx-rocm", MakeOnnxNetwork<OnnxProvider::ROCM>, 64)
 #ifdef USE_DML
 REGISTER_NETWORK("onnx-dml", MakeOnnxNetwork<OnnxProvider::DML>, 63)
 #endif
+REGISTER_NETWORK("onnx-trt", MakeOnnxNetwork<OnnxProvider::TRT>, 60)
 REGISTER_NETWORK("onnx-cuda", MakeOnnxNetwork<OnnxProvider::CUDA>, 61)
 REGISTER_NETWORK("onnx-cpu", MakeOnnxNetwork<OnnxProvider::CPU>, 62)