diff --git a/appveyor.yml b/appveyor.yml index bc351a5ca3..affe4d08bb 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -2,23 +2,24 @@ version: '{build}' configuration: Release platform: x64 image: -- Visual Studio 2017 +- Visual Studio 2019 environment: matrix: - NAME: gpu-nvidia-cudnn - NAME: gpu-nvidia-cuda - - NAME: gpu-dx12 - - NAME: gpu-opencl +# - NAME: gpu-dx12 +# - NAME: gpu-opencl - NAME: cpu-dnnl - NAME: cpu-openblas - - NAME: onednn +# - NAME: onednn - NAME: onnx-dml + - NAME: onnx-trt - NAME: android for: - matrix: only: - - NAME: gpu-opencl +# - NAME: gpu-opencl - NAME: cpu-dnnl skip_non_tags: true clone_folder: c:\projects\lc0 @@ -30,6 +31,7 @@ install: - cmd: set BLAS=false - cmd: set ONEDNN=false - cmd: set ONNX_DML=false +- cmd: set ONNX_TRT=false - cmd: set GTEST=false - cmd: set ANDROID=false - cmd: IF %NAME%==android set ANDROID=true @@ -43,11 +45,12 @@ install: - cmd: IF %NAME%==cpu-openblas set GTEST=true - cmd: IF %NAME%==onednn set ONEDNN=true - cmd: IF %NAME%==onnx-dml set ONNX_DML=true +- cmd: IF %NAME%==onnx-trt set ONNX_TRT=true - cmd: set NET=753723 - cmd: set NET_HASH=3e3444370b9fe413244fdc79671a490e19b93d3cca1669710ffeac890493d198 - cmd: IF NOT %OPENCL%==true IF NOT %DX%==true set NET=791556 - cmd: IF NOT %OPENCL%==true IF NOT %DX%==true set NET_HASH=f404e156ceb2882470fd8c032b8754af0fa0b71168328912eaef14671a256e34 -- cmd: call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat" amd64 +#- cmd: call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvarsall.bat" amd64 - cmd: set DNNL_NAME=dnnl_win_1.5.0_cpu_vcomp - cmd: IF %NAME%==cpu-dnnl IF NOT EXIST C:\cache\%DNNL_NAME% appveyor DownloadFile https://github.com/oneapi-src/oneDNN/releases/download/v1.5/dnnl_win_1.5.0_cpu_vcomp.zip - cmd: IF %NAME%==cpu-dnnl IF NOT EXIST C:\cache\%DNNL_NAME% 7z x dnnl_win_1.5.0_cpu_vcomp.zip -oC:\cache @@ -57,6 +60,9 @@ install: - cmd: IF %NAME%==onnx-dml IF NOT EXIST C:\cache\onnxruntime-win-x64-dml-1.13.1 appveyor DownloadFile https://github.com/borg323/onnxruntime/releases/download/v1.13.1/onnxruntime-win-x64-dml-1.13.1.zip - cmd: IF %NAME%==onnx-dml IF NOT EXIST C:\cache\onnxruntime-win-x64-dml-1.13.1 7z x onnxruntime-win-x64-dml-1.13.1.zip -oC:\cache - cmd: IF %NAME%==onnx-dml set ONNX_NAME=onnxruntime-win-x64-dml-1.13.1 +- cmd: IF %NAME%==onnx-trt IF NOT EXIST C:\cache\onnxruntime-win-x64-gpu-1.22.0 appveyor DownloadFile https://github.com/microsoft/onnxruntime/releases/download/v1.22.0/onnxruntime-win-x64-gpu-1.22.0.zip +- cmd: IF %NAME%==onnx-trt IF NOT EXIST C:\cache\onnxruntime-win-x64-gpu-1.22.0 7z x onnxruntime-win-x64-gpu-1.22.0.zip -oC:\cache +- cmd: IF %NAME%==onnx-trt set ONNX_NAME=onnxruntime-win-x64-gpu-1.22.0 - cmd: IF %NAME%==cpu-openblas IF NOT EXIST C:\cache\OpenBLAS appveyor DownloadFile https://sjeng.org/ftp/OpenBLAS-0.3.3-win-oldthread.zip - cmd: IF %NAME%==cpu-openblas IF NOT EXIST C:\cache\OpenBLAS 7z x OpenBLAS-0.3.3-win-oldthread.zip -oC:\cache\OpenBLAS - cmd: IF %OPENCL%==true nuget install opencl-nug -Version 0.777.77 -OutputDirectory C:\cache @@ -65,26 +71,26 @@ install: - cmd: IF %ISPC%==true IF NOT EXIST C:\cache\ispc-v1.13.0-windows appveyor DownloadFile https://github.com/ispc/ispc/releases/download/v1.13.0/ispc-v1.13.0-windows.zip - cmd: IF %ISPC%==true IF NOT EXIST C:\cache\ispc-v1.13.0-windows 7z x ispc-v1.13.0-windows.zip -oC:\cache\ispc-v1.13.0-windows - cmd: IF %ISPC%==true set PATH=C:\cache\ispc-v1.13.0-windows\bin;%PATH% -- cmd: set "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.0" +- cmd: set "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1" - cmd: IF %CUDNN%==true IF NOT EXIST "%CUDA_PATH%\cuda" set CUDNN_INSTALL=1 -- cmd: IF DEFINED CUDNN_INSTALL appveyor DownloadFile https://developer.nvidia.com/compute/cuda/10.0/Prod/network_installers/cuda_10.0.130_win10_network -- cmd: IF DEFINED CUDNN_INSTALL cuda_10.0.130_win10_network -s nvcc_10.0 cublas_dev_10.0 cublas_10.0 cudart_10.0 -- cmd: IF DEFINED CUDNN_INSTALL appveyor DownloadFile http://developer.download.nvidia.com/compute/redist/cudnn/v7.4.2/cudnn-10.0-windows10-x64-v7.4.2.24.zip -- cmd: IF DEFINED CUDNN_INSTALL 7z x cudnn-10.0-windows10-x64-v7.4.2.24.zip -o"%CUDA_PATH%" -- cmd: IF %CUDNN%==false set "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1" +- cmd: IF DEFINED CUDNN_INSTALL appveyor DownloadFile https://developer.download.nvidia.com/compute/cuda/10.1/Prod/network_installers/cuda_10.1.243_win10_network.exe +- cmd: IF DEFINED CUDNN_INSTALL cuda_10.1.243_win10_network -s nvcc_10.1 cublas_dev_10.1 cublas_10.1 cudart_10.1 +- cmd: IF DEFINED CUDNN_INSTALL appveyor DownloadFile https://developer.download.nvidia.com/compute/redist/cudnn/v7.5.1/cudnn-10.1-windows10-x64-v7.5.1.10.zip +- cmd: IF DEFINED CUDNN_INSTALL 7z x cudnn-10.1-windows10-x64-v7.5.1.10.zip -o"%CUDA_PATH%" +- cmd: IF %CUDNN%==false set "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9" - cmd: IF %CUDA%==true IF NOT EXIST "%CUDA_PATH%" set CUDA_INSTALL=1 -- cmd: IF DEFINED CUDA_INSTALL appveyor DownloadFile https://developer.download.nvidia.com/compute/cuda/11.1.0/network_installers/cuda_11.1.0_win10_network.exe -- cmd: IF DEFINED CUDA_INSTALL cuda_11.1.0_win10_network.exe -s nvcc_11.1 cublas_dev_11.1 cublas_11.1 cudart_11.1 documentation_11.1 +- cmd: IF DEFINED CUDA_INSTALL appveyor DownloadFile https://developer.download.nvidia.com/compute/cuda/12.9.0/network_installers/cuda_12.9.0_windows_network.exe +- cmd: IF DEFINED CUDA_INSTALL cuda_12.9.0_windows_network.exe -s nvcc_12.9 cublas_dev_12.9 cublas_12.9 cudart_12.9 documentation_12.9 - cmd: IF %CUDA%==true set PATH=%CUDA_PATH%\bin;%PATH% -- cmd: set PATH=C:\Python36;C:\Python36\scripts;%PATH% -- cmd: pip3 install --upgrade meson==0.55.3 -- cmd: set MIMALLOC_PATH=C:\cache\mimalloc-1.7.1 -- cmd: IF %ANDROID%==false IF NOT EXIST "%MIMALLOC_PATH%" appveyor DownloadFile https://github.com/microsoft/mimalloc/archive/refs/tags/v1.7.1.zip -- cmd: IF %ANDROID%==false IF NOT EXIST "%MIMALLOC_PATH%" 7z x v1.7.1.zip -oC:\cache\ -- cmd: IF %ANDROID%==false IF NOT EXIST "%MIMALLOC_PATH%"\out msbuild "%MIMALLOC_PATH%"\ide\vs2017\mimalloc-override.vcxproj /p:Configuration=Release /m -- cmd: IF %NAME%==android IF NOT EXIST C:\ndk\android-ndk-r19c\toolchains\llvm\prebuilt\windows-x86_64 appveyor DownloadFile https://dl.google.com/android/repository/android-ndk-r19c-windows-x86_64.zip -- cmd: IF %NAME%==android IF NOT EXIST C:\ndk\android-ndk-r19c\toolchains\llvm\prebuilt\windows-x86_64 7z x android-ndk-r19c-windows-x86_64.zip -oC:\ndk -- cmd: IF %NAME%==android set PATH=C:\ndk\android-ndk-r19c\toolchains\llvm\prebuilt\windows-x86_64\bin;%PATH% +- cmd: set PATH=C:\Python310;C:\Python310\scripts;%PATH% +#- cmd: pip3 install --upgrade meson==0.55.3 +- cmd: set MIMALLOC_PATH=C:\cache\mimalloc-1.8.7 +- cmd: IF %ANDROID%==false IF NOT EXIST "%MIMALLOC_PATH%" appveyor DownloadFile https://github.com/microsoft/mimalloc/archive/refs/tags/v1.8.7.zip +- cmd: IF %ANDROID%==false IF NOT EXIST "%MIMALLOC_PATH%" 7z x v1.8.7.zip -oC:\cache\ +- cmd: IF %ANDROID%==false IF NOT EXIST "%MIMALLOC_PATH%"\out msbuild "%MIMALLOC_PATH%"\ide\vs2019\mimalloc-override.vcxproj /p:Configuration=Release /m +- cmd: IF %NAME%==android IF NOT EXIST C:\ndk\android-ndk-r27c\toolchains\llvm\prebuilt\windows-x86_64 appveyor DownloadFile https://dl.google.com/android/repository/android-ndk-r27c-windows.zip +- cmd: IF %NAME%==android IF NOT EXIST C:\ndk\android-ndk-r27c\toolchains\llvm\prebuilt\windows-x86_64 7z x android-ndk-r27c-windows.zip -oC:\ndk +- cmd: IF %NAME%==android set PATH=C:\ndk\android-ndk-r27c\toolchains\llvm\prebuilt\windows-x86_64\bin;%PATH% - cmd: IF %NAME%==android sed "s/clang+*/&.cmd/" cross-files/aarch64-linux-android >crossfile-aarch64 - cmd: IF %NAME%==android IF NOT EXIST C:\cache\OpenBLAS\android-aarch64 appveyor DownloadFile https://github.com/borg323/OpenBLAS/releases/download/android-0.3.27/openblas-android-aarch64.zip - cmd: IF %NAME%==android IF NOT EXIST C:\cache\OpenBLAS\android-aarch64 7z x openblas-android-aarch64.zip -oC:\cache\OpenBLAS @@ -97,16 +103,19 @@ install: - cmd: touch -t 201801010000.00 c:\cache\%NET%.pb.gz - cmd: IF %GTEST%==true IF NOT EXIST C:\cache\syzygy mkdir C:\cache\syzygy - cmd: IF %GTEST%==true cd C:\cache\syzygy -- cmd: IF %GTEST%==true IF NOT EXIST KQvK.rtbz curl --remote-name-all https://tablebase.lichess.ovh/tables/standard/3-4-5/K{P,N,R,B,Q}vK.rtb{w,z} -- cmd: IF %GTEST%==true IF NOT EXIST KQQvK.rtbz curl --remote-name-all https://tablebase.lichess.ovh/tables/standard/3-4-5/K{P,N,R,B,Q}{P,N,R,B,Q}vK.rtb{w,z} -- cmd: IF %GTEST%==true IF NOT EXIST KQvKQ.rtbz curl --remote-name-all https://tablebase.lichess.ovh/tables/standard/3-4-5/K{P,N,R,B,Q}vK{P,N,R,B,Q}.rtb{w,z} +- cmd: IF %GTEST%==true IF NOT EXIST KQvK.rtbz curl --remote-name-all https://tablebase.lichess.ovh/tables/standard/3-4-5-dtz/K{P,N,R,B,Q}vK.rtbz +- cmd: IF %GTEST%==true IF NOT EXIST KQQvK.rtbz curl --remote-name-all https://tablebase.lichess.ovh/tables/standard/3-4-5-dtz/K{P,N,R,B,Q}{P,N,R,B,Q}vK.rtbz +- cmd: IF %GTEST%==true IF NOT EXIST KQvKQ.rtbz curl --remote-name-all https://tablebase.lichess.ovh/tables/standard/3-4-5-dtz/K{P,N,R,B,Q}vK{P,N,R,B,Q}.rtbz +- cmd: IF %GTEST%==true IF NOT EXIST KQvK.rtbw curl --remote-name-all https://tablebase.lichess.ovh/tables/standard/3-4-5-wdl/K{P,N,R,B,Q}vK.rtbw +- cmd: IF %GTEST%==true IF NOT EXIST KQQvK.rtbw curl --remote-name-all https://tablebase.lichess.ovh/tables/standard/3-4-5-wdl/K{P,N,R,B,Q}{P,N,R,B,Q}vK.rtbw +- cmd: IF %GTEST%==true IF NOT EXIST KQvKQ.rtbw curl --remote-name-all https://tablebase.lichess.ovh/tables/standard/3-4-5-wdl/K{P,N,R,B,Q}vK{P,N,R,B,Q}.rtbw - cmd: cd C:\projects\lc0 cache: - C:\cache - - 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.0' - - 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1' + - 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1' + - 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9' - C:\projects\lc0\subprojects\packagecache - - C:\ndk\android-ndk-r19c\toolchains\llvm\prebuilt\windows-x86_64 + - C:\ndk\android-ndk-r27c\toolchains\llvm\prebuilt\windows-x86_64 before_build: - cmd: git submodule update --init --recursive - cmd: IF %BLAS%==true (echo.#define DEFAULT_MAX_PREFETCH 0 & echo.#define DEFAULT_TASK_WORKERS 0) > params_override.h @@ -124,7 +133,8 @@ before_build: - cmd: SET EXTRA= - cmd: IF %ANDROID%==false SET EXTRA=-Db_vscrt=md - cmd: IF %ONNX_DML%==true SET EXTRA=-Db_vscrt=md -Donnx_libdir=C:\cache\%ONNX_NAME%\lib -Donnx_include=C:\cache\%ONNX_NAME%\include -- cmd: IF %ANDROID%==false meson build --backend vs2017 --buildtype release -Dgtest=%GTEST% -Dopencl=%OPENCL% -Dblas=%BUILD_BLAS% -Ddnnl=true -Ddx=%DX% -Dcudnn=%CUDNN% -Donednn=%ONEDNN% -Dispc_native_only=false -Dnative_cuda=false -Dpopcnt=%POPCNT% -Df16c=%F16C% -Dcudnn_include="%CUDA_PATH%\include","%CUDA_PATH%\cuda\include" -Dcudnn_libdirs="%CUDA_PATH%\lib\x64","%CUDA_PATH%\cuda\lib\x64" -Dopenblas_include="%PKG_FOLDER%\OpenBLAS\dist64\include" -Dopenblas_libdirs="%PKG_FOLDER%\OpenBLAS\dist64\lib" -Ddnnl_dir="%PKG_FOLDER%\%DNNL_NAME%" -Dopencl_include="%PKG_FOLDER%\opencl-nug.0.777.77\build\native\include" -Dopencl_libdirs="%PKG_FOLDER%\opencl-nug.0.777.77\build\native\lib\x64" -Ddefault_library=static -Dmalloc=mimalloc -Dmimalloc_libdir="%MIMALLOC_PATH%"\out\msvc-x64\Release %EXTRA% +- cmd: IF %ONNX_TRT%==true SET EXTRA=-Db_vscrt=md -Donnx_libdir=C:\cache\%ONNX_NAME%\lib -Donnx_include=C:\cache\%ONNX_NAME%\include +- cmd: IF %ANDROID%==false meson build --backend vs2019 --buildtype release -Dgtest=%GTEST% -Dopencl=%OPENCL% -Dblas=%BUILD_BLAS% -Ddnnl=true -Ddx=%DX% -Dcudnn=%CUDNN% -Donednn=%ONEDNN% -Dispc_native_only=false -Dnative_cuda=false -Dpopcnt=%POPCNT% -Df16c=%F16C% -Dcudnn_include="%CUDA_PATH%\include","%CUDA_PATH%\cuda\include" -Dcudnn_libdirs="%CUDA_PATH%\lib\x64","%CUDA_PATH%\cuda\lib\x64" -Dopenblas_include="%PKG_FOLDER%\OpenBLAS\dist64\include" -Dopenblas_libdirs="%PKG_FOLDER%\OpenBLAS\dist64\lib" -Ddnnl_dir="%PKG_FOLDER%\%DNNL_NAME%" -Dopencl_include="%PKG_FOLDER%\opencl-nug.0.777.77\build\native\include" -Dopencl_libdirs="%PKG_FOLDER%\opencl-nug.0.777.77\build\native\lib\x64" -Ddefault_library=static -Dmalloc=mimalloc -Dmimalloc_libdir="%MIMALLOC_PATH%"\out\msvc-x64\Release %EXTRA% - cmd: IF %ANDROID%==true meson arm64-v8a --buildtype release -Dgtest=false -Dopenblas_include="%PKG_FOLDER%\OpenBLAS\android-aarch64\include" -Dopenblas_libdirs="%PKG_FOLDER%\OpenBLAS\android-aarch64\lib" -Dembed=%EMBED% -Ddefault_library=static --cross-file crossfile-aarch64 - cmd: IF %ANDROID%==true meson armeabi-v7a --buildtype release -Dgtest=false -Dopenblas_include="%PKG_FOLDER%\OpenBLAS\android-armv7a\include" -Dopenblas_libdirs="%PKG_FOLDER%\OpenBLAS\android-armv7a\lib" -Dembed=%EMBED% -Ddefault_library=static --cross-file crossfile-armv7a -Dispc=false -Dneon=false build_script: diff --git a/cross-files/aarch64-linux-android b/cross-files/aarch64-linux-android index 4a55d838de..75e9e63de9 100644 --- a/cross-files/aarch64-linux-android +++ b/cross-files/aarch64-linux-android @@ -1,5 +1,5 @@ -# Tested with Android NDK r19c, default toolchain +# Tested with Android NDK r27c, default toolchain # Targeting API level 21 # Set the toolchain path on your environment @@ -17,8 +17,8 @@ cpp_link_args = ['-llog', '-static-libstdc++'] [binaries] c = 'aarch64-linux-android21-clang' cpp = 'aarch64-linux-android21-clang++' -ar = 'aarch64-linux-android-ar' -strip = 'aarch64-linux-android-strip' -ld = 'aarch64-linux-android-ld' -ranlib = 'aarch64-linux-android-ranlib' -as = 'aarch64-linux-android-as' +ar = 'llvm-ar' +strip = 'llvm-strip' +ld = 'ld' +ranlib = 'llvm-ranlib' +as = 'aarch64-linux-android21-clang' diff --git a/cross-files/armv7a-linux-android b/cross-files/armv7a-linux-android index 16b3e93f90..3fed7aee8b 100644 --- a/cross-files/armv7a-linux-android +++ b/cross-files/armv7a-linux-android @@ -1,5 +1,5 @@ -# Tested with Android NDK r19c, default toolchain +# Tested with Android NDK r27c, default toolchain # Targeting API level 21 # When targeting API levels < 24 the build fails unless _FILE_OFFSET_BITS is unset. @@ -24,8 +24,8 @@ cpp_link_args = ['-llog', '-static-libstdc++'] [binaries] c = 'armv7a-linux-androideabi21-clang' cpp = 'armv7a-linux-androideabi21-clang++' -ar = 'arm-linux-androideabi-ar' -strip = 'arm-linux-androideabi-strip' -ld = 'arm-linux-androideabi-ld' -ranlib = 'arm-linux-androideabi-ranlib' -as = 'arm-linux-androideabi-as' +ar = 'llvm-ar' +strip = 'llvm-strip' +ld = 'ld' +ranlib = 'llvm-ranlib' +as = 'armv7a-linux-androideabi21-clang' diff --git a/dist/README-cuda.txt b/dist/README-cuda.txt index 8278cc53cb..4f35003cda 100644 --- a/dist/README-cuda.txt +++ b/dist/README-cuda.txt @@ -4,13 +4,16 @@ Lc0 is a UCI-compliant chess engine designed to play chess via neural network, specifically those of the LeelaChessZero project (https://lczero.org). -This binary uses CUDA and cuDNN dynamic link libraries copyrighted -by Nvidia corporation (http://www.nvidia.com), and redistributed as -permitted by the respective license file (see CUDA.txt section 2.2 -and CUDNN.txt section "CUDNN DISTRIBUTION" for details). You are -authorized to redistribute these libraries together with this -package as a whole but not individually. - +This binary uses CUDA dynamic link libraries copyrighted by Nvidia +corporation (http://www.nvidia.com), that can be redistributed as +permitted by the respective license file (see CUDA.txt section 2.2). +For size reasons you will have to get the required files by running +the included `install.cmd` script. If this fails you can get them by +downloading + and +, the required dynamic link libraries are in the respective `bin` +directories. You are authorized to redistribute these libraries +together with this package as a whole but not individually. License diff --git a/dist/README-cudnn.txt b/dist/README-cudnn.txt new file mode 100644 index 0000000000..8278cc53cb --- /dev/null +++ b/dist/README-cudnn.txt @@ -0,0 +1,38 @@ +Lc0 + +Lc0 is a UCI-compliant chess engine designed to play chess via +neural network, specifically those of the LeelaChessZero project +(https://lczero.org). + +This binary uses CUDA and cuDNN dynamic link libraries copyrighted +by Nvidia corporation (http://www.nvidia.com), and redistributed as +permitted by the respective license file (see CUDA.txt section 2.2 +and CUDNN.txt section "CUDNN DISTRIBUTION" for details). You are +authorized to redistribute these libraries together with this +package as a whole but not individually. + + +License + +Leela Chess is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +Leela Chess is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with Leela Chess. If not, see . + +Additional permission under GNU GPL version 3 section 7 + +If you modify this Program, or any covered work, by linking or +combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA +Toolkit and the NVIDIA CUDA Deep Neural Network library (or a +modified version of those libraries), containing parts covered by the +terms of the respective license agreement, the licensors of this +Program grant you additional permission to convey the resulting work. + diff --git a/dist/install-cuda_12_9.cmd b/dist/install-cuda_12_9.cmd new file mode 100644 index 0000000000..e7bc785ff9 --- /dev/null +++ b/dist/install-cuda_12_9.cmd @@ -0,0 +1,41 @@ +@echo off +where /q tar +if errorlevel 1 goto error + +cd /d %~dp0 + +cls +echo Installing the CUDA dlls required by the Lc0 cuda backend. + +echo 1/4. Downloading cudart. +curl -# --ssl-no-revoke -o tmp_cudart.zip https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.9.37-archive.zip" +if errorlevel 1 goto error + +echo 2/4. Extracting files. +tar -xzOf tmp_cudart.zip cuda_cudart-windows-x86_64-12.9.37-archive/bin/cudart64_12.dll >cudart64_12.dll +if errorlevel 1 goto error + +del /q tmp_cudart.zip + +echo 3/4. Downloading cublas. +curl -# --ssl-no-revoke -o tmp_cublas.zip https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.9.0.13-archive.zip" +if errorlevel 1 goto error + +echo 4/4. Extracting files. +tar -xzOf tmp_cublas.zip libcublas-windows-x86_64-12.9.0.13-archive/bin/cublas64_12.dll >cublas64_12.dll +if errorlevel 1 goto error + +tar -xzOf tmp_cublas.zip libcublas-windows-x86_64-12.9.0.13-archive/bin/cublasLt64_12.dll >cublasLt64_12.dll +if errorlevel 1 goto error + +del /q tmp_cublas.zip + +echo Installation successful. +pause +exit /b + +:error +cls +echo Installation failed - see the README for an alternative approach. +pause + diff --git a/scripts/appveyor_android_build.cmd b/scripts/appveyor_android_build.cmd index 9f2f79665a..a9f3f01860 100644 --- a/scripts/appveyor_android_build.cmd +++ b/scripts/appveyor_android_build.cmd @@ -1,7 +1,7 @@ cd arm64-v8a ninja -aarch64-linux-android-strip lc0 +llvm-strip lc0 cd C:\projects\lc0 cd armeabi-v7a ninja -arm-linux-androideabi-strip lc0 +llvm-strip lc0 diff --git a/scripts/appveyor_win_package.cmd b/scripts/appveyor_win_package.cmd index 36f98d8eef..f1b9ac0ed1 100644 --- a/scripts/appveyor_win_package.cmd +++ b/scripts/appveyor_win_package.cmd @@ -10,14 +10,13 @@ type "%MIMALLOC_PATH%"\LICENSE |more /P > dist\mimalloc-LICENSE 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip "%MIMALLOC_PATH%"\out\msvc-x64\Release\mimalloc-redirect.dll 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\mimalloc-readme.md 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\mimalloc-LICENSE -IF %CUDA%==true copy lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-nodll.zip +IF %CUDNN%==true copy lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-nodll.zip IF %NAME%==cpu-openblas 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip C:\cache\OpenBLAS\dist64\bin\libopenblas.dll IF %NAME%==cpu-dnnl 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip C:\cache\%DNNL_NAME%\bin\dnnl.dll IF %NAME%==onednn 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip C:\cache\%DNNL_NAME%\bin\dnnl.dll IF %OPENCL%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip C:\cache\opencl-nug.0.777.77\build\native\bin\OpenCL.dll IF %CUDNN%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip "%CUDA_PATH%\bin\cudart64_100.dll" "%CUDA_PATH%\bin\cublas64_100.dll" IF %CUDNN%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip "%CUDA_PATH%\cuda\bin\cudnn64_7.dll" -IF %CUDA%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip "%CUDA_PATH%\bin\cudart64_110.dll" "%CUDA_PATH%\bin\cublas64_11.dll" "%CUDA_PATH%\bin\cublasLt64_11.dll" IF %NAME%==cpu-dnnl copy "%PKG_FOLDER%\%DNNL_NAME%\LICENSE" dist\DNNL-LICENSE IF %NAME%==cpu-dnnl copy "%PKG_FOLDER%\%DNNL_NAME%\THIRD-PARTY-PROGRAMS" dist\DNNL-THIRD-PARTY-PROGRAMS IF %NAME%==cpu-dnnl 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\DNNL-LICENSE @@ -39,8 +38,11 @@ IF %OPENCL%==true type scripts\check_opencl.bat |more /P > dist\check_opencl.bat IF %OPENCL%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\check_opencl.bat IF %DX%==true type scripts\check_dx.bat |more /P > dist\check_dx.bat IF %DX%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\check_dx.bat +IF %CUDA%==true IF %CUDNN%==false type dist\install-cuda_12_9.cmd |more /P > dist\install.cmd +IF %CUDA%==true IF %CUDNN%==false 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\install.cmd IF %CUDA%==true copy "%CUDA_PATH%\EULA.txt" dist\CUDA.txt -IF %CUDA%==true type dist\README-cuda.txt |more /P > dist\README.txt +IF %CUDA%==true IF %CUDNN%==false type dist\README-cuda.txt |more /P > dist\README.txt +IF %CUDNN%==true type dist\README-cudnn.txt |more /P > dist\README.txt IF %CUDA%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\README.txt .\dist\CUDA.txt IF %CUDNN%==true copy "%CUDA_PATH%\cuda\NVIDIA_SLA_cuDNN_Support.txt" dist\CUDNN.txt IF %CUDNN%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\CUDNN.txt diff --git a/src/mcts/params.cc b/src/mcts/params.cc index 5e65028583..394f941b08 100644 --- a/src/mcts/params.cc +++ b/src/mcts/params.cc @@ -518,7 +518,7 @@ void SearchParams::Populate(OptionsParser* options) { options->Add(kTemperatureCutoffMoveId, 0, 1000) = 0; options->Add(kTemperatureEndgameId, 0.0f, 100.0f) = 0.0f; options->Add(kTemperatureWinpctCutoffId, 0.0f, 100.0f) = 100.0f; - options->Add(kTemperatureVisitOffsetId, -1000.0f, 1000.0f) = + options->Add(kTemperatureVisitOffsetId, -1000.0f, 1000000.0f) = 0.0f; options->Add(kNoiseEpsilonId, 0.0f, 1.0f) = 0.0f; options->Add(kNoiseAlphaId, 0.0f, 10000000.0f) = 0.3f; diff --git a/src/neural/onnx/network_onnx.cc b/src/neural/onnx/network_onnx.cc index 150787927b..645bc46825 100644 --- a/src/neural/onnx/network_onnx.cc +++ b/src/neural/onnx/network_onnx.cc @@ -46,6 +46,7 @@ #include "onnxruntime_cxx_api.h" #include "utils/bf16_utils.h" #include "utils/bititer.h" +#include "utils/commandline.h" #include "utils/exception.h" #include "utils/fp16_utils.h" #include "utils/logging.h" @@ -53,7 +54,7 @@ namespace lczero { namespace { -enum class OnnxProvider { CPU, CUDA, DML, ROCM }; +enum class OnnxProvider { CPU, CUDA, DML, ROCM, TRT }; class OnnxNetwork; @@ -83,8 +84,7 @@ class OnnxComputation : public NetworkComputation { class OnnxNetwork : public Network { public: OnnxNetwork(const WeightsFile& file, const OptionsDict& options, - OnnxProvider provider, int gpu, int threads, int batch_size, - int steps); + OnnxProvider provider); std::unique_ptr NewComputation() override { if (fp16_) { return std::make_unique>(this); @@ -103,6 +103,8 @@ class OnnxNetwork : public Network { } bool IsCpu() const override { return provider_ == OnnxProvider::CPU; } + Ort::SessionOptions GetOptions(int gpu, int threads, int batch_size); + Ort::Env onnx_env_; // Prepare sessions for this many multiples of the batch size; int steps_; @@ -123,8 +125,10 @@ class OnnxNetwork : public Network { bool bf16_; // The batch size to use, or -1 for variable. int batch_size_; + // The lower limit for variable batch size. + int min_batch_size_; static constexpr int max_batch_size_ = 1024; - // For conditional locking if running the DML provider. + // For conditional locking if running the DML/ROCM/TRT provider. OnnxProvider provider_; std::mutex lock_; }; @@ -259,8 +263,10 @@ Ort::Value OnnxComputation::PrepareInputs(int start, int batch_size) { template void OnnxComputation::ComputeBlocking() { int batch_size = network_->batch_size_; - if (batch_size < 0) batch_size = raw_input_.size(); - + if (batch_size < 0) { + batch_size = std::max(static_cast(raw_input_.size()), + network_->min_batch_size_); + } for (size_t i = 0; i < raw_input_.size();) { int step = (raw_input_.size() - i + batch_size - 1) / batch_size; if (step > network_->steps_) step = network_->steps_; @@ -272,7 +278,8 @@ void OnnxComputation::ComputeBlocking() { // same to be true for the ROCm execution provider (at least for CNNs). // TODO: This may be a onnxruntime/ROCm bug, check onnxruntime 1.16 release. if (network_->provider_ == OnnxProvider::DML || - network_->provider_ == OnnxProvider::ROCM) { + network_->provider_ == OnnxProvider::ROCM || + network_->provider_ == OnnxProvider::TRT) { network_->lock_.lock(); } network_->session_[step - 1].Run( @@ -280,15 +287,16 @@ void OnnxComputation::ComputeBlocking() { network_->outputs_cstr_.data(), output_tensors_.data(), output_tensors_.size()); if (network_->provider_ == OnnxProvider::DML || - network_->provider_ == OnnxProvider::ROCM) { + network_->provider_ == OnnxProvider::ROCM || + network_->provider_ == OnnxProvider::TRT) { network_->lock_.unlock(); } i += batch; } } -Ort::SessionOptions GetOptions(OnnxProvider provider, int gpu, int threads, - int batch_size) { +Ort::SessionOptions OnnxNetwork::GetOptions(int gpu, int threads, + int batch_size) { Ort::SessionOptions options; options.SetIntraOpNumThreads(threads); options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL); @@ -301,7 +309,7 @@ Ort::SessionOptions GetOptions(OnnxProvider provider, int gpu, int threads, ->AddFreeDimensionOverrideByName(options, "batch", batch_size)); } - switch (provider) { + switch (provider_) { case OnnxProvider::DML: options.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL); options.DisableMemPattern(); @@ -312,6 +320,59 @@ Ort::SessionOptions GetOptions(OnnxProvider provider, int gpu, int threads, throw Exception("ONNX backend internal error."); #endif break; + case OnnxProvider::TRT: { + options.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL); + + std::string cache_dir = CommandLine::BinaryDirectory() + "/trt_cache"; + std::map trt_options; + trt_options["device_id"] = std::to_string(gpu); + trt_options["trt_fp16_enable"] = fp16_ ? "1" : "0"; + trt_options["trt_int8_enable"] = "0"; + trt_options["trt_max_partition_iterations"] = "1000"; + trt_options["trt_min_subgraph_size"] = "1"; + trt_options["trt_engine_cache_enable"] = "1"; + trt_options["trt_engine_cache_prefix"] = + "Lc0_ONNX_TRT_batch_" + std::to_string(batch_size) + "_"; + trt_options["trt_engine_cache_path"] = cache_dir; + trt_options["trt_timing_cache_enable"] = "1"; + trt_options["trt_timing_cache_path"] = cache_dir; + trt_options["trt_layer_norm_fp32_fallback"] = "1"; + trt_options["trt_force_sequential_engine_build"] = "1"; + // Looks like we need I/O binding to enable this. + // trt_options["trt_cuda_graph_enable"] = "1"; + if (batch_size < 0) { + trt_options["trt_profile_min_shapes"] = + inputs_[0] + ":" + std::to_string(min_batch_size_) + "x112x8x8"; + trt_options["trt_profile_max_shapes"] = + inputs_[0] + ":" + std::to_string(max_batch_size_) + "x112x8x8"; + trt_options["trt_profile_opt_shapes"] = + inputs_[0] + ":" + std::to_string(max_batch_size_ / 4) + "x112x8x8"; + } else { + trt_options["trt_profile_min_shapes"] = + inputs_[0] + ":" + std::to_string(batch_size_) + "x112x8x8"; + trt_options["trt_profile_max_shapes"] = + inputs_[0] + ":" + std::to_string(batch_size_ * steps_) + + "x112x8x8"; + trt_options["trt_profile_opt_shapes"] = + inputs_[0] + ":" + std::to_string(batch_size_ * steps_) + + "x112x8x8"; + } + std::vector keys; + std::vector values; + for (const auto& [key, value] : trt_options) { + keys.push_back(key.c_str()); + values.push_back(value.c_str()); + } + + const auto& api = Ort::GetApi(); + OrtTensorRTProviderOptionsV2* trt_options_v2; + Ort::ThrowOnError(api.CreateTensorRTProviderOptions(&trt_options_v2)); + Ort::ThrowOnError(api.UpdateTensorRTProviderOptions( + trt_options_v2, keys.data(), values.data(), keys.size())); + options.AppendExecutionProvider_TensorRT_V2(*trt_options_v2); + api.ReleaseTensorRTProviderOptions(trt_options_v2); + break; + } case OnnxProvider::ROCM: { OrtROCMProviderOptions rocm_options; rocm_options.device_id = gpu; @@ -338,30 +399,34 @@ Ort::SessionOptions GetOptions(OnnxProvider provider, int gpu, int threads, return options; } -OnnxNetwork::OnnxNetwork(const WeightsFile& file, const OptionsDict&, - OnnxProvider provider, int gpu, int threads, - int batch_size, int steps) +OnnxNetwork::OnnxNetwork(const WeightsFile& file, const OptionsDict& opts, + OnnxProvider provider) : onnx_env_(ORT_LOGGING_LEVEL_WARNING, "lc0"), - steps_(steps), capabilities_{file.format().network_format().input(), file.format().network_format().output(), file.format().network_format().moves_left()}, fp16_(file.onnx_model().data_type() == pblczero::OnnxModel::FLOAT16), bf16_(file.onnx_model().data_type() == pblczero::OnnxModel::BFLOAT16), - batch_size_(batch_size), provider_(provider) { + batch_size_ = + opts.GetOrDefault("batch", provider == OnnxProvider::DML ? 16 : -1); + steps_ = + opts.GetOrDefault("steps", provider == OnnxProvider::DML ? 4 : 1); + min_batch_size_ = opts.GetOrDefault( + "min_batch", provider == OnnxProvider::TRT ? 4 : 1); + int gpu = opts.GetOrDefault("gpu", 0); + int threads = + opts.GetOrDefault("threads", provider == OnnxProvider::CPU ? 1 : 0); + // Sanity checks. - if (batch_size_ < 0) steps_ = 1; - if (batch_size_ * steps > max_batch_size_) { + if (batch_size_ <= 0) { + batch_size_ = -1; // Variable batch size. + steps_ = 1; + } + if (batch_size_ * steps_ > max_batch_size_) { batch_size_ = max_batch_size_ / steps_; } - for (int step = 1; step <= steps_; step++) - session_.emplace_back( - onnx_env_, file.onnx_model().model().data(), - file.onnx_model().model().size(), - GetOptions(provider, gpu, threads, batch_size_ * step)); - const auto& md = file.onnx_model(); if (!md.has_input_planes()) { throw Exception("NN doesn't have input planes defined."); @@ -391,6 +456,11 @@ OnnxNetwork::OnnxNetwork(const WeightsFile& file, const OptionsDict&, std::transform(outputs_.begin(), outputs_.end(), std::back_inserter(outputs_cstr_), [](const auto& x) { return x.c_str(); }); + + for (int step = 1; step <= steps_; step++) + session_.emplace_back(onnx_env_, file.onnx_model().model().data(), + file.onnx_model().model().size(), + GetOptions(gpu, threads, batch_size_ * step)); } template @@ -398,22 +468,8 @@ std::unique_ptr MakeOnnxNetwork(const std::optional& w, const OptionsDict& opts) { if (!w) throw Exception("The ONNX backend requires a network file."); - int gpu = opts.GetOrDefault("gpu", 0); - - int batch_size = - opts.GetOrDefault("batch", kProvider == OnnxProvider::DML ? 16 : -1); - - int steps = - opts.GetOrDefault("steps", kProvider == OnnxProvider::DML ? 4 : 1); - - int threads = - opts.GetOrDefault("threads", kProvider == OnnxProvider::CPU ? 1 : 0); - - if (batch_size <= 0) batch_size = -1; // Variable batch size. - if (w->has_onnx_model()) { - return std::make_unique(*w, opts, kProvider, gpu, threads, - batch_size, steps); + return std::make_unique(*w, opts, kProvider); } else { WeightsToOnnxConverterOptions converter_options; converter_options.opset = opts.GetOrDefault("opset", 17); @@ -428,20 +484,18 @@ std::unique_ptr MakeOnnxNetwork(const std::optional& w, opts.GetOrDefault("value_head", "winner"); std::string datatype; - if (opts.IsDefault("datatype")) { + if (opts.Exists("datatype")) { + datatype = opts.Get("datatype"); + } else { bool fp16 = opts.GetOrDefault( "fp16", kProvider == OnnxProvider::CPU ? false : true); datatype = fp16 ? "f16" : "f32"; - } else { - datatype = opts.Get("datatype"); } converter_options.data_type = WeightsToOnnxConverterOptions::StringToDataType(datatype); - converter_options.relax_op_types = false; auto converted = ConvertWeightsToOnnx(*w, converter_options); - return std::make_unique(converted, opts, kProvider, gpu, - threads, batch_size, steps); + return std::make_unique(converted, opts, kProvider); } } @@ -451,6 +505,7 @@ REGISTER_NETWORK("onnx-rocm", MakeOnnxNetwork, 64) #ifdef USE_DML REGISTER_NETWORK("onnx-dml", MakeOnnxNetwork, 63) #endif +REGISTER_NETWORK("onnx-trt", MakeOnnxNetwork, 60) REGISTER_NETWORK("onnx-cuda", MakeOnnxNetwork, 61) REGISTER_NETWORK("onnx-cpu", MakeOnnxNetwork, 62)