diff --git a/.circleci/config.yml b/.circleci/config.yml index 1c5bdbc82d..6404b6bb28 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -9,24 +9,26 @@ jobs: name: Install build tools command: | apt-get update - apt-get -y install git python3-pip gcc-10 g++-10 clang-12 zlib1g zlib1g-dev + apt-get -y install git python3-pip gcc-10 g++-10 clang-12 zlib1g zlib1g-dev wget pip3 install meson==0.63 pip3 install ninja - run: - name: "Pull Submodules" - command: git submodule update --init + name: Install onnxruntime + command: | + wget https://github.com/microsoft/onnxruntime/releases/download/v1.22.0/onnxruntime-linux-x64-1.22.0.tgz -P /tmp + tar xzf /tmp/onnxruntime-linux-x64-1.22.0.tgz -C /tmp - run: name: Meson GCC environment: CC: gcc-10 CXX: g++-10 - command: meson build-gcc -Dgtest=false + command: meson build-gcc -Dgtest=false -Donnx_include=/tmp/onnxruntime-linux-x64-1.22.0/include -Donnx_libdir=/tmp/onnxruntime-linux-x64-1.22.0/lib - run: name: Meson Clang environment: CC: clang-12 CXX: clang++-12 - command: meson build-clang -Dgtest=false -Db_lto=false + command: meson build-clang -Dgtest=false -Db_lto=false -Donnx_include=/tmp/onnxruntime-linux-x64-1.22.0/include -Donnx_libdir=/tmp/onnxruntime-linux-x64-1.22.0/lib - run: name: Build GCC command: | @@ -39,13 +41,9 @@ jobs: ninja -j 4 "mac": macos: - xcode: 14.1.0 - resource_class: macos.m1.medium.gen1 + xcode: 14.3.1 steps: - checkout - - run: - name: "Pull Submodules" - command: git submodule update --init - run: name: Install build tools command: | @@ -71,43 +69,63 @@ jobs: command: lipo -create -o /tmp/lc0 build/lc0 build-arm/lc0 - store_artifacts: path: /tmp/lc0 - destination: lc0-macos_12.6.1 + destination: lc0-macos_13.2.1 - run: - name: Verify Workspace + name: Prepare Workspace command: | - mv /tmp/lc0 /tmp/lc0-macos_12.6.1 - ls -lah /tmp + mkdir -p workspace + mv /tmp/lc0 workspace - persist_to_workspace: - root: /tmp + root: workspace paths: - - lc0-macos_12.6.1 - + - lc0 + "mac latest": + macos: + xcode: 26.1.0 + steps: + - checkout + - run: + name: Install build tools + command: | + pip3 install meson + pip3 install ninja + - run: + name: Build lc0 arm + command: | + meson build-arm --buildtype=release -Dgtest=false -Dopencl=false + cd build-arm + ninja "upload-github-release": macos: - xcode: 14.1.0 + xcode: 14.3.1 steps: - attach_workspace: - at: /tmp + at: /tmp/workspace - run: name: Install GitHub CLI command: brew install gh - run: name: Verify Workspace command: | - ls -lah /tmp + ls -lah /tmp/workspace - run: name: Upload to GitHub Release command: | + mv /tmp/workspace/lc0 /tmp/lc0-$CIRCLE_TAG-macos_13.2.1 gh release upload \ "$CIRCLE_TAG" \ - /tmp/lc0-macos_12.6.1 \ - --clobber + /tmp/lc0-$CIRCLE_TAG-macos_13.2.1 \ + --clobber --repo LeelaChessZero/lc0 workflows: version: 2 builds: jobs: - build - - "mac" + - "mac": + filters: + tags: + only: /v[0-9]+(\.[0-9]+)*(\-.+)?/ + - "mac latest" - "upload-github-release": requires: - "mac" diff --git a/.gitmodules b/.gitmodules index 6575e63266..e69de29bb2 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +0,0 @@ -[submodule "libs/lczero-common"] - path = libs/lczero-common - url = https://github.com/LeelaChessZero/lczero-common.git diff --git a/AUTHORS b/AUTHORS index d1fabfddfe..b7d6010ae3 100644 --- a/AUTHORS +++ b/AUTHORS @@ -8,6 +8,7 @@ almaudoh Aloril Andrew Grant Andy Olsen +Aniebiet Udoh Ankan Ankan Banerjee Anson Hu @@ -19,9 +20,11 @@ Boštjan Mejak Brandon Lin Brett Holman Carlo Wood +Chin-Chang Yang cn4750 Cong Contrad Namiseb (Bonan) +Copilot (bot) cwbriscoe danegraphics Daniel Monroe @@ -46,8 +49,10 @@ Francis Li Francois Francois Pays François Pays +Gabe Ganesh Krishnan GBeauregard +Gergely Fülöp Gian-Carlo Pascutto gmorenz Google LLC @@ -56,11 +61,16 @@ Hace Hans Ekbrand Henrik Forstén Ikko Eltociear Ashimine +Jack L Jack Thomson James Horsfall Thomas +jamie jjoshua2 John Newlin +john-sp +Julian-Dominik Helmsen Karl Kfoury +Kathleen Mcgrievy kiilas Kip Hamiltons Kovax @@ -73,6 +83,7 @@ Martin Martin Senft masterkni6 masterkni666 +Menkib Mike Roberts Naphthalin nathan-lc0 @@ -85,13 +96,17 @@ Pan patrik-ha PaulJeFi Pratik Dixit +psykose QxC4eva +Rafal Bielski Raj Reece H. Dunn Ron Wolf Sami Kiminki +Sherman Siu Shreyas Kapur shtayerc +Shukant Pal Simon slash students @@ -108,6 +123,7 @@ Valentin Valeriy Huz Victor Popovici Videodr0me +Viet-Anh Tran Viren6 Yan Zhang -zz4032 \ No newline at end of file +zz4032 diff --git a/README.md b/README.md index a56da72740..0ce7a2a125 100644 --- a/README.md +++ b/README.md @@ -7,33 +7,28 @@ Lc0 is a UCI-compliant chess engine designed to play chess via neural network, s ## Downloading source -Lc0 can be acquired either via a git clone or an archive download from GitHub. Be aware that there is a required submodule which isn't included in source archives. +Lc0 can be acquired either via a git clone or an archive download from GitHub. -For essentially all purposes, including selfplay game generation and match play, we highly recommend using the latest `release/version` branch (for example `release/0.31`), which is equivalent to using the latest version tag. +For essentially all purposes, including selfplay game generation and match play, we highly recommend using the latest `release/version` branch (for example `release/0.32`), which is equivalent to using the latest version tag. Versioning follows the Semantic Versioning guidelines, with major, minor and patch sections. The training server enforces game quality using the versions output by the client and engine. - Download using git: ```shell -git clone -b release/0.31 --recurse-submodules https://github.com/LeelaChessZero/lc0.git +git clone -b release/0.32 https://github.com/LeelaChessZero/lc0.git ``` If you have cloned already an old version, fetch, view and checkout a new branch: ```shell git fetch --all git branch --all -git checkout -t remotes/origin/release/0.31 +git checkout -t remotes/origin/release/0.32 ``` - -If you prefer to download an archive, you need to also download and place the submodule: - * Download the [.zip](https://api.github.com/repos/LeelaChessZero/lc0/zipball/release/0.31) file ([.tar.gz](https://api.github.com/repos/LeelaChessZero/lc0/tarball/release/0.31) archive is also available) +If you prefer to download an archive: + * Download the [.zip](https://api.github.com/repos/LeelaChessZero/lc0/zipball/release/0.32) file ([.tar.gz](https://api.github.com/repos/LeelaChessZero/lc0/tarball/release/0.32) archive is also available) * Extract - * Download https://github.com/LeelaChessZero/lczero-common/archive/master.zip (also available as [.tar.gz](https://github.com/LeelaChessZero/lczero-common/archive/master.tar.gz)) - * Move the second archive into the first archive's `libs/lczero-common/` folder and extract - * The final form should look like `/libs/lczero-common/proto/` Having successfully acquired Lc0 via either of these methods, proceed to the build section below and follow the instructions for your OS. @@ -42,13 +37,11 @@ Having successfully acquired Lc0 via either of these methods, proceed to the bui Building should be easier now than it was in the past. Please report any problems you have. -Aside from the git submodule, lc0 requires the Meson build system and at least one backend library for evaluating the neural network, as well as the required `zlib`. (`gtest` is optionally used for the test suite.) If your system already has this library installed, they will be used; otherwise Meson will generate its own copy of the two (a "subproject"), which in turn requires that git is installed (yes, separately from cloning the actual lc0 repository). Meson also requires python and Ninja. +Building lc0 requires the Meson build system and at least one backend library for evaluating the neural network, as well as a few libraries. If your system already has these libraries installed, they will be used; otherwise Meson will generate its own copy (a "subproject"), which in turn requires that git is installed (yes, separately from cloning the actual lc0 repository). Meson also requires python and Ninja. -Backend support includes (in theory) any CBLAS-compatible library for CPU usage, such as OpenBLAS or Intel's DNNL or MKL. For GPUs, OpenCL and CUDA+cudnn are supported, while DX-12 can be used in Windows 10 with latest drivers. +Backend support includes (in theory) any CBLAS-compatible library for CPU usage, but OpenBLAS or Intel's DNNL are the main ones. For GPUs, the following are supported: CUDA (with optional cuDNN), various flavors of onnxruntime, and Apple's Metal Performance Shaders. There is also experimental SYCL support for AMD and Intel GPUs. -Finally, lc0 requires a compiler supporting C++17. Minimal versions seem to be g++ v8.0, clang v5.0 (with C++17 stdlib) or Visual Studio 2017. - -*Note* that cuda checks the compiler version and stops even with newer compilers, and to work around this we have added the `nvcc_ccbin` build option. This is more of an issue with new Linux versions, but you can get around it by using an earlier version of gcc just for cuda. As an example, adding `-Dnvcc_ccbin=g++-9` to the `build.sh` command line will use g++-9 with cuda instead of the system compiler. +Finally, lc0 requires a compiler supporting C++20. Minimal versions tested are g++ v10.0, clang v12.0 and Visual Studio 2019 version 16.11. Given those basics, the OS and backend specific instructions are below. @@ -56,160 +49,125 @@ Given those basics, the OS and backend specific instructions are below. #### Generic -1. Install backend: - - If you want to use NVidia graphics cards Install [CUDA](https://developer.nvidia.com/cuda-zone) and [cuDNN](https://developer.nvidia.com/cudnn). - - If you want to use AMD graphics cards install OpenCL. - - if you want OpenBLAS version Install OpenBLAS (`libopenblas-dev`). +1. Install backend (also read the detailed instructions in later sections): + - If you want to use NVidia graphics cards Install [CUDA](https://developer.nvidia.com/cuda-zone) (and optionally [cuDNN](https://developer.nvidia.com/cudnn)). + - If you want to use AMD or Intel graphics cards you can try SYCL. + - if you want BLAS install either OpenBLAS or DNNL. 2. Install ninja build (`ninja-build`), meson, and (optionally) gtest (`libgtest-dev`). 3. Go to `lc0/` 4. Run `./build.sh` 5. `lc0` will be in `lc0/build/release/` directory -6. Unzip a [neural network](https://lczero.org/play/networks/bestnets/) in the same directory as the binary. +6. Download a [neural network](https://lczero.org/play/networks/bestnets/) in the same directory as the binary (no need to unpack it). If you want to build with a different compiler, pass the `CC` and `CXX` environment variables: +```shell +CC=clang CXX=clang++ ./build.sh +``` - CC=clang-6.0 CXX=clang++-6.0 ./build.sh - -#### Note on installing CUDA on Ubuntu - -Nvidia provides .deb packages. CUDA will be installed in `/usr/local/cuda-10.0` and requires 3GB of diskspace. -If your `/usr/local` partition doesn't have that much space left you can create a symbolic link before -doing the install; for example: `sudo ln -s /opt/cuda-10.0 /usr/local/cuda-10.0` - -The instructions given on the nvidia website tell you to finish with `apt install cuda`. However, this -might not work (missing dependencies). In that case use `apt install cuda-10-0`. Afterwards you can -install the meta package `cuda` which will cause an automatic upgrade to a newer version when that -comes available (assuming you use `Installer Type deb (network)`, if you'd want that (just cuda-10-0 will -stay at version 10). If you don't know what to do, only install cuda-10-0. - -cuDNN exists of two packages, the Runtime Library and the Developer Library (both a .deb package). +#### Ubuntu 20.04 -Before you can download the latter you need to create a (free) "developer" account with nvidia for -which at least a legit email address is required (their website says: The e-mail address is not made public -and will only be used if you wish to receive a new password or wish to receive certain news or notifications -by e-mail.). Further they ask for a name, date of birth (not visible later on), country, organisation ("LeelaZero" -if you have none), primary industry segment ("Other"/none) and which development areas you are interested -in ("Deep Learning"). +For Ubuntu 20.04 you need meson, ninja and gcc-10 before performing the steps above. The following should work: +```shell +apt-get update +apt-get -y install git python3-pip gcc-10 g++-10 zlib1g zlib1g-dev +pip3 install meson +pip3 install ninja +CC=gcc-10 CXX=g++-10 INSTALL_PREFIX=~/.local ./build.sh +``` -#### Ubuntu 18.04 +Make sure that `~/.local/bin` is in your `PATH` environment variable. You can now type `lc0 --help` and start. -For Ubuntu 18.04 you need the latest version of meson, libstdc++-8-dev, and clang-6.0 before performing the steps above: +### Windows - sudo apt-get install libstdc++-8-dev clang-6.0 ninja-build pkg-config - pip3 install meson --user - CC=clang-6.0 CXX=clang++-6.0 INSTALL_PREFIX=~/.local ./build.sh +Here are the brief instructions for CUDA/cuDNN, for details and other options see `windows-build.md` and the instructions in the following sections. -Make sure that `~/.local/bin` is in your `PATH` environment variable. You can now type `lc0 --help` and start. +1. Install Microsoft Visual Studio (2019 version 16.11 or later) +2. Install [CUDA](https://developer.nvidia.com/cuda-zone) +3. (Optionally install [cuDNN](https://developer.nvidia.com/cudnn)). +4. Install Python3 if you didn't install it with Visual Studio. +5. Install Meson: `pip3 install --upgrade meson` +6. If `CUDA_PATH` is not set (run the `set` command to see the full list of variables), edit `build.cmd` and set the `CUDA_PATH` with your CUDA directory +* If you also want cuDNN, set `CUDNN_PATH` with your cuDNN directory (not needed if it is the same with `CUDA_PATH`). -#### Ubuntu 16.04 +7. Run `build.cmd`. It will ask permission to delete the build directory, then generate MSVS project and pause. -For Ubuntu 16.04 you need the latest version of meson, ninja, clang-6.0, and libstdc++-8: +Then either: - wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add - - sudo apt-add-repository 'deb http://apt.llvm.org/xenial/ llvm-toolchain-xenial-6.0 main' - sudo add-apt-repository ppa:ubuntu-toolchain-r/test - sudo apt-get update - sudo apt-get install clang-6.0 libstdc++-8-dev - pip3 install meson ninja --user - CC=clang-6.0 CXX=clang++-6.0 INSTALL_PREFIX=~/.local ./build.sh +8. Hit `Enter` to build it. +9. Resulting binary will be `build/lc0.exe` -Make sure that `~/.local/bin` is in your `PATH` environment variable. You can now type `lc0 --help` and start. +Or. -#### openSUSE (all versions) +8. Open generated solution `build/lc0.sln` in Visual Studio and build it yourself. -Instructions, packages and tools for building on openSUSE are at [openSUSE_install.md](openSUSE_install.md) +### Mac -#### Docker +You will need xcode and python3 installed. Then you need to install some required packages through Terminal: -Use https://github.com/vochicong/lc0-docker -to run latest releases of lc0 and the client inside a Docker container. +1. Install meson: `pip3 install meson` +2. Install ninja: `pip3 install ninja` +Now download the lc0 source, if you haven't already done so, following the instructions earlier in the page. -### Windows +3. Go to the lc0 directory. +4. Run `./build.sh -Dgtest=false` -Here are the brief instructions for CUDA/CuDNN, for details and other options see `windows-build.md`. +The compiled Lc0 will be in `build/release` -0. Install Microsoft Visual Studio (2017 or later) -1. Install [CUDA](https://developer.nvidia.com/cuda-zone) -2. Install [cuDNN](https://developer.nvidia.com/cudnn). -3. Install Python3 -4. Install Meson: `pip3 install --upgrade meson` -5. Edit `build.cmd`: +Starting with v0.32.0, we are also offering a pre-compiled version that can be downloaded from the [release page](https://github.com/LeelaChessZero/lc0/releases). -* Set `CUDA_PATH` with your CUDA directory -* Set `CUDNN_PATH` with your cuDNN directory (may be the same with CUDA_PATH) +### CUDA -6. Run `build.cmd`. It will ask permission to delete the build directory, then generate MSVS project and pause. +CUDA can be downloaded and installed following the instructions in from . The build in most cases will pick it up with no further action. However if the cuda compiler (`nvcc`) is not found you can call the build like this: `PATH=/usr/local/cuda/bin:$PATH ./build.sh`, replacing the path with the correct one for `nvcc`. -Then either: +*Note* that CUDA uses the system compiler and stops if it doesn't recognize the version, even if newer. This is more of an issue with new Linux versions, but you can get around with the `nvcc_ccbin` build option to specify a different compiler just for cuda. As an example, adding `-Dnvcc_ccbin=g++-11` to the build command line will use g++-11 with cuda instead of the system compiler. -7. Hit `Enter` to build it. -8. Resulting binary will be `build/lc0.exe` +### ONNX -Or. +Lc0 offers several ONNX based backends, namely onnx-cpu, onnx-cuda, onnx-trt, onnx-rocm and on Windows onnx-dml, utilizing the execution providers offered by onnxruntime. -7. Open generated solution `build/lc0.sln` in Visual Studio and build yourself. +Some Linux systems are starting to offer onnxruntime packages, so after installing this there is a good chance the Lc0 build will pick it up with no further action required. Otherwise you can set the `onnx_libdir` and `onnx_include` build options to point to the onnxruntime libraries and include directories respectively. The same options are used if you unpack a package downloaded from . -### Mac +For Windows, we offer pre-compiled packages for onnx-dml and onnx-trt, see the included README for installation instructions. -First you need to install some required packages through Terminal: -1. Install brew as per the instructions at https://brew.sh/ -2. Install python3: `brew install python3` -3. Install meson: `brew install meson` -4. Install ninja: `brew install ninja` -5. (For Mac OS 10.14 Mojave, or if the other step 5 fails): - * Install developer tools: ``xcode-select --install`` - * When using Mojave install SDK headers: `installer -pkg /Library/Developer/CommandLineTools/Packages/macOS_SDK_headers_for_macOS_10.14.pkg -target /` (if this doesn't work, use `sudo installer` instead of just `installer`.) +### SYCL -Or. +*Note* that SYCL support is new in v0.32.0 and as such is still considered experimental. -5. (For MacOS 10.15 Catalina, or if the other step 5 fails): - * Install Xcode command-line tools: ``xcode-select --install`` - * Install "XCode Developer Tools" through the app store. (First one on the list of Apps if searched.) - * Associate the SDK headers in XCode with a command: export CPATH=\`xcrun --show-sdk-path\`/usr/include - -Now download the lc0 source, if you haven't already done so, following the instructions earlier in the page. +You will need the Intel "oneAPI DPC++/C++ Compiler", "DPC++ Compatibility Tool" and (for an Intel GPU) "oneAPI Math Kernel Library (oneMKL)" or (for an AMD GPU) hipBLAS. -6. Go to the lc0 directory. -7. Run `./build.sh -Dgtest=false` (needs step 5) +The Intel tools can be found in either the "oneAPI Base Toolkit" or "C++ Essentials" packages that can be downloaded from +, while hipBLAS can be downloaded from + -### Raspberry Pi +The compiler for C code is icx and for C++ code is icx on Windows but icpx on Linux. -You'll need to be running the latest Raspberry Pi OS "buster". +To build Lc0 with SYCL you need to set the `sycl` build option using `-Dsycl=l0` (that is el zero) for an Intel GPU or `-Dsycl=amd` for (you guessed it) an AMD GPU. -1. Install OpenBLAS +You may also have to set the `dpct_include` option to point to the DPC++ Compatibility Tool includes, the `onemkl_include` similarly for the oneMKL includes, or `hip_libdirs` and `hip_include` to the AMD HIP libraries and includes respectively. +On Linux, a typical session would go like this: ```shell -git clone https://github.com/xianyi/OpenBLAS.git -cd OpenBLAS/ -make -sudo make PREFIX=/usr install -cd .. +. /opt/intel/oneapi/setvars.sh --include-intel-llvm +CC=icx CXX=icpx AR=llvm-ar ./build.sh release -Dgtest=false -Dsycl=l0 ``` +The first line is to initialize the build environment and is only needed once per session, while the build line may need modification as described above. -2. Install Meson +On windows you will have to build using `ninja`, this is provided by Visual Studio if you install the CMake component. We provide a `build-sycl.cmd` script that should build just fine for an Intel GPU. This script has not yet been tested with and AMD GPU, some editing will be required. -```shell -pip install meson -pip install ninja -``` +You can also install the [oneAPI DPC++/C++ Compiler Runtime](https://www.intel.com/content/www/us/en/developer/articles/tool/compilers-redistributable-libraries-by-version.html) so you can run Lc0 without needing to initialize the build environment every time. -3. Install compiler and standard libraries +### BLAS -```shell -sudo apt install clang-6.0 libstdc++-8-dev -``` +Lc0 can also run (a bit slow) on CPU, using matrix multiplication functions from a BLAS library. By default OpenBLAS is used if available as it seems to offer good performance on a wide range of processors. If your system doesn't offer an OpenBLAS package (e.g. `libopenblas-dev`), or you have a recent processor you can get DNNL from [here](). To use DNNL you have to pass `-Ddnnl=true` to the build and specify the directory where it was installed using the `-Ddnnl_dir=` option. For macs, the Accelerate library will be used. -4. Clone lc0 and compile +If the "Intel Implicit SPMD Program Compiler" (`ispc`) is [installed](), some performance critical functions will use vectorized code for faster execution. -```shell -git clone https://github.com/LeelaChessZero/lc0.git -cd lc0 -git submodule update --init --recursive -CC=clang-6.0 CXX=clang++-6.0 ./build.sh -Ddefault_library=static -``` +*Note* that Lc0 is not able to control the number of threads with all BLAS libraries. Some libraries try to exploit cores aggressively, in which case it may be best to leave the threads set to the default (i.e. automatic) setting. + +## Getting help -5. The resulting binary will be in build/release +If there is an issue or the above instructions were not clear, you can always ask for help. The fastest way is to ask in the help channel of our [discord chat](http://lc0.org/chat), but you can also open a [github issue](https://github.com/LeelaChessZero/lc0/issues) (after checking the issue hasn't already been reported). ## Python bindings @@ -240,8 +198,8 @@ along with Leela Chess. If not, see . ### Additional permission under GNU GPL version 3 section 7 -_The source files of Lc0 with the exception of the BLAS and OpenCL -backends (all files in the `blas` and `opencl` sub-directories) have +_The source files of Lc0 with the exception of the BLAS, OpenCL and SYCL +backends (all files in the `blas`, `opencl` and `sycl` sub-directories) have the following additional permission, as allowed under GNU GPL version 3 section 7:_ diff --git a/appveyor.yml b/appveyor.yml index fa4ea670a3..e68f9f136e 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -6,20 +6,24 @@ image: environment: matrix: - NAME: gpu-nvidia-cudnn - - NAME: gpu-nvidia-cuda + - NAME: gpu-nvidia-cuda12 # - NAME: gpu-dx12 # - NAME: gpu-opencl - NAME: cpu-dnnl - NAME: cpu-openblas # - NAME: onednn - - NAME: onnx-dml + - NAME: onnx - NAME: android + - NAME: gpu-nvidia-cuda11 for: - matrix: only: + - NAME: gpu-nvidia-cudnn + - NAME: gpu-nvidia-cuda11 # - NAME: gpu-opencl - NAME: cpu-dnnl + - NAME: cpu-openblas skip_non_tags: true clone_folder: c:\projects\lc0 install: @@ -29,20 +33,21 @@ install: - cmd: set OPENCL=false - cmd: set BLAS=false - cmd: set ONEDNN=false -- cmd: set ONNX_DML=false +- cmd: set ONNX=false - cmd: set GTEST=false - cmd: set ANDROID=false - cmd: IF %NAME%==android set ANDROID=true - cmd: IF %NAME%==gpu-nvidia-cudnn set CUDNN=true - cmd: IF %NAME%==gpu-nvidia-cudnn set CUDA=true -- cmd: IF %NAME%==gpu-nvidia-cuda set CUDA=true +- cmd: IF %NAME%==gpu-nvidia-cuda11 set CUDA=true +- cmd: IF %NAME%==gpu-nvidia-cuda12 set CUDA=true - cmd: IF %NAME%==gpu-dx12 set DX=true - cmd: IF %NAME%==gpu-opencl set OPENCL=true - cmd: IF %NAME%==cpu-dnnl set BLAS=true - cmd: IF %NAME%==cpu-openblas set BLAS=true -- cmd: IF %NAME%==cpu-openblas set GTEST=true - cmd: IF %NAME%==onednn set ONEDNN=true -- cmd: IF %NAME%==onnx-dml set ONNX_DML=true +- cmd: IF %NAME%==onnx set ONNX=true +- cmd: IF %NAME%==onnx set GTEST=true - cmd: set NET=753723 - cmd: set NET_HASH=3e3444370b9fe413244fdc79671a490e19b93d3cca1669710ffeac890493d198 - cmd: IF NOT %OPENCL%==true IF NOT %DX%==true set NET=791556 @@ -54,9 +59,12 @@ install: - cmd: IF %NAME%==onednn set DNNL_NAME=dnnl_win_2.7.2_cpu_vcomp_gpu_vcomp - cmd: IF %NAME%==onednn IF NOT EXIST C:\cache\%DNNL_NAME% appveyor DownloadFile https://github.com/borg323/oneDNN/releases/download/v2.7.2/dnnl_win_2.7.2_cpu_vcomp_gpu_vcomp.zip - cmd: IF %NAME%==onednn IF NOT EXIST C:\cache\%DNNL_NAME% 7z x dnnl_win_2.7.2_cpu_vcomp_gpu_vcomp.zip -oC:\cache -- cmd: IF %NAME%==onnx-dml set ONNX_NAME=onnxruntime-win-x64-dml-1.22 -- cmd: IF %NAME%==onnx-dml IF NOT EXIST C:\cache\%ONNX_NAME% appveyor DownloadFile https://github.com/microsoft/onnxruntime/releases/download/v1.22.0/Microsoft.ML.OnnxRuntime.DirectML.1.22.0.nupkg -- cmd: IF %NAME%==onnx-dml IF NOT EXIST C:\cache\%ONNX_NAME% 7z x Microsoft.ML.OnnxRuntime.DirectML.1.22.0.nupkg -oC:\cache\%ONNX_NAME% +- cmd: IF %NAME%==onnx set ONNX_NAME=onnxruntime-win-x64-dml-1.22.1 +- cmd: IF %NAME%==onnx set ONNX_NAME_TWO=onnxruntime-win-x64-gpu-1.22.1 +- cmd: IF %NAME%==onnx IF NOT EXIST C:\cache\%ONNX_NAME% appveyor DownloadFile https://github.com/microsoft/onnxruntime/releases/download/v1.22.1/Microsoft.ML.OnnxRuntime.DirectML.1.22.1.nupkg +- cmd: IF %NAME%==onnx IF NOT EXIST C:\cache\%ONNX_NAME% 7z x Microsoft.ML.OnnxRuntime.DirectML.1.22.1.nupkg -oC:\cache\%ONNX_NAME% +- cmd: IF %NAME%==onnx IF NOT EXIST C:\cache\%ONNX_NAME_TWO% appveyor DownloadFile https://github.com/microsoft/onnxruntime/releases/download/v1.22.1/onnxruntime-win-x64-gpu-1.22.1.zip +- cmd: IF %NAME%==onnx IF NOT EXIST C:\cache\%ONNX_NAME_TWO% 7z x onnxruntime-win-x64-gpu-1.22.1.zip -oC:\cache - cmd: IF %NAME%==cpu-openblas IF NOT EXIST C:\cache\OpenBLAS appveyor DownloadFile https://sjeng.org/ftp/OpenBLAS-0.3.3-win-oldthread.zip - cmd: IF %NAME%==cpu-openblas IF NOT EXIST C:\cache\OpenBLAS 7z x OpenBLAS-0.3.3-win-oldthread.zip -oC:\cache\OpenBLAS - cmd: IF %OPENCL%==true nuget install opencl-nug -Version 0.777.77 -OutputDirectory C:\cache @@ -71,11 +79,21 @@ install: - cmd: IF DEFINED CUDNN_INSTALL cuda_10.1.243_win10_network -s nvcc_10.1 cublas_dev_10.1 cublas_10.1 cudart_10.1 - cmd: IF DEFINED CUDNN_INSTALL appveyor DownloadFile https://developer.download.nvidia.com/compute/redist/cudnn/v7.5.1/cudnn-10.1-windows10-x64-v7.5.1.10.zip - cmd: IF DEFINED CUDNN_INSTALL 7z x cudnn-10.1-windows10-x64-v7.5.1.10.zip -o"%CUDA_PATH%" -- cmd: IF %CUDNN%==false set "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1" +- cmd: IF %NAME%==gpu-nvidia-cuda11 set "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1" +- cmd: IF %NAME%==gpu-nvidia-cuda12 set "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9" +- cmd: IF %NAME%==onnx set "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9" - cmd: IF %CUDA%==true IF NOT EXIST "%CUDA_PATH%" set CUDA_INSTALL=1 -- cmd: IF DEFINED CUDA_INSTALL appveyor DownloadFile https://developer.download.nvidia.com/compute/cuda/11.1.0/network_installers/cuda_11.1.0_win10_network.exe -- cmd: IF DEFINED CUDA_INSTALL cuda_11.1.0_win10_network.exe -s nvcc_11.1 cublas_dev_11.1 cublas_11.1 cudart_11.1 documentation_11.1 +- cmd: IF %ONNX%==true IF NOT EXIST "%CUDA_PATH%" set CUDA_INSTALL=1 +- cmd: IF DEFINED CUDA_INSTALL IF %NAME%==gpu-nvidia-cuda11 appveyor DownloadFile https://developer.download.nvidia.com/compute/cuda/11.1.0/network_installers/cuda_11.1.0_win10_network.exe +- cmd: IF DEFINED CUDA_INSTALL IF %NAME%==gpu-nvidia-cuda11 cuda_11.1.0_win10_network.exe -s nvcc_11.1 cublas_dev_11.1 cublas_11.1 cudart_11.1 documentation_11.1 +- cmd: IF DEFINED CUDA_INSTALL IF %NAME%==gpu-nvidia-cuda12 appveyor DownloadFile https://developer.download.nvidia.com/compute/cuda/12.9.0/network_installers/cuda_12.9.0_windows_network.exe +- cmd: IF DEFINED CUDA_INSTALL IF %NAME%==gpu-nvidia-cuda12 cuda_12.9.0_windows_network.exe -s nvcc_12.9 cublas_dev_12.9 cublas_12.9 curand_dev_12.9 cudart_12.9 documentation_12.9 +- cmd: IF %NAME%==gpu-nvidia-cuda12 IF NOT EXIST C:\cache\cutlass-2.11.0 appveyor DownloadFile https://github.com/NVIDIA/cutlass/archive/refs/tags/v2.11.0.zip +- cmd: IF %NAME%==gpu-nvidia-cuda12 IF NOT EXIST C:\cache\cutlass-2.11.0 7z x v2.11.0.zip -oC:\cache\ +- cmd: IF DEFINED CUDA_INSTALL IF %NAME%==onnx appveyor DownloadFile https://developer.download.nvidia.com/compute/cuda/12.9.0/network_installers/cuda_12.9.0_windows_network.exe +- cmd: IF DEFINED CUDA_INSTALL IF %NAME%==onnx cuda_12.9.0_windows_network.exe -s nvcc_12.9 cudart_12.9 - cmd: IF %CUDA%==true set PATH=%CUDA_PATH%\bin;%PATH% +- cmd: IF %ONNX%==true set PATH=%CUDA_PATH%\bin;%PATH% - cmd: set PATH=C:\Python310;C:\Python310\scripts;%PATH% #- cmd: pip3 install --upgrade meson==0.55.3 - cmd: set MIMALLOC_PATH=C:\cache\mimalloc-1.8.7 @@ -108,10 +126,10 @@ cache: - C:\cache - 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1' - 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1' + - 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9 -> appveyor.yml' - C:\projects\lc0\subprojects\packagecache - C:\ndk\android-ndk-r27c\toolchains\llvm\prebuilt\windows-x86_64 before_build: -- cmd: git submodule update --init --recursive - cmd: IF %BLAS%==true (echo.#define DEFAULT_MAX_PREFETCH 0 & echo.#define DEFAULT_TASK_WORKERS 0) > params_override.h - cmd: IF %ANDROID%==true (echo.#define DEFAULT_MAX_PREFETCH 0 & echo.#define DEFAULT_TASK_WORKERS 0) > params_override.h - cmd: SET BUILD_BLAS=%BLAS% @@ -126,8 +144,9 @@ before_build: - cmd: IF %CUDA%==true SET F16C=false - cmd: SET EXTRA= - cmd: IF %ANDROID%==false SET EXTRA=-Db_vscrt=md -- cmd: IF %ONNX_DML%==true SET EXTRA=-Db_vscrt=md -Donnx_libdir=C:\cache\%ONNX_NAME%\runtimes\win-x64\native\ -Donnx_include=C:\cache\%ONNX_NAME%\build\native\include -- cmd: IF %ANDROID%==false meson build --backend vs2019 --buildtype release -Dgtest=%GTEST% -Dopencl=%OPENCL% -Dblas=%BUILD_BLAS% -Ddnnl=true -Ddx=%DX% -Dcudnn=%CUDNN% -Donednn=%ONEDNN% -Dispc_native_only=false -Dnative_cuda=false -Dpopcnt=%POPCNT% -Df16c=%F16C% -Dcudnn_include="%CUDA_PATH%\include","%CUDA_PATH%\cuda\include" -Dcudnn_libdirs="%CUDA_PATH%\lib\x64","%CUDA_PATH%\cuda\lib\x64" -Dopenblas_include="%PKG_FOLDER%\OpenBLAS\dist64\include" -Dopenblas_libdirs="%PKG_FOLDER%\OpenBLAS\dist64\lib" -Ddnnl_dir="%PKG_FOLDER%\%DNNL_NAME%" -Dopencl_include="%PKG_FOLDER%\opencl-nug.0.777.77\build\native\include" -Dopencl_libdirs="%PKG_FOLDER%\opencl-nug.0.777.77\build\native\lib\x64" -Ddefault_library=static -Dmalloc=mimalloc -Dmimalloc_libdir="%MIMALLOC_PATH%"\out\msvc-x64\Release %EXTRA% +- cmd: IF %ONNX%==true SET EXTRA=-Db_vscrt=md -Donnx_libdir=C:\cache\%ONNX_NAME%\runtimes\win-x64\native\ -Donnx_include=C:\cache\%ONNX_NAME%\build\native\include -Ddefault_backend=onnx-trt -Dplain_cuda=false +- cmd: IF %NAME%==gpu-nvidia-cuda12 SET EXTRA=-Db_vscrt=md -Dcutlass=true -Dcutlass_include=C:\cache\cutlass-2.11.0\include +- cmd: IF %ANDROID%==false meson build --backend vs2019 --buildtype release -Dgtest=false -Dopencl=%OPENCL% -Dblas=%BUILD_BLAS% -Ddnnl=true -Ddx=%DX% -Dcudnn=%CUDNN% -Donednn=%ONEDNN% -Dispc_native_only=false -Dnative_cuda=false -Dpopcnt=%POPCNT% -Df16c=%F16C% -Dcudnn_include="%CUDA_PATH%\include","%CUDA_PATH%\cuda\include" -Dcudnn_libdirs="%CUDA_PATH%\lib\x64","%CUDA_PATH%\cuda\lib\x64" -Dopenblas_include="%PKG_FOLDER%\OpenBLAS\dist64\include" -Dopenblas_libdirs="%PKG_FOLDER%\OpenBLAS\dist64\lib" -Ddnnl_dir="%PKG_FOLDER%\%DNNL_NAME%" -Dopencl_include="%PKG_FOLDER%\opencl-nug.0.777.77\build\native\include" -Dopencl_libdirs="%PKG_FOLDER%\opencl-nug.0.777.77\build\native\lib\x64" -Ddefault_library=static -Dmalloc=mimalloc -Dmimalloc_libdir="%MIMALLOC_PATH%"\out\msvc-x64\Release %EXTRA% - cmd: IF %ANDROID%==true meson arm64-v8a --buildtype release -Dgtest=false -Dopenblas_include="%PKG_FOLDER%\OpenBLAS\android-aarch64\include" -Dopenblas_libdirs="%PKG_FOLDER%\OpenBLAS\android-aarch64\lib" -Dembed=%EMBED% -Ddefault_library=static --cross-file crossfile-aarch64 - cmd: IF %ANDROID%==true meson armeabi-v7a --buildtype release -Dgtest=false -Dopenblas_include="%PKG_FOLDER%\OpenBLAS\android-armv7a\include" -Dopenblas_libdirs="%PKG_FOLDER%\OpenBLAS\android-armv7a\lib" -Dembed=%EMBED% -Ddefault_library=static --cross-file crossfile-armv7a -Dispc=false -Dneon=false build_script: @@ -139,7 +158,7 @@ after_build: - cmd: IF %APPVEYOR_REPO_TAG%==true IF %ANDROID%==true call scripts\appveyor_android_package.cmd - cmd: cd C:\projects\lc0 artifacts: - - path: build/lc0.exe + - path: /build/lc0*.exe/ name: lc0-$(NAME) - path: arm64-v8a/lc0 name: lc0-android-arm64-v8a @@ -169,6 +188,7 @@ deploy: test_script: - cmd: IF %GTEST%==true cd build - cmd: IF %GTEST%==true xcopy /s /i C:\cache\syzygy syzygy +- cmd: IF %GTEST%==true IF %ONNX%==true copy %PKG_FOLDER%\%ONNX_NAME%\runtimes\win-x64\native\onnxruntime.dll - cmd: IF %GTEST%==true meson test --print-errorlogs - cmd: cd C:\projects\lc0 on_finish: diff --git a/build.cmd b/build.cmd index 071105b1f8..262b5ee00e 100644 --- a/build.cmd +++ b/build.cmd @@ -2,7 +2,7 @@ setlocal rem 1. Set the following for the options you want to build. -set CUDNN=true +set CUDNN=false set CUDA=true set DX12=false set OPENCL=false @@ -11,6 +11,7 @@ set DNNL=false set OPENBLAS=false set EIGEN=false set TEST=false +set CUTLASS=true if "%CUDA%"=="true" ( if not defined CUDA_PATH ( @@ -71,6 +72,7 @@ meson setup build --backend %backend% --buildtype release -Ddx=%DX12% -Dcudnn=%C -Dmkl_include="%MKL_PATH%\include" -Dmkl_libdirs="%MKL_PATH%\lib\intel64" -Ddnnl_dir="%DNNL_PATH%" ^ -Dopencl_libdirs="%OPENCL_LIB_PATH%" -Dopencl_include="%OPENCL_INCLUDE_PATH%" ^ -Dopenblas_include="%OPENBLAS_PATH%\include" -Dopenblas_libdirs="%OPENBLAS_PATH%\lib" ^ +-Dcutlass="%CUTLASS%" ^ -Ddefault_library=static if errorlevel 1 exit /b @@ -80,4 +82,4 @@ pause cd build msbuild /m /p:Configuration=Release /p:Platform=x64 /p:WholeProgramOptimization=true ^ -/p:PreferredToolArchitecture=x64 lc0.sln /filelogger \ No newline at end of file +/p:PreferredToolArchitecture=x64 lc0.sln /filelogger diff --git a/build.sh b/build.sh index fa30e5c3df..8eb935c926 100755 --- a/build.sh +++ b/build.sh @@ -24,7 +24,7 @@ if [ -f "${BUILDDIR}/build.ninja" ] then "${MESON}" configure "${BUILDDIR}" -Dbuildtype="${BUILDTYPE}" -Dprefix="${INSTALL_PREFIX:-/usr/local}" "$@" else - "${MESON}" "${BUILDDIR}" --buildtype "${BUILDTYPE}" --prefix "${INSTALL_PREFIX:-/usr/local}" "$@" + "${MESON}" setup "${BUILDDIR}" --buildtype "${BUILDTYPE}" --prefix "${INSTALL_PREFIX:-/usr/local}" "$@" fi "${MESON}" compile -C "${BUILDDIR}" diff --git a/changelog.txt b/changelog.txt index 5d208674ac..cdfec68116 100644 --- a/changelog.txt +++ b/changelog.txt @@ -1,4 +1,101 @@ -v0.31.0-rc1 (2024-03-25) +v0.32.0 (2025-08-21) +~~~~~~~ +* Support for building with cuda 13. +* README update. +* Build system improvements. + +v0.32.0-rc2 (2025-08-12) +~~~~~~~ +* Fix for onnx-trt bug, where the wrong network could be used from the cache. +* Added code to detect RPE nets and give an error instead of bad results. +* Better instructions in the readme and install script for onnx-trt. +* Made `UCI_ShowWDL` again off by default again as some GUIs have issues. +* Fixed a long standing issue when compiled with `-ffast-math` (or `icx -O3`). +* Several improvements to the sycl backend. +* Several improvements to the metal backend. +* Refactored the rescorer code and training data header to make them usable by + external tools. +* Relaxed cuda/cudnn version checks so that no warnings are shown for mismatched + versions that are supported. +* Several build system updates. +* Assorted small fixes and improvements. + +v0.32.0-rc1 (2025-07-18) +~~~~~~~ +The code has been reorganized and undergone major changes. Therefore this +changelog will be less detailed and describe the changes in major groups. +* We have a new search API that allows search algorithms to co-exist. Currently + available are `classic` (the default), `dag-preview` (more later), + `valuehead` and `policyhead`. The default algorithm can be changed either at + build time by the `default_search` option or by renaming the executable to + include the algorithm name (e.g. lc0-valuehead). +* We also have a new backend interface that is chess oriented and not tied to + the network architecture. The existing backends still use the old interface + through a wrapper. +* The source code is reorganized, with a more logical directory structure. +* The original search was ported to the new search and backend interfaces and + is renamed to `classic`. This has allowed some streamlining and + simplifications. +* The `dag-preview` search is the DAG algorithm that lived in a separate branch + up to now. It hasn't been so well tested, that's why it has "preview" in its + name for now, but lives in the `src/search/dag-classic` directory. +* The `valuehead` search replaces `ValueOnly` mode and selects the move with the + best value head evaluation. +* The `policyhead` search is equivalent to a single node search, selecting the + best move using just the policy head. +* The new `default_backend` build option allows to override the fixed priority + for the backend used by default. +* The new `native_arch` build option to override the `-march=native` compiler + default for linux release builds, to help with distribution package creation. +* We have a new `sycl` backend that will work with amd, intel and nvidia gpus. +* There is also a new `onnx-trt` backend, using tensorrt on nvidia gpus. +* Support simple/normal/pro mode in options was cleaned up, using a common + mechanism. +* Added the `wait` uci extension command to allow running simple tests from the + command line. +* Removed the `fen` uci extension command as it was unnecessarily complicating + things. +* Some preliminary fp8 support was added for onnx and xla. This is not + functional, just there to make experimentation easier. +* Several build system changes and improvements. +* We now generate binaries for cuda 12, onnx-trt and macos. +* Support for using lc0 with openbench. +* New `bench` mode for a quicker benchmark. +* Assorted small fixes and improvements. + +v0.31.2 (2024-10-20) +~~~~~~~ +* Updated the WDL_mu centipawn fallback. +* Fix for build issues with newer Linux c++ libraries. +* Fix for an XLA Mish bug. +* Minor README.md update. + +v0.31.1 (2024-08-11) +~~~~~~~ +* Make WDL_mu score type work as intended. +* Fix macos CI builds. + +v0.31.0 (2024-06-16) +~~~~~~~ +* No changes from rc3. + +v0.31.0-rc3 (2024-05-29) +~~~~~~~ +* The `WDLDrawRateTarget` option now accepts the value 0 (new default) to retain + raw WDL values if `WDLCalibrationElo` is set to 0 (default). +* Improvements to the verbose move stats if `WDLEvalObjectivity` is used. +* The centipawn score is displayed by default for old nets without WDL output. +* Some build system improvements. + +v0.31.0-rc2 (2024-04-16) +~~~~~~~ +* Changed cuda compilation options to use `-arch=native` or `-arch=all-major` + if no specific version is requested, with fallback for older cuda that don't + support those options. +* Updated android builds to use openblas 0.3.27. +* A few small fixes. + +v0.31.0-rc1 (2024-03-25) ~~~~~~~ * The blas, cuda, eigen, metal and onnx backends now have support for multihead network architecture and can run BT3/BT4 nets. @@ -39,6 +136,9 @@ natively higher draw rates. * Made the WDL Rescale sharpness limit configurable via the `--wdl-max-s` hidden option. +* The search task workers can be set automatically, to either 0 for cpu backends + or up to 4 depending on the number of cpu cores. This is enabled by + `--task-workers=-1` (the new default). * Several assorted fixes and code cleanups. v0.30.0 (2023-07-21) diff --git a/dist/README-onnx-trt.txt b/dist/README-onnx-trt.txt new file mode 100644 index 0000000000..8a50b2689e --- /dev/null +++ b/dist/README-onnx-trt.txt @@ -0,0 +1,88 @@ +# Lc0 + +Lc0 is a UCI-compliant chess engine designed to play chess via +neural network, specifically those of the LeelaChessZero project +(https://lczero.org). + +# Installation + +Summary: run `instrall.cmd` and follow the instructions. + +To run this version you will also need several dll files from NVIDA's +CUDA, cuDNN and TensorRT. Those dlls can either be on the system path +from a separate installation of these libraries, or can be placed +directly in the Lc0 folder. Either way, you will get an error message +for any that isn't found. + +The dlls needed are the following: + +1. CUDA +* cublas64_12.dll +* cublasLt64_12.dll +* cudart64_12.dll +* cufft64_11.dll + +2. cuDNN +* cudnn64_9.dll +* cudnn_graph64_9.dll + +3. TensorRT: +* nvinfer_10.dll +* nvinfer_builder_resource_10.dll +* nvinfer_plugin_10.dll +* nvonnxparser_10.dll + +The install.cmd script included in this package will download the +CUDA and cuDNN files needed and will open the TensorRT download page +using your browser. If it fails, you can download the files manually +using the following addresses, the dlls are in the `bin` directory +in the CUDA/cuDNN zips and the `lib` directory in the TensorRT zip. + +* https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.9.79-archive.zip +* https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.9.1.4-archive.zip +* https://developer.download.nvidia.com/compute/cuda/redist/libcufft/windows-x86_64/libcufft-windows-x86_64-11.4.1.4-archive.zip +* https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-9.11.0.98_cuda12-archive.zip +* https://developer.nvidia.com/tensorrt/download/10x#trt1012 + +The TensorRT link will take you to the download page, after +registering go to the "TensorRT 10.12 GA for x86_64 Architecture" +section and get the "TensorRT 10.12 GA for Windows 10, 11, +Server 2022 and CUDA 12.0 to 12.9 ZIP Package". + +Finally, if Lc0 still won't run, get the latest Visual C++ +redistributable from: https://aka.ms/vs/17/release/vc_redist.x64.exe + +# Running + +When running Lc0 with a new network file, it will take some time to +create the optimized model to use. This is normal. The model will be +cached for future runs in the `trt_cache` folder, so next time it will +be faster. If you want to experiment you can rename the `trt_cache` +folder and rerun, sometimes TensorRT will generate a different model +that may be faster. Moreover, if you are having issues, you can +delete/rename the cache and rerun. + +# License + +Leela Chess is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +Leela Chess is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with Leela Chess. If not, see . + +Additional permission under GNU GPL version 3 section 7 + +If you modify this Program, or any covered work, by linking or +combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA +Toolkit and the NVIDIA CUDA Deep Neural Network library (or a +modified version of those libraries), containing parts covered by the +terms of the respective license agreement, the licensors of this +Program grant you additional permission to convey the resulting work. + diff --git a/dist/install-cuda_12_9.cmd b/dist/install-cuda_12_9.cmd new file mode 100644 index 0000000000..c5a253093b --- /dev/null +++ b/dist/install-cuda_12_9.cmd @@ -0,0 +1,43 @@ +@echo off +where /q tar +if errorlevel 1 goto error + +cd /d %~dp0 + +cls +echo Installing the CUDA dlls required by the Lc0 cuda backend. + +echo 1/4. Downloading cudart. +curl -# --ssl-no-revoke -o tmp_cudart.zip https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.9.37-archive.zip" +if errorlevel 1 goto error + +echo 2/4. Extracting files. +tar -xzOf tmp_cudart.zip cuda_cudart-windows-x86_64-12.9.37-archive/bin/cudart64_12.dll >cudart64_12.dll +if errorlevel 1 goto error + +tar -xzOf tmp_cudart.zip cuda_cudart-windows-x86_64-12.9.37-archive/LICENSE >CUDA.txt + +del /q tmp_cudart.zip + +echo 3/4. Downloading cublas. +curl -# --ssl-no-revoke -o tmp_cublas.zip https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.9.0.13-archive.zip" +if errorlevel 1 goto error + +echo 4/4. Extracting files. +tar -xzOf tmp_cublas.zip libcublas-windows-x86_64-12.9.0.13-archive/bin/cublas64_12.dll >cublas64_12.dll +if errorlevel 1 goto error + +tar -xzOf tmp_cublas.zip libcublas-windows-x86_64-12.9.0.13-archive/bin/cublasLt64_12.dll >cublasLt64_12.dll +if errorlevel 1 goto error + +del /q tmp_cublas.zip + +echo Installation successful. +pause +exit /b + +:error +cls +echo Installation failed - you will have to download cuda 12.9 yourself. +pause + diff --git a/dist/install-trt.cmd b/dist/install-trt.cmd new file mode 100644 index 0000000000..3538c30b66 --- /dev/null +++ b/dist/install-trt.cmd @@ -0,0 +1,99 @@ +@echo off +where /q tar +if errorlevel 1 goto error + +cd /d %~dp0 + +cls + +echo This script will download and install the CUDA/cuDNN/tensorRT dlls required by the Lc0 onnx-trt backend. +echo( +echo If you are using a metered internet connection, be aware the download will be arounbd 3 Gb. +echo( +pause + +echo Installing the CUDA dlls required by the Lc0 onnx-trt backend. + +echo 1/6. Downloading cudart. +curl -# --ssl-no-revoke -o tmp_cudart.zip https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.9.79-archive.zip" +if errorlevel 1 goto error + +echo 2/6. Extracting files. +tar -xzOf tmp_cudart.zip cuda_cudart-windows-x86_64-12.9.79-archive/bin/cudart64_12.dll >cudart64_12.dll +if errorlevel 1 goto error + +tar -xzOf tmp_cudart.zip cuda_cudart-windows-x86_64-12.9.79-archive/LICENSE >CUDA.txt + +del /q tmp_cudart.zip + +echo 3/6. Downloading cublas. +curl -# --ssl-no-revoke -o tmp_cublas.zip https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.9.1.4-archive.zip" +if errorlevel 1 goto error + +echo 4/6. Extracting files. +tar -xzOf tmp_cublas.zip libcublas-windows-x86_64-12.9.1.4-archive/bin/cublas64_12.dll >cublas64_12.dll +if errorlevel 1 goto error + +tar -xzOf tmp_cublas.zip libcublas-windows-x86_64-12.9.1.4-archive/bin/cublasLt64_12.dll >cublasLt64_12.dll +if errorlevel 1 goto error + +del /q tmp_cublas.zip + +echo 5/6. Downloading cufft. +curl -# --ssl-no-revoke -o tmp_cufft.zip https://developer.download.nvidia.com/compute/cuda/redist/libcufft/windows-x86_64/libcufft-windows-x86_64-11.4.1.4-archive.zip" +if errorlevel 1 goto error + +echo 6/6. Extracting files. +tar -xzOf tmp_cufft.zip libcufft-windows-x86_64-11.4.1.4-archive/bin/cufft64_11.dll >cufft64_11.dll +if errorlevel 1 goto error + +del /q tmp_cufft.zip + +echo Installing the cuDNN dlls required by the Lc0 onnx-trt backend. + +echo 1/2. Downloading cudnn. +curl -# --ssl-no-revoke -o tmp_cudnn.zip https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-9.11.0.98_cuda12-archive.zip" +if errorlevel 1 goto error + +echo 2/2. Extracting files. +tar -xzOf tmp_cudnn.zip cudnn-windows-x86_64-9.11.0.98_cuda12-archive/bin/cudnn64_9.dll >cudnn64_9.dll +if errorlevel 1 goto error + +tar -xzOf tmp_cudnn.zip cudnn-windows-x86_64-9.11.0.98_cuda12-archive/bin/cudnn_graph64_9.dll >cudnn_graph64_9.dll +if errorlevel 1 goto error + +tar -xzOf tmp_cudnn.zip cudnn-windows-x86_64-9.11.0.98_cuda12-archive/LICENSE >CUDNN.txt + +del /q tmp_cudnn.zip + +echo Installing the tensorRT dlls required by the Lc0 onnx-trt backend. + +echo 1/2. Downloading tensorRT. +curl -# --ssl-no-revoke -o tmp_tensorrt.zip https://developer.download.nvidia.com/compute/machine-learning/tensorrt/10.12.0/zip/TensorRT-10.12.0.36.Windows.win10.cuda-12.9.zip" +if errorlevel 1 goto error + +echo 2/2. Extracting files. +tar -xzOf tmp_tensorrt.zip TensorRT-10.12.0.36/lib/nvinfer_10.dll >nvinfer_10.dll +if errorlevel 1 goto error + +tar -xzOf tmp_tensorrt.zip TensorRT-10.12.0.36/lib/nvinfer_builder_resource_10.dll >nvinfer_builder_resource_10.dll +if errorlevel 1 goto error + +tar -xzOf tmp_tensorrt.zip TensorRT-10.12.0.36/lib/nvinfer_plugin_10.dll >nvinfer_plugin_10.dll +if errorlevel 1 goto error + +tar -xzOf tmp_tensorrt.zip TensorRT-10.12.0.36/lib/nvonnxparser_10.dll >nvonnxparser_10.dll +if errorlevel 1 goto error + +tar -xzOf tmp_tensorrt.zip TensorRT-10.12.0.36/doc/Readme.txt >TENSORRT.txt + +del /q tmp_tensorrt.zip + +pause +exit /b + +:error +cls +echo Installation failed - see the README for alternative download instructions. +pause + diff --git a/libs/lczero-common b/libs/lczero-common deleted file mode 160000 index 55e1b382ef..0000000000 --- a/libs/lczero-common +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 55e1b382efadd57903e37f2a2e29caef3ea85799 diff --git a/meson.build b/meson.build index fe10b0e977..63613fb618 100644 --- a/meson.build +++ b/meson.build @@ -16,21 +16,11 @@ project('lc0', 'cpp', default_options : ['cpp_std=c++20', 'b_ndebug=if-release', 'warning_level=3', 'b_lto=true', 'b_vscrt=mt'], - meson_version: '>=0.55') + meson_version: '>=0.57') -cc = meson.get_compiler('cpp') - -if not cc.has_header('optional') or not cc.has_header('string_view') - error('Lc0 requires a compiler supporting C++17, for example g++ v8.0, ' + - 'clang v5.0 or later (with C++17 stdlib) and Visual Studio 2017 or ' + - 'later.') -endif +fs = import('fs') -if not cc.has_header('charconv') - warning('Your compiler or library does not have full C++17 support. ' + - 'See the README for compilers that are known to be working. ' + - 'This will become an error in the future.') -endif +cc = meson.get_compiler('cpp') if cc.get_id() == 'clang' # Thread safety annotation @@ -38,7 +28,9 @@ if cc.get_id() == 'clang' endif if cc.get_id() != 'msvc' if get_option('buildtype') == 'release' - add_project_arguments(cc.get_supported_arguments(['-march=native']), language : 'cpp') + if get_option('native_arch') + add_project_arguments(cc.get_supported_arguments(['-march=native']), language : 'cpp') + endif endif endif if cc.get_id() == 'msvc' @@ -70,37 +62,19 @@ includes += include_directories('third_party', is_system: true) compile_proto = find_program('scripts/compile_proto.py') gen = generator(compile_proto, output: ['@BASENAME@.pb.h'], arguments : [ - '--proto_path=@CURRENT_SOURCE_DIR@/libs/lczero-common', + '--proto_path=@CURRENT_SOURCE_DIR@', '--cpp_out=@BUILD_DIR@', '@INPUT@']) -# Handle submodules. -git = find_program('git', required: false) -if run_command('scripts/checkdir.py', 'libs/lczero-common/proto', check : false).returncode() != 0 - if git.found() - if run_command(git, 'status', check : false).returncode() == 0 - message('updating git submodule libs/lczero-common') - run_command(git, 'submodule', 'update', '--init', '--recursive', check : false) - else - message('cloning lczero-common.git into libs/lczero-common') - run_command(git, 'clone', '--depth=1', - 'https://github.com/LeelaChessZero/lczero-common.git', - 'libs/lczero-common/', check : false) - endif - else - error('Please install git to automatically fetch submodules or download the archives manually from GitHub.') - endif -endif - pb_files = [ 'src/utils/protomessage.cc', - gen.process('libs/lczero-common/proto/net.proto', - preserve_path_from : meson.current_source_dir() + '/libs/lczero-common/') + gen.process('proto/net.proto', preserve_path_from : meson.current_source_dir()) ] common_files += pb_files # Extract git short revision. short_rev = 'unknown' +git = find_program('git', required: false) if git.found() r = run_command(git, 'rev-parse', '--short', 'HEAD', check : false) if r.returncode() == 0 @@ -142,17 +116,11 @@ elif get_option('malloc') != '' endif # ONNX and HLO protobufs. -gen_proto_src = generator(compile_proto, output: ['@BASENAME@.pb.h'], - arguments : [ - '--proto_path=@CURRENT_SOURCE_DIR@/src', - '--cpp_out=@BUILD_DIR@', - '@INPUT@']) - -files += gen_proto_src.process('src/neural/onnx/onnx.proto', - preserve_path_from : meson.current_source_dir() + '/src/') +files += gen.process('proto/onnx.proto', + preserve_path_from : meson.current_source_dir()) -files += gen_proto_src.process('src/neural/xla/hlo.proto', - preserve_path_from : meson.current_source_dir() + '/src/') +files += gen.process('proto/hlo.proto', + preserve_path_from : meson.current_source_dir()) ############################################################################# ## Main files @@ -239,6 +207,11 @@ files += [ includes += include_directories('src') +deps += dependency('absl_flat_hash_map', + include_type: 'system', + fallback: ['abseil-cpp', 'absl_container_dep'], + default_options : ['warning_level=0', 'cpp_std=c++20']) + deps += dependency('threads') ############################################################################# @@ -259,9 +232,6 @@ if get_option('dag_classic') 'src/search/dag_classic/search.cc', 'src/search/dag_classic/wrapper.cc', ] - - absl = subproject('abseil-cpp', default_options : ['warning_level=0', 'cpp_std=c++20']) - deps += absl.get_variable('absl_container_dep').as_system() endif ############################################################################# @@ -348,7 +318,13 @@ if get_option('build_backends') endif - deps += dependency('eigen3', fallback: ['eigen', 'eigen_dep']).as_system() + eigen_dep = dependency('eigen3') + # Check for needed header, bad dependency seen in the widl. + if eigen_dep.found() and cc.has_header('Eigen/Core', dependencies: eigen_dep) + deps += eigen_dep.as_system() + else + deps += subproject('eigen').get_variable('eigen_dep').as_system() + endif ispc = find_program('ispc', required: false) ispc_arch = 'x86-64' @@ -476,48 +452,45 @@ if get_option('build_backends') ## cuDNN ## ~~~~~ cudnn_libdirs = get_option('cudnn_libdirs') + nvcc_paths = [] + foreach p : cudnn_libdirs + nvcc_paths += fs.parent(p) + '/bin/nvcc' + endforeach + nvcc_paths += ['nvcc', '/usr/local/cuda/bin/nvcc', '/opt/cuda/bin/nvcc'] + message('Looking for nvcc in: ' + ', '.join(nvcc_paths)) cu_blas = cc.find_library('cublas', dirs: cudnn_libdirs, required: false) cu_dnn = cc.find_library('cudnn', dirs: cudnn_libdirs, required: false) cu_dart = cc.find_library('cudart', dirs: cudnn_libdirs, required: false) - nvcc = find_program('nvcc', '/usr/local/cuda/bin/nvcc', '/opt/cuda/bin/nvcc', + nvcc = find_program(nvcc_paths, required: false) - - if (get_option('cudnn') or get_option('plain_cuda')) and cu_blas.found() and cu_dart.found() and nvcc.found() - deps += [cu_blas, cu_dart] - cuda_files = ['src/neural/backends/cuda/layers.cc'] - if get_option('cudnn') and cu_dnn.found() - deps += cu_dnn - cuda_files += 'src/neural/backends/cuda/network_cudnn.cc' - cuda_files += 'src/neural/backends/cuda/network_cuda.cc' # To support newer nets. - add_project_arguments('-DUSE_CUDNN', language : 'cpp') - elif get_option('plain_cuda') - cuda_files += 'src/neural/backends/cuda/network_cuda.cc' - endif + nvcc_ok = false + if get_option('nvcc') and nvcc.found() foreach d : get_option('cudnn_include') if run_command('scripts/checkdir.py', d, check : false).returncode() == 0 includes += include_directories(d, is_system: true) endif endforeach - includes += include_directories('src/neural/backends/cuda/') - - cuda_arguments = ['-c', '@INPUT@', '-o', '@OUTPUT@', + nvcc_arguments = ['-c', '@INPUT@', '-o', '@OUTPUT@', '-I', meson.current_source_dir() + '/src'] nvcc_help = run_command(nvcc, '-h', check : false).stdout() if host_machine.system() == 'windows' if get_option('b_vscrt') == 'mt' - cuda_arguments += ['-Xcompiler', '-MT'] + nvcc_arguments += ['-Xcompiler', '-MT'] elif get_option('b_vscrt') == 'mtd' - cuda_arguments += ['-Xcompiler', '-MTd'] + nvcc_arguments += ['-Xcompiler', '-MTd'] elif get_option('b_vscrt') == 'mdd' or (get_option('b_vscrt') == 'from_buildtype' and get_option('buildtype') == 'debug') - cuda_arguments += ['-Xcompiler', '-MDd'] + nvcc_arguments += ['-Xcompiler', '-MDd'] elif get_option('b_vscrt') != 'none' - cuda_arguments += ['-Xcompiler', '-MD'] + nvcc_arguments += ['-Xcompiler', '-MD'] endif else - cuda_arguments += ['--std=c++14', '-Xcompiler', '-fPIC'] + nvcc_arguments += ['--std=c++17', '-Xcompiler', '-fPIC'] + if get_option('debug') + nvcc_arguments += ['-g'] + endif endif if get_option('nvcc_ccbin') != '' - cuda_arguments += ['-ccbin=' + get_option('nvcc_ccbin')] + nvcc_arguments += ['-ccbin=' + get_option('nvcc_ccbin')] endif cuda_cc = get_option('cc_cuda') # Unfortunately option cuda_cc is reserved. nvcc_extra_args = [] @@ -543,26 +516,68 @@ if get_option('build_backends') endif endif foreach x : get_option('cudnn_include') - cuda_arguments += ['-I', x] + nvcc_arguments += ['-I', x] endforeach if host_machine.system() == 'windows' outputname = '@BASENAME@.obj' else outputname = '@BASENAME@.o' endif + nvcc_ok = true + + max_cuda = 0 + nvcc_dryrun = run_command(nvcc, '--dryrun', nvcc_extra_args, 'foo.cu', check : false).stderr() + foreach x : nvcc_dryrun.split() + if x.contains('-D__CUDA_ARCH__=') + arch = x.substring(16).to_int() + if arch > max_cuda + max_cuda = arch + endif + endif + endforeach + endif + if (get_option('cudnn') or get_option('plain_cuda')) and cu_dart.found() and cu_blas.found() and nvcc_ok + deps += [cu_blas, cu_dart] + cuda_files = ['src/neural/backends/cuda/layers.cc'] + if get_option('cudnn') and cu_dnn.found() + deps += cu_dnn + cuda_files += 'src/neural/backends/cuda/network_cudnn.cc' + cuda_files += 'src/neural/backends/cuda/network_cuda.cc' # To support newer nets. + add_project_arguments('-DUSE_CUDNN', language : 'cpp') + elif get_option('plain_cuda') + cuda_files += 'src/neural/backends/cuda/network_cuda.cc' + endif + includes += include_directories('src/neural/backends/cuda/') files += cuda_files + + if get_option('cutlass') and max_cuda >= 800 + add_project_arguments('-DUSE_CUTLASS', language : 'cpp') + nvcc_arguments += ['-DUSE_CUTLASS'] + if get_option('cutlass_include') != '' + nvcc_arguments += ['-I', get_option('cutlass_include')] + else + nvcc_arguments += ['-I', subproject('cutlass').get_variable('include_directory')] + endif + nvcc_arguments += ['-isystem=@CURRENT_SOURCE_DIR@/third_party'] + files += custom_target('cuda cutlass code', + input : 'src/neural/backends/cuda/cutlass_kernels.cu', + output : outputname, + command : [nvcc, nvcc_extra_args, nvcc_arguments] + ) + endif + files += custom_target('cuda fp32 code', input : 'src/neural/backends/cuda/common_kernels.cu', output : outputname, depend_files: 'src/neural/backends/cuda/winograd_helper.inc', - command : [nvcc, nvcc_extra_args, cuda_arguments] + command : [nvcc, nvcc_extra_args, nvcc_arguments] ) files += custom_target('cuda fp16 code', input : 'src/neural/backends/cuda/fp16_kernels.cu', output : outputname, depend_files: 'src/neural/backends/cuda/winograd_helper.inc', - command : [nvcc, nvcc_extra_args, cuda_arguments] + command : [nvcc, nvcc_extra_args, nvcc_arguments] ) has_backends = true endif @@ -602,24 +617,47 @@ if get_option('build_backends') ## ~~~~~~~~~~ ## ONNX ## ~~~~~~~~~~ - if get_option('onnx_libdir') != '' and get_option('onnx_include') != '' - deps += cc.find_library('onnxruntime', dirs: get_option('onnx_libdir'), - required: true) - includes += include_directories(get_option('onnx_include'), is_system: true) + onnxruntime = cc.find_library('onnxruntime', dirs: get_option('onnx_libdir'), + required: false) + if get_option('onnx') and onnxruntime.found() + deps += onnxruntime + onnx_inc_dir = get_option('onnx_include') + if fs.is_dir(onnx_inc_dir + '/onnxruntime/core/session') + # Top level of source dir. + onnx_inc_dir += '/onnxruntime/core/session' + elif fs.is_dir(onnx_inc_dir + '/onnxruntime') + onnx_inc_dir += '/onnxruntime' + endif + includes += include_directories(onnx_inc_dir, is_system: true) cc.has_header('onnxruntime_cxx_api.h', required: true, - args: '-I' + get_option('onnx_include')) - if not cc.has_header('cpu_provider_factory.h', - args: '-I' + get_option('onnx_include')) - cc.has_header('../providers/cpu/cpu_provider_factory.h', required: true, - args: '-I' + get_option('onnx_include')) - includes += include_directories(get_option('onnx_include') + '/../providers/cpu', - is_system: true) + include_directories: includes) + files += 'src/neural/backends/onnx/network_onnx.cc' + onnx_conf = configuration_data() + if cc.has_header('dml_provider_factory.h', required: false, + include_directories: includes) + # The header is not actually needed, used here to detect DML onnxruntime. + onnx_conf.set('USE_DML', true) endif - files += 'src/neural/backends/network_onnx.cc' if cc.find_library('onnxruntime_providers_rocm', dirs: get_option('onnx_libdir'), required: false).found() - add_project_arguments('-DUSE_ROCM', language : 'cpp') + onnx_conf.set('USE_ROCM', true) + endif + if cc.find_library('onnxruntime_providers_migraphx', + dirs: get_option('onnx_libdir'), required: false).found() + onnx_conf.set('USE_MIGRAPHX', true) endif + if cu_dart.found() and nvcc_ok + onnx_conf.set('USE_ONNX_CUDART', true) + deps += cu_dart + files += custom_target('cuda onnx code', + input : 'src/neural/backends/onnx/onnx_kernels.cu', + output : outputname, + command : [nvcc, nvcc_extra_args, nvcc_arguments] + ) + else + warning('No CUDA support available. Using compatibility implementation for onnx-trt and onnx-cuda.') + endif + configure_file(output : 'onnx_conf.h', configuration : onnx_conf) has_backends = true endif @@ -632,7 +670,7 @@ if get_option('build_backends') modules : ['Foundation', 'Metal', 'MetalPerformanceShaders', 'MetalPerformanceShadersGraph'], required: get_option('metal')) - if (metal_frameworks.found() and add_languages('objc', 'objcpp')) + if metal_frameworks.found() and add_languages('objc', 'objcpp', native: false) deps += metal_frameworks files += [ @@ -644,6 +682,13 @@ if get_option('build_backends') has_backends = true add_project_arguments('-fobjc-arc', language : 'objc') add_project_arguments('-fobjc-arc', language : 'objcpp') + + # Minimum MacOS version = 12.6.1 + macos_min_version = '12.6' + add_project_arguments( + '-mmacosx-version-min=' + macos_min_version, + language: ['c', 'cpp', 'objc', 'objcpp'] + ) endif ## ~~~~~~~~ @@ -682,15 +727,53 @@ if get_option('build_backends') deps += cc.find_library('mkl_core', required: true) deps += cc.find_library('OpenCL', required: true) elif get_option('sycl') == 'amd' - deps += cc.find_library('hipblas', required: true) - deps += cc.find_library('amdhip64', required: true) + hip_libdirs = get_option('hip_libdirs') + hip_args = [] + foreach hip_include : get_option('hip_include') + if run_command('scripts/checkdir.py', hip_include, check : false).returncode() == 0 + includes += include_directories(hip_include, is_system: true) + hip_args += '-I' + hip_include + endif + endforeach + deps += cc.find_library('hipblas', dirs: hip_libdirs, required: true) + cc.has_header('hipblas/hipblas.h', required: true, args: hip_args) + deps += cc.find_library('amdhip64', dirs: hip_libdirs, required: true) + cc.has_header('hip/hip_runtime.h', required: true, args: hip_args) add_project_arguments('-DUSE_HIPBLAS=ON', language : 'cpp') add_project_arguments('-D__HIP_PLATFORM_AMD__', language : 'cpp') - if get_option('amd_gfx') == '' - error('-Dsycl=amd requires specifying -Damd_gfx architecture identifier (e.g. 90a, 1100 or similar)') + amd_gfx = get_option('amd_gfx') + if amd_gfx == '' + amd_gfx = [] + agent_enum = find_program('rocm_agent_enumerator', '/opt/rocm/bin/rocm_agent_enumerator', + required: false) + if not agent_enum.found() + warning( '\'rocm_agent_enumerator\' not found. AMD GPU detection doesn\'t work. You can install rocminfo or set -Damd_gfx.') + elif meson.version().version_compare('<1.2.0') + warning( 'Automatic AMD GPU detection requires Meson 1.2.0') + else + agents = run_command(agent_enum, check : false).stdout() + agent_list = agents.splitlines() + foreach agent : agent_list + if agent.startswith('gfx') + amd_gfx += 'amd_gpu_' + agent + else + error( '\'' + agent_enum.full_path() + '\' unexpected output: ' + agent) + endif + endforeach + if amd_gfx.length() == 0 + warning( '\'' + agent_enum.full_path() + '\' failed to detect any AMD GPUs in the system.') + else + message( 'Detected AMD GPU cores: ' + ','.join(amd_gfx)) + endif + endif + else + amd_gfx = ['amd_gpu_' + amd_gfx] + endif + if amd_gfx.length() == 0 + error('-Dsycl=amd requires specifying -Damd_gfx architecture identifier (e.g. gfx90a, gfx1100 or similar)') endif - add_project_arguments('-fsycl-targets=amd_gpu_gfx'+get_option('amd_gfx'), language : 'cpp') - add_project_link_arguments('-fsycl-targets=amd_gpu_gfx'+get_option('amd_gfx'), language : 'cpp') + add_project_arguments('-fsycl-targets=' + ','.join(amd_gfx), language : 'cpp') + add_project_link_arguments('-fsycl-targets=' + ','.join(amd_gfx), language : 'cpp') else deps += cc.find_library('cublas', required: true) deps += cc.find_library('cudart', required: true) @@ -707,6 +790,7 @@ if get_option('build_backends') # For sycl under windows we need to link using icx to generate the device code. # This script edits build.ninja for this and for an icx dependency issue. meson.add_postconf_script('scripts/sycl_build_hack.py') + add_project_link_arguments('-rtlib=compiler-rt', language : 'cpp') endif endif @@ -737,15 +821,53 @@ endif deps += dependency('zlib', fallback: ['zlib', 'zlib_dep']) endif + trace_lib = get_option('trace_library') + trace_config = configuration_data() + + common_files += 'src/utils/trace.cc' + ## ~~~~~~~~ + ## perfetto + ## ~~~~~~~~ + if trace_lib == 'perfetto' + perfetto_dep = dependency('perfetto', required: true, + fallback: ['perfetto', 'dep_perfetto']) + deps += perfetto_dep + trace_config.set('USE_PERFETTO_TRACE', 1) + endif + + ## ~~~~ + ## nvtx + ## ~~~~ + if trace_lib == 'nvtx' + nvtx_includes = get_option('cudnn_include') + nvtx_header_found = false + foreach d : nvtx_includes + if run_command('scripts/checkdir.py', d, check : false).returncode() == 0 + if cc.has_header('nvtx3/nvtx3.hpp', args: '-I' + d) + includes += include_directories(d) + nvtx_header_found = true + break + endif + endif + endforeach + if not nvtx_header_found + error('nvtx3/nvtx3.hpp header not found in cudnn_include paths') + endif + # This could support other tracing apis like systemtap. + trace_config.set('USE_NVTX_TRACE', 1) + endif + configure_file(output : 'trace_config.h', + configuration : trace_config) + ## ~~~~~~~~ ## Profiler ## ~~~~~~~~ if get_option('buildtype') != 'release' - deps += cc.find_library('libprofiler', + deps += cc.find_library('profiler', dirs: ['/usr/local/lib'], required: false) endif - deps += cc.find_library('libatomic', required: false) + deps += cc.find_library('atomic', required: false) ############################################################################# ## Main Executable @@ -759,6 +881,10 @@ if not get_option('f16c') add_project_arguments('-DNO_F16C', language : 'cpp') endif +if cc.has_type('_Float16') + add_project_arguments('-DHAS_FLOAT16', language : 'cpp') +endif + if not get_option('pext') add_project_arguments('-DNO_PEXT', language : 'cpp') endif @@ -767,10 +893,19 @@ if get_option('embed') add_project_arguments('-DEMBED', language : 'cpp') endif +default_search_h = configuration_data() if get_option('default_search') != '' - add_project_arguments('-DDEFAULT_SEARCH=' + - get_option('default_search'), language : 'cpp') + default_search_h.set_quoted('DEFAULT_SEARCH', get_option('default_search')) +endif +configure_file(output : 'default_search.h', + configuration : default_search_h) + +default_backend_h = configuration_data() +if get_option('default_backend') != '' + default_backend_h.set_quoted('DEFAULT_BACKEND', get_option('default_backend')) endif +configure_file(output : 'default_backend.h', + configuration : default_backend_h) if get_option('lc0') files += common_files @@ -783,10 +918,10 @@ endif ############################################################################# if get_option('rescorer') - deps += subproject('gaviotatb').get_variable('gaviotatb_dep') + gaviota_dep = subproject('gaviotatb').get_variable('gaviotatb_dep') executable('rescorer', 'src/rescorer_main.cc', [common_files, 'src/trainingdata/rescorer.cc'], - include_directories: includes, dependencies: deps, install: true) + include_directories: includes, dependencies: [deps, gaviota_dep], install: true) endif ############################################################################# @@ -796,13 +931,18 @@ endif if get_option('gtest') gtest = dependency('gtest', fallback: ['gtest', 'gtest_dep']) gmock = dependency('gmock', fallback: ['gtest', 'gmock_dep']) - lc0_lib = library('lc0_lib', files, include_directories: includes, dependencies: deps) + lc0_lib = library('lc0_lib', common_files, include_directories: includes, dependencies: deps) test('ChessBoard', executable('chessboard_test', 'src/chess/board_test.cc', include_directories: includes, link_with: lc0_lib, dependencies: gtest ), args: '--gtest_output=xml:chessboard.xml', timeout: 90) + test('FP16', + executable('fp16_test', 'src/utils/fp16_utils_test.cc', + include_directories: includes, link_with: lc0_lib, dependencies: gtest + ), args: '--gtest_output=xml:fp16.xml', timeout: 90) + test('HashCat', executable('hashcat_test', 'src/utils/hashcat_test.cc', include_directories: includes, link_with: lc0_lib, dependencies: gtest @@ -830,7 +970,8 @@ if get_option('gtest') ), args: '--gtest_output=xml:encoder.xml', timeout: 90) test('EngineTest', - executable('engine_test', 'src/engine_test.cc', pb_files, + executable('engine_test', 'src/engine_test.cc', 'src/engine.cc', + 'src/neural/memcache.cc', pb_files, include_directories: includes, link_with: lc0_lib, dependencies: [gtest, gmock]), args: '--gtest_output=xml:engine_test.xml', timeout: 90) endif diff --git a/meson_options.txt b/meson_options.txt index 6f941d0c42..ec5c53917a 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -43,6 +43,11 @@ option('cudnn_include', value: ['/opt/cuda/include/', '/usr/local/cuda/include/', '/usr/lib/cuda/include/'], description: 'Paths to cudnn include directory') +option('cutlass_include', + type: 'string', + value: '', + description: 'Paths to cutlass include directory') + option('build_backends', type: 'boolean', value: true, @@ -68,6 +73,11 @@ option('native_cuda', value: true, description: 'build cuda code for native arch only (if supported)') +option('native_arch', + type: 'boolean', + value: true, + description: 'build code for native arch only') + option('cudnn', type: 'boolean', value: false, @@ -78,6 +88,11 @@ option('plain_cuda', value: true, description: 'Enable CUDA backend') +option('cutlass', + type: 'boolean', + value: true, + description: 'Enable cutlass lib for cuda backend. Only supports Ampere+ right now') + option('opencl', type: 'boolean', value: false, @@ -181,16 +196,21 @@ option('cc_cuda', option('amd_gfx', type: 'string', value: '', - description: 'Build for a specific AMD GPU architecture, e.g. -Damd_gfx=90a for gfx90a') + description: 'Build for a specific AMD GPU architecture, e.g. -Damd_gfx=gfx90a for gfx90a') + +option('onnx', + type: 'boolean', + value: true, + description: 'Enable ONNX backends') option('onnx_libdir', type: 'string', - value: '', + value: '/usr/lib/', description: 'Paths to ONNX runtime libraries') option('onnx_include', type: 'string', - value: '', + value: '/usr/include/onnxruntime/', description: 'Paths to ONNX runtime includes') option('xla', @@ -204,6 +224,22 @@ option('sycl', value: 'off', description: 'Enable SYCL backend') +option('hip_libdirs', + type: 'array', + value: ['/opt/rocm/lib'], + description: 'Paths to AMD HIP libraries') + +option('hip_include', + type: 'array', + value: ['/opt/rocm/include'], + description: 'Path to AMD HIP includes') + +option('trace_library', + type: 'combo', + choices: ['off', 'perfetto', 'nvtx'], + value: 'off', + description: 'Enable trace library support') + option('lc0', type: 'boolean', value: true, @@ -219,7 +255,17 @@ option('default_search', value: '', description: 'Default search algorithm to use, e.g. -Ddefault_search=classic') +option('default_backend', + type: 'string', + value: '', + description: 'Default backend to use, e.g. -Ddefault_backend=onnx-trt') + option('dag_classic', type: 'boolean', value: true, description: 'Enable dag-classic search algorithm') + +option('nvcc', + type: 'boolean', + value: true, + description: 'Use nvcc: required for cuda, optional for onnx') diff --git a/src/neural/xla/hlo.proto b/proto/hlo.proto similarity index 100% rename from src/neural/xla/hlo.proto rename to proto/hlo.proto diff --git a/proto/net.proto b/proto/net.proto new file mode 100644 index 0000000000..961a73992a --- /dev/null +++ b/proto/net.proto @@ -0,0 +1,411 @@ +/* + This file is part of Leela Chess Zero. + Copyright (C) 2018 The LCZero Authors + + Leela Chess is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Leela Chess is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Leela Chess. If not, see . + + Additional permission under GNU GPL version 3 section 7 + + If you modify this Program, or any covered work, by linking or + combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA + Toolkit and the NVIDIA CUDA Deep Neural Network library (or a + modified version of those libraries), containing parts covered by the + terms of the respective license agreement, the licensors of this + Program grant you additional permission to convey the resulting work. +*/ +syntax = "proto2"; + +package pblczero; + +message EngineVersion { + optional uint32 major = 1; + optional uint32 minor = 2; + optional uint32 patch = 3; +} + +message Weights { + message Layer { + optional float min_val = 1; + optional float max_val = 2; + optional bytes params = 3; + enum Encoding { + UNKNOWN_ENCODING = 0; + LINEAR16 = 1; + FLOAT16 = 2; + BFLOAT16 = 3; + FLOAT32 = 4; + } + optional Encoding encoding = 4; + repeated uint32 dims = 5; + } + + message ConvBlock { + optional Layer weights = 1; + optional Layer biases = 2; + optional Layer bn_means = 3; + optional Layer bn_stddivs = 4; + optional Layer bn_gammas = 5; + optional Layer bn_betas = 6; + } + + message SEunit { + // Squeeze-excitation unit (https://arxiv.org/abs/1709.01507) + // weights and biases of the two fully connected layers. + optional Layer w1 = 1; + optional Layer b1 = 2; + optional Layer w2 = 3; + optional Layer b2 = 4; + } + + message Residual { + optional ConvBlock conv1 = 1; + optional ConvBlock conv2 = 2; + optional SEunit se = 3; + } + + message Smolgen { + // For NETWORK_ATTENTIONBODY_WITH_HEADFORMAT. + optional Layer compress = 1; + optional Layer dense1_w = 2; + optional Layer dense1_b = 3; + optional Layer ln1_gammas = 4; + optional Layer ln1_betas = 5; + optional Layer dense2_w = 6; + optional Layer dense2_b = 7; + optional Layer ln2_gammas = 8; + optional Layer ln2_betas = 9; + } + + message MHA { + optional Layer q_w = 1; + optional Layer q_b = 2; + optional Layer k_w = 3; + optional Layer k_b = 4; + optional Layer v_w = 5; + optional Layer v_b = 6; + optional Layer dense_w = 7; + optional Layer dense_b = 8; + optional Smolgen smolgen = 9; + + optional Layer rpe_q = 10; + optional Layer rpe_k = 11; + optional Layer rpe_v = 12; + + // reserved 13 - 22 for int8 quantization + } + + message FFN { + optional Layer dense1_w = 1; + optional Layer dense1_b = 2; + optional Layer dense2_w = 3; + optional Layer dense2_b = 4; + // reserved 5 - 10 for int8 quantization + } + + message EncoderLayer { + optional MHA mha = 1; + optional Layer ln1_gammas = 2; + optional Layer ln1_betas = 3; + optional FFN ffn = 4; + optional Layer ln2_gammas = 5; + optional Layer ln2_betas = 6; + } + + message PolicyHead { + optional Layer ip_pol_w = 1; + optional Layer ip_pol_b = 2; + optional Layer ip2_pol_w = 3; // "wq" in policy attention + optional Layer ip2_pol_b = 4; + optional Layer ip3_pol_w = 5; // "wk" in policy attention + optional Layer ip3_pol_b = 6; + optional Layer ip4_pol_w = 7; // "ppo" in policy attention + + // Optional policy encoders for policy head. + repeated EncoderLayer pol_encoder = 8; + optional uint32 pol_headcount = 9; + + // Convolutions for legacy policy head. + optional ConvBlock policy1 = 10; + optional ConvBlock policy = 11; + } + + message ValueHead { + optional Layer ip_val_w = 1; // "embedding" for attention body value + optional Layer ip_val_b = 2; + optional Layer ip1_val_w = 3; + optional Layer ip1_val_b = 4; + optional Layer ip2_val_w = 5; + optional Layer ip2_val_b = 6; + optional Layer ip_val_err_w = 7; + optional Layer ip_val_err_b = 8; + optional Layer ip_val_cat_w = 9; + optional Layer ip_val_cat_b = 10; + + // Legacy value head support. + optional ConvBlock value = 11; + } + + message PolicyHeadMap { + required string key = 1; // name of the policy head + required PolicyHead value = 2; + } + + message PolicyHeads { + optional Layer ip_pol_w = 1; // "embedding" in policy attention + optional Layer ip_pol_b = 2; + optional PolicyHead vanilla = 3; + optional PolicyHead optimistic_st = 4; + optional PolicyHead soft = 5; + optional PolicyHead opponent = 6; + // map policy_head_map = 7; + repeated PolicyHeadMap policy_head_map = 7; + } + + message ValueHeadMap { + required string key = 1; // name of the value head + required ValueHead value = 2; + } + + message ValueHeads { + optional ValueHead winner = 1; + optional ValueHead q = 2; + optional ValueHead st = 3; + // map value_head_map = 4; + repeated ValueHeadMap value_head_map = 4; + } + + // Input convnet. + optional ConvBlock input = 1; + + // Residual tower. + repeated Residual residual = 2; + + // Embedding layer for attention body encoders + // (NETWORK_ATTENTIONBODY_WITH_HEADFORMAT). + + optional Layer ip_emb_preproc_w = 37; + optional Layer ip_emb_preproc_b = 38; + + optional Layer ip_emb_w = 25; + optional Layer ip_emb_b = 26; + + optional Layer ip_emb_ln_gammas = 39; + optional Layer ip_emb_ln_betas = 40; + + + + // Input gating (NETWORK_ATTENTIONBODY_WITH_HEADFORMAT). + optional Layer ip_mult_gate = 33; + optional Layer ip_add_gate = 34; + + optional FFN ip_emb_ffn = 41; + optional Layer ip_emb_ffn_ln_gammas = 42; + optional Layer ip_emb_ffn_ln_betas = 43; + + // Encoder stack (NETWORK_ATTENTIONBODY_WITH_HEADFORMAT). + repeated EncoderLayer encoder = 27; + optional uint32 headcount = 28; + + // Policy encoder stack + // The ffn activation up to and including NETWORK_SE_WITH_HEADFORMAT is SELU, + // otherwise it follows the ffn activation setting. + repeated EncoderLayer pol_encoder = 21; + optional uint32 pol_headcount = 24; + + // Policy head + // Extra convolution for AZ-style policy head + optional ConvBlock policy1 = 11; + optional ConvBlock policy = 3; + optional Layer ip_pol_w = 4; // "embedding" in policy attention + optional Layer ip_pol_b = 5; + // For policy attention, up to and including NETWORK_SE_WITH_HEADFORMAT the + // "embedding" activation is SELU, otherwise it is the default activation. + optional Layer ip2_pol_w = 17; // "wq" in policy attention + optional Layer ip2_pol_b = 18; + optional Layer ip3_pol_w = 19; // "wk" in policy attention + optional Layer ip3_pol_b = 20; + optional Layer ip4_pol_w = 22; // "ppo" in policy attention + + // Value head + optional ConvBlock value = 6; + optional Layer ip_val_w = 29; // "embedding" for attention body value + optional Layer ip_val_b = 30; + optional Layer ip1_val_w = 7; + optional Layer ip1_val_b = 8; + optional Layer ip2_val_w = 9; + optional Layer ip2_val_b = 10; + + optional ValueHeads value_heads = 44; + optional PolicyHeads policy_heads = 45; + + // Moves left head + optional ConvBlock moves_left = 12; + optional Layer ip_mov_w = 31; // "embedding" for attention body moves left + optional Layer ip_mov_b = 32; + optional Layer ip1_mov_w = 13; + optional Layer ip1_mov_b = 14; + optional Layer ip2_mov_w = 15; + optional Layer ip2_mov_b = 16; + + // Global smolgen weights (NETWORK_ATTENTIONBODY_WITH_HEADFORMAT). + optional Layer smolgen_w = 35; + optional Layer smolgen_b = 36; +} + +message TrainingParams { + optional uint32 training_steps = 1; + optional float learning_rate = 2; + optional float mse_loss = 3; + optional float policy_loss = 4; + optional float accuracy = 5; + optional string lc0_params = 6; +} + +message NetworkFormat { + // Format to encode the input planes with. Used by position encoder. + enum InputFormat { + INPUT_UNKNOWN = 0; + INPUT_CLASSICAL_112_PLANE = 1; + INPUT_112_WITH_CASTLING_PLANE = 2; + INPUT_112_WITH_CANONICALIZATION = 3; + INPUT_112_WITH_CANONICALIZATION_HECTOPLIES = 4; + INPUT_112_WITH_CANONICALIZATION_HECTOPLIES_ARMAGEDDON = 132; + INPUT_112_WITH_CANONICALIZATION_V2 = 5; + INPUT_112_WITH_CANONICALIZATION_V2_ARMAGEDDON = 133; + } + optional InputFormat input = 1; + + // Output format of the NN. Used by search code to interpret results. + enum OutputFormat { + OUTPUT_UNKNOWN = 0; + OUTPUT_CLASSICAL = 1; + OUTPUT_WDL = 2; + } + optional OutputFormat output = 2; + + // Network architecture. Used by backends to build the network. + enum NetworkStructure { + // Networks without PolicyFormat or ValueFormat specified + NETWORK_UNKNOWN = 0; + NETWORK_CLASSICAL = 1; + NETWORK_SE = 2; + // Networks with PolicyFormat and ValueFormat specified + NETWORK_CLASSICAL_WITH_HEADFORMAT = 3; + NETWORK_SE_WITH_HEADFORMAT = 4; + NETWORK_ONNX = 5; + NETWORK_ATTENTIONBODY_WITH_HEADFORMAT = 6; + NETWORK_ATTENTIONBODY_WITH_MULTIHEADFORMAT = 7; + NETWORK_AB_LEGACY_WITH_MULTIHEADFORMAT = 134; + } + optional NetworkStructure network = 3; + + // Policy head architecture + enum PolicyFormat { + POLICY_UNKNOWN = 0; + POLICY_CLASSICAL = 1; + POLICY_CONVOLUTION = 2; + POLICY_ATTENTION = 3; + } + optional PolicyFormat policy = 4; + + // Value head architecture + enum ValueFormat { + VALUE_UNKNOWN = 0; + VALUE_CLASSICAL = 1; + VALUE_WDL = 2; + VALUE_PARAM = 3; + } + optional ValueFormat value = 5; + + // Moves left head architecture + enum MovesLeftFormat { + MOVES_LEFT_NONE = 0; + MOVES_LEFT_V1 = 1; + } + optional MovesLeftFormat moves_left = 6; + + enum ActivationFunction { + ACTIVATION_DEFAULT = 0; + ACTIVATION_MISH = 1; + ACTIVATION_RELU = 2; + ACTIVATION_NONE = 3; + ACTIVATION_TANH = 4; + ACTIVATION_SIGMOID = 5; + ACTIVATION_SELU = 6; + ACTIVATION_SWISH = 7; + ACTIVATION_RELU_2 = 8; + ACTIVATION_SOFTMAX = 9; + } + + // Activation used everywhere except head outputs or otherwise specified. + enum DefaultActivation { + DEFAULT_ACTIVATION_RELU = 0; + DEFAULT_ACTIVATION_MISH = 1; + } + optional DefaultActivation default_activation = 7; + + optional ActivationFunction smolgen_activation = 8; + optional ActivationFunction ffn_activation = 9; + + enum InputEmbeddingFormat { + INPUT_EMBEDDING_NONE = 0; + INPUT_EMBEDDING_PE_MAP = 1; + INPUT_EMBEDDING_PE_DENSE = 2; + } + optional InputEmbeddingFormat input_embedding = 10; +} + +message Format { + enum Encoding { + UNKNOWN = 0; + LINEAR16 = 1; + } + // Any encoding specified in a Layer overides this. + optional Encoding weights_encoding = 1; + // If network_format is missing, it's assumed to have + // INPUT_CLASSICAL_112_PLANE / OUTPUT_CLASSICAL / NETWORK_CLASSICAL format. + optional NetworkFormat network_format = 2; +} + +message OnnxModel { + enum DataType { + UNKNOWN_DATATYPE = 0; + FLOAT = 1; + FLOAT16 = 10; + BFLOAT16 = 16; + } + + // Serialized OnnxProto model. + optional bytes model = 1; + optional DataType data_type = 2; + // Name of the input tensor to populate. + optional string input_planes = 3; + // Names of the output tensors to get results from. + // If some feature is not present, corresponding values are not set. + optional string output_value = 4; + optional string output_wdl = 5; + optional string output_policy = 6; + optional string output_mlh = 7; +} + +message Net { + optional fixed32 magic = 1; + optional string license = 2; + optional EngineVersion min_version = 3; + optional Format format = 4; + optional TrainingParams training_params = 5; + // Either weights or onnx_model is set, but not both. + optional Weights weights = 10; + optional OnnxModel onnx_model = 11; +} diff --git a/src/neural/onnx/onnx.proto b/proto/onnx.proto similarity index 100% rename from src/neural/onnx/onnx.proto rename to proto/onnx.proto diff --git a/scripts/appveyor_win_build.cmd b/scripts/appveyor_win_build.cmd index 43ab5f211a..00e739d567 100644 --- a/scripts/appveyor_win_build.cmd +++ b/scripts/appveyor_win_build.cmd @@ -1,5 +1,5 @@ SET PGO=false -IF %APPVEYOR_REPO_TAG%==true IF %DX%==false IF %ONNX_DML%==false SET PGO=true +IF %APPVEYOR_REPO_TAG%==true IF %DX%==false IF %ONNX%==false SET PGO=true IF %PGO%==false msbuild "C:\projects\lc0\build\lc0.sln" /m /p:WholeProgramOptimization=true /logger:"C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll" IF EXIST build\lc0.pdb del build\lc0.pdb IF %PGO%==true msbuild "C:\projects\lc0\build\lc0.sln" /m /p:WholeProgramOptimization=PGInstrument /logger:"C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll" @@ -19,3 +19,12 @@ IF %PGO%==true ( ) cd .. IF %PGO%==true msbuild "C:\projects\lc0\build\lc0.sln" /m /p:WholeProgramOptimization=PGOptimize /p:DebugInformationFormat=ProgramDatabase /logger:"C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll" +IF %NAME%==onnx ( + ren build\lc0.exe lc0-trt.exe + meson configure build -Ddefault_backend= -Dcudnn_libdirs= -Dgtest=%GTEST% + # This is needed as a separate step. + msbuild "C:\projects\lc0\build\lc0.sln" /target:REGEN + IF %PGO%==true msbuild "C:\projects\lc0\build\lc0.sln" /m /p:WholeProgramOptimization=PGOptimize /p:DebugInformationFormat=ProgramDatabase /logger:"C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll" + IF %PGO%==false msbuild "C:\projects\lc0\build\lc0.sln" /m /p:WholeProgramOptimization=true /logger:"C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll" + ren build\lc0.exe lc0-dml.exe +) diff --git a/scripts/appveyor_win_package.cmd b/scripts/appveyor_win_package.cmd index eb36adf26c..eaf1ba73b7 100644 --- a/scripts/appveyor_win_package.cmd +++ b/scripts/appveyor_win_package.cmd @@ -1,6 +1,6 @@ 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip %APPVEYOR_BUILD_FOLDER%\build\lc0.exe -IF %NAME%==gpu-nvidia-cuda appveyor DownloadFile "https://github.com/LeelaChessZero/lczero-client/releases/latest/download/lc0-training-client.exe" -IF %NAME%==gpu-nvidia-cuda 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip lc0-training-client.exe +IF %NAME%==gpu-nvidia-cuda12 appveyor DownloadFile "https://github.com/LeelaChessZero/lczero-client/releases/latest/download/lc0-training-client.exe" +IF %NAME%==gpu-nvidia-cuda12 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip lc0-training-client.exe type COPYING |more /P > dist\COPYING 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\COPYING 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip c:\cache\%NET%.pb.gz @@ -17,24 +17,48 @@ IF %NAME%==onednn 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip C:\cache\ IF %OPENCL%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip C:\cache\opencl-nug.0.777.77\build\native\bin\OpenCL.dll IF %CUDNN%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip "%CUDA_PATH%\bin\cudart64_101.dll" "%CUDA_PATH%\bin\cublas64_10.dll" "%CUDA_PATH%\bin\cublasLt64_10.dll" IF %CUDNN%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip "%CUDA_PATH%\cuda\bin\cudnn64_7.dll" -IF %CUDA%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip "%CUDA_PATH%\bin\cudart64_110.dll" "%CUDA_PATH%\bin\cublas64_11.dll" "%CUDA_PATH%\bin\cublasLt64_11.dll" -IF %NAME%==cpu-dnnl copy "%PKG_FOLDER%\%DNNL_NAME%\LICENSE" dist\DNNL-LICENSE -IF %NAME%==cpu-dnnl copy "%PKG_FOLDER%\%DNNL_NAME%\THIRD-PARTY-PROGRAMS" dist\DNNL-THIRD-PARTY-PROGRAMS -IF %NAME%==cpu-dnnl 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\DNNL-LICENSE -IF %NAME%==cpu-dnnl 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\DNNL-THIRD-PARTY-PROGRAMS -IF %NAME%==onednn copy "%PKG_FOLDER%\%DNNL_NAME%\LICENSE" dist\DNNL-LICENSE -IF %NAME%==onednn copy "%PKG_FOLDER%\%DNNL_NAME%\THIRD-PARTY-PROGRAMS" dist\DNNL-THIRD-PARTY-PROGRAMS -IF %NAME%==onednn 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\DNNL-LICENSE -IF %NAME%==onednn 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\DNNL-THIRD-PARTY-PROGRAMS -IF %ONNX_DML%==true type dist\README-onnx-dml.txt |more /P > dist\README.txt -IF %ONNX_DML%==true type dist\install-dml.cmd |more /P > dist\install.cmd -IF %ONNX_DML%==true copy "%PKG_FOLDER%\%ONNX_NAME%\LICENSE" dist\ONNX-DML-LICENSE -IF %ONNX_DML%==true copy "%PKG_FOLDER%\%ONNX_NAME%\ThirdPartyNotices.txt" dist\ONNX-DML-ThirdPartyNotices.txt -IF %ONNX_DML%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip "%PKG_FOLDER%\%ONNX_NAME%\runtimes\win-x64\native\onnxruntime.dll" -IF %ONNX_DML%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\README.txt -IF %ONNX_DML%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\install.cmd -IF %ONNX_DML%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\ONNX-DML-LICENSE -IF %ONNX_DML%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\ONNX-DML-ThirdPartyNotices.txt +IF %NAME%==gpu-nvidia-cuda11 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip "%CUDA_PATH%\bin\cudart64_110.dll" "%CUDA_PATH%\bin\cublas64_11.dll" "%CUDA_PATH%\bin\cublasLt64_11.dll" +IF %NAME%==gpu-nvidia-cuda12 ( + 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip "%CUDA_PATH%\bin\cudart64_12.dll" "%CUDA_PATH%\bin\cublas64_12.dll" "%CUDA_PATH%\bin\cublasLt64_12.dll" + type dist\install-cuda_12_9.cmd |more /P > dist\install.cmd + 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-nodll.zip .\dist\install.cmd +) +IF %NAME%==cpu-dnnl ( + copy "%PKG_FOLDER%\%DNNL_NAME%\LICENSE" dist\DNNL-LICENSE + copy "%PKG_FOLDER%\%DNNL_NAME%\THIRD-PARTY-PROGRAMS" dist\DNNL-THIRD-PARTY-PROGRAMS + 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\DNNL-LICENSE + 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\DNNL-THIRD-PARTY-PROGRAMS +) +IF %NAME%==onednn ( + copy "%PKG_FOLDER%\%DNNL_NAME%\LICENSE" dist\DNNL-LICENSE + copy "%PKG_FOLDER%\%DNNL_NAME%\THIRD-PARTY-PROGRAMS" dist\DNNL-THIRD-PARTY-PROGRAMS + 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\DNNL-LICENSE + 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\DNNL-THIRD-PARTY-PROGRAMS +) +IF %ONNX%==true ( + copy "%PKG_FOLDER%\%ONNX_NAME%\LICENSE" dist\ONNX-LICENSE + copy "%PKG_FOLDER%\%ONNX_NAME%\ThirdPartyNotices.txt" dist\ONNX-ThirdPartyNotices.txt + 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip "%PKG_FOLDER%\%ONNX_NAME%\runtimes\win-x64\native\onnxruntime.dll" + 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\ONNX-LICENSE + 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\ONNX-ThirdPartyNotices.txt + copy lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-trt.zip + ren lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-dml.zip + 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-dml.zip %APPVEYOR_BUILD_FOLDER%\build\lc0-dml.exe + 7z rn lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-dml.zip lc0-dml.exe lc0.exe + type dist\README-onnx-dml.txt |more /P > dist\README.txt + type dist\install-dml.cmd |more /P > dist\install.cmd + 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-dml.zip .\dist\README.txt + 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-dml.zip .\dist\install.cmd + 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-trt.zip %APPVEYOR_BUILD_FOLDER%\build\lc0-trt.exe + 7z rn lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-trt.zip lc0-trt.exe lc0.exe + 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-trt.zip "%PKG_FOLDER%\%ONNX_NAME%\runtimes\win-x64\native\onnxruntime_providers_shared.dll" + 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-trt.zip "%PKG_FOLDER%\%ONNX_NAME_TWO%\lib\onnxruntime_providers_cuda.dll" + 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-trt.zip "%PKG_FOLDER%\%ONNX_NAME_TWO%\lib\onnxruntime_providers_tensorrt.dll" + type dist\README-onnx-trt.txt |more /P > dist\README.txt + type dist\install-trt.cmd |more /P > dist\install.cmd + 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-trt.zip .\dist\README.txt + 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-trt.zip .\dist\install.cmd +) IF %OPENCL%==true type scripts\check_opencl.bat |more /P > dist\check_opencl.bat IF %OPENCL%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\check_opencl.bat IF %DX%==true type scripts\check_dx.bat |more /P > dist\check_dx.bat diff --git a/scripts/compile_proto.py b/scripts/compile_proto.py index bbbdfb7342..c6a81996d9 100755 --- a/scripts/compile_proto.py +++ b/scripts/compile_proto.py @@ -29,62 +29,73 @@ import os import re import sys +from typing import Any VARINT_TYPES = { - 'int32': 'std::int32_t', - 'int64': 'std::int64_t', - 'uint32': 'std::uint32_t', - 'uint64': 'std::uint64_t', - 'sint32': 'std::uint32_t', - 'sint64': 'std::uint64_t', - 'bool': 'bool', + "int32": "std::int32_t", + "int64": "std::int64_t", + "uint32": "std::uint32_t", + "uint64": "std::uint64_t", + "sint32": "std::uint32_t", + "sint64": "std::uint64_t", + "bool": "bool", } FIXED64_TYPES = { - 'fixed64': 'std::uint64_t', - 'sfixed64': 'std::int64_t', - 'double': 'double', + "fixed64": "std::uint64_t", + "sfixed64": "std::int64_t", + "double": "double", } FIXED32_TYPES = { - 'fixed32': 'std::uint32_t', - 'sfixed32': 'std::int32_t', - 'float': 'float', + "fixed32": "std::uint32_t", + "sfixed32": "std::int32_t", + "float": "float", } BYTES_TYPES = { - 'string': 'std::string_view', - 'bytes': 'std::string_view', + "string": "std::string_view", + "bytes": "std::string_view", } -ZIGZAG_TYPES = set(['sint32', 'sint64']) -FLOAT_TYPES = set(['float', 'double']) +ZIGZAG_TYPES = set(["sint32", "sint64"]) +FLOAT_TYPES = set(["float", "double"]) TYPES = {**VARINT_TYPES, **FIXED32_TYPES, **FIXED64_TYPES, **BYTES_TYPES} RESERVED_WORDS = [ - 'enum', - 'message', - 'optional', - 'package', - 'repeated', - 'required', - 'reserved', - 'syntax', - 'to', + "enum", + "message", + "optional", + "package", + "repeated", + "required", + "reserved", + "syntax", + "to", ] + list(TYPES.keys()) -GRAMMAR = ([(r'%s\b' % x, x) - for x in RESERVED_WORDS] + [('\\' + x, x) for x in '=;{}.,'] + [ - (r'/\*.*?\*/', None), # /* Comment */ - (r'//.*?$', None), # // Comment - (r'\s+', None), # Whitespace - (r'$', 'EOF'), - (r'"((?:[^"\\]|\\.)*)"', 'string'), - (r'\d+', 'number'), - (r'(\w+)', 'identifier'), - ]) +GRAMMAR = ( + [(r"%s\b" % x, x) for x in RESERVED_WORDS] + + [("\\" + x, x) for x in "=;{}.,[]"] + + [ + (r"/\*.*?\*/", None), # /* Comment */ + (r"//.*?$", None), # // Comment + (r"\s+", None), # Whitespace + (r"$", "EOF"), + (r'"((?:[^"\\]|\\.)*)"', "string"), + ( + r"[-+]?(?:[0-9]*\.[0-9]+(?:[eE][-+]?[0-9]+)?|[0-9]+[eE][-+]?[0-9]+)", + "fnumber", + ), + (r"[-+]?\d+", "number"), + (r"(\w+)", "identifier"), + ] +) + +ALLOWED_ATTRIBUTES = { + "default", +} class Lexer: - def __init__(self, text): self.text = text self.grammar = [(re.compile(x, re.S + re.M), y) for x, y in GRAMMAR] @@ -92,31 +103,31 @@ def __init__(self, text): self.cur_offset = 0 def Pick(self): - '''Picks the last token in queue. Doesn't advance the queue.''' + """Picks the last token in queue. Doesn't advance the queue.""" if self.cur_token is None: self.cur_token = self.NextToken() return self.cur_token def Consume(self, expected_token, value=None, group=0): - '''Gets the token from the queue and advances the queue. + """Gets the token from the queue and advances the queue. If @expected_token if of wrong type, or @value is not equal to regexes @group, throws an error. - ''' + """ token, match = self.Pick() if expected_token != token: - self.Error(f'Expected token type [{expected_token}], got [{token}]') + self.Error(f"Expected token type [{expected_token}], got [{token}]") if value is not None and value != match.group(group): - self.Error('Expected value [%s]' % value) + self.Error("Expected value [%s]" % value) self.cur_offset = match.span()[1] self.cur_token = None return match def NextToken(self): - '''Reads the stream and returns the next token. + """Reads the stream and returns the next token. (which is not whitespace or comment) - ''' + """ while True: token, match = self.NextTokenOrWhitespace() if token is None: @@ -125,40 +136,42 @@ def NextToken(self): return token, match def NextTokenOrWhitespace(self): - '''Reads the stream and returns the next token (possibly whitespace).''' + """Reads the stream and returns the next token (possibly whitespace).""" for r, token in self.grammar: m = r.match(self.text, self.cur_offset) if m: return (token, m) - token_snippet = self.text[self.cur_offset:self.cur_offset + 10] - self.Error(f'Unparseable token [{token_snippet}...]') + token_snippet = self.text[self.cur_offset : self.cur_offset + 10] + self.Error(f"Unparseable token [{token_snippet}...]") def Error(self, text): - '''Throws an error with context in the file read.''' - line = self.text[:self.cur_offset].count('\n') + 1 - line_start = self.text.rfind('\n', 0, self.cur_offset) + 1 - line_end = self.text.find('\n', line_start) + """Throws an error with context in the file read.""" + line = self.text[: self.cur_offset].count("\n") + 1 + line_start = self.text.rfind("\n", 0, self.cur_offset) + 1 + line_end = self.text.find("\n", line_start) if line_end == -1: line_end = len(self.text) - sys.stderr.write('%s:\n' % text) - sys.stderr.write(self.text[line_start:line_end] + '\n') - sys.stderr.write(' ' * (self.cur_offset - line_start) + '^^^\n') - raise ValueError("Parse error: %s at line %d column %d." % - (text, line, (self.cur_offset - line_start))) + sys.stderr.write("%s:\n" % text) + sys.stderr.write(self.text[line_start:line_end] + "\n") + sys.stderr.write(" " * (self.cur_offset - line_start) + "^^^\n") + raise ValueError( + "Parse error: %s at line %d column %d." + % (text, line, (self.cur_offset - line_start)) + ) def ReadIdentifierPath(lexer): - '''Reads qualified identifier a.b.d into ['a', 'b', 'd'] list''' + """Reads qualified identifier a.b.d into ['a', 'b', 'd'] list""" path = [] while True: - path.append(lexer.Consume('identifier').group(0)) - if lexer.Pick()[0] != '.': + path.append(lexer.Consume("identifier").group(0)) + if lexer.Pick()[0] != ".": return path - lexer.Consume('.') + lexer.Consume(".") def LookupType(name, stack): - '''Looks up the (possibly qualified) from the innermost scope first.''' + """Looks up the (possibly qualified) from the innermost scope first.""" for y in stack: for x in y: if x.GetName() == name[0]: @@ -166,7 +179,7 @@ def LookupType(name, stack): return x else: return LookupType(name[1:], [x.GetTypes()]) - raise ValueError("Cannot find type: %s." % '.'.join(name)) + raise ValueError("Cannot find type: %s." % ".".join(name)) # All *Parser classes have the following semantics: @@ -175,18 +188,17 @@ def LookupType(name, stack): class ProtoTypeParser: - def __init__(self, lexer, object_stack): token, match = lexer.Pick() if token in TYPES: - self.typetype = 'basic' + self.typetype = "basic" self.name = token lexer.Consume(token) - elif token == 'identifier': + elif token == "identifier": self.name = ReadIdentifierPath(lexer) - self.typetype = 'forward' + self.typetype = "forward" else: - lexer.Error('Type expected') + lexer.Error("Type expected") def LookupForwardFieldType(self, object_stack): if self.IsForward(): @@ -195,41 +207,43 @@ def LookupForwardFieldType(self, object_stack): self.name = [typ.GetFullName()] def IsZigzag(self): - if self.typetype == 'basic': + if self.typetype == "basic": return self.name in ZIGZAG_TYPES return False def GetCppType(self): - if self.typetype == 'basic': + if self.typetype == "basic": return TYPES[self.name] else: - return '_'.join(self.name) + return "_".join(self.name) def GetVariableCppType(self): if self.IsBytesType(): - return 'std::string' + return "std::string" else: return self.GetCppType() def IsEnumType(self): - return self.typetype == 'enum' + return self.typetype == "enum" def IsVarintType(self): - return self.typetype == 'enum' or (self.typetype == 'basic' - and self.name in VARINT_TYPES) + return self.typetype == "enum" or ( + self.typetype == "basic" and self.name in VARINT_TYPES + ) def IsFixedType(self): - return self.typetype == 'basic' and (self.name in FIXED64_TYPES - or self.name in FIXED32_TYPES) + return self.typetype == "basic" and ( + self.name in FIXED64_TYPES or self.name in FIXED32_TYPES + ) def IsBytesType(self): - return self.typetype == 'basic' and self.name in BYTES_TYPES + return self.typetype == "basic" and self.name in BYTES_TYPES def IsFloatType(self): - return self.typetype == 'basic' and self.name in FLOAT_TYPES + return self.typetype == "basic" and self.name in FLOAT_TYPES def GetWireType(self): - if self.typetype == 'basic': + if self.typetype == "basic": if self.name in VARINT_TYPES: return 0 if self.name in FIXED64_TYPES: @@ -238,52 +252,84 @@ def GetWireType(self): return 2 if self.name in FIXED32_TYPES: return 5 - raise ValueError('Unknown type %s' % self.name) - elif self.typetype == 'enum': + raise ValueError("Unknown type %s" % self.name) + elif self.typetype == "enum": return 0 - elif self.typetype == 'message': + elif self.typetype == "message": return 2 else: - raise ValueError('Unknown typetype %s' % self.typetype) + raise ValueError("Unknown typetype %s" % self.typetype) def IsMessage(self): - return self.typetype == 'message' + return self.typetype == "message" def IsForward(self): - return self.typetype == 'forward' + return self.typetype == "forward" def IsIntegralType(self): - if self.typetype == 'basic': - if self.name == 'double': + if self.typetype == "basic": + if self.name == "double": return False - if self.name == 'float': + if self.name == "float": return False if self.name in BYTES_TYPES: return False if self.name in TYPES: return True - raise ValueError('Unknown type %s' % self.name) - elif self.typetype == 'enum': + raise ValueError("Unknown type %s" % self.name) + elif self.typetype == "enum": return True - elif self.typetype == 'message': + elif self.typetype == "message": return False else: - raise ValueError('Unknown typetype %s' % self.typetype) + raise ValueError("Unknown typetype %s" % self.typetype) class ProtoFieldParser: - def __init__(self, lexer, object_stack): token, match = lexer.Pick() - if token not in ['repeated', 'optional', 'required']: - lexer.Error('repeated, optional or required expected') + if token not in ["repeated", "optional", "required"]: + lexer.Error("repeated, optional or required expected") self.category = token lexer.Consume(token) self.type = ProtoTypeParser(lexer, object_stack) - self.name = lexer.Consume('identifier') - lexer.Consume('=') - self.number = int(lexer.Consume('number').group(0)) - lexer.Consume(';') + self.name = lexer.Consume("identifier") + lexer.Consume("=") + self.number = int(lexer.Consume("number").group(0)) + self.attributes = ProtoFieldParser.ParseAttributes(lexer) + lexer.Consume(";") + + @staticmethod + def ParseAttributes(lexer): + attributes = {} + token, match = lexer.Pick() + if token != "[": + return attributes + lexer.Consume("[") + while True: + name = lexer.Consume("identifier").group(0) + if name not in ALLOWED_ATTRIBUTES: + lexer.Error("Unknown attribute %s" % name) + lexer.Consume("=") + token, match = lexer.Pick() + value = None + if token == "string": + value = lexer.Consume("string").group(0) + elif token == "fnumber": + value = float(lexer.Consume("fnumber").group(0)) + elif token == "number": + value = int(lexer.Consume("number").group(0)) + else: + lexer.Error("Expected string or number as default value") + attributes[name] = value + token, _ = lexer.Pick() + if token == "]": + lexer.Consume("]") + return attributes + elif token == ",": + lexer.Consume(",") + else: + lexer.Error("Expected ']' or ','") def IsType(self): return False @@ -294,96 +340,96 @@ def LookupForwardFieldType(self, object_stack): def GetParser(self): name = self.name.group(0) if self.type.IsMessage(): - if self.category == 'repeated': - return 'add_%s()->MergeFromString(val)' % name + if self.category == "repeated": + return "add_%s()->MergeFromString(val)" % name else: - return 'mutable_%s()->MergeFromString(val)' % name + return "mutable_%s()->MergeFromString(val)" % name cpp_type = self.type.GetCppType() - val = 'NOT IMPLEMENTED!' + val = "NOT IMPLEMENTED!" if self.type.IsVarintType(): - val_val = 'UnZigZag(val)' if self.type.IsZigzag() else 'val' - val = 'static_cast<%s>(%s)' % (cpp_type, val_val) + val_val = "UnZigZag(val)" if self.type.IsZigzag() else "val" + val = "static_cast<%s>(%s)" % (cpp_type, val_val) elif self.type.IsFixedType(): if self.type.IsFloatType(): - val = 'bit_cast<%s>(val)' % cpp_type + val = "bit_cast<%s>(val)" % cpp_type else: - val = 'static_cast<%s>(val)' % cpp_type + val = "static_cast<%s>(val)" % cpp_type elif self.type.IsBytesType(): - val = 'val' + val = "val" - if self.category == 'repeated': - return '%s_.emplace_back(%s)' % (name, val) + if self.category == "repeated": + return "%s_.emplace_back(%s)" % (name, val) else: - return 'set_%s(%s)' % (name, val) + return "set_%s(%s)" % (name, val) def GenerateCaseClause(self, w): - w.Write('case %d: %s; break;' % (self.number, self.GetParser())) + w.Write("case %d: %s; break;" % (self.number, self.GetParser())) def GenerateClear(self, w): - name = self.name.group(0) - if self.category == 'repeated': - w.Write('%s_.clear();' % name) + name = self.name.group(0) + if self.category == "repeated": + w.Write("%s_.clear();" % name) else: - w.Write('has_%s_ = false;' % name) - w.Write('%s_ = {};' % name) + w.Write("has_%s_ = false;" % name) + if "default" in self.attributes: + w.Write("%s_ = %s;" % (name, self.attributes["default"])) + else: + w.Write("%s_ = {};" % name) def GenerateOutput(self, w): fname = { - 0: 'AppendVarInt', - 1: 'AppendInt64', - 2: 'AppendString', - 5: 'AppendInt32' + 0: "AppendVarInt", + 1: "AppendInt64", + 2: "AppendString", + 5: "AppendInt32", } tname = { - 0: 'std::uint64_t', - 1: 'std::uint64_t', - 2: 'std::string_view', - 5: 'std::uint32_t' + 0: "std::uint64_t", + 1: "std::uint64_t", + 2: "std::string_view", + 5: "std::uint32_t", } wire_id = self.type.GetWireType() - if self.category == 'repeated': - prefix = 'for (const auto& x : %s)' % (self.name.group(0) + '_') - name = 'x' + if self.category == "repeated": + prefix = "for (const auto& x : %s)" % (self.name.group(0) + "_") + name = "x" else: - name = self.name.group(0) + '_' - prefix = 'if (has_%s)' % (name) + name = self.name.group(0) + "_" + prefix = "if (has_%s)" % (name) if self.type.IsMessage(): - name += '.OutputAsString()' + name += ".OutputAsString()" elif self.type.IsFloatType(): - name = 'bit_cast<%s>(%s)' % (tname[wire_id], name) + name = "bit_cast<%s>(%s)" % (tname[wire_id], name) - w.Write('%s %s(%d, %s, &out);' % - (prefix, fname[wire_id], self.number, name)) + w.Write("%s %s(%d, %s, &out);" % (prefix, fname[wire_id], self.number, name)) def GenerateJsonOutput(self, w): name = self.name.group(0) - if self.category == 'repeated': - prefix = 'if (!%s_.empty())' % name - funcname = 'AppendJsonRepeatedField' + if self.category == "repeated": + prefix = "if (!%s_.empty())" % name + funcname = "AppendJsonRepeatedField" else: - prefix = 'if (has_%s_)' % name - funcname = 'AppendJsonField' + prefix = "if (has_%s_)" % name + funcname = "AppendJsonField" if self.type.IsEnumType(): - value = '%s_Name(%s_)' % (self.type.GetCppType(), name) + value = "%s_Name(%s_)" % (self.type.GetCppType(), name) else: value = name + "_" - w.Write('%s %s("%s", %s, &first, &out);' % - (prefix, funcname, name, value)) + w.Write('%s %s("%s", %s, &first, &out);' % (prefix, funcname, name, value)) def GenerateFunctionDeclarations(self, w): name = self.name.group(0) cpp_type = self.type.GetCppType() var_cpp_type = self.type.GetVariableCppType() - if self.category == 'repeated': + if self.category == "repeated": if self.type.IsMessage(): w.Write("%s* add_%s();" % (cpp_type, name)) else: w.Write("void add_%s(%s val);" % (name, cpp_type)) # Using a vector here breaks API compatibility with the standard # protobuf library, but it is more convenient. - w.Write("const std::vector<%s>& %s() const;" % - (var_cpp_type, name)) + w.Write("const std::vector<%s>& %s() const;" % (var_cpp_type, name)) w.Write("std::vector<%s>* mutable_%s();" % (var_cpp_type, name)) if self.type.IsMessage(): w.Write("const %s& %s(size_t idx) const;" % (cpp_type, name)) @@ -405,54 +451,70 @@ def GenerateFunctionDefinitions(self, w, class_name): name = self.name.group(0) cpp_type = self.type.GetCppType() var_cpp_type = self.type.GetVariableCppType() - if self.category == 'repeated': + if self.category == "repeated": if self.type.IsMessage(): w.Write( - "inline %s* %s::add_%s() { return &%s_.emplace_back(); }" % - (cpp_type, class_name, name, name)) + "inline %s* %s::add_%s() { return &%s_.emplace_back(); }" + % (cpp_type, class_name, name, name) + ) else: w.Write( "inline void %s::add_%s(%s val) { %s_.emplace_back(val); }" - % (class_name, name, cpp_type, name)) + % (class_name, name, cpp_type, name) + ) w.Write( "inline const std::vector<%s>& %s::%s() const { return %s_; }" - % (var_cpp_type, class_name, name, name)) + % (var_cpp_type, class_name, name, name) + ) w.Write( "inline std::vector<%s>* %s::mutable_%s() { return &%s_; }" - % (var_cpp_type, class_name, name, name)) + % (var_cpp_type, class_name, name, name) + ) if self.type.IsMessage(): w.Write( "inline const %s& %s::%s(size_t idx) const { return %s_[idx]; }" - % (cpp_type, class_name, name, name)) + % (cpp_type, class_name, name, name) + ) w.Write( "inline %s* %s::mutable_%s(size_t idx) { return &%s_[idx]; }" - % (cpp_type, class_name, name, name)) + % (cpp_type, class_name, name, name) + ) else: w.Write( - "inline %s %s::%s(size_t idx) const { return %s_[idx]; }" % - (cpp_type, class_name, name, name)) + "inline %s %s::%s(size_t idx) const { return %s_[idx]; }" + % (cpp_type, class_name, name, name) + ) w.Write( - "inline size_t %s::%s_size() const { return %s_.size(); }" % - (class_name, name, name)) + "inline size_t %s::%s_size() const { return %s_.size(); }" + % (class_name, name, name) + ) else: - w.Write("inline bool %s::has_%s() const { return has_%s_; }" % - (class_name, name, name)) + w.Write( + "inline bool %s::has_%s() const { return has_%s_; }" + % (class_name, name, name) + ) if self.type.IsMessage(): - w.Write("inline const %s& %s::%s() const { return %s_; }" % - (cpp_type, class_name, name, name)) + w.Write( + "inline const %s& %s::%s() const { return %s_; }" + % (cpp_type, class_name, name, name) + ) if self.type.IsMessage() or self.type.IsBytesType(): - w.Write("inline %s* %s::mutable_%s() {" % - (var_cpp_type, class_name, name)) + w.Write( + "inline %s* %s::mutable_%s() {" % (var_cpp_type, class_name, name) + ) w.Indent() - w.Write('has_%s_ = true;' % (name)) - w.Write('return &%s_;' % name) + w.Write("has_%s_ = true;" % (name)) + w.Write("return &%s_;" % name) w.Unindent() w.Write("}") if not self.type.IsMessage(): - w.Write("inline %s %s::%s() const { return %s_; }" % - (cpp_type, class_name, name, name)) - w.Write("inline void %s::set_%s(%s val) {" % - (class_name, name, cpp_type)) + w.Write( + "inline %s %s::%s() const { return %s_; }" + % (cpp_type, class_name, name, name) + ) + w.Write( + "inline void %s::set_%s(%s val) {" % (class_name, name, cpp_type) + ) w.Indent() w.Write("has_%s_ = true;" % name) w.Write("%s_ = val;" % name) @@ -462,41 +524,43 @@ def GenerateFunctionDefinitions(self, w, class_name): def GenerateVariable(self, w): name = self.name.group(0) cpp_type = self.type.GetVariableCppType() - if self.category == 'repeated': + if self.category == "repeated": w.Write("std::vector<%s> %s_;" % (cpp_type, name)) else: w.Write("bool has_%s_{};" % (name)) - w.Write("%s %s_{};" % (cpp_type, name)) + if "default" in self.attributes: + w.Write("%s %s_{%s};" % (cpp_type, name, self.attributes["default"])) + else: + w.Write("%s %s_{};" % (cpp_type, name)) return class ProtoEnumParser: - def __init__(self, lexer, scope): - lexer.Consume('enum') - self.name = lexer.Consume('identifier').group(0) + lexer.Consume("enum") + self.name = lexer.Consume("identifier").group(0) self.values = [] self.scope = scope[:] - lexer.Consume('{') + lexer.Consume("{") while True: token, match = lexer.Pick() - if token == '}': + if token == "}": break - key = lexer.Consume('identifier').group(0) - lexer.Consume('=') - value = int(lexer.Consume('number').group(0)) - lexer.Consume(';') + key = lexer.Consume("identifier").group(0) + lexer.Consume("=") + value = int(lexer.Consume("number").group(0)) + lexer.Consume(";") self.values.append((key, value)) - lexer.Consume('}') + lexer.Consume("}") def GetName(self): return self.name def GetFullName(self): - return '_'.join([x.GetName() for x in self.scope] + [self.name]) + return "_".join([x.GetName() for x in self.scope] + [self.name]) def GetType(self): - return 'enum' + return "enum" def IsType(self): return True @@ -515,110 +579,112 @@ def GenerateFunctionDefinitions(self, w): def GenerateEnumDefinitions(self, w): # Protobuf enum is mapped directly to C++ enum. - w.Write('enum %s : int {' % self.GetFullName()) + w.Write("enum %s : int {" % self.GetFullName()) w.Indent() for key, value in self.values: - w.Write('%s_%s = %d,' % (self.GetFullName(), key, value)) + w.Write("%s_%s = %d," % (self.GetFullName(), key, value)) w.Unindent() - w.Write('};') - w.Write('inline std::string %s_Name(%s val) {' % - (self.GetFullName(), self.GetFullName())) + w.Write("};") + w.Write( + "inline std::string %s_Name(%s val) {" + % (self.GetFullName(), self.GetFullName()) + ) w.Indent() - w.Write('switch (val) {') + w.Write("switch (val) {") w.Indent() for key, _ in self.values: - w.Write('case %s_%s:' % (self.GetFullName(), key)) + w.Write("case %s_%s:" % (self.GetFullName(), key)) w.Write(' return "%s";' % key) w.Unindent() - w.Write('};') + w.Write("};") w.Write('return "%s(" + std::to_string(val) + ")";' % self.name) w.Unindent() - w.Write('}') + w.Write("}") def GenerateUsingDirectives(self, w): - w.Write('using %s = %s;' % (self.name, self.GetFullName())) + w.Write("using %s = %s;" % (self.name, self.GetFullName())) for key, _ in self.values: - w.Write('static constexpr %s %s =' % (self.name, key)) - w.Write(' %s_%s;' % (self.GetFullName(), key)) - w.Write('static constexpr std::array<%s,%d> %s_AllValues = {' % - (self.name, len(self.values), self.name)) + w.Write("static constexpr %s %s =" % (self.name, key)) + w.Write(" %s_%s;" % (self.GetFullName(), key)) + w.Write( + "static constexpr std::array<%s,%d> %s_AllValues = {" + % (self.name, len(self.values), self.name) + ) w.Indent() for key, _ in self.values: - w.Write('%s,' % key) + w.Write("%s," % key) w.Unindent() - w.Write('};') + w.Write("};") # Static function to convert an enum value to its name. - w.Write('static std::string %s_Name(%s val) {' % - (self.name, self.name)) + w.Write("static std::string %s_Name(%s val) {" % (self.name, self.name)) w.Indent() - w.Write('return %s_Name(val);' % (self.GetFullName())) + w.Write("return %s_Name(val);" % (self.GetFullName())) w.Unindent() - w.Write('}') + w.Write("}") def ParseReservedFields(lexer): res = set() - lexer.Consume('reserved') + lexer.Consume("reserved") while True: token, match = lexer.Pick() - if token == 'number': - num = int(lexer.Consume('number').group(0)) - if lexer.Pick()[0] == 'to': - lexer.Consume('to') - end = int(lexer.Consume('number').group(0)) + if token == "number": + num = int(lexer.Consume("number").group(0)) + if lexer.Pick()[0] == "to": + lexer.Consume("to") + end = int(lexer.Consume("number").group(0)) res.add(range(num, end + 1)) else: res.add(num) - elif token in ['identifier', 'string']: + elif token in ["identifier", "string"]: res.add(lexer.Consume(token).group(1)) else: - lexer.Error('Expected number or identifier') + lexer.Error("Expected number or identifier") token, _ = lexer.Pick() - if token == ';': - lexer.Consume(';') + if token == ";": + lexer.Consume(";") break - lexer.Consume(',') + lexer.Consume(",") return res class ProtoMessageParser: - def __init__(self, lexer, type_stack, scope): type_stack[0].append(self) self.reserved = set() self.types = [] self.fields = [] self.scope = scope[:] - lexer.Consume('message') - self.name = lexer.Consume('identifier').group(0) - lexer.Consume('{') + lexer.Consume("message") + self.name = lexer.Consume("identifier").group(0) + lexer.Consume("{") while True: token, match = lexer.Pick() - if token == '}': + if token == "}": break - elif token == 'message': - ProtoMessageParser(lexer, [self.types, *type_stack], - self.scope + [self]) - elif token == 'enum': + elif token == "message": + ProtoMessageParser( + lexer, [self.types, *type_stack], self.scope + [self] + ) + elif token == "enum": self.types.append(ProtoEnumParser(lexer, self.scope + [self])) - elif token in ['repeated', 'optional', 'required']: - self.fields.append( - ProtoFieldParser(lexer, [self.types, *type_stack])) - elif token == 'reserved': + elif token in ["repeated", "optional", "required"]: + self.fields.append(ProtoFieldParser(lexer, [self.types, *type_stack])) + elif token == "reserved": self.reserved.update(ParseReservedFields(lexer)) else: - lexer.Error('Expected field or type') - lexer.Consume('}') + lexer.Error("Expected field or type") + lexer.Consume("}") self.CheckReserved() def GetName(self): return self.name def GetFullName(self): - return '_'.join([x.GetName() for x in self.scope] + [self.name]) + return "_".join([x.GetName() for x in self.scope] + [self.name]) def GetType(self): - return 'message' + return "message" def IsType(self): return True @@ -631,19 +697,20 @@ def GetFieldsGruppedByWireType(self): for x in self.fields: type_to_fields.setdefault(x.type.GetWireType(), []).append(x) return type_to_fields - + def CheckReserved(self): for r in self.reserved: if isinstance(r, int): if any(x.number == r for x in self.fields): - raise ValueError(f'Field number [{r}] is reserved.') + raise ValueError(f"Field number [{r}] is reserved.") elif isinstance(r, range): if any(x.number in r for x in self.fields): - raise ValueError(f'Field range [{r.start} to {r.stop-1}] ' - 'is reserved.') + raise ValueError( + f"Field range [{r.start} to {r.stop - 1}] is reserved." + ) else: if any(x.name.group(0) == r for x in self.fields): - raise ValueError(f'Field name [{r}] is reserved.') + raise ValueError(f"Field name [{r}] is reserved.") def ResolveForwardDeclarations(self, type_stack): type_stack.append(self.types) @@ -654,41 +721,44 @@ def ResolveForwardDeclarations(self, type_stack): type_stack.pop() def WriteFieldParserDeclaration(self, w, wire_id, fields): - fname = {0: 'SetVarInt', 1: 'SetInt64', 2: 'SetString', 5: 'SetInt32'} + fname = {0: "SetVarInt", 1: "SetInt64", 2: "SetString", 5: "SetInt32"} tname = { - 0: 'std::uint64_t', - 1: 'std::uint64_t', - 2: 'std::string_view', - 5: 'std::uint32_t' + 0: "std::uint64_t", + 1: "std::uint64_t", + 2: "std::string_view", + 5: "std::uint32_t", } - w.Write('void %s(int field_id, %s val) final;' % - (fname[wire_id], tname[wire_id])) + w.Write( + "void %s(int field_id, %s val) final;" % (fname[wire_id], tname[wire_id]) + ) def WriteFieldParserDefinition(self, w, wire_id, fields): - fname = {0: 'SetVarInt', 1: 'SetInt64', 2: 'SetString', 5: 'SetInt32'} + fname = {0: "SetVarInt", 1: "SetInt64", 2: "SetString", 5: "SetInt32"} tname = { - 0: 'std::uint64_t', - 1: 'std::uint64_t', - 2: 'std::string_view', - 5: 'std::uint32_t' + 0: "std::uint64_t", + 1: "std::uint64_t", + 2: "std::string_view", + 5: "std::uint32_t", } - w.Write('inline void %s::%s(int field_id, %s val) {' % - (self.GetFullName(), fname[wire_id], tname[wire_id])) + w.Write( + "inline void %s::%s(int field_id, %s val) {" + % (self.GetFullName(), fname[wire_id], tname[wire_id]) + ) w.Indent() - w.Write('switch (field_id) {') + w.Write("switch (field_id) {") w.Indent() for field in fields: field.GenerateCaseClause(w) w.Unindent() - w.Write('}') + w.Write("}") w.Unindent() - w.Write('}') + w.Write("}") def GenerateUsingDirectives(self, w): - w.Write('using %s = %s;' % (self.name, self.GetFullName())) + w.Write("using %s = %s;" % (self.name, self.GetFullName())) def GenerateMessageDeclarations(self, w): - w.Write(f'class %s;' % self.GetFullName()) + w.Write(f"class %s;" % self.GetFullName()) for x in self.types: x.GenerateMessageDeclarations(w) @@ -699,42 +769,41 @@ def GenerateEnumDefinitions(self, w): def GenerateMessageDefinitions(self, w): # Writing nested messages. for x in self.types: - if x.GetType() == 'message': + if x.GetType() == "message": x.GenerateMessageDefinitions(w) # Protobuf message is a C++ class. - w.Write('class %s final : public lczero::ProtoMessage {' % - self.GetFullName()) - w.Write(' public:') + w.Write("class %s final : public lczero::ProtoMessage {" % self.GetFullName()) + w.Write(" public:") w.Indent() # Writing using directives. for x in self.types: x.GenerateUsingDirectives(w) # Writing function declarations. for x in self.fields: - w.Write('') + w.Write("") x.GenerateFunctionDeclarations(w) - w.Write('') - w.Write('std::string OutputAsString() const final;') - w.Write('std::string OutputAsJson() const final;') - w.Write('void Clear() final;') + w.Write("") + w.Write("std::string OutputAsString() const final;") + w.Write("std::string OutputAsJson() const final;") + w.Write("void Clear() final;") w.Unindent() - w.Write('') - w.Write(' private:') + w.Write("") + w.Write(" private:") w.Indent() for k, v in self.GetFieldsGruppedByWireType().items(): self.WriteFieldParserDeclaration(w, k, v) - w.Write('') + w.Write("") for x in self.fields: x.GenerateVariable(w) w.Unindent() - w.Write('};') - w.Write('') + w.Write("};") + w.Write("") def GenerateFunctionDefinitions(self, w): # Writing nested messages. for x in self.types: - if x.GetType() == 'message': + if x.GetType() == "message": x.GenerateFunctionDefinitions(w) self.GenerateOutputAsStringFunc(w) self.GenerateOutputAsJsonFunc(w) @@ -743,37 +812,35 @@ def GenerateFunctionDefinitions(self, w): self.GenerateFieldAccessorFuncs(w) def GenerateOutputAsStringFunc(self, w): - w.Write('inline std::string %s::OutputAsString() const {' % - self.GetFullName()) + w.Write("inline std::string %s::OutputAsString() const {" % self.GetFullName()) w.Indent() - w.Write('std::string out;') + w.Write("std::string out;") for x in sorted(self.fields, key=lambda x: x.number): x.GenerateOutput(w) - w.Write('return out;') + w.Write("return out;") w.Unindent() - w.Write('}') + w.Write("}") def GenerateOutputAsJsonFunc(self, w): - w.Write('inline std::string %s::OutputAsJson() const {' % - self.GetFullName()) + w.Write("inline std::string %s::OutputAsJson() const {" % self.GetFullName()) w.Indent() if self.fields: - w.Write('bool first = true;') + w.Write("bool first = true;") w.Write('std::string out = "{";') for x in self.fields: x.GenerateJsonOutput(w) w.Write('out += "}";') - w.Write('return out;') + w.Write("return out;") w.Unindent() - w.Write('}') + w.Write("}") def GenerateClearFunc(self, w): - w.Write('inline void %s::Clear() {' % self.GetFullName()) + w.Write("inline void %s::Clear() {" % self.GetFullName()) w.Indent() for x in self.fields: x.GenerateClear(w) w.Unindent() - w.Write('}') + w.Write("}") def GenerateParserFuncs(self, w): for k, v in self.GetFieldsGruppedByWireType().items(): @@ -785,38 +852,38 @@ def GenerateFieldAccessorFuncs(self, w): class ProtoFileParser: - '''Root grammar of .proto file''' + """Root grammar of .proto file""" def __init__(self, lexer): self.package = None self.types = [] while True: token, match = lexer.Pick() - if token == 'EOF': + if token == "EOF": return - elif token == 'syntax': + elif token == "syntax": self.ParseSyntax(lexer) - elif token == 'package': + elif token == "package": self.ParsePackage(lexer) - elif token == 'message': + elif token == "message": self.ParseMessage(lexer) - elif token == 'enum': + elif token == "enum": self.ParseEnum(lexer) else: - lexer.Error('Expected message or something similar') + lexer.Error("Expected message or something similar") def ParseSyntax(self, lexer): - lexer.Consume('syntax') - lexer.Consume('=') - lexer.Consume('string', 'proto2', 1) - lexer.Consume(';') + lexer.Consume("syntax") + lexer.Consume("=") + lexer.Consume("string", "proto2", 1) + lexer.Consume(";") def ParsePackage(self, lexer): - lexer.Consume('package') + lexer.Consume("package") if self.package is not None: - lexer.Error('Package was already defined') + lexer.Error("Package was already defined") self.package = ReadIdentifierPath(lexer) - lexer.Consume(';') + lexer.Consume(";") def ParseMessage(self, lexer): ProtoMessageParser(lexer, [self.types], []) @@ -825,27 +892,27 @@ def ParseEnum(self, lexer): self.types.append(ProtoEnumParser(lexer, [])) def Generate(self, w): - w.Write('// This file is AUTOGENERATED, do not edit.') - w.Write('#pragma once') + w.Write("// This file is AUTOGENERATED, do not edit.") + w.Write("#pragma once") w.Write('#include "utils/protomessage.h"') for x in self.package: - w.Write('namespace %s {' % x) - w.Write('') - w.Write('// Forward declarations.') + w.Write("namespace %s {" % x) + w.Write("") + w.Write("// Forward declarations.") for object in self.types: object.GenerateMessageDeclarations(w) for object in self.types: object.GenerateEnumDefinitions(w) - w.Write('') - w.Write('// Class declarations.') + w.Write("") + w.Write("// Class declarations.") for object in self.types: object.GenerateMessageDefinitions(w) - w.Write('') - w.Write('// Function definitions.') + w.Write("") + w.Write("// Function definitions.") for object in self.types: object.GenerateFunctionDefinitions(w) for x in reversed(self.package): - w.Write('} // namespace %s' % x) + w.Write("} // namespace %s" % x) def ResolveForwardDeclarations(self): type_stack = [self.types] @@ -854,7 +921,7 @@ def ResolveForwardDeclarations(self): class Writer: - '''A helper class for writing file line by line with indent.''' + """A helper class for writing file line by line with indent.""" def __init__(self, fo): self.fo = fo @@ -868,26 +935,26 @@ def Unindent(self): def Write(self, text): if text: - self.fo.write(' ' * self.indent + text + '\n') + self.fo.write(" " * self.indent + text + "\n") else: - self.fo.write('\n') + self.fo.write("\n") if __name__ == "__main__": # Have the same flags as protoc has. parser = argparse.ArgumentParser(description="Compile protobuf files.") - parser.add_argument('input', type=str) - parser.add_argument('--proto_path', type=str) - parser.add_argument('--cpp_out', type=str) + parser.add_argument("input", type=str) + parser.add_argument("--proto_path", type=str) + parser.add_argument("--cpp_out", type=str) args = parser.parse_args() rel_path = os.path.relpath(args.input, args.proto_path) - dest_name = os.path.splitext(rel_path)[0] + '.pb.h' + dest_name = os.path.splitext(rel_path)[0] + ".pb.h" dest_path = os.path.join(args.cpp_out, dest_name) dest_dir = os.path.dirname(dest_path) os.makedirs(dest_dir, exist_ok=True) - with open(args.input, 'r') as input, open(dest_path, 'w') as output: + with open(args.input, "r") as input, open(dest_path, "w") as output: proto_file = ProtoFileParser(Lexer(input.read())) proto_file.ResolveForwardDeclarations() writer = Writer(output) diff --git a/scripts/sycl_build_hack.py b/scripts/sycl_build_hack.py index 14edff6ded..e7e3478875 100644 --- a/scripts/sycl_build_hack.py +++ b/scripts/sycl_build_hack.py @@ -12,12 +12,12 @@ link_flag = False for line in lines: - # Replace xilink with icx -fsycl as the linker. + # Replace xilink with icx as the linker. if not link_flag: link_flag = 'xilink.exe' in line if link_flag: line = line.replace('xilink.exe', 'icx') - line = line.replace('/MACHINE:x64', '-fsycl') + line = line.replace('/MACHINE:x64', '') line = line.replace('/OUT:', '-o ') line = line.replace('/SUBSYSTEM:CONSOLE', '') line = line.replace('/OPT:REF', '') diff --git a/src/chess/board.cc b/src/chess/board.cc index 8d171141b3..59bc0c39cd 100644 --- a/src/chess/board.cc +++ b/src/chess/board.cc @@ -34,6 +34,7 @@ #include #include #include +#include #include "utils/exception.h" @@ -573,8 +574,36 @@ MoveList ChessBoard::GeneratePseudolegalMoves() const { return result; } // namespace lczero +bool ChessBoard::IsValid() const { + const auto all = ours() | theirs(); + auto check = all | pawns() | bishops() | rooks() | queens() | kings(); + if (check != all || + (pawns() & bishops()).as_int() || + (pawns() & rooks()).as_int() || + (pawns() & queens()).as_int() || + (pawns() & kings()).as_int() || + (bishops() & rooks()).as_int() || + (bishops() & queens()).as_int() || + (bishops() & kings()).as_int() || + (rooks() & queens()).as_int() || + (rooks() & kings()).as_int() || + (queens() & kings()).as_int()) { + return false; + } + return true; +} + bool ChessBoard::ApplyMove(Move move) { assert(our_pieces_.intersects(BitBoard::FromSquare(move.from()))); +#ifndef NDEBUG + absl::Cleanup validate = [&] { + if (!IsValid()) { + CERR << "Move " + move.ToString(true) + + " resulted in invalid board: " + DebugString(); + assert(false); + } + }; +#endif const Square& from = move.from(); const Square& to = move.to(); const Rank from_rank = from.rank(); @@ -1113,7 +1142,9 @@ bool ChessBoard::HasMatingMaterial() const { } std::string ChessBoard::DebugString() const { - return "https://lc0.org/fen/" + BoardToFen(*this); + auto fen = BoardToFen(*this); + std::replace(fen.begin(), fen.end(), ' ', '_'); + return "https://lc0.org/fen/" + fen; } Move ChessBoard::ParseMove(std::string_view move_str) const { @@ -1160,7 +1191,7 @@ Move ChessBoard::ParseMove(std::string_view move_str) const { // Qeenside castling. return Move::WhiteCastling(from.file(), kFileA); } - if (from.file() != to.file() && pawns_.get(from) && !their_pieces_.get(to)) { + if (from.file() != to.file() && pawns().get(from) && !their_pieces_.get(to)) { // En passant. return Move::WhiteEnPassant(from, to); } diff --git a/src/chess/board.h b/src/chess/board.h index 4d2dbe17e5..d455fcb69d 100644 --- a/src/chess/board.h +++ b/src/chess/board.h @@ -231,6 +231,8 @@ class ChessBoard { private: // Sets the piece on the square. void PutPiece(Square square, PieceType piece, bool is_theirs); + // Check internal state is consistent after state transformations. + bool IsValid() const; // All white pieces. BitBoard our_pieces_; diff --git a/src/chess/board_test.cc b/src/chess/board_test.cc index 40621be621..eef6247d1d 100644 --- a/src/chess/board_test.cc +++ b/src/chess/board_test.cc @@ -2236,6 +2236,20 @@ TEST(ChessBoard, InvalidEnPassantFromKnightPromotion) { EXPECT_TRUE(board.en_passant().empty()); } +// Move from an en-passant flag square was mistakenly marked as en-passant. +TEST(ChessBoard, QueenMoveFromEnPassantFlagBug) { + ChessBoard board; + board.SetFromFen("1Qnkr3/1p1b4/p2P2p1/P1q5/1NP3pP/1KN5/8/3R4 b - - 0 32"); + board.ApplyMove(board.ParseMove("b7b5")); + board.Mirror(); + auto m = board.ParseMove("b8c7"); + EXPECT_FALSE(m.is_en_passant()); + board.ApplyMove(m); + board.Mirror(); + MoveList legal_moves = {board.ParseMove("c5c7")}; + EXPECT_EQ(board.GenerateLegalMoves(), legal_moves); +} + } // namespace lczero int main(int argc, char** argv) { diff --git a/src/chess/callbacks.h b/src/chess/callbacks.h index 63cd7b88b4..4205e2441a 100644 --- a/src/chess/callbacks.h +++ b/src/chess/callbacks.h @@ -66,6 +66,8 @@ struct ThinkingInfo { int64_t nodes = -1; // Nodes per second. int nps = -1; + // Evaluations per second. + int eps = -1; // Hash fullness * 1000 int hashfull = -1; // Moves to mate. diff --git a/src/chess/pgn.h b/src/chess/pgn.h index 4025398a57..dd50ab9c98 100644 --- a/src/chess/pgn.h +++ b/src/chess/pgn.h @@ -319,7 +319,7 @@ class PgnReader { std::optional enpassant = std::nullopt; if (!board.en_passant().empty()) { auto sq = *board.en_passant().begin(); - enpassant = Square(sq.file(), kRank6); + enpassant = Square(sq.file(), board.flipped() ? kRank3 : kRank6); } Square from(File::FromIdx(c1), Rank::FromIdx(r1)); Square to(File::FromIdx(c2), Rank::FromIdx(r2)); diff --git a/src/chess/uciloop.cc b/src/chess/uciloop.cc index a033e4cc1b..398a8bd7bd 100644 --- a/src/chess/uciloop.cc +++ b/src/chess/uciloop.cc @@ -54,6 +54,11 @@ const OptionId kShowWDL{{.long_flag = "show-wdl", .uci_option = "UCI_ShowWDL", .help_text = "Show win, draw and lose probability.", .visibility = OptionId::kAlwaysVisible}}; +const OptionId kShowEPS{ + {.long_flag = "show-eps", + .uci_option = "UCI_ShowEPS", + .help_text = "Show neural network evaluations per second.", + .visibility = OptionId::kAlwaysVisible}}; const OptionId kShowMovesleft{{.long_flag = "show-movesleft", .uci_option = "UCI_ShowMovesLeft", .help_text = "Show estimated moves left.", @@ -63,7 +68,7 @@ const std::unordered_map> kKnownCommands = { {{"uci"}, {}}, {{"isready"}, {}}, - {{"setoption"}, {"context", "name", "value"}}, + {{"setoption"}, {"name", "value"}}, {{"ucinewgame"}, {}}, {{"position"}, {"fen", "startpos", "moves"}}, {{"go"}, @@ -94,6 +99,26 @@ ParseCommand(const std::string& line) { throw Exception("Unknown command: " + line); } + // Special parsing for setoption to keep strings unmodified. + if (command->first == "setoption") { + iss >> token; + if (token != "name") { + throw Exception("setoption must be followed by name"); + } + int name_pos = iss.eof() ? line.length() : static_cast(iss.tellg()); + std::optional value_pos; + while (iss >> token) { + if (token == "value") { + value_pos = iss.eof() ? line.length() : static_cast(iss.tellg()); + params["value"] = Trim(line.substr(*value_pos)); + break; + } + } + params["name"] = Trim(line.substr( + name_pos, value_pos ? *value_pos - name_pos - 5 : std::string::npos)); + return {"setoption", params}; + } + std::string whitespace; while (iss >> token) { auto iter = command->second.find(token); @@ -139,7 +164,7 @@ int GetNumeric(const std::unordered_map& params, bool ContainsKey(const std::unordered_map& params, const std::string& key) { - return params.find(key) != params.end(); + return params.contains(key); } } // namespace @@ -164,9 +189,12 @@ bool UciLoop::DispatchCommand( engine_->EnsureReady(); uci_responder_->SendRawResponse("readyok"); } else if (command == "setoption") { - options_->SetUciOption(GetOrEmpty(params, "name"), - GetOrEmpty(params, "value"), - GetOrEmpty(params, "context")); + if (GetOrEmpty(params, "name").empty()) { + throw Exception("setoption requires name"); + } else { + options_->SetUciOption(GetOrEmpty(params, "name"), + GetOrEmpty(params, "value")); + } } else if (command == "ucinewgame") { engine_->NewGame(); } else if (command == "position") { @@ -235,7 +263,8 @@ bool UciLoop::ProcessLine(const std::string& line) { void StringUciResponder::PopulateParams(OptionsParser* options) { options->Add(kUciChess960) = false; - options->Add(kShowWDL) = true; + options->Add(kShowWDL) = false; + options->Add(kShowEPS) = false; options->Add(kShowMovesleft) = false; options_ = &options->GetOptionsDict(); } @@ -289,6 +318,9 @@ void StringUciResponder::OutputThinkingInfo(std::vector* infos) { } if (info.hashfull >= 0) res += " hashfull " + std::to_string(info.hashfull); if (info.nps >= 0) res += " nps " + std::to_string(info.nps); + if (info.eps >= 0 && options_ && options_->Get(kShowEPS)) { + res += " eps " + std::to_string(info.eps); + } if (info.tb_hits >= 0) res += " tbhits " + std::to_string(info.tb_hits); if (info.multipv >= 0) res += " multipv " + std::to_string(info.multipv); diff --git a/src/engine.cc b/src/engine.cc index d76e630086..c4c487c020 100644 --- a/src/engine.cc +++ b/src/engine.cc @@ -162,6 +162,7 @@ void Engine::EnsureSearchStopped() { } void Engine::UpdateBackendConfig() { + LOGFILE << "Update backend configuration."; const std::string backend_name = options_.Get(SharedBackendParams::kBackendId); if (!backend_ || backend_name != backend_name_ || @@ -182,6 +183,7 @@ void Engine::EnsureSyzygyTablebasesLoaded() { previous_tb_paths_ = tb_paths; if (tb_paths.empty()) { + LOGFILE << "Reset Syzygy tablebases."; syzygy_tb_.reset(); } else { syzygy_tb_ = std::make_unique(); @@ -198,6 +200,7 @@ void Engine::EnsureSyzygyTablebasesLoaded() { // Initializes the search with either the specified position for the normal // search or the position one ply trimmed for the ponder search. void Engine::InitializeSearchPosition(bool for_ponder) { + LOGFILE << "Setting a new search position."; assert(last_position_); if (!for_ponder) { search_->SetPosition(*last_position_); @@ -217,7 +220,8 @@ void Engine::SetPosition(const std::string& fen, EnsureSearchStopped(); ponder_enabled_ = options_.Get(kPonderId); strict_uci_timing_ = options_.Get(kStrictUciTiming); - if (!strict_uci_timing_) search_->StartClock(); + isready_seen_ = false; + search_->StartClock(); UpdateBackendConfig(); EnsureSyzygyTablebasesLoaded(); last_position_ = MakeGameState(fen, moves); @@ -235,13 +239,18 @@ void Engine::Go(const GoParams& params) { throw Exception( "Ponder is not enabled, but the ponder search is requested."); } - if (strict_uci_timing_) search_->StartClock(); + if ((strict_uci_timing_ && isready_seen_) || + !(params.wtime || params.btime)) { + search_->StartClock(); + } if (!last_position_) NewGame(); if (ponder_enabled_) InitializeSearchPosition(params.ponder); last_go_params_ = params; search_->StartSearch(params); } +void Engine::EnsureReady() { isready_seen_ = true; } + void Engine::Wait() { search_->WaitSearch(); } void Engine::Stop() { search_->StopSearch(); } diff --git a/src/engine.h b/src/engine.h index 80b593301c..e50d661393 100644 --- a/src/engine.h +++ b/src/engine.h @@ -44,7 +44,7 @@ class Engine : public EngineControllerBase { static void PopulateOptions(OptionsParser*); - void EnsureReady() override {}; + void EnsureReady() override; void NewGame() override; void SetPosition(const std::string& fen, const std::vector& moves) override; @@ -74,8 +74,10 @@ class Engine : public EngineControllerBase { std::unique_ptr syzygy_tb_; // absl_nullable // UCI parameters cache to be consistent between `position` and `go`. + // Defaults ensure corect operation even if `go` comes first. bool ponder_enabled_ = false; - bool strict_uci_timing_ = false; + bool strict_uci_timing_ = true; + bool isready_seen_ = true; // Last position set for the search. Used to: // 1. Detect whether the position was ever set (to initialize to startpos). // 2. Remember the position for ponder go (removing the last ply). diff --git a/src/main.cc b/src/main.cc index 78415a3a33..dc83a199e8 100644 --- a/src/main.cc +++ b/src/main.cc @@ -26,6 +26,7 @@ */ #include "chess/board.h" +#include "default_search.h" #include "engine.h" #include "search/register.h" #include "selfplay/loop.h" @@ -37,6 +38,7 @@ #include "utils/commandline.h" #include "utils/esc_codes.h" #include "utils/logging.h" +#include "utils/trace.h" #include "version.h" namespace lczero { @@ -52,14 +54,9 @@ void ChooseAndRunEngine() { // Then if DEFAULT_SEARCH is defined, run the engine specified by it. #ifdef DEFAULT_SEARCH -#define STRINGIFY_INTERNAL(x) #x -#define STRINGIFY(x) STRINGIFY_INTERNAL(x) SearchFactory* factory = - SearchManager::Get()->GetFactoryByName(STRINGIFY(DEFAULT_SEARCH)); - if (!factory) - throw Exception("Unknown search algorithm: " STRINGIFY(DEFAULT_SEARCH)); -#undef STRINGIFY -#undef STRINGIFY_INTERNAL + SearchManager::Get()->GetFactoryByName(DEFAULT_SEARCH); + if (!factory) throw Exception("Unknown search algorithm: " DEFAULT_SEARCH); RunEngine(factory); return; #endif @@ -80,6 +77,7 @@ void ChooseAndRunEngine() { } // namespace lczero int main(int argc, const char** argv) { + LCTRACE_INITIALIZE; using namespace lczero; EscCodes::Init(); LOGFILE << "Lc0 started."; diff --git a/src/neural/backends/blas/blas.h b/src/neural/backends/blas/blas.h index 7001be64d7..a9018c71d2 100644 --- a/src/neural/backends/blas/blas.h +++ b/src/neural/backends/blas/blas.h @@ -18,6 +18,13 @@ #pragma once +// clang-format off +// math.h include is workaround for Eigen trying to use math functions from global +// namespaces. math.h must be included before Eigen/Core. +#include +#include +// clang-format on + // Select the BLAS vendor based on defines #ifdef USE_MKL diff --git a/src/neural/backends/blas/convolution1.cc b/src/neural/backends/blas/convolution1.cc index 8674b06dcf..1da550cb5b 100644 --- a/src/neural/backends/blas/convolution1.cc +++ b/src/neural/backends/blas/convolution1.cc @@ -19,8 +19,6 @@ #include "neural/backends/blas/convolution1.h" #include "neural/backends/blas/blas.h" -#include - namespace lczero { template using EigenMatrixMap = diff --git a/src/neural/backends/blas/fully_connected_layer.cc b/src/neural/backends/blas/fully_connected_layer.cc index 84699a3ec2..d0736c1eb3 100644 --- a/src/neural/backends/blas/fully_connected_layer.cc +++ b/src/neural/backends/blas/fully_connected_layer.cc @@ -23,8 +23,6 @@ #include #include -#include - namespace lczero { namespace { void ApplyBias(size_t batch_size, const size_t output_size, const float* biases, diff --git a/src/neural/backends/blas/network_blas.cc b/src/neural/backends/blas/network_blas.cc index 71c561ca12..c91c5c44f5 100644 --- a/src/neural/backends/blas/network_blas.cc +++ b/src/neural/backends/blas/network_blas.cc @@ -16,7 +16,6 @@ along with Leela Chess. If not, see . */ -#include #include #include #include @@ -70,7 +69,7 @@ class BlasComputation : public NetworkComputation { const ActivationFunction smolgen_activation, const ActivationFunction ffn_activation, const bool attn_policy, const bool attn_body, - bool is_pe_dense_embedding); + bool is_pe_dense_embedding, int threads); virtual ~BlasComputation() {} @@ -157,13 +156,14 @@ template class BlasNetwork : public Network { public: BlasNetwork(const WeightsFile& weights, const OptionsDict& options); - virtual ~BlasNetwork(){}; + virtual ~BlasNetwork() {}; std::unique_ptr NewComputation() override { return std::make_unique>( this, weights_, policy_head_, value_head_, max_batch_size_, wdl_, moves_left_, conv_policy_, default_activation_, smolgen_activation_, - ffn_activation_, attn_policy_, attn_body_, is_pe_dense_embedding_); + ffn_activation_, attn_policy_, attn_body_, is_pe_dense_embedding_, + threads_); } const NetworkCapabilities& GetCapabilities() const override { @@ -199,15 +199,16 @@ class BlasNetwork : public Network { const NetworkCapabilities capabilities_; MultiHeadWeights weights_; size_t max_batch_size_; + int threads_; bool wdl_; bool moves_left_; bool conv_policy_; bool attn_policy_; bool attn_body_; bool is_pe_dense_embedding_; - ActivationFunction default_activation_; - ActivationFunction smolgen_activation_; - ActivationFunction ffn_activation_; + ActivationFunction default_activation_ = ACTIVATION_NONE; + ActivationFunction smolgen_activation_ = ACTIVATION_NONE; + ActivationFunction ffn_activation_ = ACTIVATION_NONE; std::string policy_head_; std::string value_head_; std::mutex buffers_lock_; @@ -222,7 +223,8 @@ BlasComputation::BlasComputation( const bool conv_policy, const ActivationFunction default_activation, const ActivationFunction smolgen_activation, const ActivationFunction ffn_activation, const bool attn_policy, - const bool attn_body, bool is_pe_dense_embedding) + const bool attn_body, bool is_pe_dense_embedding, + [[maybe_unused]] int threads) : weights_(weights), max_batch_size_(max_batch_size), policies_(0), @@ -240,7 +242,7 @@ BlasComputation::BlasComputation( value_head_(value_head), network_(network) { #ifdef USE_DNNL - omp_set_num_threads(1); + omp_set_num_threads(threads); #endif } @@ -989,6 +991,7 @@ BlasNetwork::BlasNetwork(const WeightsFile& file, max_batch_size_ = static_cast(options.GetOrDefault("batch_size", 256)); + threads_ = options.GetOrDefault("threads", 1); auto nf = file.format().network_format(); using NF = pblczero::NetworkFormat; @@ -1075,7 +1078,7 @@ BlasNetwork::BlasNetwork(const WeightsFile& file, } else { #ifdef USE_OPENBLAS int num_procs = openblas_get_num_procs(); - openblas_set_num_threads(1); + openblas_set_num_threads(threads_); const char* core_name = openblas_get_corename(); const char* config = openblas_get_config(); CERR << "BLAS vendor: OpenBLAS."; @@ -1084,7 +1087,7 @@ BlasNetwork::BlasNetwork(const WeightsFile& file, #endif #ifdef USE_MKL - mkl_set_num_threads(1); + mkl_set_num_threads(threads_); CERR << "BLAS vendor: MKL."; constexpr int len = 256; char versionbuf[len]; diff --git a/src/neural/backends/blas/winograd_convolution3.cc b/src/neural/backends/blas/winograd_convolution3.cc index 31f00b50df..c1687aebe6 100644 --- a/src/neural/backends/blas/winograd_convolution3.cc +++ b/src/neural/backends/blas/winograd_convolution3.cc @@ -29,8 +29,6 @@ #include "winograd_transform_ispc.h" #endif -#include - namespace lczero { template using EigenMatrixMap = diff --git a/src/neural/backends/cuda/common_kernels.cu b/src/neural/backends/cuda/common_kernels.cu index ea8801ec2f..bab99ce4cf 100644 --- a/src/neural/backends/cuda/common_kernels.cu +++ b/src/neural/backends/cuda/common_kernels.cu @@ -31,6 +31,7 @@ #include "cuda_common.h" #include "neural/tables/activation_function.h" #include "neural/tables/attention_policy_map.h" +#include "utils/exception.h" #include "winograd_helper.inc" namespace lczero { @@ -381,12 +382,13 @@ __global__ void NCHWtoNHWC_kernel(dT* output_tensor, const sT* input_tensor, template void convertNCHWtoNHWC(DstType* output_tensor, const SrcType* input_tensor, - int Nin, int Cin, int Nout, int Cout, int H, int W) { + int Nin, int Cin, int Nout, int Cout, int H, int W, + cudaStream_t stream) { size_t numElements = Nout * Cout * H * W; const int blockSize = 256; int blocks = DivUp(numElements, blockSize); - NCHWtoNHWC_kernel<<>>(output_tensor, input_tensor, Nin, - Cin, Nout, Cout, H, W); + NCHWtoNHWC_kernel<<>>( + output_tensor, input_tensor, Nin, Cin, Nout, Cout, H, W); } template @@ -437,65 +439,20 @@ __global__ void batchNorm_kernel(T* output, const T* input, const T* skipInput, template void batchNorm(T* output, const T* input, const T* skipInput, int N, int C, int H, int W, float* means, float* var_multipliers, - ActivationFunction activation) { + ActivationFunction activation, cudaStream_t stream) { const int total_elements = N * C * H * W; const int kBlockSize = 256; int blocks = DivUp(total_elements, kBlockSize); - batchNorm_kernel<<>>(output, input, skipInput, N, C, H, W, - means, var_multipliers, activation); + batchNorm_kernel<<>>( + output, input, skipInput, N, C, H, W, means, var_multipliers, activation); ReportCUDAErrors(cudaGetLastError()); } -__global__ void expandPlanes_kernel_Fp32_NCHW(float* output, - const uint64_t* masks, - const float* values, int n) { - // Block size of 256, same mask/val for 64 consecutive threads. - constexpr int kNumShmemElements = 256 / 64; - - __shared__ uint64_t shMasks[kNumShmemElements]; - __shared__ float shVals[kNumShmemElements]; - - int index = threadIdx.x + blockDim.x * blockIdx.x; - - int planeIndex = index >> 6; - - if (planeIndex >= n) return; - - // Load inputs to shared memory. - if (threadIdx.x < kNumShmemElements) { - shMasks[threadIdx.x] = masks[planeIndex + threadIdx.x]; - shVals[threadIdx.x] = values[planeIndex + threadIdx.x]; - } - __syncthreads(); - - uint64_t mask = shMasks[threadIdx.x >> 6]; - - int sqIndex = index & 0x3F; - float op = 0; - - bool set = !!(mask & (1ull << sqIndex)); - if (set) { - op = shVals[threadIdx.x >> 6]; - } - output[index] = op; -} - -void expandPlanes_Fp32_NCHW(float* output, const uint64_t* masks, - const float* values, int n, cudaStream_t stream) { - int threads = n * 8 * 8; // Each thread writes a single element. - const int blockSize = 256; - int blocks = DivUp(threads, blockSize); - expandPlanes_kernel_Fp32_NCHW<<>>(output, masks, - values, n); - ReportCUDAErrors(cudaGetLastError()); -} - -// TODO: Can optimize using shared memory if this becomes a bottleneck. -__global__ void expandPlanes_kernel_Fp16_NHWC(half* output, - const uint64_t* masks, - const float* values, int n) { +template +__global__ void expandPlanes_kernel_NHWC(T* output, const uint64_t* masks, + const T* values, int n) { const int index = threadIdx.x + blockDim.x * blockIdx.x; if (index >= n * 8 * 8) return; @@ -505,66 +462,61 @@ __global__ void expandPlanes_kernel_Fp16_NHWC(half* output, uint64_t mask = masks[boardIndex * kInputPlanes + planeIndex]; - half op = 0; + T op = 0; bool set = !!(mask & (1ull << sqIndex)); if (set) { - float val = values[boardIndex * kInputPlanes + planeIndex]; - op = (half)val; + op = values[boardIndex * kInputPlanes + planeIndex]; } output[index] = op; } -void expandPlanes_Fp16_NHWC(half* output, const uint64_t* masks, - const float* values, int n, cudaStream_t stream) { +template +void expandPlanes_NHWC(T* output, const uint64_t* masks, const T* values, int n, + cudaStream_t stream) { int threads = n * 8 * 8; // Each thread writes a single element. const int kBlockSize = 256; int blocks = DivUp(threads, kBlockSize); - expandPlanes_kernel_Fp16_NHWC<<>>( - output, masks, values, n); + expandPlanes_kernel_NHWC<<>>(output, masks, + values, n); ReportCUDAErrors(cudaGetLastError()); } -__global__ void expandPlanes_kernel_Fp16_NCHW(half* output, - const uint64_t* masks, - const float* values, int n) { - // block size of 256, same mask/val for 64 consecutive threads - constexpr int kNumShmemElements = 256 / 64; - - __shared__ uint64_t shMasks[kNumShmemElements]; - __shared__ half shVals[kNumShmemElements]; - - int index = threadIdx.x + blockDim.x * blockIdx.x; +template +__global__ void expandPlanes_kernel_NCHW(T* output, const uint64_t* masks, + const T* values, unsigned n) { + unsigned index = threadIdx.x + blockDim.x * blockIdx.x; - int planeIndex = index >> 6; + index *= 2; + unsigned planeIndex = index >> 6; if (planeIndex >= n) return; - // load inputs to shared memory - if (threadIdx.x < kNumShmemElements) { - shMasks[threadIdx.x] = masks[planeIndex + threadIdx.x]; - shVals[threadIdx.x] = values[planeIndex + threadIdx.x]; - } - __syncthreads(); - - uint64_t mask = shMasks[threadIdx.x >> 6]; + uint64_t mask = masks[planeIndex]; int sqIndex = index & 0x3F; - half op = 0; + T op[2] = {0, 0}; bool set = !!(mask & (1ull << sqIndex)); if (set) { - op = (half)shVals[threadIdx.x >> 6]; + op[0] = values[planeIndex]; } - output[index] = op; + sqIndex++; + set = !!(mask & (1ull << sqIndex)); + if (set) { + op[1] = values[planeIndex]; + } + output[index + 0] = op[0]; + output[index + 1] = op[1]; } -void expandPlanes_Fp16_NCHW(half* output, const uint64_t* masks, - const float* values, int n, cudaStream_t stream) { - int threads = n * 8 * 8; // each thread writes a single element +template +void expandPlanes_NCHW(T* output, const uint64_t* masks, const T* values, + int n, cudaStream_t stream) { + unsigned threads = n * 8 * 8 / 2; // each thread writes two elements. const int blockSize = 256; - int blocks = DivUp(threads, blockSize); - expandPlanes_kernel_Fp16_NCHW<<>>(output, masks, - values, n); + unsigned blocks = DivUp(threads, blockSize); + expandPlanes_kernel_NCHW<<>>(output, masks, + values, n); ReportCUDAErrors(cudaGetLastError()); } @@ -704,14 +656,14 @@ __global__ void globalAvgPool_kernel(T* output, const T* input, template void globalAvgPool(int N, int C, T* output, const T* input, - const T* prevLayerBias, bool nhwc) { + const T* prevLayerBias, bool nhwc, cudaStream_t stream) { const int kPlaneSize = 64; if (nhwc) { assert((std::is_same::value)); // For NHWC fp16, simply launch N blocks, each with C threads. - globalAvgPool_kernel_NHWC_fp16<<>>((half*)output, (half*)input, - (half*)prevLayerBias, - N * C * kPlaneSize, N * C); + globalAvgPool_kernel_NHWC_fp16<<>>( + (half*)output, (half*)input, (half*)prevLayerBias, N * C * kPlaneSize, + N * C); } else { // For NCHW layout (used with fp32), // each warp processes a full plane (64 elements), and writes a single @@ -722,8 +674,8 @@ void globalAvgPool(int N, int C, T* output, const T* input, const int kBlockSize = kWarpsPerBlock * 32; int blocks = DivUp(kTotalWarps, kWarpsPerBlock); - globalAvgPool_kernel<<>>(output, input, prevLayerBias, - N * C * kPlaneSize, N * C, C); + globalAvgPool_kernel<<>>( + output, input, prevLayerBias, N * C * kPlaneSize, N * C, C); } ReportCUDAErrors(cudaGetLastError()); } @@ -731,18 +683,18 @@ void globalAvgPool(int N, int C, T* output, const T* input, template void globalScale(int N, int C, T* output, const T* input, const T* scaleBias, const T* prevLayerBias, bool nhwc, - ActivationFunction activation) { + ActivationFunction activation, cudaStream_t stream) { // Each thread writes one output. const int kBlockSize = 256; const int kBlocks = DivUp(N * 8 * 8 * C, kBlockSize); if (nhwc) { assert((std::is_same::value)); - globalScale_kernel_fp16_nhwc<<>>( + globalScale_kernel_fp16_nhwc<<>>( (half*)output, (half*)input, (half*)scaleBias, (half*)prevLayerBias, N * C * 8 * 8, C, 8 * 8 * C, activation); } else { - globalScale_kernel<<>>( + globalScale_kernel<<>>( output, input, scaleBias, prevLayerBias, N * C * 8 * 8, C, activation); } ReportCUDAErrors(cudaGetLastError()); @@ -808,6 +760,15 @@ void OutputInputTransform(int N, int C, int se_K, T* output, const T* input, ReportCUDAErrors(cudaGetLastError()); } +__device__ __forceinline__ float clamp(float val, float low, float high) { + if (__builtin_expect(isnan(val), 0)) return val; + return fminf(fmaxf(val, low), high); +} + +namespace { +constexpr float kTwiceHalfMax = 131008.0f; // Twice the max finite fp16 value. +} // namespace + // softmax along C dimension which is assumed to be 64 // each thread processes two elements. Each warp computes a sum (over 64 // elements) @@ -843,6 +804,11 @@ __global__ void softmax_opt_64_kernel(T* output, const T* input, x[0] += x[2]; x[1] += x[3]; } + if (fp16) { + // Guard against Inf from fp16 overflow. + x[0] = clamp(x[0], -kTwiceHalfMax, kTwiceHalfMax); + x[1] = clamp(x[1], -kTwiceHalfMax, kTwiceHalfMax); + } float threadMax = max(x[0], x[1]); float maxval = warpMax(threadMax); maxval = __shfl_sync(0xFFFFFFFF, maxval, 0); @@ -884,6 +850,10 @@ __global__ void softmax_kernel(T* output, const T* input, const T* input2) { float x = (float)input[index]; if (input2 != nullptr) x += (float)input2[index]; + if (std::is_same::value) { + // Guard against Inf from fp16 overflow. + x = clamp(x, -kTwiceHalfMax, kTwiceHalfMax); + } __shared__ float sum, maxval; if (c == 0) { @@ -1242,7 +1212,8 @@ __global__ void preprocess_for_attention_body_kernel( if (c >= input_size) { // concatenate from position encoding array if (is_pe_dense_embedding) { - op = (T)(encoding[n * 64 * encoding_size + hw * encoding_size + (c - input_size)]); + op = (T)(encoding[n * 64 * encoding_size + hw * encoding_size + + (c - input_size)]); } else { op = (T)(encoding[64 * hw + (c - input_size)]); } @@ -1309,6 +1280,64 @@ void applyInputGating(T* output, const T* input, const T* mult, const T* add, ReportCUDAErrors(cudaGetLastError()); } +template +__global__ void genOffsetPointers_kernel(T** offsets, int heads, int block_size, + int depth, int d_model, T* k, T* q, + T* b1, T* v, T* b2) { + const int i = (blockIdx.x * blockDim.x + threadIdx.x) * kWorkPerThread; + if (i >= block_size) return; + const int h = i % heads; + const int n = i / heads; + int w; + T* res[kWorkPerThread]; + for (w = 0; w < kWorkPerThread; w++) { + res[w] = k + h * depth + 64 * d_model * n + w * depth; + offsets[i + w] = res[w]; + } + + for (w = 0; w < kWorkPerThread; w++) { + res[w] = q + h * depth + 64 * d_model * n + w * depth; + offsets[i + w + block_size] = res[w]; + } + + for (w = 0; w < kWorkPerThread; w++) { + res[w] = b1 + i * 64 * 64 + w * 64 * 64; + offsets[i + w + 2 * block_size] = res[w]; + } + + for (w = 0; w < kWorkPerThread; w++) { + res[w] = v + h * depth + 64 * d_model * n + w * depth; + offsets[i + w + 3 * block_size] = res[w]; + } + + for (w = 0; w < kWorkPerThread; w++) { + res[w] = b2 + h * depth + 64 * d_model * n + w * depth; + offsets[i + w + 4 * block_size] = res[w]; + } +} + +template +void genOffsetPointers(T** offsets, int heads, int max_batch, int depth, + int d_model, T* k, T* q, T* b1, T* v, T* b2, + cudaStream_t stream) { + const int block_size = heads * max_batch; + // Process two elements per thread to use 128 bit store instructions. + constexpr int kWorkPerThread = 2; + constexpr int kWorkGroupSize = 128; + if (block_size % kWorkPerThread != 0) { + // Handle odd block sizes. + int grid = DivUp(block_size, kWorkGroupSize); + genOffsetPointers_kernel<<>>( + offsets, heads, block_size, depth, d_model, k, q, b1, v, b2); + } else { + // Handle even block size + int grid = DivUp(block_size, kWorkGroupSize * kWorkPerThread); + genOffsetPointers_kernel + <<>>(offsets, heads, block_size, depth, + d_model, k, q, b1, v, b2); + } +} + // Template instantiation. template void copyTypeConverted(half* op, float* ip, int N, cudaStream_t stream); @@ -1322,11 +1351,13 @@ template void copyTypeConverted(half* op, half* ip, int N, template void batchNorm(float* output, const float* input, const float* skipInput, int N, int C, int H, int W, float* means, float* var_multipliers, - ActivationFunction activation); + ActivationFunction activation, + cudaStream_t stream); template void batchNorm(half* output, const half* input, const half* skipInput, int N, int C, int H, int W, float* means, float* var_multipliers, - ActivationFunction activation); + ActivationFunction activation, + cudaStream_t stream); template void addVectors(float* c, float* a, float* b, int size, int asize, int bsize, ActivationFunction act, @@ -1368,18 +1399,36 @@ template void addBias_NCHW(half* c, half* a, half* b, int N, int C, int H, template void globalAvgPool(int N, int C, float* output, const float* input, - const float* prevLayerBias, bool nhwc); + const float* prevLayerBias, bool nhwc, + cudaStream_t stream); template void globalAvgPool(int N, int C, half* output, const half* input, - const half* prevLayerBias, bool nhwc); + const half* prevLayerBias, bool nhwc, + cudaStream_t stream); + +template void expandPlanes_NHWC(float* output, const uint64_t* masks, + const float* values, int n, + cudaStream_t stream); +template void expandPlanes_NHWC(half* output, const uint64_t* masks, + const half* values, int n, + cudaStream_t stream); + +template void expandPlanes_NCHW(float* output, const uint64_t* masks, + const float* values, int n, + cudaStream_t stream); +template void expandPlanes_NCHW(half* output, const uint64_t* masks, + const half* values, int n, + cudaStream_t stream); template void globalScale(int N, int C, float* output, const float* input, const float* scaleBias, const float* prevLayerBias, bool nhwc, - ActivationFunction activation); + ActivationFunction activation, + cudaStream_t stream); template void globalScale(int N, int C, half* output, const half* input, const half* scaleBias, const half* prevLayerBias, bool nhwc, - ActivationFunction activation); + ActivationFunction activation, + cudaStream_t stream); template void PolicyMap(int N, float* output, const float* input, const short* indices, int inputSize, @@ -1391,7 +1440,7 @@ template void PolicyMap(int N, half* output, const half* input, int outputSize, cudaStream_t stream); template void FilterTransform(int N, int C, float* transformedFilter, - const float* filter); + const float* filter, cudaStream_t stream); template void InputTransform(int N, int C, float* transformed_input, @@ -1566,15 +1615,16 @@ template void ComputePromotionLogits(int N, int C, float* output, template void convertNCHWtoNHWC(half* output_tensor, const float* input_tensor, int Nin, int Cin, int Nout, int Cout, int H, - int W); + int W, cudaStream_t stream); template void convertNCHWtoNHWC(float* output_tensor, const float* input_tensor, int Nin, int Cin, int Nout, - int Cout, int H, int W); + int Cout, int H, int W, + cudaStream_t stream); template void convertNCHWtoNHWC(half* output_tensor, const half* input_tensor, int Nin, int Cin, int Nout, int Cout, int H, - int W); + int W, cudaStream_t stream); template void inputPreprocessForAttentionBody( half* output, const half* input, const half* encoding, int N, @@ -1595,5 +1645,14 @@ template void applyInputGating(float* output, const float* input, const float* mult, const float* add, int N, int C, int output_size, cudaStream_t stream); + +template void genOffsetPointers(float** offsets, int heads, + int max_batch, int depth, int d_model, + float* k, float* q, float* b1, float* v, + float* b2, cudaStream_t stream); +template void genOffsetPointers(half** offsets, int heads, int max_batch, + int depth, int d_model, half* k, half* q, + half* b1, half* v, half* b2, + cudaStream_t stream); } // namespace cudnn_backend } // namespace lczero diff --git a/src/neural/backends/cuda/cuda_common.h b/src/neural/backends/cuda/cuda_common.h index ca91f0e91b..1babb7e003 100644 --- a/src/neural/backends/cuda/cuda_common.h +++ b/src/neural/backends/cuda/cuda_common.h @@ -30,7 +30,7 @@ #include #include -#include "utils/exception.h" +#include "utils/fp16_utils.h" #ifdef USE_CUDNN #include diff --git a/src/neural/backends/cuda/cutlass_kernels.cu b/src/neural/backends/cuda/cutlass_kernels.cu new file mode 100644 index 0000000000..619c839f90 --- /dev/null +++ b/src/neural/backends/cuda/cutlass_kernels.cu @@ -0,0 +1,124 @@ +/* + This file is part of Leela Chess Zero. + Copyright (C) 2018 The LCZero Authors + + Leela Chess is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Leela Chess is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Leela Chess. If not, see . + + Additional permission under GNU GPL version 3 section 7 + + If you modify this Program, or any covered work, by linking or + combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA + Toolkit and the NVIDIA CUDA Deep Neural Network library (or a + modified version of those libraries), containing parts covered by the + terms of the respective license agreement, the licensors of this + Program grant you additional permission to convey the resulting work. +*/ + +#include "neural/backends/cuda/cuda_common.h" + +// Fused MHA implementation from cutlass example #41 +#include "fused_multi_head_attention/kernel_forward.h" +#include "utils/exception.h" + +namespace lczero { +namespace cudnn_backend { + +template +void fusedMHACutlass(void* output, void* q, void* k, void* v, void* skip, + int batch_size, int num_heads, int depth, + cudaStream_t stream) { + cutlass::half_t* mha_q = (cutlass::half_t*)q; + cutlass::half_t* mha_k = (cutlass::half_t*)k; + cutlass::half_t* mha_v = (cutlass::half_t*)v; + + constexpr int kQueriesPerBlock = 64; + constexpr int kKeysPerBlock = 64; + constexpr bool kSingleValueIteration = true; + + using Attention = + AttentionKernel; + static_assert( + !Attention::kNeedsOutputAccumulatorBuffer, + "Unhandled case in cutlass MHA: needs output accumulator buffer"); + + typename Attention::Params p; + { // set parameters + p.query_ptr = mha_q; + p.key_ptr = mha_k; + p.value_ptr = mha_v; + p.logsumexp_ptr = nullptr; // Only needed for bw + p.output_accum_ptr = nullptr; + p.output_ptr = (cutlass::half_t*)output; + p.attn_bias_ptr = (cutlass::half_t*)skip; + + p.scale = 1.0f / sqrt((float)depth); + + p.num_heads = num_heads; + p.num_batches = batch_size; + p.head_dim = depth; + p.head_dim_value = depth; + p.num_queries = 64; + p.num_keys = 64; + + // All tensors are in BMHK shapes + p.q_strideH = depth; + p.k_strideH = depth; + p.v_strideH = depth; + p.q_strideM = depth * num_heads; + p.k_strideM = depth * num_heads; + p.v_strideM = depth * num_heads; + p.q_strideB = p.q_strideM * 64; + p.k_strideB = p.k_strideM * 64; + p.v_strideB = p.v_strideM * 64; + p.o_strideM = p.head_dim_value * p.num_heads; + + p.bias_strideH = 64 * 64; + p.bias_strideM = 64; + p.bias_strideB = num_heads * p.bias_strideH; + } + + constexpr auto kernel_fn = attention_kernel_batched_impl; + int smem_bytes = sizeof(typename Attention::SharedStorage); + if (smem_bytes > 0xc000) { + ReportCUDAErrors(cudaFuncSetAttribute( + kernel_fn, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_bytes)); + } + if (!Attention::check_supported(p)) { + throw Exception("Unhandled case in cutlass MHA: check_supported failed."); + } + + kernel_fn<<>>(p); + + ReportCUDAErrors(cudaGetLastError()); +} + +void fusedMHA(void* output, void* mha_q, void* mha_k, void* mha_v, void* skip, + int batch_size, int num_heads, int depth, cudaStream_t stream) { + if (skip == nullptr) { + fusedMHACutlass(output, mha_q, mha_k, mha_v, skip, batch_size, + num_heads, depth, stream); + } else { + fusedMHACutlass(output, mha_q, mha_k, mha_v, skip, batch_size, + num_heads, depth, stream); + } +} + +} // namespace cudnn_backend +} // namespace lczero diff --git a/src/neural/backends/cuda/fp16_kernels.cu b/src/neural/backends/cuda/fp16_kernels.cu index 0d93ca6459..37827ba0eb 100644 --- a/src/neural/backends/cuda/fp16_kernels.cu +++ b/src/neural/backends/cuda/fp16_kernels.cu @@ -27,6 +27,7 @@ #include "cuda_common.h" #include "neural/tables/activation_function.h" +#include "utils/exception.h" // Allow building on an old architecture. #if __CUDA_ARCH__ < 530 @@ -137,61 +138,61 @@ __global__ void SE_Layer_NHWC(half* output, const half* skip, const half* input, bool Se_Fp16_NHWC(int N, int C, int numFc1Out, half* output, const half* skip, const half* input, const half* w1, const half* b1, const half* w2, const half* b2, const half* bPrev, - ActivationFunction activation) { + ActivationFunction activation, cudaStream_t stream) { // TODO: Think of more elegant way to avoid this hardcoding :-/ if (numFc1Out == 16) { if (C == 64) { - SE_Layer_NHWC<64, 16> - <<>>(output, skip, input, w1, b1, w2, b2, bPrev, activation); + SE_Layer_NHWC<64, 16><<>>(output, skip, input, w1, b1, + w2, b2, bPrev, activation); } else { // TODO: support other channel counts. throw Exception("channel count unsupported by SE layer"); } } else if (numFc1Out == 32) { if (C == 64) { - SE_Layer_NHWC<64, 32> - <<>>(output, skip, input, w1, b1, w2, b2, bPrev, activation); + SE_Layer_NHWC<64, 32><<>>(output, skip, input, w1, b1, + w2, b2, bPrev, activation); } else if (C == 128) { - SE_Layer_NHWC<128, 32> - <<>>(output, skip, input, w1, b1, w2, b2, bPrev, activation); + SE_Layer_NHWC<128, 32><<>>(output, skip, input, w1, b1, + w2, b2, bPrev, activation); } else if (C == 192) { - SE_Layer_NHWC<192, 32> - <<>>(output, skip, input, w1, b1, w2, b2, bPrev, activation); + SE_Layer_NHWC<192, 32><<>>(output, skip, input, w1, b1, + w2, b2, bPrev, activation); } else if (C == 256) { - SE_Layer_NHWC<256, 32> - <<>>(output, skip, input, w1, b1, w2, b2, bPrev, activation); + SE_Layer_NHWC<256, 32><<>>(output, skip, input, w1, b1, + w2, b2, bPrev, activation); } else if (C == 320) { - SE_Layer_NHWC<320, 32> - <<>>(output, skip, input, w1, b1, w2, b2, bPrev, activation); + SE_Layer_NHWC<320, 32><<>>(output, skip, input, w1, b1, + w2, b2, bPrev, activation); } else if (C == 352) { - SE_Layer_NHWC<352, 32> - <<>>(output, skip, input, w1, b1, w2, b2, bPrev, activation); + SE_Layer_NHWC<352, 32><<>>(output, skip, input, w1, b1, + w2, b2, bPrev, activation); } else if (C == 384) { - SE_Layer_NHWC<384, 32> - <<>>(output, skip, input, w1, b1, w2, b2, bPrev, activation); + SE_Layer_NHWC<384, 32><<>>(output, skip, input, w1, b1, + w2, b2, bPrev, activation); } else { // TODO: support other channel counts. return false; } } else if (numFc1Out == 64) { if (C == 64) { - SE_Layer_NHWC<64, 64> - <<>>(output, skip, input, w1, b1, w2, b2, bPrev, activation); + SE_Layer_NHWC<64, 64><<>>(output, skip, input, w1, b1, + w2, b2, bPrev, activation); } else if (C == 128) { - SE_Layer_NHWC<128, 64> - <<>>(output, skip, input, w1, b1, w2, b2, bPrev, activation); + SE_Layer_NHWC<128, 64><<>>(output, skip, input, w1, b1, + w2, b2, bPrev, activation); } else if (C == 192) { - SE_Layer_NHWC<192, 64> - <<>>(output, skip, input, w1, b1, w2, b2, bPrev, activation); + SE_Layer_NHWC<192, 64><<>>(output, skip, input, w1, b1, + w2, b2, bPrev, activation); } else if (C == 256) { - SE_Layer_NHWC<256, 64> - <<>>(output, skip, input, w1, b1, w2, b2, bPrev, activation); + SE_Layer_NHWC<256, 64><<>>(output, skip, input, w1, b1, + w2, b2, bPrev, activation); } else if (C == 320) { - SE_Layer_NHWC<320, 64> - <<>>(output, skip, input, w1, b1, w2, b2, bPrev, activation); + SE_Layer_NHWC<320, 64><<>>(output, skip, input, w1, b1, + w2, b2, bPrev, activation); } else if (C == 384) { - SE_Layer_NHWC<384, 64> - <<>>(output, skip, input, w1, b1, w2, b2, bPrev, activation); + SE_Layer_NHWC<384, 64><<>>(output, skip, input, w1, b1, + w2, b2, bPrev, activation); } else { // TODO: support other channel counts. return false; @@ -474,7 +475,7 @@ void OutputInputTransform(int N, int C, int se_K, T* output, const T* input, } template void FilterTransform(int N, int C, half* transformedFilter, - const half* filter); + const half* filter, cudaStream_t stream); template void InputTransform(int N, int C, half* transformed_input, const half* input, diff --git a/src/neural/backends/cuda/inputs_outputs.h b/src/neural/backends/cuda/inputs_outputs.h index 4c356994a8..89e728da84 100644 --- a/src/neural/backends/cuda/inputs_outputs.h +++ b/src/neural/backends/cuda/inputs_outputs.h @@ -27,75 +27,158 @@ #pragma once +#include +#include + +#include "cuda_common.h" #include "neural/network.h" +#include "utils/bit.h" namespace lczero { namespace cudnn_backend { +inline void ToType(float& dst, float src) { dst = src; } +inline void ToType(half& dst, float src) { + auto temp = FP32toFP16(src); + dst = bit_cast(temp); +} + +inline float FromType(float src) { return src; } +inline float FromType(half src) { + uint16_t temp = bit_cast(src); + return FP16toFP32(temp); +} + +template +struct CudaGraphCapture; + +template +struct CudaGraphExec { + ~CudaGraphExec() { + if (graph_exec_ != nullptr) { + ReportCUDAErrors(cudaGraphExecDestroy(graph_exec_)); + } + } + + CudaGraphExec& operator=(const CudaGraphCapture&); + explicit operator bool() const { return graph_exec_ != nullptr; } + + void Launch(cudaStream_t stream) { + ReportCUDAErrors(cudaGraphLaunch(graph_exec_, stream)); + } + cudaGraphExec_t graph_exec_ = nullptr; +}; + +template struct InputsOutputs { - InputsOutputs(int maxBatchSize, bool wdl, bool moves_left, + InputsOutputs(unsigned maxBatchSize, bool wdl, bool moves_left, size_t tensor_mem_size = 0, size_t scratch_size = 0, bool cublasDisableTensorCores = false) { ReportCUDAErrors(cudaHostAlloc( &input_masks_mem_, maxBatchSize * kInputPlanes * sizeof(uint64_t), cudaHostAllocMapped)); - ReportCUDAErrors( - cudaHostGetDevicePointer(&input_masks_mem_gpu_, input_masks_mem_, 0)); + ReportCUDAErrors(cudaMalloc( + &input_masks_mem_gpu_, maxBatchSize * kInputPlanes * sizeof(uint64_t))); - ReportCUDAErrors(cudaHostAlloc(&input_val_mem_, - maxBatchSize * kInputPlanes * sizeof(float), - cudaHostAllocMapped)); ReportCUDAErrors( - cudaHostGetDevicePointer(&input_val_mem_gpu_, input_val_mem_, 0)); + cudaHostAlloc(&input_val_mem_, + maxBatchSize * kInputPlanes * sizeof(input_val_mem_[0]), + cudaHostAllocMapped)); + ReportCUDAErrors(cudaMalloc( + &input_val_mem_gpu_, + maxBatchSize * kInputPlanes * sizeof(input_val_mem_gpu_[0]))); ReportCUDAErrors(cudaHostAlloc( - &op_policy_mem_, maxBatchSize * kNumOutputPolicy * sizeof(float), 0)); + &op_policy_mem_, + maxBatchSize * kNumOutputPolicy * sizeof(op_policy_mem_[0]), 0)); // Seperate device memory copy for policy output. // It's faster to write to device memory and then copy to host memory // than having the kernel write directly to it. ReportCUDAErrors(cudaMalloc( - &op_policy_mem_gpu_, maxBatchSize * kNumOutputPolicy * sizeof(float))); - - ReportCUDAErrors(cudaHostAlloc(&op_value_mem_, - maxBatchSize * (wdl ? 3 : 1) * sizeof(float), - cudaHostAllocMapped)); + &op_policy_mem_gpu_, + maxBatchSize * kNumOutputPolicy * sizeof(op_policy_mem_[0]))); + ReportCUDAErrors(cudaHostAlloc( + &op_value_mem_, maxBatchSize * (wdl ? 3 : 1) * sizeof(op_value_mem_[0]), + cudaHostAllocMapped)); + ReportCUDAErrors(cudaMalloc( + &op_value_mem_gpu_, + maxBatchSize * (wdl ? 3 : 1) * sizeof(op_value_mem_gpu_[0]))); + if (wdl && sizeof(DataType) != sizeof(float)) { + wdl_cpu_softmax_ = std::make_unique(maxBatchSize * 2); + } + ReportCUDAErrors( + cudaEventCreateWithFlags(&upload_done_event_, cudaEventDisableTiming)); ReportCUDAErrors( - cudaHostGetDevicePointer(&op_value_mem_gpu_, op_value_mem_, 0)); + cudaEventCreateWithFlags(&policy_done_event_, cudaEventDisableTiming)); + ReportCUDAErrors( + cudaEventCreateWithFlags(&value_done_event_, cudaEventDisableTiming)); + ReportCUDAErrors(cudaEventCreateWithFlags(&wdl_download_done_event_, + cudaEventDisableTiming)); + ReportCUDAErrors(cudaEventCreateWithFlags(&download_done_event_, + cudaEventDisableTiming)); if (moves_left) { - ReportCUDAErrors(cudaHostAlloc(&op_moves_left_mem_, - maxBatchSize * sizeof(float), - cudaHostAllocMapped)); - ReportCUDAErrors(cudaHostGetDevicePointer(&op_moves_left_mem_gpu_, - op_moves_left_mem_, 0)); + ReportCUDAErrors(cudaHostAlloc( + &op_moves_left_mem_, maxBatchSize * sizeof(op_moves_left_mem_[0]), + cudaHostAllocMapped)); + ReportCUDAErrors( + cudaMalloc(&op_moves_left_mem_gpu_, + maxBatchSize * sizeof(op_moves_left_mem_gpu_[0]))); + ReportCUDAErrors(cudaEventCreateWithFlags(&moves_left_done_event_, + cudaEventDisableTiming)); } + ReportCUDAErrors( + cudaStreamCreateWithFlags(&exec_stream_, cudaStreamNonBlocking)); + ReportCUDAErrors( + cudaEventCreateWithFlags(&join_capture_event_, cudaEventDisableTiming)); + cuda_graphs_ = std::make_unique[]>(maxBatchSize); + // memory for network execution managed inside this structure if (tensor_mem_size) { multi_stream_ = true; - ReportCUDAErrors(cudaStreamCreate(&stream_)); + ReportCUDAErrors( + cudaStreamCreateWithFlags(&compute_stream_, cudaStreamNonBlocking)); + ReportCUDAErrors( + cudaStreamCreateWithFlags(&upload_stream_, cudaStreamNonBlocking)); + ReportCUDAErrors( + cudaStreamCreateWithFlags(&download_stream_, cudaStreamNonBlocking)); ReportCUDAErrors(cudaMalloc(&scratch_mem_, scratch_size)); for (auto& mem : tensor_mem_) { ReportCUDAErrors(cudaMalloc(&mem, tensor_mem_size)); - ReportCUDAErrors(cudaMemsetAsync(mem, 0, tensor_mem_size, stream_)); + ReportCUDAErrors( + cudaMemsetAsync(mem, 0, tensor_mem_size, compute_stream_)); } ReportCUBLASErrors(cublasCreate(&cublas_)); ReportCUBLASErrors(cublasSetMathMode( cublas_, cublasDisableTensorCores ? CUBLAS_PEDANTIC_MATH : CUBLAS_TENSOR_OP_MATH)); - ReportCUBLASErrors(cublasSetStream(cublas_, stream_)); + ReportCUBLASErrors(cublasSetStream(cublas_, compute_stream_)); } else { multi_stream_ = false; } } ~InputsOutputs() { ReportCUDAErrors(cudaFreeHost(input_masks_mem_)); + ReportCUDAErrors(cudaFree(input_masks_mem_gpu_)); ReportCUDAErrors(cudaFreeHost(input_val_mem_)); + ReportCUDAErrors(cudaFree(input_val_mem_gpu_)); ReportCUDAErrors(cudaFreeHost(op_policy_mem_)); ReportCUDAErrors(cudaFree(op_policy_mem_gpu_)); ReportCUDAErrors(cudaFreeHost(op_value_mem_)); - if (op_moves_left_mem_ != nullptr) + ReportCUDAErrors(cudaFree(op_value_mem_gpu_)); + ReportCUDAErrors(cudaEventDestroy(upload_done_event_)); + ReportCUDAErrors(cudaEventDestroy(policy_done_event_)); + ReportCUDAErrors(cudaEventDestroy(value_done_event_)); + ReportCUDAErrors(cudaEventDestroy(wdl_download_done_event_)); + ReportCUDAErrors(cudaEventDestroy(download_done_event_)); + if (op_moves_left_mem_ != nullptr) { ReportCUDAErrors(cudaFreeHost(op_moves_left_mem_)); + ReportCUDAErrors(cudaFree(op_moves_left_mem_gpu_)); + ReportCUDAErrors(cudaEventDestroy(moves_left_done_event_)); + } + ReportCUDAErrors(cudaEventDestroy(join_capture_event_)); + ReportCUDAErrors(cudaStreamDestroy(exec_stream_)); if (multi_stream_) { for (auto mem : tensor_mem_) { @@ -106,24 +189,26 @@ struct InputsOutputs { if (head_offset_pointers_) { ReportCUDAErrors(cudaFree(head_offset_pointers_)); } - cudaStreamDestroy(stream_); - cublasDestroy(cublas_); + ReportCUDAErrors(cudaStreamDestroy(compute_stream_)); + ReportCUDAErrors(cudaStreamDestroy(upload_stream_)); + ReportCUDAErrors(cudaStreamDestroy(download_stream_)); + ReportCUBLASErrors(cublasDestroy(cublas_)); } } uint64_t* input_masks_mem_; - float* input_val_mem_; - float* op_policy_mem_; - float* op_value_mem_; - float* op_moves_left_mem_ = nullptr; + DataType* input_val_mem_; + DataType* op_policy_mem_; + DataType* op_value_mem_; + DataType* op_moves_left_mem_ = nullptr; - // GPU pointers for the above allocations. + // Copies in VRAM. uint64_t* input_masks_mem_gpu_; - float* input_val_mem_gpu_; - float* op_value_mem_gpu_; - float* op_moves_left_mem_gpu_; + DataType* input_val_mem_gpu_; + DataType* op_policy_mem_gpu_; + DataType* op_value_mem_gpu_; + DataType* op_moves_left_mem_gpu_ = nullptr; - // This is a seperate copy. - float* op_policy_mem_gpu_; + std::unique_ptr wdl_cpu_softmax_; // memory needed to run the network owned by InputsOutputs when multi_stream // is enabled @@ -134,11 +219,82 @@ struct InputsOutputs { void** head_offset_pointers_ = nullptr; // cuda stream used to run the network - cudaStream_t stream_; + cudaStream_t compute_stream_ = nullptr; + cudaStream_t upload_stream_ = nullptr; + cudaStream_t download_stream_ = nullptr; + + // cuda events to synchronize between streams + cudaEvent_t upload_done_event_ = nullptr; + cudaEvent_t policy_done_event_ = nullptr; + cudaEvent_t value_done_event_ = nullptr; + cudaEvent_t moves_left_done_event_ = nullptr; + cudaEvent_t wdl_download_done_event_ = nullptr; + cudaEvent_t download_done_event_ = nullptr; + + // cuda graph support + cudaStream_t exec_stream_ = nullptr; + std::unique_ptr[]> cuda_graphs_; + cudaEvent_t join_capture_event_ = nullptr; // cublas handle used to run the network - cublasHandle_t cublas_; + cublasHandle_t cublas_ = nullptr; +}; + +template +struct CudaGraphCapture { + static constexpr int kMinimumFreeMemory = 100 * 1024 * 1024; + + CudaGraphCapture(InputsOutputs& io, cudaStream_t upload_stream, + cudaStream_t download_stream) + : io_(io), + upload_stream_(upload_stream), + download_stream_(download_stream) { + ReportCUDAErrors(cudaStreamBeginCapture(upload_stream_, + cudaStreamCaptureModeThreadLocal)); + } + + ~CudaGraphCapture() { + if (graph_ != nullptr) { + ReportCUDAErrors(cudaGraphDestroy(graph_)); + } + } + + static bool EnsureEnoughFreeMemory() { + size_t free_mem = 0; + size_t total_mem = 0; + ReportCUDAErrors(cudaMemGetInfo(&free_mem, &total_mem)); + return free_mem > kMinimumFreeMemory; + } + + void EndCapture() { + ReportCUDAErrors( + cudaEventRecord(io_.join_capture_event_, download_stream_)); + ReportCUDAErrors( + cudaStreamWaitEvent(upload_stream_, io_.join_capture_event_, 0)); + ReportCUDAErrors(cudaStreamEndCapture(upload_stream_, &graph_)); + } + + InputsOutputs& io_; + cudaStream_t upload_stream_; + cudaStream_t download_stream_; + + cudaGraph_t graph_ = nullptr; }; +template +inline CudaGraphExec& CudaGraphExec::operator=( + const CudaGraphCapture& graph) { + assert(graph_exec_ == nullptr); + if (graph.graph_ == nullptr) { + throw Exception("Trying to instantiate an nullptr cuda graph"); + } + ReportCUDAErrors( + cudaGraphInstantiate(&graph_exec_, graph.graph_, nullptr, nullptr, 0)); +#if CUDART_VERSION >= 11010 + ReportCUDAErrors(cudaGraphUpload(graph_exec_, graph.io_.exec_stream_)); +#endif + return *this; +} + } // namespace cudnn_backend } // namespace lczero diff --git a/src/neural/backends/cuda/kernels.h b/src/neural/backends/cuda/kernels.h index 06ad15c657..91ee87abe0 100644 --- a/src/neural/backends/cuda/kernels.h +++ b/src/neural/backends/cuda/kernels.h @@ -67,7 +67,8 @@ void addBias_NCHW(T* c, T* a, T* b, int N, int C, int H, int W, // params, also pad/un-pad elements from Batch or Channel dimensions template void convertNCHWtoNHWC(DstType* output_tensor, const SrcType* input_tensor, - int Nin, int Cin, int Nout, int Cout, int H, int W); + int Nin, int Cin, int Nout, int Cout, int H, int W, + cudaStream_t stream); // Plain data-type conversion (no layout conversion). template @@ -77,35 +78,34 @@ void copyTypeConverted(DstType* op, SrcType* ip, int N, cudaStream_t stream); template void batchNorm(T* output, const T* input, const T* skipInput, int N, int C, int H, int W, float* means, float* var_multipliers, - ActivationFunction activation); + ActivationFunction activation, cudaStream_t stream); // Unpack planes (input to network). -void expandPlanes_Fp32_NCHW(float* output, const uint64_t* masks, - const float* values, int n, cudaStream_t stream); - -void expandPlanes_Fp16_NHWC(half* output, const uint64_t* masks, - const float* values, int n, cudaStream_t stream); +template +void expandPlanes_NHWC(T* output, const uint64_t* masks, const T* values, int n, + cudaStream_t stream); -void expandPlanes_Fp16_NCHW(half* output, const uint64_t* masks, - const float* values, int n, cudaStream_t stream); +template +void expandPlanes_NCHW(T* output, const uint64_t* masks, const T* values, int n, + cudaStream_t stream); // Perform global avg pool. template void globalAvgPool(int N, int C, T* output, const T* input, - const T* prevLayerBias, bool nhwc); + const T* prevLayerBias, bool nhwc, cudaStream_t steam); // Perform global scale. template void globalScale(int N, int C, T* output, const T* input, const T* scaleBias, const T* prevLayerBias, bool nhwc, - ActivationFunction activation); + ActivationFunction activation, cudaStream_t steam); // Perform Squeeze-and-Excitation (SE) in a single fused kernel. // Returns false if the fused kernel can't handle the sizes. bool Se_Fp16_NHWC(int N, int C, int numFc1Out, half* output, const half* skip, const half* input, const half* w1, const half* b1, const half* w2, const half* b2, const half* bPrev, - ActivationFunction activation); + ActivationFunction activation, cudaStream_t stream); template void PolicyMap(int N, T* output, const T* input, const short* indices, @@ -114,7 +114,8 @@ void PolicyMap(int N, T* output, const T* input, const short* indices, // Custom winograd helper functions template -void FilterTransform(int N, int C, T* transformedFilter, const T* filter); +void FilterTransform(int N, int C, T* transformedFilter, const T* filter, + cudaStream_t stream); template void InputTransform(int N, int C, T* transformedInput, const T* input, @@ -157,5 +158,14 @@ void inputPreprocessForAttentionBody(T* output, const T* input, template void applyInputGating(T* output, const T* input, const T* mult, const T* add, int N, int HW, int C, cudaStream_t stream); + +template +void genOffsetPointers(T** offsets, int heads, int max_batch, int depth, + int d_model, T* k, T* q, T* b1, T* v, T* b2, + cudaStream_t stream); + +void fusedMHA(void* output, void* mha_q, void* mha_k, void* mha_v, void* skip, + int batch_size, int num_heads, int depth, cudaStream_t stream); + } // namespace cudnn_backend } // namespace lczero diff --git a/src/neural/backends/cuda/layers.cc b/src/neural/backends/cuda/layers.cc index 81a0b01b8b..5ae5b7f7dc 100644 --- a/src/neural/backends/cuda/layers.cc +++ b/src/neural/backends/cuda/layers.cc @@ -219,7 +219,7 @@ void ConvLayer::LoadWeights(float* pfilter, float* pBias, void* scratch) { if (nhwc_) { convertNCHWtoNHWC((half*)weights, (float*)scratch, C, c_input_, C, c_input_, - filter_size_, filter_size_); + filter_size_, filter_size_, 0); } else { copyTypeConverted((half*)weights, (float*)scratch, C * c_input_ * filter_size_ * filter_size_, 0); @@ -495,7 +495,7 @@ void SELayer::Eval(int N, float* output, const float* input, // 1. Global avg pooling (also adds previous layer bias before computing // averages). - globalAvgPool(N, C, op2, input, bPrev_, false); + globalAvgPool(N, C, op2, input, bPrev_, false, stream); // 2. First fully connected layer. float alpha = 1.0f, beta = 0.0f; @@ -514,7 +514,7 @@ void SELayer::Eval(int N, float* output, const float* input, // 4. (Optional prev layer bias add), Global scale, residual add, relu and // bias. - globalScale(N, C, output, input, op2, bPrev_, false, act_); + globalScale(N, C, output, input, op2, bPrev_, false, act_, stream); } template <> @@ -525,7 +525,7 @@ void SELayer::Eval(int N, half* output, const half* input, bool se_done = false; if (kUseFusedSELayer && nhwc_) { se_done = Se_Fp16_NHWC(N, C, numFc1Out_, output, input2, input, w1_t_, b1_, - w2_t_, b2_, bPrev_, act_); + w2_t_, b2_, bPrev_, act_, stream); } if (!se_done) { assert(output == input2); @@ -535,7 +535,7 @@ void SELayer::Eval(int N, half* output, const half* input, // 1. Global avg pooling (also adds previous layer bias before computing // averages). - globalAvgPool(N, C, op2, input, bPrev_, nhwc_); + globalAvgPool(N, C, op2, input, bPrev_, nhwc_, stream); // 2. First fully connected layer. __half_raw one_h{0x3C00}; @@ -557,7 +557,7 @@ void SELayer::Eval(int N, half* output, const half* input, // 4. (Optional prev layer bias add), Global scale, residual add, relu and // bias. - globalScale(N, C, output, input, op2, bPrev_, nhwc_, act_); + globalScale(N, C, output, input, op2, bPrev_, nhwc_, act_, stream); } } @@ -593,7 +593,7 @@ void FCLayer::LoadWeights(float* cpuWeight, float* cpuBias, if (nhwc_) { convertNCHWtoNHWC((half*)weights_, (float*)scratch, (int)num_biases, input_->GetC(), (int)num_biases, input_->GetC(), - input_->GetH(), input_->GetW()); + input_->GetH(), input_->GetW(), 0); } else { copyTypeConverted((half*)weights_, (float*)scratch, (int)num_weights, 0); } @@ -851,7 +851,7 @@ void FusedWinogradConvSELayer::LoadWeights(float* pfilter, } // run winograd transform kernel for the filter - FilterTransform(C, c_input_, transformed_weights_, weights); + FilterTransform(C, c_input_, transformed_weights_, weights, 0); } // TODO: Do this on the GPU to improve network load time! @@ -1200,7 +1200,7 @@ void ResidualBlock::LoadWeights0(float* pfilter, float* pBias, } // run winograd transform kernel for the filter - FilterTransform(C, c_input_, transformed_weights0_, weights); + FilterTransform(C, c_input_, transformed_weights0_, weights, 0); } template @@ -1226,7 +1226,7 @@ void ResidualBlock::LoadWeights1(float* pfilter, float* pBias, } // run winograd transform kernel for the filter - FilterTransform(C, C, transformed_weights1_, weights); + FilterTransform(C, C, transformed_weights1_, weights, 0); } template @@ -1422,7 +1422,7 @@ template AttentionPolicyHead::AttentionPolicyHead( BaseLayer* ip, const MultiHeadWeights::PolicyHead& weights, void* scratch, bool attention_body, ActivationFunction act, - int max_batch_size) + int max_batch_size, bool use_gemm_ex) : BaseLayer(64 * 64 + 24 * 8, 1, 1, ip), attention_body_(attention_body), // Old networks without attention body (e.g. T79) use hardcoded SELU @@ -1474,8 +1474,9 @@ AttentionPolicyHead::AttentionPolicyHead( nullptr, 0, // smolgen weights not implemented in // policy encoder heads yet. max_batch_size, ACTIVATION_SWISH, act_, - 1e-6); // attentionbody nets don't have policy encoders, so using old - // epsilon for backward compatibility with T78. + 1e-6, // attentionbody nets don't have policy encoders, so + use_gemm_ex, // using old epsilon for backward compatibility with T78. + false); encoder_weights_.emplace_back(pW); } } @@ -1485,7 +1486,8 @@ EncoderBlock::EncoderBlock( const MultiHeadWeights::EncoderLayer& cpu_weights, void* scratch, int heads, int size, float alpha, DataType* smolgen_global_scratch, int smolgen_global_size, int max_batch_size, ActivationFunction smolgen_act, - ActivationFunction ffn_act, float default_eps) + ActivationFunction ffn_act, float default_eps, bool use_gemm_ex, + bool fused_mha) : embedding_op_size_(size), encoder_heads_(heads), alpha_(alpha), @@ -1493,7 +1495,9 @@ EncoderBlock::EncoderBlock( has_smolgen_(cpu_weights.mha.has_smolgen), smolgen_activation_(smolgen_act), ffn_activation_(ffn_act), - max_batch_size_(max_batch_size) { + max_batch_size_(max_batch_size), + use_fused_mha_(fused_mha), + use_gemm_ex_(use_gemm_ex) { mha_q_size_ = cpu_weights.mha.q_b.size(); mha_k_size_ = cpu_weights.mha.k_b.size(); mha_v_size_ = cpu_weights.mha.v_b.size(); @@ -1605,7 +1609,8 @@ static void cublasXGemmStridedBatched( cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, float alpha, const void* A, int lda, long long int strideA, const void* B, int ldb, long long int strideB, - float beta, void* C, int ldc, long long int strideC, int batchCount) { + float beta, void* C, int ldc, long long int strideC, int batchCount, + bool use_gemm_ex) { const bool fp16 = std::is_same::value; if (fp16) { unsigned short alpha_h = FP32toFP16(alpha); @@ -1615,10 +1620,17 @@ static void cublasXGemmStridedBatched( B, CUDA_R_16F, ldb, strideB, &beta_h, C, CUDA_R_16F, ldc, strideC, batchCount, CUDA_R_16F, CUBLAS_GEMM_DEFAULT)); } else { - ReportCUBLASErrors(cublasGemmStridedBatchedEx( - handle, transa, transb, m, n, k, &alpha, A, CUDA_R_32F, lda, strideA, B, - CUDA_R_32F, ldb, strideB, &beta, C, CUDA_R_32F, ldc, strideC, - batchCount, CUDA_R_32F, CUBLAS_GEMM_DEFAULT)); + if (use_gemm_ex) { + ReportCUBLASErrors(cublasGemmStridedBatchedEx( + handle, transa, transb, m, n, k, &alpha, A, CUDA_R_32F, lda, strideA, + B, CUDA_R_32F, ldb, strideB, &beta, C, CUDA_R_32F, ldc, strideC, + batchCount, CUDA_R_32F, CUBLAS_GEMM_DEFAULT)); + } else { + ReportCUBLASErrors(cublasSgemmStridedBatched( + handle, transa, transb, m, n, k, &alpha, (const float*)A, lda, + strideA, (const float*)B, ldb, strideB, &beta, (float*)C, ldc, + strideC, batchCount)); + } } } @@ -1736,7 +1748,8 @@ void EncoderBlock::Eval(int N, DataType* in_out_tensor, cublasXGemmStridedBatched( cublas, CUBLAS_OP_T, CUBLAS_OP_N, num_outputs, batch, num_inputs, 1.0f, mha_qkv_w, num_inputs, num_inputs * num_outputs, in_out_tensor, - num_inputs, 0, 0.0f, mha_q, num_outputs, num_outputs * max_batch, 3); + num_inputs, 0, 0.0f, mha_q, num_outputs, num_outputs * max_batch, 3, + use_gemm_ex_); addBiasBatched(mha_q, mha_q, mha_qkv_b, 3, batch, num_outputs, max_batch, ACTIVATION_NONE, stream); } @@ -1760,31 +1773,33 @@ void EncoderBlock::Eval(int N, DataType* in_out_tensor, // shape(k)[-1] = depth float factor = 1.0f / sqrt((float)depth); +#ifdef USE_CUTLASS + if (use_fused_mha_) { + // TODO: check if we need skip in a different tensor than same tensor as + // output! + fusedMHA(buffer2, mha_q, mha_k, mha_v, has_smolgen_ ? buffer2 : nullptr, N, + encoder_heads_, depth, stream); + } else +#endif // matmul_qk = tf.matmul(q, k, transpose_b=True) { if (*offset_pointers == nullptr) { - std::vector offsets(encoder_heads_ * max_batch_size_ * 5); - for (int i = 0; i < encoder_heads_ * max_batch_size_; i++) { - int h = i % encoder_heads_; - int n = i / encoder_heads_; - offsets[i] = mha_k + h * depth + 64 * d_model * n; - offsets[i + encoder_heads_ * max_batch_size_] = - mha_q + h * depth + 64 * d_model * n; - offsets[i + 2 * encoder_heads_ * max_batch_size_] = - buffer1 + i * 64 * 64; - offsets[i + 3 * encoder_heads_ * max_batch_size_] = - mha_v + h * depth + 64 * d_model * n; - offsets[i + 4 * encoder_heads_ * max_batch_size_] = - buffer2 + h * depth + 64 * d_model * n; - } +#ifndef NDEBUG + cudaStreamCaptureStatus capture; + ReportCUDAErrors(cudaStreamIsCapturing(stream, &capture)); + assert(capture != + cudaStreamCaptureStatus::cudaStreamCaptureStatusActive && + "Stream capture is active, cannot allocate memory for offset " + "pointers"); +#endif ReportCUDAErrors( cudaMalloc((void**)offset_pointers, encoder_heads_ * max_batch_size_ * 5 * sizeof(DataType*))); - ReportCUDAErrors( - cudaMemcpy(*offset_pointers, offsets.data(), - encoder_heads_ * max_batch_size_ * 5 * sizeof(DataType*), - cudaMemcpyHostToDevice)); + genOffsetPointers((DataType**)*offset_pointers, encoder_heads_, + max_batch_size_, depth, d_model, mha_k, mha_q, buffer1, + mha_v, buffer2, stream); } + cublasXGemmBatched( cublas, CUBLAS_OP_T, CUBLAS_OP_N, 64 /*M*/, 64 /*N*/, depth /*K*/, // A/B, and M/N are swapped for row-major to col-major @@ -1805,20 +1820,18 @@ void EncoderBlock::Eval(int N, DataType* in_out_tensor, 64 /*LDC*/, // 64 * 64 /*strideC*/, N * encoder_heads_); - } - // attention_weights = tf.nn.softmax(scaled_attention_logits, axis = -1) - // attention_weights -> buffer1 - if (has_smolgen_) { - // Add smolgen weights to the scaled matmul_qk attention logits before - // softmax. - Softmax(encoder_heads_ * N * 64, 64, buffer1, buffer1, buffer2, stream); - } else { - Softmax(encoder_heads_ * N * 64, 64, buffer1, buffer1, - (const DataType*)nullptr, stream); - } + // attention_weights = tf.nn.softmax(scaled_attention_logits, axis = -1) + // attention_weights -> buffer1 + if (has_smolgen_) { + // Add smolgen weights to the scaled matmul_qk attention logits before + // softmax. + Softmax(encoder_heads_ * N * 64, 64, buffer1, buffer1, buffer2, stream); + } else { + Softmax(encoder_heads_ * N * 64, 64, buffer1, buffer1, + (const DataType*)nullptr, stream); + } - { cublasXGemmBatched( cublas, CUBLAS_OP_N, CUBLAS_OP_N, depth /*M*/, 64 /*N*/, 64 /*K*/, 1.0f, *offset_pointers + encoder_heads_ * max_batch_size_ * @@ -1892,8 +1905,10 @@ void AttentionPolicyHead::Eval( DataType* buffer2 = input2_tensor + scratch_size / (2 * sizeof(DataType)); int inputC = this->input_->GetC(); - if (!attention_body_) - convertNCHWtoNHWC((DataType*)scratch, input, N, inputC, N, inputC, 8, 8); + bool input_nhwc = attention_body_ || this->input_->isNHWC(); + if (!input_nhwc) + convertNCHWtoNHWC((DataType*)scratch, input, N, inputC, N, inputC, 8, 8, + stream); // 1. Policy embedding (fully connected layer) // Input data in NHWC layout N*(64)*C, output is N*(64)*embedding_op_size_ @@ -1905,7 +1920,7 @@ void AttentionPolicyHead::Eval( cublasXgemm(cublas, CUBLAS_OP_T, CUBLAS_OP_N, num_outputs, batch, num_inputs, 1.0f, (const DataType*)ip_pol_w_, num_inputs, - attention_body_ ? input : (DataType*)scratch, + input_nhwc ? input : (DataType*)scratch, num_inputs, 0.0f, pol_embedding, num_outputs); addBiasBatched(pol_embedding, pol_embedding, ip_pol_b_, 1, batch, num_outputs, act_, stream); @@ -1929,7 +1944,7 @@ void AttentionPolicyHead::Eval( cublasXGemmStridedBatched( cublas, CUBLAS_OP_T, CUBLAS_OP_N, num_outputs, batch, num_inputs, 1.0f, wqk_w_, num_inputs, num_inputs * num_outputs, input2_tensor, num_inputs, - 0, 0.0f, wq, num_outputs, num_outputs * batch, 2); + 0, 0.0f, wq, num_outputs, num_outputs * batch, 2, use_gemm_ex_); addBiasBatched(wq, wq, wqk_b_, 2, batch, num_outputs, ACTIVATION_NONE, stream); @@ -1952,7 +1967,7 @@ void AttentionPolicyHead::Eval( wk /*A*/, policy_d_model_ /*LDA*/, 64 * policy_d_model_, /*strideA*/ wq /*B*/, policy_d_model_ /*LDB*/, 64 * policy_d_model_, /*strideB*/ 0.0f, output /*C*/, // output (policy_attn_logits) - 64 /*LDC*/, 64 * 64 + 8 * 24 /*strideC*/, N); + 64 /*LDC*/, 64 * 64 + 8 * 24 /*strideC*/, N, use_gemm_ex_); } // Compute promotion_logits in a single kernel (and put the result just after @@ -2045,8 +2060,10 @@ AttentionBody::AttentionBody(const MultiHeadWeights& weights, void* scratch, Activations activations, int num_res_blocks, int input_c, int max_batch_size, - bool is_pe_dense_embedding) - : BaseLayer(weights.ip_emb_b.size(), 8, 8, nullptr), + bool is_pe_dense_embedding, + bool use_gemm_ex, bool fused_mha) + : BaseLayer(weights.ip_emb_b.size(), 8, 8, nullptr, false, + use_gemm_ex), embedding_op_size_(weights.ip_emb_b.size()), encoder_head_count_(weights.encoder_head_count), activations_(activations), @@ -2055,7 +2072,8 @@ AttentionBody::AttentionBody(const MultiHeadWeights& weights, has_gating_(weights.ip_mult_gate.size() > 0 && weights.ip_add_gate.size() > 0), has_smolgen_(weights.has_smolgen), - is_pe_dense_embedding_(is_pe_dense_embedding) { + is_pe_dense_embedding_(is_pe_dense_embedding), + use_fused_mha_(fused_mha) { allocAndUpload(&ip_emb_w_, weights.ip_emb_w, scratch); allocAndUpload(&ip_emb_b_, weights.ip_emb_b, scratch); @@ -2110,7 +2128,7 @@ AttentionBody::AttentionBody(const MultiHeadWeights& weights, enc, scratch, encoder_head_count_, embedding_op_size_, alpha, smolgen_global_, smolgen_global_size_, max_batch_size, activations_.smolgen_activation, activations_.ffn_activation, - is_pe_dense_embedding_ ? 1e-3 : 1e-6); + is_pe_dense_embedding_ ? 1e-3 : 1e-6, use_gemm_ex, use_fused_mha_); encoder_weights_.emplace_back(pW); } } @@ -2172,7 +2190,8 @@ void AttentionBody::Eval(int N, DataType* output, const int num_inputs = 64 * 12; const int batch = N; - convertNCHWtoNHWC((DataType*)scratch, input, N, inputC, N, 12, 8, 8); + convertNCHWtoNHWC((DataType*)scratch, input, N, inputC, N, 12, 8, 8, + stream); cublasXgemm( cublas, CUBLAS_OP_T, CUBLAS_OP_N, num_outputs, batch, num_inputs, 1.0f, (const DataType*)ip_emb_pre_w_, num_inputs, @@ -2207,7 +2226,8 @@ void AttentionBody::Eval(int N, DataType* output, // #redirect flow through encoder blocks // flow = tf.transpose(flow, perm = [ 0, 2, 3, 1 ]) // flow = tf.reshape(flow, [ -1, 64, self.RESIDUAL_FILTERS ]) - convertNCHWtoNHWC((DataType*)scratch, input, N, inputC, N, inputC, 8, 8); + convertNCHWtoNHWC((DataType*)scratch, input, N, inputC, N, inputC, 8, 8, + stream); } if (is_pe_dense_embedding_) { @@ -2439,6 +2459,7 @@ void CudnnError(cudnnStatus_t status, const char* file, const int& line) { char message[128]; sprintf(message, "CUDNN error: %s (%s:%d) ", cudnnGetErrorString(status), file, line); + CERR << message; throw Exception(message); } } @@ -2475,6 +2496,7 @@ void CublasError(cublasStatus_t status, const char* file, const int& line) { char message[128]; sprintf(message, "CUBLAS error: %s (%s:%d) ", CublasGetErrorString(status), file, line); + CERR << message; throw Exception(message); } } @@ -2484,6 +2506,7 @@ void CudaError(cudaError_t status, const char* file, const int& line) { char message[128]; sprintf(message, "CUDA error: %s (%s:%d) ", cudaGetErrorString(status), file, line); + CERR << message; throw Exception(message); } } diff --git a/src/neural/backends/cuda/layers.h b/src/neural/backends/cuda/layers.h index 9ba5bd286e..5c5ec871c1 100644 --- a/src/neural/backends/cuda/layers.h +++ b/src/neural/backends/cuda/layers.h @@ -29,6 +29,7 @@ #include #include +#include #include "cuda_common.h" #include "neural/network_legacy.h" @@ -340,7 +341,8 @@ class EncoderBlock { int heads, int size, float alpha, DataType* smolgen_global_scratch, int smolgen_global_size, int max_batch_size, ActivationFunction smolgen_act, - ActivationFunction ffn_act, float default_eps); + ActivationFunction ffn_act, float default_eps, bool use_gemm_ex, + bool fused_mha); ~EncoderBlock(); void Eval(int N, DataType* inpop, DataType* scratch0, DataType* scratch1, @@ -393,6 +395,8 @@ class EncoderBlock { int smol_global_size_; const int max_batch_size_; + const bool use_fused_mha_; + const bool use_gemm_ex_; }; // The Attention policy head implementation @@ -406,12 +410,14 @@ class AttentionPolicyHead : public BaseLayer { using BaseLayer::GetC; using BaseLayer::GetH; using BaseLayer::GetW; + using BaseLayer::use_gemm_ex_; public: AttentionPolicyHead(BaseLayer* ip, const MultiHeadWeights::PolicyHead& weights, void* scratch, bool attention_body, - ActivationFunction act, int max_batch_size); + ActivationFunction act, int max_batch_size, + bool use_gemm_ex); ~AttentionPolicyHead(); void Eval(int N, DataType* output, const DataType* input, const DataType* input2, void* scratch, size_t scratch_size, @@ -476,7 +482,8 @@ class AttentionBody : public BaseLayer { public: AttentionBody(const MultiHeadWeights& weights, void* scratch, Activations activations, int num_res_blocks, int input_c, - int max_batch_size, bool is_pe_dense_embedding); + int max_batch_size, bool is_pe_dense_embedding, + bool use_gemm_ex, bool fused_mha); ~AttentionBody(); void Eval(int N, DataType* output, const DataType* input, const DataType* input2, void* scratch, size_t scratch_size, @@ -507,6 +514,7 @@ class AttentionBody : public BaseLayer { const bool has_gating_; const bool has_smolgen_; bool is_pe_dense_embedding_; // flag for dense position encoding + const bool use_fused_mha_; }; // The value head implementation @@ -523,8 +531,8 @@ class ValueHead : public BaseLayer { public: ValueHead(BaseLayer* ip, const MultiHeadWeights::ValueHead& weights, - void* scratch, bool attention_body, bool wdl, ActivationFunction act, - int max_batch_size, bool use_gemm_ex); + void* scratch, bool attention_body, bool wdl, + ActivationFunction act, int max_batch_size, bool use_gemm_ex); ~ValueHead(); void Eval(int N, DataType* output, const DataType* input, const DataType* input2, void* scratch, size_t scratch_size, @@ -548,6 +556,5 @@ class ValueHead : public BaseLayer { ActivationFunction act_; }; - } // namespace cudnn_backend } // namespace lczero diff --git a/src/neural/backends/cuda/network_cuda.cc b/src/neural/backends/cuda/network_cuda.cc index 43187a4316..85c80ce2e8 100644 --- a/src/neural/backends/cuda/network_cuda.cc +++ b/src/neural/backends/cuda/network_cuda.cc @@ -26,10 +26,10 @@ */ #include #include -#include #include #include #include +#include #include "cuda_common.h" #include "inputs_outputs.h" @@ -39,8 +39,17 @@ #include "neural/network_legacy.h" #include "neural/tables/attention_policy_map.h" #include "neural/tables/policy_map.h" -#include "utils/bititer.h" #include "utils/exception.h" +#include "utils/fp16_utils.h" +#include "utils/trace.h" + +#if CUDART_VERSION >= 11010 +#define CUDA_GRAPH_SUPPORTS_EXTERNAL_EVENTS 1 +#else +#define CUDA_GRAPH_SUPPORTS_EXTERNAL_EVENTS 0 +#undef cudaEventWaitExternal +#undef cudaEventRecordExternal +#endif namespace lczero { using namespace cudnn_backend; @@ -120,8 +129,8 @@ static size_t getMaxAttentionBodySize(const MultiHeadWeights& weights, int N) { template class CudaNetworkComputation : public NetworkComputation { public: - CudaNetworkComputation(CudaNetwork* network, - bool wdl, bool moves_left); + CudaNetworkComputation(CudaNetwork* network, bool wdl, + bool moves_left); ~CudaNetworkComputation(); void AddInput(InputPlanes&& input) override { @@ -130,11 +139,11 @@ class CudaNetworkComputation : public NetworkComputation { const auto iter_val = &inputs_outputs_->input_val_mem_[batch_size_ * kInputPlanes]; - int i = 0; - for (const auto& plane : input) { + assert(input.size() == kInputPlanes); + for (int i = 0; i < kInputPlanes; i++) { + const auto& plane = input[i]; iter_mask[i] = plane.mask; - iter_val[i] = plane.value; - i++; + ToType(iter_val[i], plane.value); } batch_size_++; @@ -142,38 +151,47 @@ class CudaNetworkComputation : public NetworkComputation { void ComputeBlocking() override; + void CaptureGraph(std::unique_lock&& lock = {}); + int GetBatchSize() const override { return batch_size_; } float GetQVal(int sample) const override { if (wdl_) { - auto w = inputs_outputs_->op_value_mem_[3 * sample + 0]; - auto l = inputs_outputs_->op_value_mem_[3 * sample + 2]; - return w - l; + const float* wdl = + sizeof(inputs_outputs_->op_value_mem_[0]) == sizeof(float) + ? (float*)inputs_outputs_->op_value_mem_ + : inputs_outputs_->wdl_cpu_softmax_.get(); + return wdl[2 * sample]; } - return inputs_outputs_->op_value_mem_[sample]; + return FromType(inputs_outputs_->op_value_mem_[sample]); } float GetDVal(int sample) const override { if (wdl_) { - return inputs_outputs_->op_value_mem_[3 * sample + 1]; + const float* wdl = + sizeof(inputs_outputs_->op_value_mem_[0]) == sizeof(float) + ? (float*)inputs_outputs_->op_value_mem_ + : inputs_outputs_->wdl_cpu_softmax_.get(); + return wdl[2 * sample + 1]; } return 0.0f; } float GetPVal(int sample, int move_id) const override { - return inputs_outputs_->op_policy_mem_[sample * kNumOutputPolicy + move_id]; + return FromType( + inputs_outputs_->op_policy_mem_[sample * kNumOutputPolicy + move_id]); } float GetMVal(int sample) const override { if (moves_left_) { - return inputs_outputs_->op_moves_left_mem_[sample]; + return FromType(inputs_outputs_->op_moves_left_mem_[sample]); } return 0.0f; } private: // Memory holding inputs, outputs. - std::unique_ptr inputs_outputs_; + std::unique_ptr> inputs_outputs_; int batch_size_; bool wdl_; bool moves_left_; @@ -190,6 +208,7 @@ class CudaNetwork : public Network { file.format().network_format().moves_left()} { MultiHeadWeights weights(file.weights()); gpu_id_ = options.GetOrDefault("gpu", 0); + enable_graph_capture_ = options.GetOrDefault("graph_capture", true); const auto nf = file.format().network_format(); using NF = pblczero::NetworkFormat; @@ -210,6 +229,10 @@ class CudaNetwork : public Network { showInfo(); +#ifdef USE_CUTLASS + CERR << "Compiled with CUTLASS enabled"; +#endif + int total_gpus; ReportCUDAErrors(cudaGetDeviceCount(&total_gpus)); @@ -218,7 +241,7 @@ class CudaNetwork : public Network { cudaDeviceProp deviceProp = {}; cudaGetDeviceProperties(&deviceProp, gpu_id_); - showDeviceInfo(deviceProp); + showDeviceInfo(deviceProp, gpu_id_); l2_cache_size_ = deviceProp.l2CacheSize; sm_count_ = deviceProp.multiProcessorCount; @@ -255,7 +278,16 @@ class CudaNetwork : public Network { } if (!multi_stream_) { + ReportCUDAErrors( + cudaStreamCreateWithFlags(&compute_stream_, cudaStreamNonBlocking)); + ReportCUDAErrors( + cudaStreamCreateWithFlags(&upload_stream_, cudaStreamNonBlocking)); + ReportCUDAErrors( + cudaStreamCreateWithFlags(&download_stream_, cudaStreamNonBlocking)); + ReportCUDAErrors(cudaEventCreateWithFlags(&compute_ordering_event_, + cudaEventDisableTiming)); ReportCUBLASErrors(cublasCreate(&cublas_)); + ReportCUBLASErrors(cublasSetStream(cublas_, compute_stream_)); if (has_tensor_cores_) ReportCUBLASErrors(cublasSetMathMode( cublas_, @@ -310,6 +342,11 @@ class CudaNetwork : public Network { use_res_block_winograd_fuse_opt_ = options.Get("res_block_fusing"); } + bool use_fused_mha = false; + if (deviceProp.major >= 8 && fp16) { + use_fused_mha = options.GetOrDefault("fused_mha", true); + } + const bool use_gemm_ex = deviceProp.major >= 5; // 0. Check for SE. @@ -342,14 +379,14 @@ class CudaNetwork : public Network { std::string policy_head = options.GetOrDefault("policy_head", "vanilla"); // Check that selected policy head exists. - if (weights.policy_heads.count(policy_head) == 0) { + if (!weights.policy_heads.contains(policy_head)) { throw Exception("The policy head you specified '" + policy_head + "' does not exist in this net."); } std::string value_head = options.GetOrDefault("value_head", "winner"); // Check that selected value head exists. - if (weights.value_heads.count(value_head) == 0) { + if (!weights.value_heads.contains(value_head)) { throw Exception("The value head you specified '" + value_head + "' does not exist in this net."); } @@ -457,7 +494,8 @@ class CudaNetwork : public Network { numBlocks_ > 0 ? kNumFilters : kInputPlanes, max_batch_size_, static_cast( file.format().network_format().input_embedding()) == - InputEmbedding::INPUT_EMBEDDING_PE_DENSE); + InputEmbedding::INPUT_EMBEDDING_PE_DENSE, + use_gemm_ex, use_fused_mha); network_.emplace_back(std::move(attention_body)); encoder_last_ = getLastLayer(); @@ -469,7 +507,7 @@ class CudaNetwork : public Network { if (attn_policy_) { auto AttentionPolicy = std::make_unique>( getLastLayer(), head, scratch_mem_, attn_body_, act, - max_batch_size_); + max_batch_size_, use_gemm_ex); network_.emplace_back(std::move(AttentionPolicy)); auto policymap = std::make_unique>( @@ -529,8 +567,8 @@ class CudaNetwork : public Network { pblczero::NetworkFormat::VALUE_WDL; BaseLayer* lastlayer = attn_body_ ? encoder_last_ : resi_last_; auto value_main = std::make_unique>( - lastlayer, head, scratch_mem_, attn_body_, wdl_, act, - max_batch_size_, use_gemm_ex); + lastlayer, head, scratch_mem_, attn_body_, wdl_, act, max_batch_size_, + use_gemm_ex); network_.emplace_back(std::move(value_main)); } @@ -591,18 +629,86 @@ class CudaNetwork : public Network { tensor_mem_size_ = multi_stream_ ? maxSize : 0; - // pre-allocate one InputsOutputs object - // The first call to allocate memory, create cublas, - // strem, etc takes really long (600 ms) - std::unique_ptr io = GetInputsOutputs(); + // pre-allocate cuda graphs for search threads + auto allocateCudaGraphs = [&] { + ReportCUDAErrors(cudaSetDevice(gpu_id_)); + CudaNetworkComputation comp(this, wdl_, moves_left_); + comp.AddInput(InputPlanes{(size_t)kNumInputPlanes}); + // Make sure cublas is initialized in this thread. + comp.ComputeBlocking(); + for (int i = 0; i < GetMiniBatchSize(); i++) { + comp.AddInput(InputPlanes{(size_t)kNumInputPlanes}); + auto lock = LockEval(); + comp.CaptureGraph(std::move(lock)); + } + }; + std::thread t2(allocateCudaGraphs); + allocateCudaGraphs(); + t2.join(); + } + + std::unique_lock LockEval() { + if (multi_stream_) { + return {}; + } else { + return std::unique_lock{lock_}; + } + } + + bool GetGraphCaptureEnabled() const { return enable_graph_capture_; } + + CudaGraphCapture BeginCapture(InputsOutputs& io) { + if (!multi_stream_) { +#if CUDA_GRAPH_SUPPORTS_EXTERNAL_EVENTS + return {io, upload_stream_, download_stream_}; +#else + return {io, compute_stream_, download_stream_}; +#endif + } else { + return {io, io.upload_stream_, io.download_stream_}; + } } - void forwardEval(InputsOutputs* io, int batchSize) { + void UploadInputs(InputsOutputs* io, int batchSize) { + // Multu-stream can capture uploads without external events. + if (multi_stream_) return; + ReportCUDAErrors( + cudaMemcpyAsync(io->input_masks_mem_gpu_, io->input_masks_mem_, + batchSize * kInputPlanes * sizeof(uint64_t), + cudaMemcpyHostToDevice, upload_stream_)); + ReportCUDAErrors(cudaMemcpyAsync( + io->input_val_mem_gpu_, io->input_val_mem_, + batchSize * kInputPlanes * sizeof(io->input_val_mem_[0]), + cudaMemcpyHostToDevice, upload_stream_)); + ReportCUDAErrors(cudaEventRecord(io->upload_done_event_, upload_stream_)); + ReportCUDAErrors( + cudaStreamWaitEvent(compute_stream_, io->upload_done_event_, 0)); + } + + void GraphLaunch(InputsOutputs* io, int batchSize) { +#if CUDA_GRAPH_SUPPORTS_EXTERNAL_EVENTS + io->cuda_graphs_[batchSize - 1].Launch(io->exec_stream_); +#else + if (!multi_stream_) { + UploadInputs(io, batchSize); + + io->cuda_graphs_[batchSize - 1].Launch(compute_stream_); + ReportCUDAErrors( + cudaEventRecord(io->download_done_event_, compute_stream_)); + } else { + io->cuda_graphs_[batchSize - 1].Launch(io->exec_stream_); + ReportCUDAErrors( + cudaEventRecord(io->download_done_event_, io->exec_stream_)); + } +#endif + } + + void forwardEval(InputsOutputs* io, int batchSize, + [[maybe_unused]] bool capture = false) { // It is safe to evaluate larger than the batchSize // as all buffers are designed to handle max_batch_size // and the extra invalid results are never read. if (batchSize < min_batch_size_) batchSize = min_batch_size_; - if (!multi_stream_) lock_.lock(); #ifdef DEBUG_RAW_NPS auto t_start = std::chrono::high_resolution_clock::now(); @@ -610,13 +716,13 @@ class CudaNetwork : public Network { // Expand packed planes to full planes. uint64_t* ipDataMasks = io->input_masks_mem_gpu_; - float* ipDataValues = io->input_val_mem_gpu_; + auto* ipDataValues = io->input_val_mem_gpu_; DataType* tensor_mem[3]; void* scratch_mem; DataType*** offset_pointers; DataType*** head_offset_pointers; - cudaStream_t stream; + cudaStream_t compute_stream, upload_stream, download_stream; cublasHandle_t cublas; if (multi_stream_) { // We use tensor and scratch memory from InputOutputs (so that multiple @@ -625,29 +731,49 @@ class CudaNetwork : public Network { scratch_mem = io->scratch_mem_; offset_pointers = (DataType***)&io->offset_pointers_; head_offset_pointers = (DataType***)&io->head_offset_pointers_; - stream = io->stream_; + compute_stream = io->compute_stream_; + upload_stream = io->upload_stream_; + download_stream = io->download_stream_; cublas = io->cublas_; } else { for (int i = 0; i < 3; i++) tensor_mem[i] = tensor_mem_[i]; scratch_mem = scratch_mem_; offset_pointers = (DataType***)&offset_pointers_; head_offset_pointers = (DataType***)&head_offset_pointers_; - stream = 0; // default stream + compute_stream = compute_stream_; + upload_stream = upload_stream_; + download_stream = download_stream_; cublas = cublas_; } - bool fp16 = std::is_same::value; - if (fp16) { - expandPlanes_Fp16_NCHW((half*)(tensor_mem[0]), ipDataMasks, ipDataValues, - batchSize * kInputPlanes, stream); - } else { - expandPlanes_Fp32_NCHW((float*)(tensor_mem[0]), ipDataMasks, ipDataValues, - batchSize * kInputPlanes, stream); + if (multi_stream_ || CUDA_GRAPH_SUPPORTS_EXTERNAL_EVENTS) { + ReportCUDAErrors( + cudaMemcpyAsync(io->input_masks_mem_gpu_, io->input_masks_mem_, + batchSize * kInputPlanes * sizeof(uint64_t), + cudaMemcpyHostToDevice, upload_stream)); + ReportCUDAErrors(cudaMemcpyAsync( + io->input_val_mem_gpu_, io->input_val_mem_, + batchSize * kInputPlanes * sizeof(io->input_val_mem_[0]), + cudaMemcpyHostToDevice, upload_stream)); + ReportCUDAErrors(cudaEventRecord(io->upload_done_event_, upload_stream)); + ReportCUDAErrors( + cudaStreamWaitEvent(compute_stream, io->upload_done_event_, 0)); } - float* opPol = io->op_policy_mem_gpu_; - float* opVal = io->op_value_mem_gpu_; - float* opMov = io->op_moves_left_mem_gpu_; + if (!multi_stream_) { +#if CUDA_GRAPH_SUPPORTS_EXTERNAL_EVENTS + ReportCUDAErrors( + cudaStreamWaitEvent(compute_stream, compute_ordering_event_, + capture ? cudaEventWaitExternal : 0)); +#endif + } + + expandPlanes_NCHW(tensor_mem[0], ipDataMasks, ipDataValues, + batchSize * kInputPlanes, compute_stream); + + auto* opPol = io->op_policy_mem_gpu_; + auto* opVal = io->op_value_mem_gpu_; + auto* opMov = io->op_moves_left_mem_gpu_; // Figure out if the memory requirment for running the res block would fit // in the L2 cache. @@ -675,7 +801,8 @@ class CudaNetwork : public Network { // we can use a single alloc to hold all the required tensors, and enable // persistent L2 caching on it ReportCUDAErrors(cudaStreamSetAttribute( - stream, cudaStreamAttributeAccessPolicyWindow, &stream_attribute)); + compute_stream, cudaStreamAttributeAccessPolicyWindow, + &stream_attribute)); enableCacheOpt = true; skip_connection = @@ -693,7 +820,7 @@ class CudaNetwork : public Network { // Input. network_[l++]->Eval(batchSize, skip_connection, tensor_mem[0], nullptr, scratch_mem, scratch_size_, nullptr, cublas, - stream); // input conv + compute_stream); // input conv // Residual block. for (int block = 0; block < numBlocks_; block++) { @@ -701,15 +828,15 @@ class CudaNetwork : public Network { network_[l++]->Eval(batchSize, tensor_mem[2], skip_connection, nullptr, enableCacheOpt ? nullptr : scratch_mem, scratch_size_, nullptr, cublas, - stream); // block + compute_stream); // block } else { network_[l++]->Eval(batchSize, tensor_mem[0], tensor_mem[2], nullptr, scratch_mem, scratch_size_, nullptr, cublas, - stream); // conv1 + compute_stream); // conv1 network_[l++]->Eval(batchSize, tensor_mem[2], tensor_mem[0], tensor_mem[2], scratch_mem, scratch_size_, - nullptr, cublas, stream); // conv2 + nullptr, cublas, compute_stream); // conv2 } } @@ -723,7 +850,7 @@ class CudaNetwork : public Network { batchSize, tensor_mem[1], (numBlocks_ > 0) ? tensor_mem[2] : tensor_mem[0], (numBlocks_ > 0) ? tensor_mem[0] : tensor_mem[2], scratch_mem, - scratch_size_, nullptr, cublas, stream, + scratch_size_, nullptr, cublas, compute_stream, offset_pointers); // Entire attention body of the network flow = tensor_mem[1]; @@ -735,7 +862,8 @@ class CudaNetwork : public Network { if (enableCacheOpt) { // reset the cache settings stream_attribute.accessPolicyWindow.num_bytes = 0; - cudaStreamSetAttribute(stream, cudaStreamAttributeAccessPolicyWindow, + cudaStreamSetAttribute(compute_stream, + cudaStreamAttributeAccessPolicyWindow, &stream_attribute); cudaCtxResetPersistingL2Cache(); } @@ -745,116 +873,131 @@ class CudaNetwork : public Network { if (attn_policy_) { network_[l++]->Eval( batchSize, spare1, flow, spare2, scratch_mem, scratch_size_, nullptr, - cublas, stream, + cublas, compute_stream, head_offset_pointers); // Entire Attention policy head except for the // policy map - if (fp16) { - network_[l++]->Eval(batchSize, spare2, spare1, nullptr, scratch_mem, - scratch_size_, nullptr, cublas, - stream); // policy map layer - copyTypeConverted(opPol, (half*)spare2, batchSize * kNumOutputPolicy, - stream); // POLICY output - } else { - network_[l++]->Eval(batchSize, (DataType*)opPol, spare1, nullptr, - scratch_mem, scratch_size_, nullptr, cublas, - stream); // policy map layer // POLICY output - } + network_[l++]->Eval( + batchSize, (DataType*)opPol, spare1, nullptr, scratch_mem, + scratch_size_, nullptr, cublas, + compute_stream); // policy map layer // POLICY output } else if (conv_policy_) { network_[l++]->Eval(batchSize, spare1, flow, nullptr, scratch_mem, scratch_size_, nullptr, cublas, - stream); // policy conv1 + compute_stream); // policy conv1 network_[l++]->Eval(batchSize, spare2, spare1, nullptr, scratch_mem, scratch_size_, nullptr, cublas, - stream); // policy conv2 - - if (fp16) { - network_[l++]->Eval(batchSize, spare1, spare2, nullptr, scratch_mem, - scratch_size_, nullptr, cublas, - stream); // policy map layer - copyTypeConverted(opPol, (half*)(spare1), batchSize * kNumOutputPolicy, - stream); // POLICY output - } else { - network_[l++]->Eval(batchSize, (DataType*)opPol, spare2, nullptr, - scratch_mem, scratch_size_, nullptr, cublas, - stream); // policy map layer // POLICY output - } + compute_stream); // policy conv2 + + network_[l++]->Eval( + batchSize, (DataType*)opPol, spare2, nullptr, scratch_mem, + scratch_size_, nullptr, cublas, + compute_stream); // policy map layer // POLICY output } else { network_[l++]->Eval(batchSize, spare1, flow, nullptr, scratch_mem, scratch_size_, nullptr, cublas, - stream); // pol conv - - if (fp16) { - network_[l++]->Eval(batchSize, spare2, spare1, nullptr, scratch_mem, - scratch_size_, nullptr, cublas, - stream); // pol FC + compute_stream); // pol conv - copyTypeConverted(opPol, (half*)(spare2), batchSize * kNumOutputPolicy, - stream); // POLICY - } else { - network_[l++]->Eval(batchSize, (DataType*)opPol, spare1, nullptr, - scratch_mem, scratch_size_, nullptr, cublas, - stream); // pol FC // POLICY - } + network_[l++]->Eval(batchSize, (DataType*)opPol, spare1, nullptr, + scratch_mem, scratch_size_, nullptr, cublas, + compute_stream); // pol FC // POLICY } + ReportCUDAErrors(cudaEventRecord(io->policy_done_event_, compute_stream)); + ReportCUDAErrors( + cudaStreamWaitEvent(download_stream, io->policy_done_event_, 0)); // Copy policy output from device memory to host memory. - ReportCUDAErrors( - cudaMemcpyAsync(io->op_policy_mem_, io->op_policy_mem_gpu_, - sizeof(float) * kNumOutputPolicy * batchSize, - cudaMemcpyDeviceToHost, stream)); + ReportCUDAErrors(cudaMemcpyAsync( + io->op_policy_mem_, io->op_policy_mem_gpu_, + sizeof(io->op_policy_mem_[0]) * kNumOutputPolicy * batchSize, + cudaMemcpyDeviceToHost, download_stream)); // value head - if (fp16) { - network_[l++]->Eval(batchSize, spare1, flow, spare2, scratch_mem, - scratch_size_, nullptr, cublas, - stream); // value head - copyTypeConverted(opVal, (half*)spare1, wdl_ ? 3 * batchSize : batchSize, - stream); - } else { - network_[l++]->Eval(batchSize, (DataType*)opVal, flow, spare2, - scratch_mem, scratch_size_, nullptr, cublas, - stream); // value head + network_[l++]->Eval(batchSize, (DataType*)opVal, flow, spare2, scratch_mem, + scratch_size_, nullptr, cublas, + compute_stream); // value head + if (!moves_left_ && !multi_stream_) { +#if CUDA_GRAPH_SUPPORTS_EXTERNAL_EVENTS + ReportCUDAErrors( + cudaEventRecordWithFlags(compute_ordering_event_, compute_stream, + capture ? cudaEventRecordExternal : 0)); +#endif + } + ReportCUDAErrors(cudaEventRecord(io->value_done_event_, compute_stream)); + ReportCUDAErrors( + cudaStreamWaitEvent(download_stream, io->value_done_event_, 0)); + ReportCUDAErrors(cudaMemcpyAsync( + io->op_value_mem_, io->op_value_mem_gpu_, + sizeof(io->op_value_mem_[0]) * (wdl_ ? 3 : 1) * batchSize, + cudaMemcpyDeviceToHost, download_stream)); + + if (wdl_) { +#if CUDA_GRAPH_SUPPORTS_EXTERNAL_EVENTS + ReportCUDAErrors(cudaEventRecordWithFlags( + io->wdl_download_done_event_, download_stream, + capture ? cudaEventRecordExternal : 0)); +#endif } if (moves_left_) { // Moves left head network_[l++]->Eval(batchSize, spare1, flow, nullptr, scratch_mem, scratch_size_, nullptr, cublas, - stream); // moves conv or embedding + compute_stream); // moves conv or embedding network_[l++]->Eval(batchSize, spare2, spare1, nullptr, scratch_mem, scratch_size_, nullptr, cublas, - stream); // moves FC1 + compute_stream); // moves FC1 // Moves left FC2 - if (fp16) { - // TODO: consider fusing the bias-add of FC2 with format conversion. - network_[l++]->Eval(batchSize, spare1, spare2, nullptr, scratch_mem, - scratch_size_, nullptr, cublas, stream); - copyTypeConverted(opMov, (half*)(spare1), batchSize, stream); - } else { - network_[l++]->Eval(batchSize, (DataType*)opMov, spare2, nullptr, - scratch_mem, scratch_size_, nullptr, cublas, - stream); + network_[l++]->Eval(batchSize, (DataType*)opMov, spare2, nullptr, + scratch_mem, scratch_size_, nullptr, cublas, + compute_stream); + if (!multi_stream_) { +#if CUDA_GRAPH_SUPPORTS_EXTERNAL_EVENTS + ReportCUDAErrors( + cudaEventRecordWithFlags(compute_ordering_event_, compute_stream, + capture ? cudaEventRecordExternal : 0)); +#endif } + ReportCUDAErrors( + cudaEventRecord(io->moves_left_done_event_, compute_stream)); + ReportCUDAErrors( + cudaStreamWaitEvent(download_stream, io->moves_left_done_event_, 0)); + ReportCUDAErrors( + cudaMemcpyAsync(io->op_moves_left_mem_, io->op_moves_left_mem_gpu_, + sizeof(io->op_moves_left_mem_[0]) * batchSize, + cudaMemcpyDeviceToHost, download_stream)); } - - if (multi_stream_) { - ReportCUDAErrors(cudaStreamSynchronize(stream)); - } else { - ReportCUDAErrors(cudaDeviceSynchronize()); - // The next thread can start using the GPU now. - lock_.unlock(); +#if CUDA_GRAPH_SUPPORTS_EXTERNAL_EVENTS + ReportCUDAErrors( + cudaEventRecordWithFlags(io->download_done_event_, download_stream, + capture ? cudaEventRecordExternal : 0)); +#else + if (!capture) { + ReportCUDAErrors( + cudaEventRecord(io->download_done_event_, download_stream)); } +#endif + } + void finishEval(InputsOutputs* io, int batchSize) { +#if !CUDA_GRAPH_SUPPORTS_EXTERNAL_EVENTS + ReportCUDAErrors(cudaEventSynchronize(io->download_done_event_)); +#endif if (wdl_) { +#if CUDA_GRAPH_SUPPORTS_EXTERNAL_EVENTS + ReportCUDAErrors(cudaEventSynchronize(io->wdl_download_done_event_)); +#endif // Value softmax done cpu side. for (int i = 0; i < batchSize; i++) { - float w = io->op_value_mem_[3 * i + 0]; - float d = io->op_value_mem_[3 * i + 1]; - float l = io->op_value_mem_[3 * i + 2]; + float* wdl = sizeof(io->op_value_mem_[0]) == sizeof(float) + ? (float*)io->op_value_mem_ + : io->wdl_cpu_softmax_.get(); + float w = FromType(io->op_value_mem_[3 * i + 0]); + float d = FromType(io->op_value_mem_[3 * i + 1]); + float l = FromType(io->op_value_mem_[3 * i + 2]); float m = std::max({w, d, l}); w = std::exp(w - m); d = std::exp(d - m); @@ -862,12 +1005,14 @@ class CudaNetwork : public Network { float sum = w + d + l; w /= sum; l /= sum; - d = 1.0f - w - l; - io->op_value_mem_[3 * i + 0] = w; - io->op_value_mem_[3 * i + 1] = d; - io->op_value_mem_[3 * i + 2] = l; + d /= sum; + wdl[2 * i + 0] = w - l; + wdl[2 * i + 1] = d; } } +#if CUDA_GRAPH_SUPPORTS_EXTERNAL_EVENTS + ReportCUDAErrors(cudaEventSynchronize(io->download_done_event_)); +#endif } ~CudaNetwork() { @@ -879,7 +1024,11 @@ class CudaNetwork : public Network { if (offset_pointers_) ReportCUDAErrors(cudaFree(offset_pointers_)); if (head_offset_pointers_) ReportCUDAErrors(cudaFree(head_offset_pointers_)); - cublasDestroy(cublas_); + ReportCUBLASErrors(cublasDestroy(cublas_)); + ReportCUDAErrors(cudaStreamDestroy(compute_stream_)); + ReportCUDAErrors(cudaStreamDestroy(upload_stream_)); + ReportCUDAErrors(cudaStreamDestroy(download_stream_)); + ReportCUDAErrors(cudaEventDestroy(compute_ordering_event_)); } } @@ -892,31 +1041,41 @@ class CudaNetwork : public Network { return 2 * sm_count_; } + int GetPreferredBatchStep() const override { + int preferred_split = 7; + while (sm_count_ % preferred_split != 0) preferred_split++; + return preferred_split; + } + int GetThreads() const override { return 1 + multi_stream_; } std::unique_ptr NewComputation() override { // Set correct gpu id for this computation (as it might have been called // from a different thread). - ReportCUDAErrors(cudaSetDevice(gpu_id_)); + int device = -1; + ReportCUDAErrors(cudaGetDevice(&device)); + if (device != gpu_id_) { + ReportCUDAErrors(cudaSetDevice(gpu_id_)); + } return std::make_unique>(this, wdl_, moves_left_); } - std::unique_ptr GetInputsOutputs() { + std::unique_ptr> GetInputsOutputs() { std::lock_guard lock(inputs_outputs_lock_); if (free_inputs_outputs_.empty()) { - return std::make_unique( + return std::make_unique>( max_batch_size_, wdl_, moves_left_, tensor_mem_size_, scratch_size_, !has_tensor_cores_ && std::is_same::value); } else { - std::unique_ptr resource = + std::unique_ptr> resource = std::move(free_inputs_outputs_.front()); free_inputs_outputs_.pop_front(); return resource; } } - void ReleaseInputsOutputs(std::unique_ptr resource) { + void ReleaseInputsOutputs(std::unique_ptr> resource) { std::lock_guard lock(inputs_outputs_lock_); free_inputs_outputs_.push_back(std::move(resource)); } @@ -925,7 +1084,7 @@ class CudaNetwork : public Network { // This function invokes constructor just to please complier and silence // warning. Is never called (but compiler thinks that it could). void UglyFunctionToSilenceNvccWarning() { - InputsOutputs io(0, false, false, false); + InputsOutputs io(0, false, false, false); } private: @@ -935,6 +1094,7 @@ class CudaNetwork : public Network { int sm_count_; int max_batch_size_; int min_batch_size_; + bool enable_graph_capture_; bool wdl_; bool moves_left_; bool use_res_block_winograd_fuse_opt_; // fuse operations inside the residual @@ -971,11 +1131,15 @@ class CudaNetwork : public Network { bool has_tensor_cores_; // not used when multi-steam is enabled + cudaStream_t compute_stream_ = nullptr; + cudaStream_t upload_stream_ = nullptr; + cudaStream_t download_stream_ = nullptr; + cudaEvent_t compute_ordering_event_ = nullptr; cublasHandle_t cublas_; DataType* tensor_mem_[3]; mutable std::mutex inputs_outputs_lock_; - std::list> free_inputs_outputs_; + std::list>> free_inputs_outputs_; void showInfo() const { int version; @@ -996,9 +1160,12 @@ class CudaNetwork : public Network { major = CUDART_VERSION / 1000; minor = (CUDART_VERSION - major * 1000) / 10; pl = CUDART_VERSION - major * 1000 - minor * 10; - CERR << "WARNING: CUDA Runtime version mismatch, was compiled with " - "version " - << major << "." << minor << "." << pl; + // After cuda 11, newer version with same major is OK. + if (major < 11 || (major != version / 1000) || version < CUDART_VERSION) { + CERR << "WARNING: CUDA Runtime version mismatch, was compiled with " + "version " + << major << "." << minor << "." << pl; + } } cudaDriverGetVersion(&version); major = version / 1000; @@ -1011,11 +1178,27 @@ class CudaNetwork : public Network { } } - void showDeviceInfo(const cudaDeviceProp& deviceProp) const { + void showDeviceInfo(const cudaDeviceProp& deviceProp, + [[maybe_unused]] int deviceId) const { CERR << "GPU: " << deviceProp.name; CERR << "GPU memory: " << deviceProp.totalGlobalMem / std::pow(2.0f, 30) << " Gb"; - CERR << "GPU clock frequency: " << deviceProp.clockRate / 1e3f << " MHz"; + // Get clock rate + float clockRateMHz; +#if CUDART_VERSION >= 13000 + int clockRatekHz; + cudaError_t err = + cudaDeviceGetAttribute(&clockRatekHz, cudaDevAttrClockRate, deviceId); + if (err != cudaSuccess) { + CERR << "Error getting clock rate: " << cudaGetErrorString(err); + clockRateMHz = 0.0f; // Fallback value + } else { + clockRateMHz = clockRatekHz / 1e3f; + } +#else + clockRateMHz = deviceProp.clockRate / 1e3f; +#endif + CERR << "GPU clock frequency: " << clockRateMHz << " MHz"; CERR << "GPU compute capability: " << deviceProp.major << "." << deviceProp.minor; CERR << "L2 cache capacity: " << deviceProp.l2CacheSize; @@ -1039,9 +1222,40 @@ CudaNetworkComputation::~CudaNetworkComputation() { network_->ReleaseInputsOutputs(std::move(inputs_outputs_)); } +template +void CudaNetworkComputation::CaptureGraph( + std::unique_lock&& lock) { + if (!network_->GetGraphCaptureEnabled()) return; + if (!CudaGraphCapture::EnsureEnoughFreeMemory()) { + static std::once_flag flag; + std::call_once(flag, []() { + CERR << "WARNING: Not enough GPU memory to capture CUDA graphs."; + }); + return; + } + auto capture = network_->BeginCapture(*inputs_outputs_); + network_->forwardEval(inputs_outputs_.get(), GetBatchSize(), true); + capture.EndCapture(); + if (lock.owns_lock()) lock.unlock(); + inputs_outputs_->cuda_graphs_[GetBatchSize() - 1] = capture; +} + template void CudaNetworkComputation::ComputeBlocking() { - network_->forwardEval(inputs_outputs_.get(), GetBatchSize()); + LCTRACE_FUNCTION_SCOPE; + assert(GetBatchSize() >= 1); + if (inputs_outputs_->cuda_graphs_[GetBatchSize() - 1]) { + std::unique_lock lock = network_->LockEval(); + network_->GraphLaunch(inputs_outputs_.get(), GetBatchSize()); + } else { + std::unique_lock lock = network_->LockEval(); +#if !CUDA_GRAPH_SUPPORTS_EXTERNAL_EVENTS + network_->UploadInputs(inputs_outputs_.get(), GetBatchSize()); +#endif + network_->forwardEval(inputs_outputs_.get(), GetBatchSize()); + CaptureGraph(std::move(lock)); + } + network_->finishEval(inputs_outputs_.get(), GetBatchSize()); } template diff --git a/src/neural/backends/cuda/network_cudnn.cc b/src/neural/backends/cuda/network_cudnn.cc index d7b15147a2..edf7b592e6 100644 --- a/src/neural/backends/cuda/network_cudnn.cc +++ b/src/neural/backends/cuda/network_cudnn.cc @@ -26,7 +26,6 @@ */ #include #include -#include #include #include #include @@ -39,8 +38,8 @@ #include "neural/network_legacy.h" #include "neural/tables/attention_policy_map.h" #include "neural/tables/policy_map.h" -#include "utils/bititer.h" #include "utils/exception.h" +#include "utils/fp16_utils.h" // #define DEBUG_RAW_NPS @@ -99,11 +98,10 @@ class CudnnNetworkComputation : public NetworkComputation { const auto iter_val = &inputs_outputs_->input_val_mem_[batch_size_ * kInputPlanes]; - int i = 0; - for (const auto& plane : input) { + for (int i = 0; i < kInputPlanes; i++) { + const auto& plane = input[i]; iter_mask[i] = plane.mask; - iter_val[i] = plane.value; - i++; + ToType(iter_val[i], plane.value); } batch_size_++; @@ -111,41 +109,47 @@ class CudnnNetworkComputation : public NetworkComputation { void ComputeBlocking() override; + void CaptureGraph(std::unique_lock&& lock = {}); + int GetBatchSize() const override { return batch_size_; } float GetQVal(int sample) const override { if (wdl_) { - auto w = inputs_outputs_->op_value_mem_[3 * sample + 0]; - auto l = inputs_outputs_->op_value_mem_[3 * sample + 2]; - return w - l; - } else { - return inputs_outputs_->op_value_mem_[sample]; + const float* wdl = + sizeof(inputs_outputs_->op_value_mem_[0]) == sizeof(float) + ? (float*)inputs_outputs_->op_value_mem_ + : inputs_outputs_->wdl_cpu_softmax_.get(); + return wdl[2 * sample]; } + return FromType(inputs_outputs_->op_value_mem_[sample]); } float GetDVal(int sample) const override { if (wdl_) { - auto d = inputs_outputs_->op_value_mem_[3 * sample + 1]; - return d; - } else { - return 0.0f; + const float* wdl = + sizeof(inputs_outputs_->op_value_mem_[0]) == sizeof(float) + ? (float*)inputs_outputs_->op_value_mem_ + : inputs_outputs_->wdl_cpu_softmax_.get(); + return wdl[2 * sample + 1]; } + return 0.0f; } float GetPVal(int sample, int move_id) const override { - return inputs_outputs_->op_policy_mem_[sample * kNumOutputPolicy + move_id]; + return FromType( + inputs_outputs_->op_policy_mem_[sample * kNumOutputPolicy + move_id]); } float GetMVal(int sample) const override { if (moves_left_) { - return inputs_outputs_->op_moves_left_mem_[sample]; + return FromType(inputs_outputs_->op_moves_left_mem_[sample]); } return 0.0f; } private: // Memory holding inputs, outputs. - std::unique_ptr inputs_outputs_; + std::unique_ptr> inputs_outputs_; int batch_size_; bool wdl_; bool moves_left_; @@ -162,6 +166,7 @@ class CudnnNetwork : public Network { file.format().network_format().moves_left()} { MultiHeadWeights weights(file.weights()); gpu_id_ = options.GetOrDefault("gpu", 0); + enable_graph_capture_ = options.GetOrDefault("graph_capture", true); conv_policy_ = file.format().network_format().policy() == pblczero::NetworkFormat::POLICY_CONVOLUTION; @@ -189,7 +194,7 @@ class CudnnNetwork : public Network { cudaDeviceProp deviceProp = {}; cudaGetDeviceProperties(&deviceProp, gpu_id_); - showDeviceInfo(deviceProp); + showDeviceInfo(deviceProp, gpu_id_); // Select GPU to run on (for *the current* thread). ReportCUDAErrors(cudaSetDevice(gpu_id_)); @@ -229,6 +234,17 @@ class CudnnNetwork : public Network { // Override if forced from backend option if (options.Exists("nhwc")) nhwc_ = options.Get("nhwc"); } + ReportCUDAErrors( + cudaStreamCreateWithFlags(&compute_stream_, cudaStreamNonBlocking)); + ReportCUDAErrors( + cudaStreamCreateWithFlags(&upload_stream_, cudaStreamNonBlocking)); + ReportCUDAErrors( + cudaStreamCreateWithFlags(&download_stream_, cudaStreamNonBlocking)); + ReportCUDAErrors(cudaEventCreateWithFlags(&compute_ordering_event_, + cudaEventDisableTiming)); + + ReportCUBLASErrors(cublasSetStream(cublas_, compute_stream_)); + ReportCUDNNErrors(cudnnSetStream(cudnn_, compute_stream_)); if (hasTensorCores) ReportCUBLASErrors(cublasSetMathMode( @@ -527,7 +543,7 @@ class CudnnNetwork : public Network { if (attn_policy_) { auto AttentionPolicy = std::make_unique>( getLastLayer(), head, scratch_mem_, false, ACTIVATION_SELU, - max_batch_size_); + max_batch_size_, use_gemm_ex); network_.emplace_back(std::move(AttentionPolicy)); auto policymap = std::make_unique>( @@ -586,8 +602,7 @@ class CudnnNetwork : public Network { auto FCVal1 = std::make_unique>( getLastLayer(), head.ip1_val_b.size(), 1, 1, true, mish_net ? ACTIVATION_MISH : ACTIVATION_RELU); - FCVal1->LoadWeights(&head.ip1_val_w[0], &head.ip1_val_b[0], - scratch_mem_); + FCVal1->LoadWeights(&head.ip1_val_w[0], &head.ip1_val_b[0], scratch_mem_); network_.emplace_back(std::move(FCVal1)); wdl_ = file.format().network_format().value() == @@ -597,8 +612,7 @@ class CudnnNetwork : public Network { auto FCVal2 = std::make_unique>( getLastLayer(), head.ip2_val_b.size(), 1, 1, true, fc2_tanh ? ACTIVATION_TANH : ACTIVATION_NONE); - FCVal2->LoadWeights(&head.ip2_val_w[0], &head.ip2_val_b[0], - scratch_mem_); + FCVal2->LoadWeights(&head.ip2_val_w[0], &head.ip2_val_b[0], scratch_mem_); network_.emplace_back(std::move(FCVal2)); } value_out_ = getLastLayer(); @@ -664,45 +678,94 @@ class CudnnNetwork : public Network { CERR << "allocated " << 3 * maxSize << " bytes of GPU memory to run the network"; #endif + + // pre-allocate cuda graphs for search threads + auto allocateCudaGraphs = [&] { + CudnnNetworkComputation comp(this, wdl_, moves_left_); + comp.AddInput(InputPlanes{(size_t)kNumInputPlanes}); + // Make sure cublas is initialized in this thread. + comp.ComputeBlocking(); + for (int i = 0; i < GetMiniBatchSize(); i++) { + comp.AddInput(InputPlanes{(size_t)kNumInputPlanes}); + auto lock = LockEval(); + comp.CaptureGraph(std::move(lock)); + } + }; + std::thread t2(allocateCudaGraphs); + allocateCudaGraphs(); + t2.join(); } - void forwardEval(InputsOutputs* io, int batchSize) { + std::unique_lock LockEval() { + return std::unique_lock{lock_}; + } + + bool GetGraphCaptureEnabled() const { return enable_graph_capture_; } + + CudaGraphCapture BeginCapture(InputsOutputs& io) { + return {io, compute_stream_, download_stream_}; + } + + void UploadInputs(InputsOutputs* io, int batchSize) { + ReportCUDAErrors( + cudaMemcpyAsync(io->input_masks_mem_gpu_, io->input_masks_mem_, + batchSize * kInputPlanes * sizeof(uint64_t), + cudaMemcpyHostToDevice, upload_stream_)); + ReportCUDAErrors(cudaMemcpyAsync( + io->input_val_mem_gpu_, io->input_val_mem_, + batchSize * kInputPlanes * sizeof(io->input_val_mem_[0]), + cudaMemcpyHostToDevice, upload_stream_)); + ReportCUDAErrors(cudaEventRecord(io->upload_done_event_, upload_stream_)); + ReportCUDAErrors( + cudaStreamWaitEvent(compute_stream_, io->upload_done_event_, 0)); + } + + void GraphLaunch(InputsOutputs* io, int batchSize) { + UploadInputs(io, batchSize); + + // cudaGraphUpload was added in CUDA 11.1 +#if CUDART_VERSION >= 11010 + // Make sure graph has completed upload before launching it. + ReportCUDAErrors(cudaStreamSynchronize(io->exec_stream_)); +#endif + + io->cuda_graphs_[batchSize - 1].Launch(compute_stream_); + ReportCUDAErrors( + cudaEventRecord(io->download_done_event_, compute_stream_)); + } + + void forwardEval(InputsOutputs* io, int batchSize, + bool capture = false) { // It is safe to evaluate larger than the batchSize // as all buffers are designed to handle max_batch_size // and the extra invalid results are never read. if (batchSize < min_batch_size_) batchSize = min_batch_size_; - std::unique_lock lock(lock_); #ifdef DEBUG_RAW_NPS auto t_start = std::chrono::high_resolution_clock::now(); #endif // TODO: consider supporting multi-stream path for cudnn backend too. - cudaStream_t stream = 0; // default stream + cudaStream_t compute_stream = compute_stream_; + cudaStream_t download_stream = download_stream_; // Expand packed planes to full planes. - uint64_t* ipDataMasks = io->input_masks_mem_gpu_; - float* ipDataValues = io->input_val_mem_gpu_; + const uint64_t* ipDataMasks = io->input_masks_mem_gpu_; + const auto* ipDataValues = io->input_val_mem_gpu_; - bool fp16 = std::is_same::value; - if (fp16) { - if (nhwc_) - expandPlanes_Fp16_NHWC((half*)(tensor_mem_[0]), ipDataMasks, - ipDataValues, batchSize * kInputPlanes, stream); - else - expandPlanes_Fp16_NCHW((half*)(tensor_mem_[0]), ipDataMasks, - ipDataValues, batchSize * kInputPlanes, stream); - } else { - expandPlanes_Fp32_NCHW((float*)(tensor_mem_[0]), ipDataMasks, - ipDataValues, batchSize * kInputPlanes, stream); - } + if (nhwc_) + expandPlanes_NHWC(tensor_mem_[0], ipDataMasks, ipDataValues, + batchSize * kInputPlanes, compute_stream); + else + expandPlanes_NCHW(tensor_mem_[0], ipDataMasks, ipDataValues, + batchSize * kInputPlanes, compute_stream); // debug code example // dumpTensor(tensor_mem_[0], 1024, "After expand Planes", fp16); - float* opPol = io->op_policy_mem_gpu_; - float* opVal = io->op_value_mem_gpu_; - float* opMov = io->op_moves_left_mem_gpu_; + auto* opPol = io->op_policy_mem_gpu_; + auto* opVal = io->op_value_mem_gpu_; + auto* opMov = io->op_moves_left_mem_gpu_; int l = 0; // Input. @@ -710,40 +773,40 @@ class CudnnNetwork : public Network { batchSize, use_res_block_winograd_fuse_opt_ ? tensor_mem_[1] : tensor_mem_[2], tensor_mem_[0], nullptr, scratch_mem_, scratch_size_, cudnn_, cublas_, - stream); // input conv + compute_stream); // input conv // Residual block. for (int block = 0; block < numBlocks_; block++) { if (use_res_block_winograd_fuse_opt_) { network_[l++]->Eval(batchSize, tensor_mem_[2], tensor_mem_[1], nullptr, scratch_mem_, scratch_size_, cudnn_, cublas_, - stream); // block + compute_stream); // block } else { network_[l++]->Eval(batchSize, tensor_mem_[0], tensor_mem_[2], nullptr, scratch_mem_, scratch_size_, cudnn_, cublas_, - stream); // conv1 + compute_stream); // conv1 if (use_custom_winograd_) { network_[l++]->Eval(batchSize, tensor_mem_[2], tensor_mem_[0], tensor_mem_[2], scratch_mem_, scratch_size_, - cudnn_, cublas_, stream); // conv2 + cudnn_, cublas_, compute_stream); // conv2 } else { // For SE Resnet, skip connection is added after SE (and bias is added // as part of SE). if (has_se_) { network_[l++]->Eval(batchSize, tensor_mem_[1], tensor_mem_[0], nullptr, scratch_mem_, scratch_size_, cudnn_, - cublas_, stream); // conv2 + cublas_, compute_stream); // conv2 } else { network_[l++]->Eval(batchSize, tensor_mem_[2], tensor_mem_[0], tensor_mem_[2], scratch_mem_, scratch_size_, - cudnn_, cublas_, stream); // conv2 + cudnn_, cublas_, compute_stream); // conv2 } if (has_se_) { network_[l++]->Eval(batchSize, tensor_mem_[2], tensor_mem_[1], tensor_mem_[2], scratch_mem_, scratch_size_, - cudnn_, cublas_, stream); // SE layer + cudnn_, cublas_, compute_stream); // SE layer } } } @@ -753,125 +816,110 @@ class CudnnNetwork : public Network { if (attn_policy_) { network_[l++]->Eval( batchSize, tensor_mem_[0], tensor_mem_[2], tensor_mem_[1], - scratch_mem_, scratch_size_, nullptr, cublas_, stream, + scratch_mem_, scratch_size_, nullptr, cublas_, compute_stream, &head_offset_pointers_); // Entire Attention policy head except for // the policy map - if (fp16) { - network_[l++]->Eval(batchSize, tensor_mem_[1], tensor_mem_[0], nullptr, - scratch_mem_, scratch_size_, nullptr, cublas_, - stream); // policy map layer - copyTypeConverted(opPol, (half*)(tensor_mem_[1]), - batchSize * kNumOutputPolicy, - stream); // POLICY output - } else { - network_[l++]->Eval(batchSize, (DataType*)opPol, tensor_mem_[0], - nullptr, scratch_mem_, scratch_size_, nullptr, - cublas_, stream); // policy map layer - // POLICY output - } + network_[l++]->Eval(batchSize, (DataType*)opPol, tensor_mem_[0], nullptr, + scratch_mem_, scratch_size_, nullptr, cublas_, + compute_stream); // policy map layer + // POLICY output } else if (conv_policy_) { network_[l++]->Eval(batchSize, tensor_mem_[0], tensor_mem_[2], nullptr, scratch_mem_, scratch_size_, cudnn_, cublas_, - stream); // policy conv1 + compute_stream); // policy conv1 network_[l++]->Eval(batchSize, tensor_mem_[1], tensor_mem_[0], nullptr, scratch_mem_, scratch_size_, cudnn_, cublas_, - stream); // policy conv2 + compute_stream); // policy conv2 - if (fp16) { - network_[l++]->Eval(batchSize, tensor_mem_[0], tensor_mem_[1], nullptr, - scratch_mem_, scratch_size_, cudnn_, cublas_, - stream); // policy map layer - copyTypeConverted(opPol, (half*)(tensor_mem_[0]), - batchSize * kNumOutputPolicy, - stream); // POLICY output - } else { - network_[l++]->Eval(batchSize, (DataType*)opPol, tensor_mem_[1], - nullptr, scratch_mem_, scratch_size_, cudnn_, - cublas_, - stream); // policy map layer // POLICY output - } + network_[l++]->Eval( + batchSize, (DataType*)opPol, tensor_mem_[1], nullptr, scratch_mem_, + scratch_size_, cudnn_, cublas_, + compute_stream); // policy map layer // POLICY output } else { network_[l++]->Eval(batchSize, tensor_mem_[0], tensor_mem_[2], nullptr, scratch_mem_, scratch_size_, cudnn_, cublas_, - stream); // pol conv - - if (fp16) { - network_[l++]->Eval(batchSize, tensor_mem_[1], tensor_mem_[0], nullptr, - scratch_mem_, scratch_size_, cudnn_, cublas_, - stream); // pol FC + compute_stream); // pol conv - copyTypeConverted(opPol, (half*)(tensor_mem_[1]), - batchSize * kNumOutputPolicy, stream); // POLICY - } else { - network_[l++]->Eval(batchSize, (DataType*)opPol, tensor_mem_[0], - nullptr, scratch_mem_, scratch_size_, cudnn_, - cublas_, stream); // pol FC // POLICY - } + network_[l++]->Eval(batchSize, (DataType*)opPol, tensor_mem_[0], nullptr, + scratch_mem_, scratch_size_, cudnn_, cublas_, + compute_stream); // pol FC // POLICY } + ReportCUDAErrors(cudaEventRecord(io->policy_done_event_, compute_stream)); + ReportCUDAErrors( + cudaStreamWaitEvent(download_stream, io->policy_done_event_, 0)); + // Copy policy output from device memory to host memory. ReportCUDAErrors(cudaMemcpyAsync( io->op_policy_mem_, io->op_policy_mem_gpu_, - sizeof(float) * kNumOutputPolicy * batchSize, cudaMemcpyDeviceToHost)); + sizeof(io->op_policy_mem_[0]) * kNumOutputPolicy * batchSize, + cudaMemcpyDeviceToHost, download_stream)); // value head network_[l++]->Eval(batchSize, tensor_mem_[0], tensor_mem_[2], nullptr, scratch_mem_, scratch_size_, cudnn_, cublas_, - stream); // value conv + compute_stream); // value conv network_[l++]->Eval(batchSize, tensor_mem_[1], tensor_mem_[0], nullptr, scratch_mem_, scratch_size_, cudnn_, cublas_, - stream); // value FC1 + compute_stream); // value FC1 - if (fp16) { - // TODO: consider fusing the bias-add of FC2 with format conversion. - network_[l++]->Eval(batchSize, tensor_mem_[0], tensor_mem_[1], nullptr, - scratch_mem_, scratch_size_, cudnn_, cublas_, - stream); // value FC2 - copyTypeConverted(opVal, (half*)(tensor_mem_[0]), - wdl_ ? 3 * batchSize : batchSize, stream); // VALUE - } else { - network_[l++]->Eval(batchSize, (DataType*)opVal, tensor_mem_[1], nullptr, - scratch_mem_, scratch_size_, cudnn_, cublas_, - stream); // value FC2 // VALUE - } + network_[l++]->Eval(batchSize, (DataType*)opVal, tensor_mem_[1], nullptr, + scratch_mem_, scratch_size_, cudnn_, cublas_, + compute_stream); // value FC2 // VALUE + + ReportCUDAErrors(cudaEventRecord(io->value_done_event_, compute_stream)); + ReportCUDAErrors( + cudaStreamWaitEvent(download_stream, io->value_done_event_, 0)); + ReportCUDAErrors(cudaMemcpyAsync( + io->op_value_mem_, io->op_value_mem_gpu_, + sizeof(io->op_value_mem_[0]) * (wdl_ ? 3 : 1) * batchSize, + cudaMemcpyDeviceToHost, download_stream)); if (moves_left_) { // Moves left head network_[l++]->Eval(batchSize, tensor_mem_[0], tensor_mem_[2], nullptr, scratch_mem_, scratch_size_, cudnn_, cublas_, - stream); // moves conv + compute_stream); // moves conv network_[l++]->Eval(batchSize, tensor_mem_[1], tensor_mem_[0], nullptr, scratch_mem_, scratch_size_, cudnn_, cublas_, - stream); // moves FC1 + compute_stream); // moves FC1 // Moves left FC2 - if (fp16) { - // TODO: consider fusing the bias-add of FC2 with format conversion. - network_[l++]->Eval(batchSize, tensor_mem_[0], tensor_mem_[1], nullptr, - scratch_mem_, scratch_size_, cudnn_, cublas_, - stream); - copyTypeConverted(opMov, (half*)(tensor_mem_[0]), batchSize, stream); - } else { - network_[l++]->Eval(batchSize, (DataType*)opMov, tensor_mem_[1], - nullptr, scratch_mem_, scratch_size_, cudnn_, - cublas_, stream); - } + network_[l++]->Eval(batchSize, (DataType*)opMov, tensor_mem_[1], nullptr, + scratch_mem_, scratch_size_, cudnn_, cublas_, + compute_stream); + + ReportCUDAErrors( + cudaEventRecord(io->moves_left_done_event_, compute_stream)); + ReportCUDAErrors( + cudaStreamWaitEvent(download_stream, io->moves_left_done_event_, 0)); + ReportCUDAErrors( + cudaMemcpyAsync(io->op_moves_left_mem_, io->op_moves_left_mem_gpu_, + sizeof(io->op_moves_left_mem_[0]) * batchSize, + cudaMemcpyDeviceToHost, download_stream)); } - ReportCUDAErrors(cudaDeviceSynchronize()); - // The next thread can start using the GPU now. - lock.unlock(); + if (!capture) { + ReportCUDAErrors( + cudaEventRecord(io->download_done_event_, download_stream)); + } + } + void finishEval(InputsOutputs* io, int batchSize) { + ReportCUDAErrors(cudaEventSynchronize(io->download_done_event_)); if (wdl_) { // Value softmax done cpu side. for (int i = 0; i < batchSize; i++) { - float w = io->op_value_mem_[3 * i + 0]; - float d = io->op_value_mem_[3 * i + 1]; - float l = io->op_value_mem_[3 * i + 2]; + float* wdl = sizeof(io->op_value_mem_[0]) == sizeof(float) + ? (float*)io->op_value_mem_ + : io->wdl_cpu_softmax_.get(); + float w = FromType(io->op_value_mem_[3 * i + 0]); + float d = FromType(io->op_value_mem_[3 * i + 1]); + float l = FromType(io->op_value_mem_[3 * i + 2]); float m = std::max({w, d, l}); w = std::exp(w - m); d = std::exp(d - m); @@ -879,10 +927,9 @@ class CudnnNetwork : public Network { float sum = w + d + l; w /= sum; l /= sum; - d = 1.0f - w - l; - io->op_value_mem_[3 * i + 0] = w; - io->op_value_mem_[3 * i + 1] = d; - io->op_value_mem_[3 * i + 2] = l; + d /= sum; + wdl[2 * i + 0] = w - l; + wdl[2 * i + 1] = d; } } @@ -921,6 +968,9 @@ class CudnnNetwork : public Network { ReportCUDAErrors(cudaFree(head_offset_pointers_)); cudnnDestroy(cudnn_); cublasDestroy(cublas_); + ReportCUDAErrors(cudaStreamDestroy(compute_stream_)); + ReportCUDAErrors(cudaStreamDestroy(upload_stream_)); + ReportCUDAErrors(cudaStreamDestroy(download_stream_)); } const NetworkCapabilities& GetCapabilities() const override { @@ -930,25 +980,29 @@ class CudnnNetwork : public Network { std::unique_ptr NewComputation() override { // Set correct gpu id for this computation (as it might have been called // from a different thread). - ReportCUDAErrors(cudaSetDevice(gpu_id_)); + int device = -1; + ReportCUDAErrors(cudaGetDevice(&device)); + if (device != gpu_id_) { + ReportCUDAErrors(cudaSetDevice(gpu_id_)); + } return std::make_unique>(this, wdl_, moves_left_); } - std::unique_ptr GetInputsOutputs() { + std::unique_ptr> GetInputsOutputs() { std::lock_guard lock(inputs_outputs_lock_); if (free_inputs_outputs_.empty()) { - return std::make_unique(max_batch_size_, wdl_, - moves_left_); + return std::make_unique>(max_batch_size_, wdl_, + moves_left_); } else { - std::unique_ptr resource = + std::unique_ptr> resource = std::move(free_inputs_outputs_.front()); free_inputs_outputs_.pop_front(); return resource; } } - void ReleaseInputsOutputs(std::unique_ptr resource) { + void ReleaseInputsOutputs(std::unique_ptr> resource) { std::lock_guard lock(inputs_outputs_lock_); free_inputs_outputs_.push_back(std::move(resource)); } @@ -956,7 +1010,9 @@ class CudnnNetwork : public Network { // Apparently nvcc doesn't see constructor invocations through make_unique. // This function invokes constructor just to please complier and silence // warning. Is never called (but compiler thinks that it could). - void UglyFunctionToSilenceNvccWarning() { InputsOutputs io(0, false, false, false); } + void UglyFunctionToSilenceNvccWarning() { + InputsOutputs io(0, false, false, false); + } private: const NetworkCapabilities capabilities_; @@ -965,6 +1021,7 @@ class CudnnNetwork : public Network { int gpu_id_; int max_batch_size_; int min_batch_size_; + bool enable_graph_capture_; bool wdl_; bool moves_left_; @@ -999,7 +1056,12 @@ class CudnnNetwork : public Network { size_t scratch_size_; mutable std::mutex inputs_outputs_lock_; - std::list> free_inputs_outputs_; + std::list>> free_inputs_outputs_; + + cudaStream_t compute_stream_ = nullptr; + cudaStream_t upload_stream_ = nullptr; + cudaStream_t download_stream_ = nullptr; + cudaEvent_t compute_ordering_event_ = nullptr; void showInfo() const { int version; @@ -1020,16 +1082,20 @@ class CudnnNetwork : public Network { major = CUDART_VERSION / 1000; minor = (CUDART_VERSION - major * 1000) / 10; pl = CUDART_VERSION - major * 1000 - minor * 10; - CERR << "WARNING: CUDA Runtime version mismatch, was compiled with " - "version " - << major << "." << minor << "." << pl; + // After cuda 11, newer version with same major is OK. + if (major < 11 || (major != version / 1000) || version < CUDART_VERSION) { + CERR << "WARNING: CUDA Runtime version mismatch, was compiled with " + "version " + << major << "." << minor << "." << pl; + } } version = (int)cudnnGetVersion(); major = version / 1000; minor = (version - major * 1000) / 100; pl = version - major * 1000 - minor * 100; CERR << "Cudnn version: " << major << "." << minor << "." << pl; - if (version != CUDNN_VERSION) { + // Assuming CUDNN > 7. + if (major != CUDNN_MAJOR || minor < CUDNN_MINOR) { CERR << "WARNING: CUDNN Runtime version mismatch, was compiled with " "version " << CUDNN_MAJOR << "." << CUDNN_MINOR << "." << CUDNN_PATCHLEVEL; @@ -1045,11 +1111,27 @@ class CudnnNetwork : public Network { } } - void showDeviceInfo(const cudaDeviceProp& deviceProp) const { + void showDeviceInfo(const cudaDeviceProp& deviceProp, + [[maybe_unused]] int deviceId) const { CERR << "GPU: " << deviceProp.name; CERR << "GPU memory: " << deviceProp.totalGlobalMem / std::pow(2.0f, 30) << " GiB"; - CERR << "GPU clock frequency: " << deviceProp.clockRate / 1e3f << " MHz"; + // Get clock rate + float clockRateMHz; +#if CUDART_VERSION >= 13000 + int clockRatekHz; + cudaError_t err = + cudaDeviceGetAttribute(&clockRatekHz, cudaDevAttrClockRate, deviceId); + if (err != cudaSuccess) { + CERR << "Error getting clock rate: " << cudaGetErrorString(err); + clockRateMHz = 0.0f; // Fallback value + } else { + clockRateMHz = clockRatekHz / 1e3f; + } +#else + clockRateMHz = deviceProp.clockRate / 1e3f; +#endif + CERR << "GPU clock frequency: " << clockRateMHz << " MHz"; CERR << "GPU compute capability: " << deviceProp.major << "." << deviceProp.minor; @@ -1078,9 +1160,37 @@ CudnnNetworkComputation::~CudnnNetworkComputation() { network_->ReleaseInputsOutputs(std::move(inputs_outputs_)); } +template +void CudnnNetworkComputation::CaptureGraph( + std::unique_lock&& lock) { + if (!network_->GetGraphCaptureEnabled()) return; + if (!CudaGraphCapture::EnsureEnoughFreeMemory()) { + static std::once_flag flag; + std::call_once(flag, []() { + CERR << "WARNING: Not enough GPU memory to capture CUDA graphs."; + }); + return; + } + CudaGraphCapture capture = network_->BeginCapture(*inputs_outputs_); + network_->forwardEval(inputs_outputs_.get(), GetBatchSize(), true); + capture.EndCapture(); + if (lock.owns_lock()) lock.unlock(); + inputs_outputs_->cuda_graphs_[GetBatchSize() - 1] = capture; +} + template void CudnnNetworkComputation::ComputeBlocking() { - network_->forwardEval(inputs_outputs_.get(), GetBatchSize()); + assert(GetBatchSize() >= 1); + if (inputs_outputs_->cuda_graphs_[GetBatchSize() - 1]) { + std::unique_lock lock = network_->LockEval(); + network_->GraphLaunch(inputs_outputs_.get(), GetBatchSize()); + } else { + std::unique_lock lock = network_->LockEval(); + network_->UploadInputs(inputs_outputs_.get(), GetBatchSize()); + network_->forwardEval(inputs_outputs_.get(), GetBatchSize()); + CaptureGraph(std::move(lock)); + } + network_->finishEval(inputs_outputs_.get(), GetBatchSize()); } template diff --git a/src/neural/backends/cuda/winograd_helper.inc b/src/neural/backends/cuda/winograd_helper.inc index 72e9828bb9..749181eee4 100644 --- a/src/neural/backends/cuda/winograd_helper.inc +++ b/src/neural/backends/cuda/winograd_helper.inc @@ -843,14 +843,15 @@ __global__ __launch_bounds__( } template -void FilterTransform(int N, int C, T* transformedFilter, const T* filter) { +void FilterTransform(int N, int C, T* transformedFilter, const T* filter, + cudaStream_t stream) { // Each thread processes entire filter block (input 3x3 elements -> output 6x6 // elements) const int kBlockSize = 64; const int kBlocks = DivUp(N * C, kBlockSize); - filterTransform_kernel<<>>(N, C, N * C, - transformedFilter, filter); + filterTransform_kernel<<>>( + N, C, N * C, transformedFilter, filter); ReportCUDAErrors(cudaGetLastError()); } diff --git a/src/neural/backends/metal/metal_common.h b/src/neural/backends/metal/metal_common.h index a42c00dcac..0c76d7395b 100644 --- a/src/neural/backends/metal/metal_common.h +++ b/src/neural/backends/metal/metal_common.h @@ -36,14 +36,13 @@ static int kInputPlanes = 112; struct InputsOutputs { InputsOutputs(int maxBatchSize, bool wdl, bool moves_left, bool conv_policy, bool attn_policy) { - input_masks_mem_.reserve(maxBatchSize * kInputPlanes); - input_val_mem_.reserve(maxBatchSize * kInputPlanes); - input_val_mem_expanded_.reserve(maxBatchSize * kInputPlanes * 64); - op_policy_mem_.reserve(maxBatchSize * kNumOutputPolicy); - op_value_mem_.reserve(maxBatchSize * (wdl ? 3 : 1)); + input_masks_mem_.resize(maxBatchSize * kInputPlanes); + input_val_mem_.resize(maxBatchSize * kInputPlanes); + op_policy_mem_.resize(maxBatchSize * kNumOutputPolicy); + op_value_mem_.resize(maxBatchSize * (wdl ? 3 : 1)); if (moves_left) { - op_moves_left_mem_.reserve(maxBatchSize); + op_moves_left_mem_.resize(maxBatchSize); }; /** @@ -53,16 +52,15 @@ struct InputsOutputs { * Remove this op_policy_raw_mem_ memory allocation when bug is fixed. */ if (attn_policy) { - op_policy_raw_mem_.reserve(maxBatchSize * (64 * 64 + 8 * 24)); + op_policy_raw_mem_.resize(maxBatchSize * (64 * 64 + 8 * 24)); } else if (conv_policy) { - op_policy_raw_mem_.reserve(maxBatchSize * 73 * 64); + op_policy_raw_mem_.resize(maxBatchSize * 73 * 64); } } ~InputsOutputs() {} std::vector input_masks_mem_; std::vector input_val_mem_; - std::vector input_val_mem_expanded_; std::vector op_policy_mem_; std::vector op_value_mem_; std::vector op_moves_left_mem_; diff --git a/src/neural/backends/metal/mps/MetalNetworkBuilder.h b/src/neural/backends/metal/mps/MetalNetworkBuilder.h index 74ddd6bcaa..869e014005 100644 --- a/src/neural/backends/metal/mps/MetalNetworkBuilder.h +++ b/src/neural/backends/metal/mps/MetalNetworkBuilder.h @@ -51,7 +51,7 @@ class MetalNetworkBuilder { Activations& activations, std::string& policy_head, std::string& value_head); - void forwardEval(float* inputs, int batchSize, + void forwardEval(float* values, uint64_t* masks, int batchSize, std::vector output_mems); private: diff --git a/src/neural/backends/metal/mps/MetalNetworkBuilder.mm b/src/neural/backends/metal/mps/MetalNetworkBuilder.mm index 2be155975a..7791d13d85 100644 --- a/src/neural/backends/metal/mps/MetalNetworkBuilder.mm +++ b/src/neural/backends/metal/mps/MetalNetworkBuilder.mm @@ -36,13 +36,12 @@ Toolkit and the NVIDIA CUDA Deep Neural Network library (or a MetalNetworkBuilder::MetalNetworkBuilder(void){} MetalNetworkBuilder::~MetalNetworkBuilder(void){} -//void MetalNetworkBuilder::init(void* weights, void* options) std::string MetalNetworkBuilder::init(int gpu_id) { // All metal devices. NSArray> * devices = MTLCopyAllDevices(); - if ([devices count] <= gpu_id) { + if ((NSUInteger)gpu_id >= [devices count]) { // No GPU device matching ID. [NSException raise:@"Could not find device" format:@"Could not find a GPU or CPU compute device with specified id"]; return ""; @@ -68,13 +67,17 @@ Toolkit and the NVIDIA CUDA Deep Neural Network library (or a NSString * policyHead = [NSString stringWithUTF8String:policy_head.c_str()]; NSString * valueHead = [NSString stringWithUTF8String:value_head.c_str()]; - // 0. Input placeholder. - // @todo - placeholder can be made directly as NHWC to avoid transposes. + // 0. Input value and mask placeholders. MPSGraphTensor * layer = [graph inputPlaceholderWithInputChannels:kInputPlanes - height:8 - width:8 label:@"inputs"]; + MPSGraphTensor * maskTensor = [graph maskPlaceholderWithInputChannels:kInputPlanes + label:@"inputs/mask"]; + + layer = [graph expandInputTensorWithMask:maskTensor + input:layer + label:@"inputs/expand"]; + const NSUInteger kernelSize = 3; const bool isPeDenseEmbedding = embedding == InputEmbedding::INPUT_EMBEDDING_PE_DENSE; @@ -302,11 +305,11 @@ Toolkit and the NVIDIA CUDA Deep Neural Network library (or a } } -void MetalNetworkBuilder::forwardEval(float * inputs, int batchSize, std::vector output_mems) +void MetalNetworkBuilder::forwardEval(float * inputs, uint64_t * masks, int batchSize, std::vector output_mems) { @autoreleasepool { Lc0NetworkGraph * graph = [Lc0NetworkGraph getGraphAt:[NSNumber numberWithInt:this->gpu_id]]; - [graph runInferenceWithBatchSize:batchSize inputs:inputs outputs:&output_mems[0]]; + [graph runInferenceWithBatchSize:batchSize inputs:inputs masks:masks outputs:&output_mems[0]]; } } diff --git a/src/neural/backends/metal/mps/NetworkGraph.h b/src/neural/backends/metal/mps/NetworkGraph.h index 2664b68c7d..dfc163cc48 100644 --- a/src/neural/backends/metal/mps/NetworkGraph.h +++ b/src/neural/backends/metal/mps/NetworkGraph.h @@ -50,12 +50,13 @@ static MPSImageFeatureChannelFormat fcFormat = MPSImageFeatureChannelFormatFloat // Input tensor and tensor data placeholders. MPSGraphTensor * _inputTensor; + MPSGraphTensor * _maskTensor; // Variables to track results of graph inference. NSArray * _resultTensors; NSArray * _targetTensors; NSMutableDictionary * _resultDataDicts; - NSMutableDictionary * _readVariables; + NSMutableDictionary * _readVariables; // Variables for triple buffering dispatch_semaphore_t _doubleBufferingSemaphore; @@ -72,10 +73,20 @@ static MPSImageFeatureChannelFormat fcFormat = MPSImageFeatureChannelFormatFloat -(nonnull instancetype) initWithDevice:(id __nonnull)device; -(nonnull MPSGraphTensor *) inputPlaceholderWithInputChannels:(NSUInteger)channels - height:(NSUInteger)height - width:(NSUInteger)width label:(NSString * __nullable)label; +-(nonnull MPSGraphTensor *) maskPlaceholderWithInputChannels:(NSUInteger)channels + label:(NSString * __nullable)label; + +-(nonnull MPSGraphTensor *) expandInputTensorWithMask:(MPSGraphTensor * __nonnull)maskTensor + input:(MPSGraphTensor * __nonnull)inputTensor + label:(NSString * __nonnull)label; + +- (nonnull MPSGraphTensor *) broadcastByStackingTensor:(MPSGraphTensor * __nonnull)input + axis:(NSInteger)axis + times:(NSUInteger)times + name:(NSString * __nonnull)name; + -(nonnull MPSGraphTensor *) addConvolutionBlockWithParent:(MPSGraphTensor * __nonnull)parent outputChannels:(NSUInteger)outputChannels kernelSize:(NSUInteger)kernelSize @@ -199,9 +210,11 @@ static MPSImageFeatureChannelFormat fcFormat = MPSImageFeatureChannelFormatFloat -(nonnull NSArray *) runInferenceWithBatchSize:(NSUInteger)batchSize inputs:(float * __nonnull)inputs + masks:(uint64_t * __nonnull)masks outputs:(float * __nonnull * __nonnull)outputBuffers; -(nonnull MPSCommandBuffer *) runCommandSubBatchWithInputs:(float * __nonnull)inputs + masks:(uint64_t * __nonnull)masks subBatch:(NSUInteger)subBatch subBatchSize:(NSUInteger)subBatchSize; diff --git a/src/neural/backends/metal/mps/NetworkGraph.mm b/src/neural/backends/metal/mps/NetworkGraph.mm index 0befa256e6..322308e67b 100644 --- a/src/neural/backends/metal/mps/NetworkGraph.mm +++ b/src/neural/backends/metal/mps/NetworkGraph.mm @@ -25,9 +25,11 @@ Toolkit and the NVIDIA CUDA Deep Neural Network library (or a Program grant you additional permission to convey the resulting work. */ +#import #import "neural/network_legacy.h" +#import "neural/tables/attention_policy_map.h" +#import "neural/tables/policy_map.h" #import "NetworkGraph.h" -#import static MPSGraphConvolution2DOpDescriptor * __nonnull convolution2DDescriptor = [MPSGraphConvolution2DOpDescriptor descriptorWithStrideInX:1 strideInY:1 @@ -66,13 +68,12 @@ -(NSUInteger) size { -(NSUInteger) sizeOfDimensions:(NSArray *)dimensions { NSUInteger size = 1; for (NSNumber * dim in dimensions) { - if ([dim intValue] < [self.shape count]) - size *= [self.shape[[dim intValue]] intValue]; + if ((NSUInteger)[dim intValue] < [self.shape count]) + size *= [self.shape[(NSUInteger)[dim intValue]] intValue]; } return size; } - -(NSUInteger) sizeOfDimensionsFrom:(NSNumber *)dimension { NSUInteger size = 1; for (NSUInteger dim = [dimension intValue]; dim < [self.shape count]; dim++) { @@ -137,6 +138,7 @@ -(nonnull instancetype) initWithDevice:(id __nonnull)device -(nonnull NSArray *) runInferenceWithBatchSize:(NSUInteger)batchSize inputs:(float * __nonnull)inputs + masks:(uint64_t * __nonnull)masks outputs:(float * __nonnull * __nonnull)outputBuffers { // Calculate number of sub-batches to split across GPU command buffers for parallel execution. @@ -144,18 +146,20 @@ -(nonnull instancetype) initWithDevice:(id __nonnull)device NSUInteger splits = (batchSize + kMinSubBatchSize + 1) / kMinSubBatchSize; if (splits > kMaxInflightBuffers) splits = kMaxInflightBuffers; NSUInteger subBatchSize = batchSize / splits; - NSUInteger inputDataLength = subBatchSize * [_inputTensor sizeOfDimensions:@[@1, @2, @3]]; + NSUInteger inputDataLength = subBatchSize * [_inputTensor sizeOfDimensionsFrom:@1]; // Split batchSize into smaller sub-batches and run using double-buffering. NSUInteger subBatch = 0; MPSCommandBuffer * commandBuffer; for (subBatch = 0; subBatch < splits - 1; subBatch++) { commandBuffer = [self runCommandSubBatchWithInputs:inputs + subBatch * inputDataLength + masks:masks + subBatch * inputDataLength subBatch:subBatch subBatchSize:subBatchSize]; } // Last sub-batch may be smaller or larger than others. MPSCommandBuffer * latestCommandBuffer = [self runCommandSubBatchWithInputs:inputs + subBatch * inputDataLength + masks:masks + subBatch * inputDataLength subBatch:subBatch subBatchSize:batchSize - subBatch * subBatchSize]; @@ -169,6 +173,7 @@ -(nonnull instancetype) initWithDevice:(id __nonnull)device } -(nonnull MPSCommandBuffer *) runCommandSubBatchWithInputs:(float * __nonnull)inputs + masks:(uint64_t * __nonnull)masks subBatch:(NSUInteger)subBatch subBatchSize:(NSUInteger)subBatchSize { @@ -178,7 +183,7 @@ -(nonnull MPSCommandBuffer *) runCommandSubBatchWithInputs:(float * __nonnull)in // Create command buffer for this sub-batch. MPSCommandBuffer * commandBuffer = [MPSCommandBuffer commandBufferFromCommandQueue:_queue]; - MPSShape * shape = @[@(subBatchSize), _inputTensor.shape[1], _inputTensor.shape[2], _inputTensor.shape[3]]; + MPSShape * shape = @[@(subBatchSize), _inputTensor.shape[1], _inputTensor.shape[2]]; NSData * inputData = [NSData dataWithBytesNoCopy:inputs length:subBatchSize * sizeof(float) @@ -189,17 +194,32 @@ -(nonnull MPSCommandBuffer *) runCommandSubBatchWithInputs:(float * __nonnull)in shape:shape dataType:_inputTensor.dataType]; + NSData * maskData = [NSData dataWithBytesNoCopy:masks + length:subBatchSize * sizeof(uint64_t) + freeWhenDone:NO]; + + MPSGraphTensorData * inputMaskData = [[MPSGraphTensorData alloc] initWithDevice:_device + data:maskData + shape:shape + dataType:MPSDataTypeUInt64]; + + NSDictionary * feeds = @{_inputTensor : inputTensorData, _maskTensor : inputMaskData}; + // Create execution descriptor with block to update results for each iteration. MPSGraphExecutionDescriptor * executionDescriptor = [[MPSGraphExecutionDescriptor alloc] init]; - executionDescriptor.completionHandler = ^(MPSGraphTensorDataDictionary * resultDictionary, NSError * error) { - _resultDataDicts[@(subBatch)] = resultDictionary; + executionDescriptor.completionHandler = ^(MPSGraphTensorDataDictionary * resultDictionary, NSError * _Nullable error) { + if (error) { + NSLog(@"Error occurred during execution: %@", error); + } else { + _resultDataDicts[@(subBatch)] = resultDictionary; + } // Release double buffering semaphore for the next training iteration to be encoded. dispatch_semaphore_signal(_doubleBufferingSemaphore); }; [self encodeToCommandBuffer:commandBuffer - feeds:@{_inputTensor : inputTensorData} + feeds:feeds targetTensors:_targetTensors targetOperations:nil executionDescriptor:executionDescriptor]; @@ -226,9 +246,6 @@ -(void) copyResultsToBuffers:(float * __nonnull * __nonnull)outputBuffers -(void) setResultTensors:(NSArray * __nonnull)results { - // Okay to remove nulls from the read variables. - [_readVariables removeObjectsForKeys:[_readVariables allKeysForObject:[NSNull null]]]; - // Set the results we're interested in. _resultTensors = results; @@ -238,16 +255,110 @@ -(void) setResultTensors:(NSArray * __nonnull)results } -(nonnull MPSGraphTensor *) inputPlaceholderWithInputChannels:(NSUInteger)channels - height:(NSUInteger)height - width:(NSUInteger)width label:(NSString * __nullable)label { - // Create a placeholder tensor that can hold the specified number of sub-batches. - _inputTensor = [self placeholderWithShape:@[@(-1), @(channels), @(height), @(width)] name:label]; - + _inputTensor = [self placeholderWithShape:@[@(-1), @(channels), @1] + dataType:MPSDataTypeFloat32 + name:label]; return _inputTensor; } +-(nonnull MPSGraphTensor *) maskPlaceholderWithInputChannels:(NSUInteger)channels + label:(NSString * __nullable)label +{ + _maskTensor = [self placeholderWithShape:@[@(-1), @(channels), @1] + dataType:MPSDataTypeUInt64 + name:label]; + return _maskTensor; +} + +-(nonnull MPSGraphTensor *) expandInputTensorWithMask:(MPSGraphTensor * __nonnull)maskTensor + input:(MPSGraphTensor * __nonnull)valueTensor + label:(NSString * __nonnull)label +{ + // 64 values to form the bitboard indices. + uint64_t bitIndices[64]; + for (int i = 0; i < 64; i++) { + bitIndices[i] = 1ULL << i; + } + NSData * bitIndicesData = [NSData dataWithBytesNoCopy:bitIndices + length:64 * sizeof(uint64_t) + freeWhenDone:NO]; + + MPSGraphTensor * bitIndicesTensor = [self constantWithData:bitIndicesData + shape:@[@1, @1, @64] + dataType:MPSDataTypeUInt64]; + + // Broadcast mask and bit index tensors to [N,C,64] + maskTensor = [self broadcastByStackingTensor:maskTensor + axis:3 + times:64 + name:[NSString stringWithFormat:@"%@/mask/broadcast", label]]; + + MPSGraphTensor * expandedMaskTensor; + if (@available(macOS 13.0, *)) { + // Expand the bitmap using the masks and values. + expandedMaskTensor = [self bitwiseANDWithPrimaryTensor:maskTensor + secondaryTensor:bitIndicesTensor + name:[NSString stringWithFormat:@"%@/mask/bitwise_and", label]]; + + MPSGraphTensor * zeroTensor = [self constantWithScalar:0.0 + shape:@[@1] + dataType:MPSDataTypeUInt64]; + + expandedMaskTensor = [self notEqualWithPrimaryTensor:expandedMaskTensor + secondaryTensor:zeroTensor + name:[NSString stringWithFormat:@"%@/zero_equals", label]]; + } else { + // Alternative method: bitwise ops not available in earlier macos versions, so using integer division and modulo. + // Divide by the bit index, which is also a power of 2, to shift the desired bit to position 0. + expandedMaskTensor = [self divisionWithPrimaryTensor:maskTensor + secondaryTensor:bitIndicesTensor + name:[NSString stringWithFormat:@"%@/mask/divide", label]]; + + // Take modulo 2 to extract the least significant bit + MPSGraphTensor * twoTensor = [self constantWithScalar:2.0 + shape:@[@1] + dataType:MPSDataTypeUInt64]; + + expandedMaskTensor = [self moduloWithPrimaryTensor:expandedMaskTensor + secondaryTensor:twoTensor + name:[NSString stringWithFormat:@"%@/mask/modulo", label]]; + } + + // Broadcast input tensor values to match the expanded dimensions. + valueTensor = [self broadcastByStackingTensor:valueTensor + axis:3 + times:64 + name:[NSString stringWithFormat:@"%@/input/broadcast", label]]; + + expandedMaskTensor = [self castTensor:expandedMaskTensor + toType:MPSDataTypeFloat32 + name:[NSString stringWithFormat:@"%@/input/cast", label]]; + + // Final multiplication: value * mask + expandedMaskTensor = [self multiplicationWithPrimaryTensor:expandedMaskTensor + secondaryTensor:valueTensor + name:[NSString stringWithFormat:@"%@/input/multiply", label]]; + + // Reshape to final output format [batch_size, kInputPlanes, 8, 8] + return [self reshapeTensor:expandedMaskTensor + withShape:@[@(-1), valueTensor.shape[1], @8, @8] + name:[NSString stringWithFormat:@"%@/input/reshape", label]]; +} + +- (nonnull MPSGraphTensor *) broadcastByStackingTensor:(MPSGraphTensor * __nonnull)input + axis:(NSInteger)axis + times:(NSUInteger)times + name:(NSString * __nonnull)name +{ + NSMutableArray * stackedTensors = [NSMutableArray array]; + for (NSUInteger i = 0; i < times; i++) { + [stackedTensors addObject:input]; + } + return [self stackTensors:stackedTensors axis:axis name:name]; +} + -(nonnull MPSGraphTensor *) addConvolutionBlockWithParent:(MPSGraphTensor * __nonnull)parent outputChannels:(NSUInteger)outputChannels kernelSize:(NSUInteger)kernelSize @@ -471,23 +582,37 @@ -(nonnull MPSGraphTensor *) addSEUnitWithParent:(MPSGraphTensor * __nonnull)pare } -(nonnull MPSGraphTensor *) addPolicyMapLayerWithParent:(MPSGraphTensor * __nonnull)parent - policyMap:(uint32_t * __nonnull)policyMap + policyMap:(const short * __nonnull)policyMap + mapSize:(NSUInteger)mapSize label:(NSString * __nonnull)label { - NSData * policyMapData = [NSData dataWithBytesNoCopy:policyMap - length:kNumPolicyOutputs * sizeof(uint32_t) - freeWhenDone:NO]; + if ([parent sizeOfDimensionsFrom:@1] < mapSize) { + [NSException raise:@"Invalid parent tensor shape" + format:@"Parent tensor non-batch dimensions (%zu) is less than mapping tensor size of (%zu) for policy mapping.", + [parent sizeOfDimensionsFrom:@1], mapSize]; + } - MPSGraphTensor * mappingTensor = [self constantWithData:policyMapData + // The mapping is an array of 64x?? squares, where each square contains a number from -1 to 1857. + // The mapping is flattened to a 1D array of size 1858, where each index corresponds to a square + // that had a value != -1. + uint32_t mappingIndices[kNumPolicyOutputs]; + for (NSUInteger i = 0; i < mapSize; i++) { + if (policyMap[i] == -1) continue; + mappingIndices[policyMap[i]] = i; + } + + NSData * policyMapIndexData = [NSData dataWithBytesNoCopy:mappingIndices + length:kNumPolicyOutputs * sizeof(uint32_t) + freeWhenDone:NO]; + + MPSGraphTensor * indicesTensor = [self constantWithData:policyMapIndexData shape:@[@(kNumPolicyOutputs)] dataType:MPSDataTypeUInt32]; - MPSGraphTensor * flatConvTensor = [self flatten2DTensor:parent - axis:1 - name:[NSString stringWithFormat:@"%@/flatten", label]]; + parent = [self flatten2DTensor:parent axis:1 name:[NSString stringWithFormat:@"%@/flatten", label]]; - MPSGraphTensor * policyTensor = [self gatherWithUpdatesTensor:flatConvTensor - indicesTensor:mappingTensor + MPSGraphTensor * policyTensor = [self gatherWithUpdatesTensor:parent + indicesTensor:indicesTensor axis:1 batchDimensions:0 name:[NSString stringWithFormat:@"%@/gather", label]]; @@ -506,7 +631,6 @@ -(nonnull MPSGraphTensor *) addEncoderLayerWithParent:(MPSGraphTensor * __nonnul normtype:(NSString * __nonnull)normtype label:(NSString * __nonnull)label { - NSUInteger dModel = encoder.mha.q_b.size(); MPSGraphTensor * mhaQ = [self addFullyConnectedLayerWithParent:parent outputChannels:encoder.mha.q_b.size() weights:&encoder.mha.q_w[0] @@ -605,15 +729,16 @@ -(nonnull MPSGraphTensor *) addEncoderLayerWithParent:(MPSGraphTensor * __nonnul label:[NSString stringWithFormat:@"%@/ln2", label]]; } else if ([normtype isEqual:@"rmsnorm"] || [normtype isEqual:@"skipfirst"]) { - enc = [self addRmsNormalizationWithParent:enc - scaledSecondaryTensor:ffn - gammas:&encoder.ln2_gammas[0] - alpha:alpha - label:[NSString stringWithFormat:@"%@/ln1", label]]; + return [self addRmsNormalizationWithParent:enc + scaledSecondaryTensor:ffn + gammas:&encoder.ln2_gammas[0] + alpha:alpha + label:[NSString stringWithFormat:@"%@/ln1", label]]; } else { [NSException raise:@"Invalid normalization type." format:@"Invalid normalization type specified: %@", normtype]; + return nil; } } @@ -882,7 +1007,8 @@ -(nonnull MPSGraphTensor *) scaledQKMatmulWithQueries:(MPSGraphTensor * __nonnul qkMatmul = [self multiplicationWithPrimaryTensor:qkMatmul secondaryTensor:[self constantWithScalar:scale - shape:@[@1] dataType:qkMatmul.dataType] + shape:@[@1] + dataType:qkMatmul.dataType] name:[NSString stringWithFormat:@"%@/scale", label]]; return qkMatmul; } @@ -944,6 +1070,14 @@ -(nonnull MPSGraphTensor *) attentionPolicyPromoMatmulConcatWithParent:(MPSGraph parent = [self reshapeTensor:parent withShape:@[@(-1), @64, @64] name:[NSString stringWithFormat:@"%@/parent_reshape", label]]; + MPSGraphTensor * slice = [self sliceTensor:parent dimension:1 start:48 length:8 name:[NSString stringWithFormat:@"%@/slice_policy_1", label]]; + slice = [self sliceTensor:slice dimension:2 start:56 length:8 name:[NSString stringWithFormat:@"%@/slice_policy_2", label]]; + slice = [self reshapeTensor:slice withShape:@[@(-1), @64] name:[NSString stringWithFormat:@"%@/slice_reshape", label]]; + slice = [self broadcastByStackingTensor:slice axis:2 times:3 name:[NSString stringWithFormat:@"%@/slice_broadcast", label]]; + slice = [self transposeTensor:slice dimension:1 withDimension:2 name:[NSString stringWithFormat:@"%@/slice_transpose", label]]; + + promo = [self additionWithPrimaryTensor:promo secondaryTensor:slice name:[NSString stringWithFormat:@"%@/offset_add", label]]; + return [self concatTensor:parent withTensor:promo dimension:1 name:[NSString stringWithFormat:@"%@/concat", label]]; } @@ -1263,7 +1397,8 @@ -(nonnull MPSGraphTensor *) makePolicyHeadWithTensor:(MPSGraphTensor * __nonnull scale:1.0f / sqrt(policyDModel) label:[NSString stringWithFormat:@"%@/self_attention/kq", label]]; - // 6. Slice last 8 keys (k[:, 56:, :]) and matmul with policy promotion weights, then concat to matmul_qk. + // 6. Slice last 8 keys (k[:, 48:56, 56:64]) and matmul with policy promotion weights, + // add to promotion logits then concat to matmul_qk. policy = [self attentionPolicyPromoMatmulConcatWithParent:policy withKeys:keys weights:&head.ip4_pol_w[0] @@ -1272,6 +1407,12 @@ -(nonnull MPSGraphTensor *) makePolicyHeadWithTensor:(MPSGraphTensor * __nonnull sliceFrom:56 channelSize:policyDModel label:[NSString stringWithFormat:@"%@/promo_logits", label]]; + + policy = [self addPolicyMapLayerWithParent:policy + policyMap:&lczero::kAttnPolicyMap[0] + mapSize:(64 * 64 + 8 * 24) + label:[NSString stringWithFormat:@"%@/policy_mapping", label]]; + } else if (convolutionPolicy) { if (attentionBody) { @@ -1296,30 +1437,10 @@ -(nonnull MPSGraphTensor *) makePolicyHeadWithTensor:(MPSGraphTensor * __nonnull label:[NSString stringWithFormat:@"%@/conv2", label]]; - /** - * @todo policy map implementation has bug in MPSGraph (GatherND not working in graph). - * Implementation of policy map to be done in CPU for now. - * - * Reinstate this section when bug is fixed. See comments below. - * - // [1858 -> HWC or CHW] - const bool HWC = false; - std::vector policy_map(1858); - for (const auto& mapping : kConvPolicyMap) { - if (mapping == -1) continue; - const auto index = &mapping - kConvPolicyMap; - const auto displacement = index / 64; - const auto square = index % 64; - const auto row = square / 8; - const auto col = square % 8; - if (HWC) { - policy_map[mapping] = ((row * 8) + col) * 80 + displacement; - } else { - policy_map[mapping] = ((displacement * 8) + row) * 8 + col; - } - } - policy = builder_->makePolicyMapLayer(policy, &policy_map[0], "policy_map"); - */ + policy = [self addPolicyMapLayerWithParent:policy + policyMap:&lczero::kConvPolicyMap[0] + mapSize:(73 * 64) + label:[NSString stringWithFormat:@"%@/policy_mapping", label]]; } else { if (attentionBody) { @@ -1391,10 +1512,10 @@ -(nonnull MPSGraphTensor *) makeValueHeadWithTensor:(MPSGraphTensor * __nonnull) value = [self addFullyConnectedLayerWithParent:value outputChannels:head.ip2_val_b.size() - weights:&head.ip2_val_w[0] + weights:&head.ip2_val_w[0] biases:&head.ip2_val_b[0] activation:wdl ? @"softmax" : @"tanh" - label:[NSString stringWithFormat:@"%@/fc2", label]]; + label:[NSString stringWithFormat:@"%@/fc2", label]]; return value; } diff --git a/src/neural/backends/metal/network_metal.cc b/src/neural/backends/metal/network_metal.cc index 0a45eb74da..46f29459b5 100644 --- a/src/neural/backends/metal/network_metal.cc +++ b/src/neural/backends/metal/network_metal.cc @@ -160,99 +160,30 @@ MetalNetwork::MetalNetwork(const WeightsFile& file, const OptionsDict& options) "' does not exist in this net."); } - auto embedding = static_cast(file.format().network_format().input_embedding()); - builder_->build(kInputPlanes, weights, embedding, attn_body, attn_policy_, conv_policy_, - wdl_, moves_left_, activations, policy_head, value_head); + auto embedding = static_cast( + file.format().network_format().input_embedding()); + builder_->build(kInputPlanes, weights, embedding, attn_body, attn_policy_, + conv_policy_, wdl_, moves_left_, activations, policy_head, + value_head); } void MetalNetwork::forwardEval(InputsOutputs* io, int batchSize) { - // Expand encoded input into N x 112 x 8 x 8. - float* dptr = &io->input_val_mem_expanded_[0]; - for (size_t i = 0; i < batchSize; i++) { - for (size_t j = 0; j < kInputPlanes; j++) { - const float value = io->input_val_mem_[j + i * kInputPlanes]; - const uint64_t mask = io->input_masks_mem_[j + i * kInputPlanes]; - for (auto k = 0; k < 64; k++) { - *(dptr++) = (mask & (((uint64_t)1) << k)) != 0 ? value : 0; - } - } - } - // Metal is not thread-safe, so lock is needed. lock_.lock(); - if (attn_policy_ || conv_policy_) { - /** - * @todo policy map implementation has bug in MPSGraph (GatherND not working - * in graph). Implementation of policy map to be done in CPU for now. - * - * Remove this if-branch when bug is fixed. See comments above. - */ - - if (moves_left_) { - builder_->forwardEval(&io->input_val_mem_expanded_[0], batchSize, - {&io->op_policy_raw_mem_[0], &io->op_value_mem_[0], - &io->op_moves_left_mem_[0]}); - } else { - builder_->forwardEval( - &io->input_val_mem_expanded_[0], batchSize, - {&io->op_policy_raw_mem_[0], &io->op_value_mem_[0]}); - } - // The next thread can start using the GPU now. - lock_.unlock(); - - if (attn_policy_) { - // Promotion offset calculation. - for (size_t batch = 0; batch < batchSize; batch++) { - for (int k = 0; k < 8; k++) { // y in cuda - for (int j = 0; j < 8; j++) { // w in cuda - for (int i = 0; i < 3; i++) { // c in cuda - // Promotion offsets already precalculated and stored in GPU. - // Just the main policy offsets need to be added here. - io->op_policy_raw_mem_[batch * (64 * 64 + 8 * 24) + 64 * 64 + - 24 * k + 3 * j + i] += - io->op_policy_raw_mem_[batch * (64 * 64 + 8 * 24) + - (48 + k) * 64 + 56 + j]; - } - } - } - } - // Mapping from attention policy to lc0 policy - for (size_t batch = 0; batch < batchSize; batch++) { - for (size_t i = 0; i < 64 * 64 + 8 * 24; i++) { - size_t j = kAttnPolicyMap[i]; - if (j >= 0) { - io->op_policy_mem_[batch * 1858 + j] = - io->op_policy_raw_mem_[batch * (64 * 64 + 8 * 24) + i]; - } - } - } - } else if (conv_policy_) { - // Mapping from convolutional policy to lc0 policy - for (size_t batch = 0; batch < batchSize; batch++) { - for (size_t i = 0; i < 73 * 64; i++) { - short j = kConvPolicyMap[i]; - if (j >= 0) { - io->op_policy_mem_[batch * 1858 + j] = - io->op_policy_raw_mem_[batch * 80 * 64 + i]; - } - } - } - } - + if (moves_left_) { + builder_->forwardEval(&io->input_val_mem_[0], &io->input_masks_mem_[0], + batchSize, + {&io->op_policy_mem_[0], &io->op_value_mem_[0], + &io->op_moves_left_mem_[0]}); } else { - if (moves_left_) { - builder_->forwardEval(&io->input_val_mem_expanded_[0], batchSize, - {&io->op_policy_mem_[0], &io->op_value_mem_[0], - &io->op_moves_left_mem_[0]}); - } else { - builder_->forwardEval(&io->input_val_mem_expanded_[0], batchSize, - {&io->op_policy_mem_[0], &io->op_value_mem_[0]}); - } - - // The next thread can start using the GPU now. - lock_.unlock(); + builder_->forwardEval(&io->input_val_mem_[0], &io->input_masks_mem_[0], + batchSize, + {&io->op_policy_mem_[0], &io->op_value_mem_[0]}); } + + // The next thread can start using the GPU now. + lock_.unlock(); } std::unique_ptr MakeMetalNetwork(const std::optional& w, diff --git a/src/neural/backends/network_demux.cc b/src/neural/backends/network_demux.cc index a1a28f779f..accf9bd12f 100644 --- a/src/neural/backends/network_demux.cc +++ b/src/neural/backends/network_demux.cc @@ -25,125 +25,221 @@ Program grant you additional permission to convey the resulting work. */ +#include +#include #include +#include +#include #include #include #include "neural/factory.h" -#include "utils/exception.h" namespace lczero { namespace { +class DemuxingComputation; + +struct DemuxingWork { + DemuxingComputation* source_ = nullptr; + std::unique_ptr computation_; + int start_ = 0; + int end_ = 0; + + DemuxingWork(int sample) : end_(sample) {} + DemuxingWork(DemuxingComputation* source, int start, int end) + : source_(source), start_(start), end_(end) { + assert(start_ != end_); + } + + auto operator<=>(const DemuxingWork& b) const { return end_ <=> b.end_; } +}; + class DemuxingNetwork; -class DemuxingComputation : public NetworkComputation { +class DemuxingBackend; +class DemuxingComputation final : public NetworkComputation { + std::tuple&, int> GetParent( + int sample) const { + auto iter = std::lower_bound(parents_.begin(), parents_.end(), sample + 1); + assert(iter != parents_.end()); + assert(sample >= iter->start_); + assert(sample < iter->end_); + return {iter->computation_, sample - iter->start_}; + } + public: DemuxingComputation(DemuxingNetwork* network) : network_(network) {} + ~DemuxingComputation() { + // Wait for other threads to stop using this object. It must be spinloop for + // correct synchronization between notify_one and destructor. + while (dataready_.load(std::memory_order_acquire) != -1) { + SpinloopPause(); + } + } - void AddInput(InputPlanes&& input) override { planes_.emplace_back(input); } + void AddInput(InputPlanes&& input) override { + planes_.emplace_back(std::move(input)); + } void ComputeBlocking() override; int GetBatchSize() const override { return planes_.size(); } float GetQVal(int sample) const override { - const int idx = sample / partial_size_; - const int offset = sample % partial_size_; - return parents_[idx]->GetQVal(offset); + auto [parent, offset] = GetParent(sample); + if (!parent) return 0; + return parent->GetQVal(offset); } float GetDVal(int sample) const override { - int idx = sample / partial_size_; - int offset = sample % partial_size_; - return parents_[idx]->GetDVal(offset); + auto [parent, offset] = GetParent(sample); + if (!parent) return 0; + return parent->GetDVal(offset); } float GetMVal(int sample) const override { - int idx = sample / partial_size_; - int offset = sample % partial_size_; - return parents_[idx]->GetMVal(offset); + auto [parent, offset] = GetParent(sample); + if (!parent) return 0; + return parent->GetMVal(offset); } float GetPVal(int sample, int move_id) const override { - const int idx = sample / partial_size_; - const int offset = sample % partial_size_; - return parents_[idx]->GetPVal(offset, move_id); + auto [parent, offset] = GetParent(sample); + if (!parent) return 0; + return parent->GetPVal(offset, move_id); } void NotifyComplete() { - std::unique_lock lock(mutex_); - dataready_--; - if (dataready_ == 0) { + if (1 == dataready_.fetch_sub(1, std::memory_order_release)) { + { + std::lock_guard lock(mutex_); + } dataready_cv_.notify_one(); + dataready_.store(-1, std::memory_order_release); } } - NetworkComputation* AddParentFromNetwork(Network* network) { - std::unique_lock lock(mutex_); - parents_.emplace_back(network->NewComputation()); - const int cur_idx = (parents_.size() - 1) * partial_size_; - for (int i = cur_idx; i < std::min(GetBatchSize(), cur_idx + partial_size_); - i++) { - parents_.back()->AddInput(std::move(planes_[i])); - } - return parents_.back().get(); - } - private: std::vector planes_; DemuxingNetwork* network_; - std::vector> parents_; + std::vector parents_; + + std::mutex mutex_; + std::condition_variable dataready_cv_; + std::atomic dataready_ = -1; + + friend class DemuxingBackend; +}; + +class DemuxingBackend { + public: + ~DemuxingBackend() { + while (!threads_.empty()) { + threads_.back().join(); + threads_.pop_back(); + } + while (!queue_.empty()) { + queue_.front()->source_->NotifyComplete(); + queue_.pop(); + } + } + + void Assign(std::unique_ptr&& network, const OptionsDict& opts, + std::atomic& abort) { + network_ = std::move(network); + int nn_threads = opts.GetOrDefault("threads", 0); + if (nn_threads == 0) { + nn_threads = network_->GetThreads(); + } + for (int i = 0; i < nn_threads; i++) { + threads_.emplace_back([&] { Worker(abort); }); + } + } + + void Enqueue(DemuxingWork* work) { + { + std::unique_lock lock(mutex_); + queue_.push(work); + } + dataready_cv_.notify_one(); + } + + void Abort() { + { + std::unique_lock lock(mutex_); + } + dataready_cv_.notify_all(); + } + void Worker(std::atomic& abort) { + while (!abort.load(std::memory_order_relaxed)) { + DemuxingWork* work = nullptr; + { + std::unique_lock lock(mutex_); + dataready_cv_.wait(lock, [&] { + return abort.load(std::memory_order_relaxed) || !queue_.empty(); + }); + if (abort.load(std::memory_order_relaxed)) return; + if (!queue_.empty()) { + work = queue_.front(); + queue_.pop(); + } + } + if (work) { + work->computation_ = network_->NewComputation(); + auto& planes = work->source_->planes_; + for (int i = work->start_; i < work->end_; i++) { + work->computation_->AddInput(std::move(planes[i])); + } + work->computation_->ComputeBlocking(); + work->source_->NotifyComplete(); + } + } + } + + private: std::mutex mutex_; std::condition_variable dataready_cv_; - int dataready_ = 0; - int partial_size_ = 0; + std::vector threads_; + std::unique_ptr network_; + std::queue queue_; }; -class DemuxingNetwork : public Network { +class DemuxingNetwork final : public Network { public: DemuxingNetwork(const std::optional& weights, - const OptionsDict& options) { - minimum_split_size_ = options.GetOrDefault("minimum-split-size", 0); + const OptionsDict& options) + : backends_(std::max(size_t(1), options.ListSubdicts().size())) { const auto parents = options.ListSubdicts(); if (parents.empty()) { // If options are empty, or multiplexer configured in root object, // initialize on root object and default backend. auto backends = NetworkFactory::Get()->GetBackendsList(); - AddBackend(backends[0], weights, options); + AddBackend(0, backends[0], weights, options); } + int i = 0; for (const auto& name : parents) { - AddBackend(name, weights, options.GetSubdict(name)); + AddBackend(i++, name, weights, options.GetSubdict(name)); } } - void AddBackend(const std::string& name, + void AddBackend(int index, const std::string& name, const std::optional& weights, const OptionsDict& opts) { const std::string backend = opts.GetOrDefault("backend", name); - networks_.emplace_back( - NetworkFactory::Get()->Create(backend, weights, opts)); + auto network = NetworkFactory::Get()->Create(backend, weights, opts); - int nn_threads = opts.GetOrDefault("threads", 0); - if (nn_threads == 0) { - nn_threads = networks_.back()->GetThreads(); - } - - min_batch_size_ = - std::min(min_batch_size_, networks_.back()->GetMiniBatchSize()); - is_cpu_ &= networks_.back()->IsCpu(); - - if (networks_.size() == 1) { - capabilities_ = networks_.back()->GetCapabilities(); + min_batch_size_ = std::min(min_batch_size_, network->GetMiniBatchSize()); + batch_step_ = std::max(batch_step_, network->GetPreferredBatchStep()); + is_cpu_ &= network->IsCpu(); + if (index == 0) { + capabilities_ = network->GetCapabilities(); } else { - capabilities_.Merge(networks_.back()->GetCapabilities()); - } - - for (int i = 0; i < nn_threads; ++i) { - threads_.emplace_back([this]() { Worker(); }); + capabilities_.Merge(network->GetCapabilities()); } + backends_[index].Assign(std::move(network), opts, abort_); } std::unique_ptr NewComputation() override { @@ -155,102 +251,86 @@ class DemuxingNetwork : public Network { } int GetMiniBatchSize() const override { - return min_batch_size_ * threads_.size(); + return min_batch_size_ * backends_.size(); } - bool IsCpu() const override { return is_cpu_; } - - void Enqueue(DemuxingComputation* computation) { - std::lock_guard lock(mutex_); - queue_.push(computation); - cv_.notify_one(); - } - - ~DemuxingNetwork() { - Abort(); - Wait(); - // Unstuck waiting computations. - while (!queue_.empty()) { - queue_.front()->NotifyComplete(); - queue_.pop(); - } - } + int GetPreferredBatchStep() const override { return batch_step_; } - void Worker() { - // While Abort() is not called (and it can only be called from destructor). - while (!abort_) { - { - { - std::unique_lock lock(mutex_); - // Wait until there's come work to compute. - cv_.wait(lock, [&] { return abort_ || !queue_.empty(); }); - if (abort_) break; - } + bool IsCpu() const override { return is_cpu_; } - // While there is a work in queue, process it. - while (true) { - DemuxingComputation* to_notify; - { - std::unique_lock lock(mutex_); - if (queue_.empty()) break; - to_notify = queue_.front(); - queue_.pop(); - } - long long net_idx = ++(counter_) % networks_.size(); - NetworkComputation* to_compute = - to_notify->AddParentFromNetwork(networks_[net_idx].get()); - to_compute->ComputeBlocking(); - to_notify->NotifyComplete(); - } - } - } - } + ~DemuxingNetwork() { Abort(); } void Abort() { - { - std::lock_guard lock(mutex_); - abort_ = true; + abort_.store(true, std::memory_order_relaxed); + for (auto& b : backends_) { + b.Abort(); } - cv_.notify_all(); } - void Wait() { - while (!threads_.empty()) { - threads_.back().join(); - threads_.pop_back(); - } - } - - std::vector> networks_; + std::vector backends_; NetworkCapabilities capabilities_; int min_batch_size_ = std::numeric_limits::max(); + int batch_step_ = 1; bool is_cpu_ = true; - std::queue queue_; - int minimum_split_size_ = 0; - std::atomic counter_; - bool abort_ = false; - - std::mutex mutex_; - std::condition_variable cv_; - - std::vector threads_; + std::atomic start_index_; + std::atomic abort_ = false; }; void DemuxingComputation::ComputeBlocking() { if (GetBatchSize() == 0) return; - partial_size_ = (GetBatchSize() + network_->threads_.size() - 1) / - network_->threads_.size(); - if (partial_size_ < network_->minimum_split_size_) { - partial_size_ = std::min(GetBatchSize(), network_->minimum_split_size_); + // Calculate batch_step_ size split count. + int splits = 1 + (GetBatchSize() - 1) / network_->batch_step_; + // Calculate the minimum number of splits per backend. + int split_size_per_backend = splits / network_->backends_.size(); + // Calculate how many backends get extra work. + int extra_split_backends = + splits - split_size_per_backend * network_->backends_.size(); + + // Find the first backend which got less work from the previous batch. + int start_index = + network_->start_index_.fetch_add(std::max(1, extra_split_backends), + std::memory_order_relaxed) % + network_->backends_.size(); + + int end_index = + (start_index + extra_split_backends) % network_->backends_.size(); + int work_start = 0; + int work_items = split_size_per_backend > 0 ? network_->backends_.size() + : extra_split_backends; + // First store the work item count and reserve memory from them. + dataready_.store(work_items, std::memory_order_relaxed); + parents_.reserve(work_items); + int i = start_index; + // First send work to backends which get extra work. + int split_size = split_size_per_backend + 1; + for (; i != end_index; i = (i + 1) % network_->backends_.size()) { + assert(work_start != GetBatchSize()); + int work_end = work_start + split_size * network_->batch_step_; + work_end = std::min(work_end, GetBatchSize()); + parents_.emplace_back(this, work_start, work_end); + network_->backends_[i].Enqueue(&parents_.back()); + work_start = work_end; } - const int splits = (GetBatchSize() + partial_size_ - 1) / partial_size_; - - std::unique_lock lock(mutex_); - dataready_ = splits; - for (int j = 0; j < splits; j++) { - network_->Enqueue(this); + // Queue remaining work items which don't get extra work. + split_size--; + if (split_size > 0) { + do { + assert(work_start != GetBatchSize()); + int work_end = work_start + split_size * network_->batch_step_; + work_end = std::min(work_end, GetBatchSize()); + parents_.emplace_back(this, work_start, work_end); + network_->backends_[i].Enqueue(&parents_.back()); + work_start = work_end; + i = (i + 1) % network_->backends_.size(); + } while (i != start_index); } - dataready_cv_.wait(lock, [this]() { return dataready_ == 0; }); + assert(work_start == GetBatchSize()); + assert(work_items == (int)parents_.size()); + // Wait until all backends complete their work. + std::unique_lock lock(mutex_); + dataready_cv_.wait(lock, [this]() { + return dataready_.load(std::memory_order_acquire) <= 0; + }); } std::unique_ptr MakeDemuxingNetwork( diff --git a/src/neural/backends/network_onnx.cc b/src/neural/backends/network_onnx.cc deleted file mode 100644 index c44331af72..0000000000 --- a/src/neural/backends/network_onnx.cc +++ /dev/null @@ -1,514 +0,0 @@ -/* - This file is part of Leela Chess Zero. - Copyright (C) 2021-2023 The LCZero Authors - - Leela Chess is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - Leela Chess is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with Leela Chess. If not, see . - - Additional permission under GNU GPL version 3 section 7 - - If you modify this Program, or any covered work, by linking or - combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA - Toolkit and the NVIDIA CUDA Deep Neural Network library (or a - modified version of those libraries), containing parts covered by the - terms of the respective license agreement, the licensors of this - Program grant you additional permission to convey the resulting work. -*/ - -#include -#include -#include -#include -#include -#include -#include - -#if __has_include("dml_provider_factory.h") -#include "dml_provider_factory.h" -#define USE_DML -#endif - -#include "cpu_provider_factory.h" -#include "neural/factory.h" -#include "neural/loader.h" -#include "neural/network.h" -#include "neural/onnx/converter.h" -#include "onnxruntime_cxx_api.h" -#include "utils/bf16_utils.h" -#include "utils/bititer.h" -#include "utils/commandline.h" -#include "utils/exception.h" -#include "utils/fp16_utils.h" -#include "utils/logging.h" - -namespace lczero { -namespace { - -enum class OnnxProvider { CPU, CUDA, DML, ROCM, TRT }; - -class OnnxNetwork; - -template -class OnnxComputation : public NetworkComputation { - public: - OnnxComputation(OnnxNetwork* network); - void AddInput(InputPlanes&& input) override; - int GetBatchSize() const override { return raw_input_.size(); } - void ComputeBlocking() override; - float GetQVal(int sample) const override; - float GetDVal(int sample) const override; - float GetPVal(int sample, int move_id) const override; - float GetMVal(int sample) const override; - - private: - Ort::Value PrepareInputs(int start, int batch_size); - - OnnxNetwork* network_; - std::vector raw_input_; - std::vector input_tensor_data_; - std::vector output_tensors_; - std::vector> output_tensors_data_; - std::vector output_tensors_step_; -}; - -class OnnxNetwork : public Network { - public: - OnnxNetwork(const WeightsFile& file, const OptionsDict& options, - OnnxProvider provider); - std::unique_ptr NewComputation() override { - if (fp16_) { - return std::make_unique>(this); - } else if (bf16_) { - return std::make_unique>(this); - } else { - return std::make_unique>(this); - } - } - const NetworkCapabilities& GetCapabilities() const override { - return capabilities_; - } - int GetMiniBatchSize() const override { - return batch_size_ == -1 ? Network::GetMiniBatchSize() - : batch_size_ * steps_; - } - bool IsCpu() const override { return provider_ == OnnxProvider::CPU; } - - Ort::SessionOptions GetOptions(int gpu, int threads, int batch_size); - - Ort::Env onnx_env_; - // Prepare sessions for this many multiples of the batch size; - int steps_; - std::vector session_; - std::vector inputs_; - // Points to strings in inputs_. - std::vector inputs_cstr_; - std::vector outputs_; - // Points to strings in outputs_. - std::vector outputs_cstr_; - // Indices in output_cstr_ vector. - int policy_head_ = -1; - int wdl_head_ = -1; - int value_head_ = -1; - int mlh_head_ = -1; - NetworkCapabilities capabilities_; - bool fp16_; - bool bf16_; - // The batch size to use, or -1 for variable. - int batch_size_; - // The lower limit for variable batch size. - int min_batch_size_; - static constexpr int max_batch_size_ = 1024; - // For conditional locking if running the DML/ROCM/TRT provider. - OnnxProvider provider_; - std::mutex lock_; -}; - -template -OnnxComputation::OnnxComputation(OnnxNetwork* network) - : network_(network) { - output_tensors_data_.resize(network_->outputs_.size()); - output_tensors_step_.resize(network_->outputs_.size()); - output_tensors_step_[network_->policy_head_] = 1858; - output_tensors_data_[network_->policy_head_] = - std::vector(1858 * network_->max_batch_size_); - if (network_->wdl_head_ != -1) { - output_tensors_step_[network_->wdl_head_] = 3; - output_tensors_data_[network_->wdl_head_] = - std::vector(3 * network_->max_batch_size_); - } - if (network_->value_head_ != -1) { - output_tensors_step_[network_->value_head_] = 1; - output_tensors_data_[network_->value_head_] = - std::vector(network_->max_batch_size_); - } - if (network_->mlh_head_ != -1) { - output_tensors_step_[network_->mlh_head_] = 1; - output_tensors_data_[network_->mlh_head_] = - std::vector(network_->max_batch_size_); - } -} - -template -void OnnxComputation::AddInput(InputPlanes&& input) { - raw_input_.emplace_back(input); - if (raw_input_.size() > network_->max_batch_size_) { - throw Exception("NN input exceeds max batch size of " + - std::to_string(network_->max_batch_size_) + "."); - } -} - -float AsFloat(float x) { return x; } -float AsFloat(Ort::Float16_t x) { - uint16_t tmp; - std::memcpy(&tmp, reinterpret_cast(&x), sizeof(uint16_t)); - return FP16toFP32(tmp); -} -float AsFloat(Ort::BFloat16_t x) { - uint16_t tmp; - std::memcpy(&tmp, reinterpret_cast(&x), sizeof(uint16_t)); - return BF16toFP32(tmp); -} - -template -float OnnxComputation::GetQVal(int sample) const { - if (network_->wdl_head_ != -1) { - const auto& data = output_tensors_data_[network_->wdl_head_]; - return AsFloat(data[sample * 3 + 0]) - AsFloat(data[sample * 3 + 2]); - } else { - const auto& data = output_tensors_data_[network_->value_head_]; - return AsFloat(data[sample]); - } -} - -template -float OnnxComputation::GetDVal(int sample) const { - if (network_->wdl_head_ == -1) return 0.0f; - const auto& data = output_tensors_data_[network_->wdl_head_]; - return AsFloat(data[sample * 3 + 1]); -} - -template -float OnnxComputation::GetPVal(int sample, int move_id) const { - const auto& data = output_tensors_data_[network_->policy_head_]; - return AsFloat(data[sample * 1858 + move_id]); -} - -template -float OnnxComputation::GetMVal(int sample) const { - if (network_->mlh_head_ == -1) return 0.0f; - const auto& data = output_tensors_data_[network_->mlh_head_]; - return AsFloat(data[sample]); -} - -void AsDataType(float x, float* y) { *y = x; } -void AsDataType(float x, Ort::Float16_t* y) { - uint16_t tmp = FP32toFP16(x); - std::memcpy(reinterpret_cast(y), &tmp, sizeof(uint16_t)); -} -void AsDataType(float x, Ort::BFloat16_t* y) { - uint16_t tmp = FP32toBF16(x); - std::memcpy(reinterpret_cast(y), &tmp, sizeof(uint16_t)); -} - -template -Ort::Value OnnxComputation::PrepareInputs(int start, int batch_size) { - input_tensor_data_.clear(); - input_tensor_data_.resize(batch_size * kInputPlanes * 8 * 8); - auto iter = input_tensor_data_.data(); - int end = std::min(start + batch_size, static_cast(raw_input_.size())); - for (int i = start; i < end; i++) { - for (const auto& plane : raw_input_[i]) { - DataType value; - AsDataType(plane.value, &value); - for (auto bit : IterateBits(plane.mask)) { - *(iter + bit) = value; - } - iter += 64; - } - } - for (int i = end; i < start + batch_size; i++) { - for (int j = 0; j < kInputPlanes * 64; j++) { - *iter++ = DataType(); - } - } - - auto memory_info = - Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault); - - output_tensors_.clear(); - for (size_t i = 0; i < output_tensors_step_.size(); i++) { - int size = output_tensors_step_[i]; - int64_t dims[] = {batch_size, size}; - output_tensors_.emplace_back(Ort::Value::CreateTensor( - memory_info, output_tensors_data_[i].data() + start * size, - size * batch_size, dims, 2)); - } - - int64_t dims[] = {batch_size, kInputPlanes, 8, 8}; - return Ort::Value::CreateTensor(memory_info, - input_tensor_data_.data(), - input_tensor_data_.size(), dims, 4); -} - -template -void OnnxComputation::ComputeBlocking() { - int batch_size = network_->batch_size_; - if (batch_size < 0) { - batch_size = std::max(static_cast(raw_input_.size()), - network_->min_batch_size_); - } - for (size_t i = 0; i < raw_input_.size();) { - int step = (raw_input_.size() - i + batch_size - 1) / batch_size; - if (step > network_->steps_) step = network_->steps_; - int batch = batch_size * step; - - auto input_tensor = PrepareInputs(i, batch); - // The DML onnxruntime execution provider is documented as not supporting - // multi-threaded calls to Run on the same inference session. We found the - // same to be true for the ROCm execution provider (at least for CNNs). - // TODO: This may be a onnxruntime/ROCm bug, check onnxruntime 1.16 release. - if (network_->provider_ == OnnxProvider::DML || - network_->provider_ == OnnxProvider::ROCM || - network_->provider_ == OnnxProvider::TRT) { - network_->lock_.lock(); - } - network_->session_[step - 1].Run( - {}, network_->inputs_cstr_.data(), &input_tensor, 1, - network_->outputs_cstr_.data(), output_tensors_.data(), - output_tensors_.size()); - if (network_->provider_ == OnnxProvider::DML || - network_->provider_ == OnnxProvider::ROCM || - network_->provider_ == OnnxProvider::TRT) { - network_->lock_.unlock(); - } - i += batch; - } -} - -Ort::SessionOptions OnnxNetwork::GetOptions(int gpu, int threads, - int batch_size) { - Ort::SessionOptions options; - options.SetIntraOpNumThreads(threads); - options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL); - - if (batch_size > 0) { - // Override the default (variable) batch size. - Ort::ThrowOnError( - OrtGetApiBase() - ->GetApi(ORT_API_VERSION) - ->AddFreeDimensionOverrideByName(options, "batch", batch_size)); - } - - switch (provider_) { - case OnnxProvider::DML: - options.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL); - options.DisableMemPattern(); -#ifdef USE_DML - Ort::ThrowOnError( - OrtSessionOptionsAppendExecutionProvider_DML(options, gpu)); -#else - throw Exception("ONNX backend internal error."); -#endif - break; - case OnnxProvider::TRT: { - options.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL); - - std::string cache_dir = CommandLine::BinaryDirectory() + "/trt_cache"; - std::map trt_options; - trt_options["device_id"] = std::to_string(gpu); - trt_options["trt_fp16_enable"] = fp16_ ? "1" : "0"; - trt_options["trt_int8_enable"] = "0"; - trt_options["trt_max_partition_iterations"] = "1000"; - trt_options["trt_min_subgraph_size"] = "1"; - trt_options["trt_engine_cache_enable"] = "1"; - trt_options["trt_engine_cache_prefix"] = - "Lc0_ONNX_TRT_batch_" + std::to_string(batch_size) + "_"; - trt_options["trt_engine_cache_path"] = cache_dir; - trt_options["trt_timing_cache_enable"] = "1"; - trt_options["trt_timing_cache_path"] = cache_dir; - trt_options["trt_layer_norm_fp32_fallback"] = "1"; - trt_options["trt_force_sequential_engine_build"] = "1"; - // Looks like we need I/O binding to enable this. - // trt_options["trt_cuda_graph_enable"] = "1"; - if (batch_size < 0) { - trt_options["trt_profile_min_shapes"] = - inputs_[0] + ":" + std::to_string(min_batch_size_) + "x112x8x8"; - trt_options["trt_profile_max_shapes"] = - inputs_[0] + ":" + std::to_string(max_batch_size_) + "x112x8x8"; - trt_options["trt_profile_opt_shapes"] = - inputs_[0] + ":" + std::to_string(max_batch_size_ / 4) + "x112x8x8"; - } else { - trt_options["trt_profile_min_shapes"] = - inputs_[0] + ":" + std::to_string(batch_size_) + "x112x8x8"; - trt_options["trt_profile_max_shapes"] = - inputs_[0] + ":" + std::to_string(batch_size_ * steps_) + - "x112x8x8"; - trt_options["trt_profile_opt_shapes"] = - inputs_[0] + ":" + std::to_string(batch_size_ * steps_) + - "x112x8x8"; - } - std::vector keys; - std::vector values; - for (const auto& [key, value] : trt_options) { - keys.push_back(key.c_str()); - values.push_back(value.c_str()); - } - - const auto& api = Ort::GetApi(); - OrtTensorRTProviderOptionsV2* trt_options_v2; - Ort::ThrowOnError(api.CreateTensorRTProviderOptions(&trt_options_v2)); - Ort::ThrowOnError(api.UpdateTensorRTProviderOptions( - trt_options_v2, keys.data(), values.data(), keys.size())); - options.AppendExecutionProvider_TensorRT_V2(*trt_options_v2); - api.ReleaseTensorRTProviderOptions(trt_options_v2); - break; - } - case OnnxProvider::ROCM: { - OrtROCMProviderOptions rocm_options; - rocm_options.device_id = gpu; - options.AppendExecutionProvider_ROCM(rocm_options); - break; - } - case OnnxProvider::CUDA: { - OrtCUDAProviderOptions cuda_options; - cuda_options.device_id = gpu; - options.AppendExecutionProvider_CUDA(cuda_options); - break; - } - case OnnxProvider::CPU: - auto status = OrtSessionOptionsAppendExecutionProvider_CPU(options, 0); - if (status) { - std::string error_message = Ort::GetApi().GetErrorMessage(status); - OrtErrorCode error_code = Ort::GetApi().GetErrorCode(status); - Ort::GetApi().ReleaseStatus(status); - throw Exception("ONNX CPU error " + std::to_string(error_code) + ": " + - error_message); - } - break; - } - return options; -} - -OnnxNetwork::OnnxNetwork(const WeightsFile& file, const OptionsDict& opts, - OnnxProvider provider) - : onnx_env_(ORT_LOGGING_LEVEL_WARNING, "lc0"), - capabilities_{file.format().network_format().input(), - file.format().network_format().output(), - file.format().network_format().moves_left()}, - fp16_(file.onnx_model().data_type() == pblczero::OnnxModel::FLOAT16), - bf16_(file.onnx_model().data_type() == pblczero::OnnxModel::BFLOAT16), - provider_(provider) { - onnx_env_.DisableTelemetryEvents(); - batch_size_ = - opts.GetOrDefault("batch", provider == OnnxProvider::DML ? 16 : -1); - steps_ = - opts.GetOrDefault("steps", provider == OnnxProvider::DML ? 4 : 1); - min_batch_size_ = opts.GetOrDefault( - "min_batch", provider == OnnxProvider::TRT ? 4 : 1); - int gpu = opts.GetOrDefault("gpu", 0); - int threads = - opts.GetOrDefault("threads", provider == OnnxProvider::CPU ? 1 : 0); - - // Sanity checks. - if (batch_size_ <= 0) { - batch_size_ = -1; // Variable batch size. - steps_ = 1; - } - if (batch_size_ * steps_ > max_batch_size_) { - batch_size_ = max_batch_size_ / steps_; - } - - const auto& md = file.onnx_model(); - if (!md.has_input_planes()) { - throw Exception("NN doesn't have input planes defined."); - } - inputs_.emplace_back(md.input_planes()); - if (!md.has_output_policy()) { - throw Exception("NN doesn't have policy head defined."); - } - policy_head_ = outputs_.size(); - outputs_.emplace_back(md.output_policy()); - if (md.has_output_wdl()) { - wdl_head_ = outputs_.size(); - outputs_.emplace_back(md.output_wdl()); - } else if (md.has_output_value()) { - value_head_ = outputs_.size(); - outputs_.emplace_back(md.output_value()); - } else { - throw Exception("NN doesn't have value head."); - } - if (md.has_output_mlh()) { - mlh_head_ = outputs_.size(); - outputs_.emplace_back(md.output_mlh()); - } - std::transform(inputs_.begin(), inputs_.end(), - std::back_inserter(inputs_cstr_), - [](const auto& x) { return x.c_str(); }); - std::transform(outputs_.begin(), outputs_.end(), - std::back_inserter(outputs_cstr_), - [](const auto& x) { return x.c_str(); }); - - for (int step = 1; step <= steps_; step++) - session_.emplace_back(onnx_env_, file.onnx_model().model().data(), - file.onnx_model().model().size(), - GetOptions(gpu, threads, batch_size_ * step)); -} - -template -std::unique_ptr MakeOnnxNetwork(const std::optional& w, - const OptionsDict& opts) { - if (!w) throw Exception("The ONNX backend requires a network file."); - - if (w->has_onnx_model()) { - return std::make_unique(*w, opts, kProvider); - } else { - WeightsToOnnxConverterOptions converter_options; - converter_options.opset = opts.GetOrDefault("opset", 17); - converter_options.alt_mish = opts.GetOrDefault( - "alt_mish", kProvider == OnnxProvider::CPU ? true : false); - converter_options.alt_layernorm = opts.GetOrDefault( - "alt_layernorm", kProvider == OnnxProvider::DML ? true : false); - converter_options.no_shape = opts.GetOrDefault("no_shape", false); - converter_options.policy_head = - opts.GetOrDefault("policy_head", "vanilla"); - converter_options.value_head = - opts.GetOrDefault("value_head", "winner"); - - std::string datatype; - if (opts.Exists("datatype")) { - datatype = opts.Get("datatype"); - } else { - bool fp16 = opts.GetOrDefault( - "fp16", kProvider == OnnxProvider::CPU ? false : true); - datatype = fp16 ? "f16" : "f32"; - } - converter_options.data_type = - WeightsToOnnxConverterOptions::StringToDataType(datatype); - - auto converted = ConvertWeightsToOnnx(*w, converter_options); - return std::make_unique(converted, opts, kProvider); - } -} - -#ifdef USE_ROCM -REGISTER_NETWORK("onnx-rocm", MakeOnnxNetwork, 64) -#endif -#ifdef USE_DML -REGISTER_NETWORK("onnx-dml", MakeOnnxNetwork, 63) -#endif -REGISTER_NETWORK("onnx-trt", MakeOnnxNetwork, 60) -REGISTER_NETWORK("onnx-cuda", MakeOnnxNetwork, 61) -REGISTER_NETWORK("onnx-cpu", MakeOnnxNetwork, 62) - -} // namespace -} // namespace lczero diff --git a/src/neural/backends/onnx/network_onnx.cc b/src/neural/backends/onnx/network_onnx.cc new file mode 100644 index 0000000000..f5ac887cda --- /dev/null +++ b/src/neural/backends/onnx/network_onnx.cc @@ -0,0 +1,966 @@ +/* + This file is part of Leela Chess Zero. + Copyright (C) 2021-2023 The LCZero Authors + + Leela Chess is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Leela Chess is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Leela Chess. If not, see . + + Additional permission under GNU GPL version 3 section 7 + + If you modify this Program, or any covered work, by linking or + combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA + Toolkit and the NVIDIA CUDA Deep Neural Network library (or a + modified version of those libraries), containing parts covered by the + terms of the respective license agreement, the licensors of this + Program grant you additional permission to convey the resulting work. +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "onnx_conf.h" + +#ifdef USE_ONNX_CUDART +#include "cuda_runtime.h" +#include "neural/backends/onnx/onnx_kernels.h" +#endif + +#include "neural/factory.h" +#include "neural/loader.h" +#include "neural/network.h" +#include "neural/onnx/converter.h" +#include "onnxruntime_cxx_api.h" +#include "utils/bf16_utils.h" +#include "utils/bititer.h" +#include "utils/commandline.h" +#include "utils/exception.h" +#include "utils/fp16_utils.h" +#include "utils/logging.h" +#include "utils/trace.h" + +namespace lczero { +namespace onnx { + +enum class OnnxProvider { CPU, CUDA, DML, ROCM, TRT, MIGRAPHX }; + +class OnnxNetwork; + +static constexpr int kNumOutputPolicy = 1858; + +struct InputsOutputs { + InputsOutputs(OnnxNetwork* network); + ~InputsOutputs() { + switch (provider_) { + case OnnxProvider::CUDA: + case OnnxProvider::TRT: +#ifdef USE_ONNX_CUDART + ReportCUDAErrors(cudaEventDestroy(inputs_uploaded_event_)); + ReportCUDAErrors(cudaEventDestroy(inputs_processed_event_)); + ReportCUDAErrors(cudaEventDestroy(evaluation_done_event_)); + ReportCUDAErrors(cudaEventDestroy(outputs_download_event_)); + ReportCUDAErrors(cudaFree(input_tensor_upload_device_)); + ReportCUDAErrors(cudaFree(input_tensor_data_device_)); + for (void* ptr : output_tensors_data_device_) { + ReportCUDAErrors(cudaFree(ptr)); + } + ReportCUDAErrors(cudaFreeHost(input_tensor_data_)); + for (void* ptr : output_tensors_data_) { + ReportCUDAErrors(cudaFreeHost(ptr)); + } + break; +#endif + default: + free(input_tensor_data_); + for (void* ptr : output_tensors_data_) { + free(ptr); + } + } + } + OnnxProvider provider_; + void* input_tensor_data_; + void* input_tensor_upload_device_; + void* input_tensor_data_device_; + std::vector output_tensors_data_; + std::vector output_tensors_data_device_; + std::vector output_tensors_step_; + // To be removed when converting to new backend interface. + std::vector wdl_output_data_; + Ort::MemoryInfo memory_info_{nullptr}; +#ifdef USE_ONNX_CUDART + cudaEvent_t inputs_uploaded_event_ = nullptr; + cudaEvent_t inputs_processed_event_ = nullptr; + cudaEvent_t evaluation_done_event_ = nullptr; + cudaEvent_t outputs_download_event_ = nullptr; +#endif +}; + +template +class OnnxComputation final : public NetworkComputation { + public: + OnnxComputation(OnnxNetwork* network); + ~OnnxComputation(); + void AddInput(InputPlanes&& input) override; + int GetBatchSize() const override; + void ComputeBlocking() override; + float GetQVal(int sample) const override; + float GetDVal(int sample) const override; + float GetPVal(int sample, int move_id) const override; + float GetMVal(int sample) const override; + + private: + Ort::IoBinding PrepareInputs(int start, int batch_size, int step); + + OnnxNetwork* network_; + size_t input_size_ = 0; + std::vector raw_input_; + std::unique_ptr inputs_outputs_; +}; + +class OnnxNetwork final : public Network { + public: + OnnxNetwork(const WeightsFile& file, const OptionsDict& options, + OnnxProvider provider, bool cpu_wdl); + ~OnnxNetwork(); + std::unique_ptr NewComputation() override { +#ifdef USE_ONNX_CUDART + if (provider_ == OnnxProvider::CUDA || provider_ == OnnxProvider::TRT) { + int device = -1; + ReportCUDAErrors(cudaGetDevice(&device)); + if (device != gpu_) { + ReportCUDAErrors(cudaSetDevice(gpu_)); + } + } +#endif + if (fp16_) { + return std::make_unique>(this); + } else if (bf16_) { + return std::make_unique>(this); + } else { + return std::make_unique>(this); + } + } + const NetworkCapabilities& GetCapabilities() const override { + return capabilities_; + } + int GetMiniBatchSize() const override { + return batch_size_ == -1 ? Network::GetMiniBatchSize() + : batch_size_ * steps_; + } + int GetPreferredBatchStep() const override { + return batch_size_ == -1 ? min_batch_size_ : batch_size_; + } + bool IsCpu() const override { return provider_ == OnnxProvider::CPU; } + + Ort::SessionOptions GetOptions(int threads, int batch_size, uint64_t hash, int optimize); + + std::unique_ptr GetInputsOutputs() { + std::lock_guard lock(inputs_outputs_lock_); + if (free_inputs_outputs_.empty()) { + return std::make_unique(this); + } else { + std::unique_ptr resource = + std::move(free_inputs_outputs_.front()); + free_inputs_outputs_.pop_front(); + return resource; + } + } + + void ReleaseInputsOutputs(std::unique_ptr resource) { + std::lock_guard lock(inputs_outputs_lock_); + free_inputs_outputs_.push_back(std::move(resource)); + } + + Ort::Env onnx_env_; + // Prepare sessions for this many multiples of the batch size; + int steps_; + std::vector session_; + std::vector inputs_; + std::vector outputs_; + // Indices in output_ vector. + int policy_head_ = -1; + int wdl_head_ = -1; + int value_head_ = -1; + int mlh_head_ = -1; + NetworkCapabilities capabilities_; + bool fp16_; + bool bf16_; + bool cpu_wdl_; + // The batch size to use, or -1 for variable. + int batch_size_; + // The lower limit for variable batch size. + int min_batch_size_; + int gpu_; + static constexpr int max_batch_size_ = 1024; + // For conditional locking if running the DML/ROCM/TRT provider. + OnnxProvider provider_; + std::mutex lock_; + // For shared device addresses. +#ifdef USE_ONNX_CUDART + cudaStream_t compute_stream_ = nullptr; + cudaStream_t upload_stream_ = nullptr; + cudaStream_t download_stream_ = nullptr; +#endif + + private: + std::mutex inputs_outputs_lock_; + std::list> free_inputs_outputs_; +}; + +InputsOutputs::InputsOutputs(OnnxNetwork* network) + : provider_(network->provider_) { + int max_batch_size = network->max_batch_size_; + int value_head = network->value_head_; + int wdl_head = network->wdl_head_; + int policy_head = network->policy_head_; + int mlh_head = network->mlh_head_; + int data_size = (network->fp16_ | network->bf16_) ? 2 : 4; + int outputs_size = + std::max({value_head, wdl_head, policy_head, mlh_head}) + 1; + output_tensors_data_.resize(outputs_size); + output_tensors_data_device_.resize(outputs_size); + output_tensors_step_.resize(outputs_size); + if (wdl_head != -1) { + wdl_output_data_.resize(3 * max_batch_size); + } + output_tensors_step_[policy_head] = kNumOutputPolicy; + if (wdl_head != -1) { + output_tensors_step_[wdl_head] = 3; + } + if (value_head != -1) { + output_tensors_step_[value_head] = 1; + } + if (mlh_head != -1) { + output_tensors_step_[mlh_head] = 1; + } + + switch (provider_) { + case OnnxProvider::CUDA: + case OnnxProvider::TRT: +#ifdef USE_ONNX_CUDART + ReportCUDAErrors( + cudaEventCreate(&inputs_processed_event_, cudaEventDisableTiming)); + ReportCUDAErrors( + cudaEventCreate(&inputs_uploaded_event_, cudaEventDisableTiming)); + ReportCUDAErrors( + cudaEventCreate(&evaluation_done_event_, cudaEventDisableTiming)); + ReportCUDAErrors( + cudaEventCreate(&outputs_download_event_, cudaEventDisableTiming)); + ReportCUDAErrors( + cudaHostAlloc(&input_tensor_data_, + max_batch_size * kInputPlanes * sizeof(InputPlane), 0)); + for (int i = 0; i < outputs_size; i++) { + ReportCUDAErrors(cudaHostAlloc( + &output_tensors_data_[i], + max_batch_size * output_tensors_step_[i] * data_size, 0)); + } + + output_tensors_data_device_.resize(outputs_size); + ReportCUDAErrors( + cudaMalloc(&input_tensor_upload_device_, + max_batch_size * kInputPlanes * sizeof(InputPlane))); + ReportCUDAErrors( + cudaMalloc(&input_tensor_data_device_, + max_batch_size * kInputPlanes * 8 * 8 * data_size)); + for (int i = 0; i < outputs_size; i++) { + ReportCUDAErrors( + cudaMalloc(&output_tensors_data_device_[i], + max_batch_size * output_tensors_step_[i] * data_size)); + } + memory_info_ = Ort::MemoryInfo{"Cuda", OrtDeviceAllocator, network->gpu_, + OrtMemTypeDefault}; + break; +#endif + default: + input_tensor_data_ = + malloc(max_batch_size * kInputPlanes * 8 * 8 * data_size); + for (int i = 0; i < outputs_size; i++) { + output_tensors_data_[i] = + malloc(max_batch_size * output_tensors_step_[i] * data_size); + } + input_tensor_data_device_ = input_tensor_data_; + for (int i = 0; i < outputs_size; i++) { + output_tensors_data_device_[i] = output_tensors_data_[i]; + } + memory_info_ = + Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault); + } +} + +OnnxNetwork::~OnnxNetwork() { +#ifdef USE_ONNX_CUDART + if (provider_ == OnnxProvider::TRT || provider_ == OnnxProvider::CUDA) { + ReportCUDAErrors(cudaStreamDestroy(compute_stream_)); + ReportCUDAErrors(cudaStreamDestroy(upload_stream_)); + ReportCUDAErrors(cudaStreamDestroy(download_stream_)); + } +#endif +} + +template +OnnxComputation::OnnxComputation(OnnxNetwork* network) + : network_(network) { + inputs_outputs_ = network_->GetInputsOutputs(); +} + +template +OnnxComputation::~OnnxComputation() { + network_->ReleaseInputsOutputs(std::move(inputs_outputs_)); +} + +void AsDataType(float x, float* y) { *y = x; } +void AsDataType(float x, Ort::Float16_t* y) { + uint16_t tmp = FP32toFP16(x); + std::memcpy(reinterpret_cast(y), &tmp, sizeof(uint16_t)); +} +void AsDataType(float x, Ort::BFloat16_t* y) { + uint16_t tmp = FP32toBF16(x); + std::memcpy(reinterpret_cast(y), &tmp, sizeof(uint16_t)); +} + +template +void OnnxComputation::AddInput(InputPlanes&& input) { + if (input_size_ >= network_->max_batch_size_) { + throw Exception("NN input exceeds max batch size of " + + std::to_string(network_->max_batch_size_) + "."); + } +#ifdef USE_ONNX_CUDART + if (network_->provider_ == OnnxProvider::CUDA || + network_->provider_ == OnnxProvider::TRT) { + assert(input.size() == kInputPlanes); + uint64_t* masks = + static_cast(inputs_outputs_->input_tensor_data_) + + input_size_ * kInputPlanes; + uint64_t* mask_end = + static_cast(inputs_outputs_->input_tensor_data_) + + network_->max_batch_size_ * kInputPlanes; + DataType* values = + reinterpret_cast(mask_end) + input_size_ * kInputPlanes; + for (size_t i = 0; i < kInputPlanes; i++) { + masks[i] = input[i].mask; + DataType value; + AsDataType(input[i].value, &value); + values[i] = value; + } + input_size_++; + if (input_size_ > network_->max_batch_size_) { + throw Exception("NN input exceeds max batch size of " + + std::to_string(network_->max_batch_size_) + "."); + } + return; + } +#endif + raw_input_.emplace_back(std::move(input)); + input_size_++; +} +template +int OnnxComputation::GetBatchSize() const { + return input_size_; +} + +float AsFloat(float x) { return x; } +float AsFloat(Ort::Float16_t x) { + uint16_t tmp; + std::memcpy(&tmp, reinterpret_cast(&x), sizeof(uint16_t)); + return FP16toFP32(tmp); +} +float AsFloat(Ort::BFloat16_t x) { + uint16_t tmp; + std::memcpy(&tmp, reinterpret_cast(&x), sizeof(uint16_t)); + return BF16toFP32(tmp); +} + +template +float OnnxComputation::GetQVal(int sample) const { + if (network_->wdl_head_ != -1) { + return inputs_outputs_->wdl_output_data_[sample * 3 + 0] - + inputs_outputs_->wdl_output_data_[sample * 3 + 2]; + } else { + DataType* data = static_cast( + inputs_outputs_->output_tensors_data_[network_->value_head_]); + return AsFloat(data[sample]); + } +} + +template +float OnnxComputation::GetDVal(int sample) const { + if (network_->wdl_head_ == -1) return 0.0f; + return inputs_outputs_->wdl_output_data_[sample * 3 + 1]; +} + +template +float OnnxComputation::GetPVal(int sample, int move_id) const { + DataType* data = static_cast( + inputs_outputs_->output_tensors_data_[network_->policy_head_]); + return AsFloat(data[sample * kNumOutputPolicy + move_id]); +} + +template +float OnnxComputation::GetMVal(int sample) const { + if (network_->mlh_head_ == -1) return 0.0f; + DataType* data = static_cast( + inputs_outputs_->output_tensors_data_[network_->mlh_head_]); + return AsFloat(data[sample]); +} + +template +Ort::IoBinding OnnxComputation::PrepareInputs(int start, + int batch_size, + int step) { +#ifdef USE_ONNX_CUDART + if (network_->provider_ != OnnxProvider::CUDA && + network_->provider_ != OnnxProvider::TRT) +#endif + { + DataType* iter = + static_cast(inputs_outputs_->input_tensor_data_); + iter += start * kInputPlanes * 8 * 8; + std::memset(static_cast(iter), 0, + batch_size * kInputPlanes * 8 * 8 * sizeof(DataType)); + int end = std::min(start + batch_size, static_cast(input_size_)); + for (int i = start; i < end; i++) { + for (const auto& plane : raw_input_[i]) { + DataType value; + AsDataType(plane.value, &value); + for (auto bit : IterateBits(plane.mask)) { + *(iter + bit) = value; + } + iter += 64; + } + } + } + + Ort::IoBinding binding{network_->session_[step - 1]}; + for (size_t i = 0; i < inputs_outputs_->output_tensors_step_.size(); i++) { + int size = inputs_outputs_->output_tensors_step_[i]; + int64_t dims[] = {batch_size, size}; + binding.BindOutput( + network_->outputs_[i].c_str(), + Ort::Value::CreateTensor( + inputs_outputs_->memory_info_, + static_cast( + inputs_outputs_->output_tensors_data_device_[i]) + + start * size, + size * batch_size, dims, 2)); + } + + int64_t dims[] = {batch_size, kInputPlanes, 8, 8}; + binding.BindInput( + network_->inputs_[0].c_str(), + Ort::Value::CreateTensor( + inputs_outputs_->memory_info_, + static_cast(inputs_outputs_->input_tensor_data_device_) + + start * kInputPlanes * 8 * 8, + batch_size * kInputPlanes * 8 * 8, dims, 4)); + return binding; +} + +template +void OnnxComputation::ComputeBlocking() { + LCTRACE_FUNCTION_SCOPE; + int batch_size = network_->batch_size_; + if (batch_size < 0) { + batch_size = + std::max(static_cast(input_size_), network_->min_batch_size_); + } + // Only the DML onnxruntime execution provider is documented as needing + // locking, but it seems all GPU backends need it. + if (network_->provider_ != OnnxProvider::CPU) { + network_->lock_.lock(); + } + for (size_t i = 0; i < (size_t)input_size_;) { + int step = (input_size_ - i + batch_size - 1) / batch_size; + if (step > network_->steps_) step = network_->steps_; + int batch = batch_size * step; + if (network_->provider_ == OnnxProvider::TRT && network_->batch_size_ > 0) { + batch = std::min((int)input_size_ - (int)i, batch); + } + + auto binding = PrepareInputs(i, batch, step); + + Ort::RunOptions options = {}; +#ifdef USE_ONNX_CUDART + if (network_->provider_ == OnnxProvider::TRT || + network_->provider_ == OnnxProvider::CUDA) { + if (i == 0) { + ReportCUDAErrors( + cudaStreamWaitEvent(network_->upload_stream_, + inputs_outputs_->inputs_processed_event_)); + } + const char* src_masks = + static_cast(inputs_outputs_->input_tensor_data_); + char* dst_masks = + static_cast(inputs_outputs_->input_tensor_upload_device_); + src_masks += i * kInputPlanes * sizeof(uint64_t); + dst_masks += i * kInputPlanes * (sizeof(uint64_t) + sizeof(DataType)); + ReportCUDAErrors(cudaMemcpyAsync( + dst_masks, src_masks, batch * kInputPlanes * sizeof(uint64_t), + cudaMemcpyHostToDevice, network_->upload_stream_)); + char* src_values = + static_cast(inputs_outputs_->input_tensor_data_); + src_values += network_->max_batch_size_ * kInputPlanes * sizeof(uint64_t); + src_values += i * kInputPlanes * sizeof(DataType); + char* dst_values = dst_masks + batch * kInputPlanes * sizeof(uint64_t); + ReportCUDAErrors(cudaMemcpyAsync( + dst_values, src_values, batch * kInputPlanes * sizeof(DataType), + cudaMemcpyHostToDevice, network_->upload_stream_)); + ReportCUDAErrors(cudaEventRecord(inputs_outputs_->inputs_uploaded_event_, + network_->upload_stream_)); + ReportCUDAErrors(cudaStreamWaitEvent( + network_->compute_stream_, inputs_outputs_->inputs_uploaded_event_)); + if (network_->fp16_) { + half* dst = + reinterpret_cast(inputs_outputs_->input_tensor_data_device_); + dst += i * kInputPlanes * 8 * 8; + expandPlanesOnnx(dst, dst_masks, batch * kInputPlanes, + network_->compute_stream_); + } else if (network_->bf16_) { + __nv_bfloat16* dst = reinterpret_cast<__nv_bfloat16*>( + inputs_outputs_->input_tensor_data_device_); + dst += i * kInputPlanes * 8 * 8; + expandPlanesOnnx(dst, dst_masks, batch * kInputPlanes, + network_->compute_stream_); + } else { + float* dst = reinterpret_cast( + inputs_outputs_->input_tensor_data_device_); + dst += i * kInputPlanes * 8 * 8; + expandPlanesOnnx(dst, dst_masks, batch * kInputPlanes, + network_->compute_stream_); + } + + ReportCUDAErrors(cudaEventRecord(inputs_outputs_->inputs_processed_event_, + network_->upload_stream_)); + if (i == 0) { + ReportCUDAErrors( + cudaStreamWaitEvent(network_->compute_stream_, + inputs_outputs_->outputs_download_event_)); + } + options.AddConfigEntry("disable_synchronize_execution_providers", "1"); + } else +#endif + { + binding.SynchronizeInputs(); + } + network_->session_[step - 1].Run(options, binding); +#ifdef USE_ONNX_CUDART + if (network_->provider_ == OnnxProvider::TRT || + network_->provider_ == OnnxProvider::CUDA) { + for (size_t j = 0; j < inputs_outputs_->output_tensors_step_.size(); + j++) { + ReportCUDAErrors( + cudaEventRecord(inputs_outputs_->evaluation_done_event_, + network_->compute_stream_)); + ReportCUDAErrors( + cudaStreamWaitEvent(network_->download_stream_, + inputs_outputs_->evaluation_done_event_)); + size_t offset = i * inputs_outputs_->output_tensors_step_[j]; + ReportCUDAErrors(cudaMemcpyAsync( + static_cast(inputs_outputs_->output_tensors_data_[j]) + + offset, + static_cast( + inputs_outputs_->output_tensors_data_device_[j]) + + offset, + batch * inputs_outputs_->output_tensors_step_[j] * sizeof(DataType), + cudaMemcpyDeviceToHost, network_->download_stream_)); + ReportCUDAErrors( + cudaEventRecord(inputs_outputs_->outputs_download_event_, + network_->download_stream_)); + } + } else +#endif + { + binding.SynchronizeOutputs(); + } + i += batch; + } + if (network_->provider_ != OnnxProvider::CPU) { + network_->lock_.unlock(); + } +#ifdef USE_ONNX_CUDART + if (network_->provider_ == OnnxProvider::TRT || + network_->provider_ == OnnxProvider::CUDA) { + ReportCUDAErrors( + cudaEventSynchronize(inputs_outputs_->outputs_download_event_)); + } +#endif + if (network_->wdl_head_ != -1) { + const DataType* data = static_cast( + inputs_outputs_->output_tensors_data_[network_->wdl_head_]); + for (size_t i = 0; i < input_size_; i++) { + float w = AsFloat(data[i * 3 + 0]); + float d = AsFloat(data[i * 3 + 1]); + float l = AsFloat(data[i * 3 + 2]); + if (network_->cpu_wdl_) { + // Value softmax done cpu side. + float m = std::max({w, d, l}); + w = std::exp(w - m); + d = std::exp(d - m); + l = std::exp(l - m); + float sum = w + d + l; + w /= sum; + l /= sum; + d /= sum; + } + inputs_outputs_->wdl_output_data_[3 * i + 0] = w; + inputs_outputs_->wdl_output_data_[3 * i + 1] = d; + inputs_outputs_->wdl_output_data_[3 * i + 2] = l; + } + } +} + +Ort::SessionOptions OnnxNetwork::GetOptions(int threads, int batch_size, + uint64_t hash, int optimize) { + Ort::SessionOptions options; + options.SetIntraOpNumThreads(threads); + GraphOptimizationLevel level = GraphOptimizationLevel::ORT_DISABLE_ALL; + switch (optimize) { + case 0: + level = GraphOptimizationLevel::ORT_DISABLE_ALL; + break; + case 1: + level = GraphOptimizationLevel::ORT_ENABLE_BASIC; + break; + case 2: + level = GraphOptimizationLevel::ORT_ENABLE_EXTENDED; + break; + default: + level = GraphOptimizationLevel::ORT_ENABLE_ALL; + break; + } + options.SetGraphOptimizationLevel(level); + + if (batch_size > 0 && provider_ != OnnxProvider::TRT) { + // Override the default (variable) batch size. + Ort::ThrowOnError( + OrtGetApiBase() + ->GetApi(ORT_API_VERSION) + ->AddFreeDimensionOverrideByName(options, "batch", batch_size)); + } + + switch (provider_) { + case OnnxProvider::DML: { + std::unordered_map dml_options; + dml_options["device_id"] = std::to_string(gpu_); + dml_options["performance_preference"] = "high_performance"; + options.AppendExecutionProvider("DML", dml_options); + break; + } + case OnnxProvider::TRT: { + options.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL); + + std::string cache_dir = CommandLine::BinaryDirectory() + "/trt_cache"; + std::map trt_options; + trt_options["device_id"] = std::to_string(gpu_); + trt_options["trt_builder_optimization_level"] = std::to_string(std::clamp(optimize, 0, 5)); + trt_options["trt_fp16_enable"] = optimize >= 6 ? "1" : "0"; +#if ORT_API_VERSION >= 23 + trt_options["trt_bf16_enable"] = optimize >= 7 ? "1" : "0"; +#endif + trt_options["trt_int8_enable"] = optimize >= 8 ? "1" : "0"; + trt_options["trt_max_partition_iterations"] = "1000"; + trt_options["trt_min_subgraph_size"] = "1"; + trt_options["trt_engine_cache_enable"] = "1"; + // We need the batch size as well as the hash, as it is set after loading. + std::ostringstream oss; + oss << std::hex << hash; + trt_options["trt_engine_cache_prefix"] = + "Lc0_ONNX_TRT_ORT_" + Ort::GetVersionString() + "_batch_" + + (batch_size < 0 ? std::to_string(batch_size) + : std::to_string(batch_size - batch_size_ + 1) + "-" + + std::to_string(batch_size)) + + "_" + std::to_string(optimize) + "_" + oss.str() + "_"; + trt_options["trt_engine_cache_path"] = cache_dir; + trt_options["trt_timing_cache_enable"] = "1"; + trt_options["trt_timing_cache_path"] = cache_dir; + trt_options["trt_layer_norm_fp32_fallback"] = "1"; + trt_options["trt_force_sequential_engine_build"] = "1"; + trt_options["trt_context_memory_sharing_enable"] = "1"; + // Looks like we need I/O binding to enable this. +#ifdef USE_ONNX_CUDART + trt_options["has_user_compute_stream"] = "1"; +#endif + if (batch_size < 0) { + trt_options["trt_profile_min_shapes"] = + inputs_[0] + ":" + std::to_string(min_batch_size_) + "x112x8x8"; + trt_options["trt_profile_max_shapes"] = + inputs_[0] + ":" + std::to_string(max_batch_size_) + "x112x8x8"; + trt_options["trt_profile_opt_shapes"] = + inputs_[0] + ":" + std::to_string(max_batch_size_ / 4) + "x112x8x8"; + } else { + trt_options["trt_profile_min_shapes"] = + inputs_[0] + ":" + std::to_string(batch_size - batch_size_ + 1) + + "x112x8x8"; + trt_options["trt_profile_max_shapes"] = + inputs_[0] + ":" + std::to_string(batch_size) + "x112x8x8"; + trt_options["trt_profile_opt_shapes"] = + inputs_[0] + ":" + std::to_string(batch_size) + "x112x8x8"; + } + std::vector keys; + std::vector values; + for (const auto& [key, value] : trt_options) { + keys.push_back(key.c_str()); + values.push_back(value.c_str()); + } + + const auto& api = Ort::GetApi(); + OrtTensorRTProviderOptionsV2* trt_options_v2; + Ort::ThrowOnError(api.CreateTensorRTProviderOptions(&trt_options_v2)); + Ort::ThrowOnError(api.UpdateTensorRTProviderOptions( + trt_options_v2, keys.data(), values.data(), keys.size())); +#ifdef USE_ONNX_CUDART + Ort::ThrowOnError(api.UpdateTensorRTProviderOptionsWithValue( + trt_options_v2, "user_compute_stream", compute_stream_)); +#endif + options.AppendExecutionProvider_TensorRT_V2(*trt_options_v2); + api.ReleaseTensorRTProviderOptions(trt_options_v2); + break; + } + case OnnxProvider::ROCM: { + OrtROCMProviderOptions rocm_options; + rocm_options.device_id = gpu_; + options.AppendExecutionProvider_ROCM(rocm_options); + break; + } + case OnnxProvider::MIGRAPHX: { + std::unordered_map migraphx_options; + migraphx_options["device_id"] = std::to_string(gpu_); + migraphx_options["migraphx_exhaustive_tune"] = optimize >= 5 ? "1" : "0"; + migraphx_options["migraphx_fp16_enable"] = optimize >= 6 ? "1" : "0"; + migraphx_options["migraphx_bf16_enable"] = optimize >= 7 ? "1" : "0"; + migraphx_options["migraphx_fp8_enable"] = optimize >= 8 ? "1" : "0"; + std::filesystem::path cache_dir = CommandLine::BinaryDirectory(); + cache_dir /= "migraphx_cache"; + + if (!std::filesystem::exists(cache_dir)) { + std::filesystem::create_directories(cache_dir); + } + migraphx_options["migraphx_model_cache_dir"] = cache_dir.string(); + + options.AppendExecutionProvider("MIGraphX", migraphx_options); + break; + } + case OnnxProvider::CUDA: { + OrtCUDAProviderOptions cuda_options; + cuda_options.device_id = gpu_; +#ifdef USE_ONNX_CUDART + cuda_options.has_user_compute_stream = true; + cuda_options.user_compute_stream = compute_stream_; +#endif + options.AppendExecutionProvider_CUDA(cuda_options); + break; + } + case OnnxProvider::CPU: + // The CPU execution provider is always available. + break; + } + return options; +} + +OnnxNetwork::OnnxNetwork(const WeightsFile& file, const OptionsDict& opts, + OnnxProvider provider, bool cpu_wdl) + : onnx_env_(ORT_LOGGING_LEVEL_WARNING, "lc0"), + capabilities_{file.format().network_format().input(), + file.format().network_format().output(), + file.format().network_format().moves_left()}, + fp16_(file.onnx_model().data_type() == pblczero::OnnxModel::FLOAT16), + bf16_(file.onnx_model().data_type() == pblczero::OnnxModel::BFLOAT16), + cpu_wdl_(cpu_wdl), + provider_(provider) { + onnx_env_.DisableTelemetryEvents(); + + gpu_ = opts.GetOrDefault("gpu", 0); + +#ifdef USE_ONNX_CUDART + if (provider_ == OnnxProvider::CUDA || provider_ == OnnxProvider::TRT) { + cudaDeviceProp deviceProp = {}; + if (!cudaGetDeviceProperties(&deviceProp, gpu_)) { + CERR << "GPU: " << deviceProp.name; + CERR << "GPU memory: " << deviceProp.totalGlobalMem / std::pow(2.0f, 30) + << " Gb"; + int clockRate = 0; + ReportCUDAErrors( + cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, gpu_)); + CERR << "GPU clock frequency: " << clockRate / 1e3f << " MHz"; + } +#if CUDART_VERSION >= 12080 + int runtime_version; + ReportCUDAErrors(cudaRuntimeGetVersion(&runtime_version)); + if (runtime_version >= 12080) { + int attr; + ReportCUDAErrors( + cudaDeviceGetAttribute(&attr, cudaDevAttrGpuPciDeviceId, gpu_)); + uint32_t pci_device = attr; + CERR << "GPU device ID: " << std::hex << (pci_device & 0xffff) << ":" + << (pci_device >> 16); + ReportCUDAErrors( + cudaDeviceGetAttribute(&attr, cudaDevAttrGpuPciSubsystemId, gpu_)); + uint32_t pci_subsystem = attr; + CERR << "GPU subsystem ID: " << std::hex << (pci_subsystem & 0xffff) + << ":" << (pci_subsystem >> 16) << std::dec; + } +#endif + } +#endif + + int threads = + opts.GetOrDefault("threads", provider == OnnxProvider::CPU ? 1 : 0); + int default_batch = -1; + int default_steps = 1; + int default_min_batch = 1; + switch (provider) { + case OnnxProvider::DML: + case OnnxProvider::MIGRAPHX: + default_batch = 16; + default_steps = 4; + break; + case OnnxProvider::TRT: + default_min_batch = 4; + default: + break; + } + + int optimize = opts.GetOrDefault("optimize", 3); + batch_size_ = opts.GetOrDefault("batch", default_batch); + steps_ = opts.GetOrDefault("steps", default_steps); + min_batch_size_ = opts.GetOrDefault("min_batch", default_min_batch); + + // Sanity checks. + if (batch_size_ <= 0) { + batch_size_ = -1; // Variable batch size. + steps_ = 1; + } + if (batch_size_ * steps_ > max_batch_size_) { + batch_size_ = max_batch_size_ / steps_; + } + + const auto& md = file.onnx_model(); + if (!md.has_input_planes()) { + throw Exception("NN doesn't have input planes defined."); + } + inputs_.emplace_back(md.input_planes()); + if (!md.has_output_policy()) { + throw Exception("NN doesn't have policy head defined."); + } + policy_head_ = outputs_.size(); + outputs_.emplace_back(md.output_policy()); + if (md.has_output_wdl()) { + wdl_head_ = outputs_.size(); + outputs_.emplace_back(md.output_wdl()); + } else if (md.has_output_value()) { + value_head_ = outputs_.size(); + outputs_.emplace_back(md.output_value()); + } else { + throw Exception("NN doesn't have value head."); + } + if (md.has_output_mlh()) { + mlh_head_ = outputs_.size(); + outputs_.emplace_back(md.output_mlh()); + } + uint64_t hash = 0; + if (provider == OnnxProvider::TRT) { + hash = std::hash()(md.model()); + } + switch (provider) { + case OnnxProvider::TRT: + case OnnxProvider::CUDA: +#ifdef USE_ONNX_CUDART + ReportCUDAErrors(cudaSetDevice(gpu_)); + ReportCUDAErrors(cudaStreamCreate(&compute_stream_)); + ReportCUDAErrors(cudaStreamCreate(&upload_stream_)); + ReportCUDAErrors(cudaStreamCreate(&download_stream_)); +#else + CERR << "WARNING: Simplified version without CUDA enhancements."; +#endif + break; + default: + break; + } + + for (int step = 1; step <= steps_; step++) + session_.emplace_back(onnx_env_, file.onnx_model().model().data(), + file.onnx_model().model().size(), + GetOptions(threads, batch_size_ * step, hash, optimize)); +} + +template +std::unique_ptr MakeOnnxNetwork(const std::optional& w, + const OptionsDict& opts) { + if (!w) throw Exception("The ONNX backend requires a network file."); + + if (w->has_onnx_model()) { + return std::make_unique(*w, opts, kProvider, false); + } else { + WeightsToOnnxConverterOptions converter_options; + converter_options.ir = opts.GetOrDefault("ir", -1); + converter_options.alt_mish = opts.GetOrDefault( + "alt_mish", kProvider == OnnxProvider::CPU ? true : false); + converter_options.alt_layernorm = opts.GetOrDefault( + "alt_layernorm", + kProvider == OnnxProvider::DML && + w->format().network_format().ffn_activation() == + pblczero::NetworkFormat::ACTIVATION_RELU_2 + ? true + : false); + converter_options.no_shape = opts.GetOrDefault("no_shape", false); + converter_options.policy_head = + opts.GetOrDefault("policy_head", "vanilla"); + converter_options.value_head = + opts.GetOrDefault("value_head", "winner"); + converter_options.no_wdl_softmax = true; + // No execution provider has a better mish version, some don't even have it. + converter_options.real_mish = false; + + std::string datatype; + if (opts.Exists("datatype")) { + datatype = opts.Get("datatype"); + } else { + bool fp16 = opts.GetOrDefault( + "fp16", kProvider == OnnxProvider::CPU ? false : true); + datatype = fp16 ? "f16" : "f32"; + } + converter_options.data_type = + WeightsToOnnxConverterOptions::StringToDataType(datatype); + converter_options.opset = opts.GetOrDefault( + "opset", converter_options.data_type == + WeightsToOnnxConverterOptions::DataType::kBFloat16 + ? 22 + : 17); + + auto converted = ConvertWeightsToOnnx(*w, converter_options); + return std::make_unique(converted, opts, kProvider, true); + } +} + +#ifdef USE_MIGRAPHX +REGISTER_NETWORK("onnx-migraphx", MakeOnnxNetwork, 65) +#endif +#ifdef USE_ROCM +REGISTER_NETWORK("onnx-rocm", MakeOnnxNetwork, 64) +#endif +#ifdef USE_DML +REGISTER_NETWORK("onnx-dml", MakeOnnxNetwork, 63) +#endif +REGISTER_NETWORK("onnx-trt", MakeOnnxNetwork, 60) +REGISTER_NETWORK("onnx-cuda", MakeOnnxNetwork, 61) +REGISTER_NETWORK("onnx-cpu", MakeOnnxNetwork, 62) + +} // namespace onnx +} // namespace lczero diff --git a/src/neural/backends/onnx/onnx_kernels.cu b/src/neural/backends/onnx/onnx_kernels.cu new file mode 100644 index 0000000000..1da1d0f232 --- /dev/null +++ b/src/neural/backends/onnx/onnx_kernels.cu @@ -0,0 +1,94 @@ +/* + This file is part of Leela Chess Zero. + Copyright (C) 2025 The LCZero Authors + + Leela Chess is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Leela Chess is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Leela Chess. If not, see . + + Additional permission under GNU GPL version 3 section 7 + + If you modify this Program, or any covered work, by linking or + combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA + Toolkit and the NVIDIA CUDA Deep Neural Network library (or a + modified version of those libraries), containing parts covered by the + terms of the respective license agreement, the licensors of this + Program grant you additional permission to convey the resulting work. +*/ + +#include + +#include "neural/backends/onnx/onnx_kernels.h" +#include "utils/exception.h" + +namespace lczero { +namespace onnx { + +template +__global__ void expandPlanes_kernel(DataType* output, const uint64_t* masks, + const DataType* values, unsigned n) { + unsigned index = threadIdx.x + blockDim.x * blockIdx.x; + index *= bits_per_thread; + unsigned planeIndex = index >> 6; + if (planeIndex >= n) return; + + uint64_t mask = masks[planeIndex]; + unsigned sqIndex = index & 0x3F; + DataType value = static_cast(values[planeIndex]); + DataType op[bits_per_thread] = {}; + mask >>= sqIndex; + for (unsigned i = 0; i < bits_per_thread; i++) { + if (mask & 0x1) { + op[i] = value; + } + mask >>= 1; + } + for (unsigned i = 0; i < bits_per_thread; i++) { + output[index + i] = op[i]; + } +} + +template +void expandPlanesOnnx(DataType* output, const void* input, unsigned n, + cudaStream_t stream) { + constexpr unsigned bits_per_thread = 2; + int threads = n * 8 * 8 / bits_per_thread; + const int blockSize = 256; + int blocks = DivUp(threads, blockSize); + + const uint64_t* masks = static_cast(input); + const DataType* values = reinterpret_cast(masks + n); + + expandPlanes_kernel + <<>>(output, masks, values, n); + + ReportCUDAErrors(cudaGetLastError()); +} + +void CudaError(cudaError_t status, const char* file, int line) { + if (status != cudaSuccess) { + auto err = std::string("CUDA error: ") + cudaGetErrorString(status) + " (" + + file + ":" + std::to_string(line) + ") "; + throw Exception(err); + } +} + +template void expandPlanesOnnx(half* output, const void* input, + unsigned n, cudaStream_t stream); +template void expandPlanesOnnx(float* output, const void* input, + unsigned n, cudaStream_t stream); +template void expandPlanesOnnx<__nv_bfloat16>(__nv_bfloat16* output, + const void* input, unsigned n, + cudaStream_t stream); + +} // namespace onnx +} // namespace lczero diff --git a/src/neural/backends/onnx/onnx_kernels.h b/src/neural/backends/onnx/onnx_kernels.h new file mode 100644 index 0000000000..f16b981da7 --- /dev/null +++ b/src/neural/backends/onnx/onnx_kernels.h @@ -0,0 +1,49 @@ +/* + This file is part of Leela Chess Zero. + Copyright (C) 2025 The LCZero Authors + + Leela Chess is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Leela Chess is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Leela Chess. If not, see . + + Additional permission under GNU GPL version 3 section 7 + + If you modify this Program, or any covered work, by linking or + combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA + Toolkit and the NVIDIA CUDA Deep Neural Network library (or a + modified version of those libraries), containing parts covered by the + terms of the respective license agreement, the licensors of this + Program grant you additional permission to convey the resulting work. +*/ + +#pragma once + +#include +#include +#include + +namespace lczero { +namespace onnx { + +// Expand input planes from bitmask to floating point tensors. It is used as a +// preprocessing step of ONNX models. +template +void expandPlanesOnnx(DataType* output, const void* input, unsigned n, + cudaStream_t stream); + +#define ReportCUDAErrors(status) CudaError(status, __FILE__, __LINE__) +void CudaError(cudaError_t status, const char* file, int line); + +inline int DivUp(int a, int b) { return (a + b - 1) / b; } + +} // namespace onnx +} // namespace lczero diff --git a/src/neural/backends/opencl/OpenCL.h b/src/neural/backends/opencl/OpenCL.h index 369aae7b68..08b0324d58 100644 --- a/src/neural/backends/opencl/OpenCL.h +++ b/src/neural/backends/opencl/OpenCL.h @@ -36,7 +36,13 @@ using net_t = float; #include #include +#if __has_include("CL/opencl.hpp") +#include "CL/opencl.hpp" +#elif __has_include("OpenCL/opencl.hpp") +#include "OpenCL/opencl.hpp" +#else #include "opencl.hpp" +#endif #include "neural/backends/opencl/OpenCLBuffers.h" #include "neural/backends/opencl/OpenCLParams.h" diff --git a/src/neural/backends/sycl/common_kernels.dp.cpp b/src/neural/backends/sycl/common_kernels.dp.cpp index 65335e5e6a..8cae7bbf49 100644 --- a/src/neural/backends/sycl/common_kernels.dp.cpp +++ b/src/neural/backends/sycl/common_kernels.dp.cpp @@ -20,7 +20,6 @@ */ #include -#include "dpct/dpct.hpp" #include #include @@ -881,7 +880,7 @@ void globalAvgPool_kernel(T* output, const T* input, "--use-experimental-features=masked-sub-group-operation" to use the experimental helper function to migrate __shfl_down_sync. */ - S += dpct::shift_sub_group_left(item_ct1.get_sub_group(), S, offset); + S += sycl::shift_group_left(item_ct1.get_sub_group(), S, offset); } float avg = S / elementsPerWarp; @@ -960,8 +959,10 @@ void globalScale(int N, int C, T* output, const T* input, const T* scaleBias, sycl::range<3>(1, 1, kBlocks) * sycl::range<3>(1, 1, kBlockSize), sycl::range<3>(1, 1, kBlockSize)), [=](sycl::nd_item<3> item_ct1) { - ((sycl::half*)output, (sycl::half*)input, (sycl::half*)scaleBias, - (sycl::half*)prevLayerBias, N * C * 8 * 8, C, 8 * 8 * C, activation); + globalScale_kernel_fp16_nhwc( + (sycl::half*)output, (sycl::half*)input, (sycl::half*)scaleBias, + (sycl::half*)prevLayerBias, N * C * 8 * 8, C, 8 * 8 * C, + activation, item_ct1); }); } else { sycl_queue.parallel_for( @@ -1126,7 +1127,7 @@ void softmax_opt_64_kernel(T* output, const T* input, "--use-experimental-features=masked-sub-group-operation" to use the experimental helper function to migrate __shfl_sync. */ - maxval = dpct::select_from_sub_group(item_ct1.get_sub_group(), maxval, 0); + maxval = sycl::select_from_group(item_ct1.get_sub_group(), maxval, 0); ex[0] = sycl::exp(x[0] - maxval); ex[1] = sycl::exp(x[1] - maxval); @@ -1139,7 +1140,7 @@ void softmax_opt_64_kernel(T* output, const T* input, "--use-experimental-features=masked-sub-group-operation" to use the experimental helper function to migrate __shfl_sync. */ - Sum = dpct::select_from_sub_group(item_ct1.get_sub_group(), Sum, 0); + Sum = sycl::select_from_group(item_ct1.get_sub_group(), Sum, 0); ex[0] = ex[0] / Sum; ex[1] = ex[1] / Sum; @@ -1162,11 +1163,16 @@ void softmax_opt_64_kernel(T* output, const T* input, // C threads per block, N blocks template void softmax_kernel(T* output, const T* input, const T* input2, - const sycl::nd_item<3> &item_ct1, float &sum, float &maxval) { + const sycl::nd_item<3> &item_ct1, float &localsum, + float &localmax) { int n = item_ct1.get_group(2); int c = item_ct1.get_local_id(2); int C = item_ct1.get_local_range(2); int index = n * C + c; + sycl::atomic_ref maxval(localmax); + sycl::atomic_ref sum(localsum); // softmax = tf.exp(logits) / tf.reduce_sum(tf.exp(logits), axis) @@ -1183,7 +1189,7 @@ void softmax_kernel(T* output, const T* input, const T* input2, // Get max across warp first, and then update across C dimension float warpmax = warpMax(x, item_ct1); - if ((c & 0x1F) == 0) atomicMaxFloat(&maxval, warpmax); + if ((c & 0x1F) == 0) maxval.fetch_max(warpmax); item_ct1.barrier(sycl::access::fence_space::local_space); @@ -1195,8 +1201,7 @@ void softmax_kernel(T* output, const T* input, const T* input2, // update shared memory sum across C dimension if ((c & 0x1F) == 0) - dpct::atomic_fetch_add(&sum, - val); + sum.fetch_add(val); item_ct1.barrier(sycl::access::fence_space::local_space); @@ -1243,7 +1248,8 @@ void Softmax(int N, int C, T* output, const T* input, const T* input2, sycl::que } } -__dpct_inline__ float shared_sum_for_layer_norm( +[[gnu::always_inline]] +inline float shared_sum_for_layer_norm( float x, const sycl::nd_item<3>& item_ct1, sycl::local_accessor sum) { // compute warp-wide sum @@ -1676,6 +1682,74 @@ void applyInputGating(T* output, const T* input, const T* mult, const T* add, }); } +template +static void genOffsetPointers_kernel(T** offsets, int heads, int block_size, + int depth, int d_model, T* k, T* q, T* b1, + T* v, T* b2, + const sycl::nd_item<1>& item_ct) { + const int i = item_ct.get_global_id(0) * kWorkPerThread; + if (i >= block_size) return; + const int h = i % heads; + const int n = i / heads; + int w; + T* res[kWorkPerThread]; + for (w = 0; w < kWorkPerThread; w++) { + res[w] = k + h * depth + 64 * d_model * n + w * depth; + offsets[i + w] = res[w]; + } + + for (w = 0; w < kWorkPerThread; w++) { + res[w] = q + h * depth + 64 * d_model * n + w * depth; + offsets[i + w + block_size] = res[w]; + } + + for (w = 0; w < kWorkPerThread; w++) { + res[w] = b1 + i * 64 * 64 + w * 64 * 64; + offsets[i + w + 2 * block_size] = res[w]; + } + + for (w = 0; w < kWorkPerThread; w++) { + res[w] = v + h * depth + 64 * d_model * n + w * depth; + offsets[i + w + 3 * block_size] = res[w]; + } + + for (w = 0; w < kWorkPerThread; w++) { + res[w] = b2 + h * depth + 64 * d_model * n + w * depth; + offsets[i + w + 4 * block_size] = res[w]; + } +} + +template +void genOffsetPointers(T** offsets, int heads, int max_batch, int depth, + int d_model, T* k, T* q, T* b1, + T* v, T* b2, sycl::queue& sycl_queue) { + const int block_size = heads * max_batch; + // Process two elements per thread to use 128 bit store instructions. + constexpr int kWorkPerThread = 2; + constexpr int kWorkGroupSize = 128; + if (block_size % kWorkPerThread != 0) { + // Handle odd block sizes. + sycl::range<1> global(DivUp(block_size, kWorkGroupSize)); + sycl::range<1> local(kWorkGroupSize); + sycl_queue.parallel_for(sycl::nd_range<1>(global*local, local), + [=](sycl::nd_item<1> item_ct) { + genOffsetPointers_kernel(offsets, heads, block_size, + depth, d_model, k, q, b1, + v, b2, item_ct); + }); + } else { + // Handle even block size + sycl::range<1> global(DivUp(block_size, kWorkGroupSize*kWorkPerThread)); + sycl::range<1> local(kWorkGroupSize); + sycl_queue.parallel_for(sycl::nd_range<1>(global*local, local), + [=](sycl::nd_item<1> item_ct) { + genOffsetPointers_kernel(offsets, heads, block_size, + depth, d_model, k, q, b1, + v, b2, item_ct); + }); + } +} + // Template instantiation. template void copyTypeConverted(sycl::half* op, float* ip, int N, sycl::queue &sycl_queue); template void copyTypeConverted(float* op, sycl::half* ip, int N, sycl::queue &sycl_queue); @@ -1950,5 +2024,13 @@ template void applyInputGating(sycl::half* output, const sycl::half* template void applyInputGating(float* output, const float* input, const float* mult, const float* add, int N, int C, int output_size, sycl::queue &sycl_queue); + +template void genOffsetPointers(float** offsets, int heads, int max_batch, int depth, + int d_model, float* k, float* q, float* b1, + float* v, float* b2, sycl::queue& sycl_queue); + +template void genOffsetPointers(sycl::half** offsets, int heads, int max_batch, int depth, + int d_model, sycl::half* k, sycl::half* q, sycl::half* b1, + sycl::half* v, sycl::half* b2, sycl::queue& sycl_queue); } // namespace sycldnn_backend } // namespace lczero diff --git a/src/neural/backends/sycl/cuBlasContext.h b/src/neural/backends/sycl/cuBlasContext.h index 5e201b82bd..f330ce8150 100644 --- a/src/neural/backends/sycl/cuBlasContext.h +++ b/src/neural/backends/sycl/cuBlasContext.h @@ -61,7 +61,7 @@ class cuBlasContextManager{ #include "hip/hip_runtime.h" -#include "hipblas.h" +#include "hipblas/hipblas.h" class hipBlasContextManager; static hipBlasContextManager *_hipBlasContextManager; diff --git a/src/neural/backends/sycl/fp16_kernels.dp.cpp b/src/neural/backends/sycl/fp16_kernels.dp.cpp index bb89b65a97..a6921e9733 100644 --- a/src/neural/backends/sycl/fp16_kernels.dp.cpp +++ b/src/neural/backends/sycl/fp16_kernels.dp.cpp @@ -20,14 +20,9 @@ */ #include -#include "dpct/dpct.hpp" #include "sycl_common.h" #include "neural/backends/shared/activation.h" -// Allow building on an old architecture. -#if DPCT_COMPATIBILITY_TEMP < 530 -#define SKIP_FP16_BITS 1 -#endif #include "winograd_helper.h" namespace lczero { @@ -597,9 +592,7 @@ void OutputInputTransformKernel_fp16_shmem_board( int c = k; // top-left { - sycl::half inEl[6][6] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + sycl::half inEl[6][6] = {}; #pragma unroll for (int i = 0; i < 5; i++) @@ -617,9 +610,7 @@ void OutputInputTransformKernel_fp16_shmem_board( // top-right { - sycl::half inEl[6][6] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + sycl::half inEl[6][6] = {}; #pragma unroll for (int i = 0; i < 5; i++) @@ -637,9 +628,7 @@ void OutputInputTransformKernel_fp16_shmem_board( // bottom-left { - sycl::half inEl[6][6] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + sycl::half inEl[6][6] = {}; #pragma unroll for (int i = 0; i < 5; i++) @@ -657,9 +646,7 @@ void OutputInputTransformKernel_fp16_shmem_board( // bottom-right { - sycl::half inEl[6][6] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + sycl::half inEl[6][6] = {}; #pragma unroll for (int i = 0; i < 5; i++) diff --git a/src/neural/backends/sycl/kernels.h b/src/neural/backends/sycl/kernels.h index 05954e32c2..2330cae9f5 100644 --- a/src/neural/backends/sycl/kernels.h +++ b/src/neural/backends/sycl/kernels.h @@ -20,7 +20,6 @@ */ #include -#include "dpct/dpct.hpp" #include "sycl_common.h" #include "neural/backends/shared/activation.h" @@ -146,5 +145,9 @@ void inputPreprocessForAttentionBody(T* output, const T* input, template void applyInputGating(T* output, const T* input, const T* mult, const T* add, int N, int HW, int C, sycl::queue &sycl_queue); + +template +void genOffsetPointers(T** offsets, int heads, int max_batch, int depth, + int d_model, T* k, T* q, T* b1, T* v, T* b2, sycl::queue &sycl_queue); } // namespace sycldnn_backend } // namespace lczero diff --git a/src/neural/backends/sycl/layers.cc.dp.cpp b/src/neural/backends/sycl/layers.cc.dp.cpp index fa49425a0e..8a046ab292 100644 --- a/src/neural/backends/sycl/layers.cc.dp.cpp +++ b/src/neural/backends/sycl/layers.cc.dp.cpp @@ -20,7 +20,6 @@ */ #include -#include "dpct/dpct.hpp" #include "layers.h" #include @@ -28,7 +27,7 @@ #include #ifdef USE_HIPBLAS -#include "hipblas.h" +#include "hipblas/hipblas.h" #include "cuBlasContext.h" #elif defined(USE_CUBLAS) #include @@ -46,12 +45,15 @@ #include "neural/network.h" #include "neural/tables/attention_policy_map.h" #include "utils/fp16_utils.h" -#include "dpct/lib_common_utils.hpp" #include #ifdef USE_HIPBLAS +#if hipblasVersionMajor < 3 +#define HIPBLAS_COMPUTE_16F HIPBLAS_R_16F +#define HIPBLAS_COMPUTE_32F HIPBLAS_R_32F +#endif #define transpose_type hipblasOperation_t #define transpose_type_transpose HIPBLAS_OP_T #define transpose_type_notranspose HIPBLAS_OP_N @@ -237,17 +239,15 @@ void SELayer::Eval(int N, float* output, const float* input, sycl_queue.submit([&](sycl::handler &cgh) { //auto d_A = b_A.get_access(cgh); - cgh.host_task([=](sycl::interop_handle ih) { + cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) { - auto cudaStreamHandle = sycl::get_native(sycl_queue); + auto cudaStreamHandle = ih.get_native_queue(); cublasSetStream(handle, cudaStreamHandle); ReportCUBLASErrors(cublasSgemm(handle, transpose_type_transpose, transpose_type_notranspose, numFc1Out_, N, C, &alpha, w1_, C, op2, C, &beta, op1, numFc1Out_)); - cudaStreamSynchronize(cudaStreamHandle); - }); }); #elif defined(USE_HIPBLAS) @@ -256,16 +256,14 @@ void SELayer::Eval(int N, float* output, const float* input, sycl_queue.submit([&](sycl::handler &cgh) { //auto d_A = b_A.get_access(cgh); - cgh.host_task([=](sycl::interop_handle ih) { + cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) { + auto hipStreamHandle = ih.get_native_queue(); - auto hipStreamHandle = sycl::get_native(sycl_queue); hipblasSetStream(handle, hipStreamHandle); hipblasSgemm(handle, transpose_type_transpose, transpose_type_notranspose, numFc1Out_, N, C, &alpha, w1_, C, op2, C, &beta, op1, numFc1Out_); - - hipStreamSynchronize(hipStreamHandle); }); }); #else @@ -284,33 +282,30 @@ void SELayer::Eval(int N, float* output, const float* input, sycl_queue.submit([&](sycl::handler &cgh) { - cgh.host_task([=](sycl::interop_handle ih) { + cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) { - auto cudaStreamHandle = sycl::get_native(sycl_queue); + auto cudaStreamHandle = ih.get_native_queue(); cublasSetStream(handle, cudaStreamHandle); ReportCUBLASErrors(cublasSgemm(handle, transpose_type_transpose, transpose_type_notranspose, 2 * C, N, numFc1Out_, &alpha, w2_, numFc1Out_, op1, numFc1Out_, &beta, op2, 2 * C)); - cudaStreamSynchronize(cudaStreamHandle); - }); }); #elif defined(USE_HIPBLAS) sycl_queue.submit([&](sycl::handler &cgh) { - cgh.host_task([=](sycl::interop_handle ih) { + cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) { + auto hipStreamHandle = ih.get_native_queue(); - auto hipStreamHandle = sycl::get_native(sycl_queue); hipblasSetStream(handle, hipStreamHandle); hipblasSgemm(handle, transpose_type_transpose, transpose_type_notranspose, 2 * C, N, numFc1Out_, &alpha, w2_, numFc1Out_, op1, numFc1Out_, &beta, op2, 2 * C); - hipStreamSynchronize(hipStreamHandle); }); }); @@ -373,17 +368,15 @@ void SELayer::Eval(int N, sycl::half* output, const sycl::half* inpu sycl_queue.submit([&](sycl::handler &cgh) { - cgh.host_task([=](sycl::interop_handle ih) { + cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) { - auto cudaStreamHandle = sycl::get_native(sycl_queue); + auto cudaStreamHandle = ih.get_native_queue(); cublasSetStream(handle, cudaStreamHandle); ReportCUBLASErrors(cublasHgemm(handle, transpose_type_transpose, transpose_type_notranspose, numFc1Out_, N, C, &alpha, ((const half *)w1_), C, ((const half *)op2), C, &beta, ((half *)op1), numFc1Out_)); - cudaStreamSynchronize(cudaStreamHandle); - }); }); @@ -391,10 +384,9 @@ void SELayer::Eval(int N, sycl::half* output, const sycl::half* inpu hipblasHandle_t handle = hipBlasContextManager::gethipBlasHandle_t(); sycl_queue.submit([&](sycl::handler &cgh) { - cgh.host_task([=](sycl::interop_handle ih) { + cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) { + auto hipStreamHandle = ih.get_native_queue(); - auto hipStreamHandle = - sycl::get_native(sycl_queue); hipblasSetStream(handle, hipStreamHandle); hipblasHgemm(handle, transpose_type_transpose, @@ -402,7 +394,6 @@ void SELayer::Eval(int N, sycl::half* output, const sycl::half* inpu ((const hipblasHalf *)w1_), C, ((const hipblasHalf *)op2), C, &beta, ((hipblasHalf *)op1), numFc1Out_); - hipStreamSynchronize(hipStreamHandle); }); }); #else @@ -418,9 +409,9 @@ void SELayer::Eval(int N, sycl::half* output, const sycl::half* inpu sycl_queue_.submit([&](sycl::handler &cgh) { - cgh.host_task([=](sycl::interop_handle ih) { + cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) { - auto cudaStreamHandle = sycl::get_native(sycl_queue); + auto cudaStreamHandle = ih.get_native_queue(); cublasSetStream(handle, cudaStreamHandle); // 3. Second fully connected layer. @@ -428,16 +419,13 @@ void SELayer::Eval(int N, sycl::half* output, const sycl::half* inpu numFc1Out_, &alpha, ((const half *)w2_), numFc1Out_, ((const half *)op1), numFc1Out_, &beta, ((half *)op2), 2 * C)); - cudaStreamSynchronize(cudaStreamHandle); - }); }); #elif defined(USE_HIPBLAS) sycl_queue.submit([&](sycl::handler &cgh) { - cgh.host_task([=](sycl::interop_handle ih) { - auto hipStreamHandle = - sycl::get_native(sycl_queue); + cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) { + auto hipStreamHandle = ih.get_native_queue(); hipblasSetStream(handle, hipStreamHandle); hipblasHgemm( @@ -446,7 +434,6 @@ void SELayer::Eval(int N, sycl::half* output, const sycl::half* inpu ((const hipblasHalf *)op1), numFc1Out_, &beta, ((hipblasHalf *)op2), 2 * C); - hipStreamSynchronize(hipStreamHandle); }); }); #else @@ -562,9 +549,9 @@ template <> sycl_queue.submit([&](sycl::handler &cgh) { - cgh.host_task([=](sycl::interop_handle ih) { + cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) { - auto cudaStreamHandle = sycl::get_native(sycl_queue); + auto cudaStreamHandle = ih.get_native_queue(); cublasSetStream(handle, cudaStreamHandle); ReportCUBLASErrors(cublasHgemm(handle, transpose_type_transpose, transpose_type_notranspose, num_outputs, @@ -572,16 +559,13 @@ template <> ((const half *)input_tensor), num_inputs, &beta, ((half *)output_tensor), num_outputs)); - cudaStreamSynchronize(cudaStreamHandle); - }); }); #elif defined(USE_HIPBLAS) hipblasHandle_t handle = hipBlasContextManager::gethipBlasHandle_t(); sycl_queue.submit([&](sycl::handler &cgh) { - cgh.host_task([=](sycl::interop_handle ih) { - auto hipStreamHandle = - sycl::get_native(sycl_queue); + cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) { + auto hipStreamHandle = ih.get_native_queue(); hipblasSetStream(handle, hipStreamHandle); hipblasHgemm( @@ -590,7 +574,6 @@ template <> num_inputs, ((const hipblasHalf *)input_tensor), num_inputs, &beta, ((hipblasHalf *)output_tensor), num_outputs); - hipStreamSynchronize(hipStreamHandle); }); }); #else @@ -625,9 +608,9 @@ void FCLayer::Eval(int N, float* output_tensor, sycl_queue.submit([&](sycl::handler &cgh) { - cgh.host_task([=](sycl::interop_handle ih) { + cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) { - auto cudaStreamHandle = sycl::get_native(sycl_queue); + auto cudaStreamHandle = ih.get_native_queue(); cublasSetStream(handle, cudaStreamHandle); @@ -636,17 +619,14 @@ void FCLayer::Eval(int N, float* output_tensor, input_tensor, num_inputs, &beta, output_tensor, num_outputs)); - cudaStreamSynchronize(cudaStreamHandle); - }); }); #elif defined(USE_HIPBLAS) hipblasHandle_t handle = hipBlasContextManager::gethipBlasHandle_t(); sycl_queue.submit([&](sycl::handler &cgh) { - - cgh.host_task([=](sycl::interop_handle ih) { + cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) { + auto hipStreamHandle = ih.get_native_queue(); - auto hipStreamHandle = sycl::get_native(sycl_queue); hipblasSetStream(handle, hipStreamHandle); @@ -655,7 +635,6 @@ void FCLayer::Eval(int N, float* output_tensor, input_tensor, num_inputs, &beta, output_tensor, num_outputs); - hipStreamSynchronize(hipStreamHandle); }); }); @@ -939,9 +918,9 @@ template <> sycl_queue.submit([&](sycl::handler &cgh) { //auto d_A = b_A.get_access(cgh); - cgh.host_task([=](sycl::interop_handle ih) { + cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) { - auto cudaStreamHandle = sycl::get_native(sycl_queue); + auto cudaStreamHandle = ih.get_native_queue(); cublasSetStream(handle, cudaStreamHandle); ReportCUBLASErrors(cublasGemmStridedBatchedEx( @@ -950,8 +929,6 @@ template <> batchSize, CUDA_R_16F, CUBLAS_GEMM_DEFAULT)); - cudaStreamSynchronize(cudaStreamHandle); - }); }); @@ -959,18 +936,16 @@ template <> hipblasHandle_t handle = hipBlasContextManager::gethipBlasHandle_t(); sycl_queue.submit([&](sycl::handler &cgh) { - cgh.host_task([=](sycl::interop_handle ih) { - auto hipStreamHandle = - sycl::get_native(sycl_queue); + cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) { + auto hipStreamHandle = ih.get_native_queue(); hipblasSetStream(handle, hipStreamHandle); hipblasGemmStridedBatchedEx( handle, transpose_type_notranspose, transpose_type_notranspose, N, M, K, &alpha, B, HIPBLAS_R_16F, N, N * K, A, HIPBLAS_R_16F, K, K * M, - &beta, Out, HIPBLAS_R_16F, N, N * M, batchSize, HIPBLAS_R_16F, + &beta, Out, HIPBLAS_R_16F, N, N * M, batchSize, HIPBLAS_COMPUTE_16F, HIPBLAS_GEMM_DEFAULT); - hipStreamSynchronize(hipStreamHandle); }); }); #else @@ -1008,8 +983,8 @@ template <> void BaseLayer::cublasRowMajorMatrixMul(const float* A, const #ifdef USE_CUBLAS sycl_queue.submit([&](sycl::handler &cgh) { //auto d_A = b_A.get_access(cgh); - cgh.host_task([=](sycl::interop_handle ih) { - auto cudaStreamHandle = sycl::get_native(sycl_queue); + cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) { + auto cudaStreamHandle = ih.get_native_queue(); cublasSetStream(handle, cudaStreamHandle); ReportCUBLASErrors(cublasGemmStridedBatchedEx( @@ -1018,25 +993,21 @@ template <> void BaseLayer::cublasRowMajorMatrixMul(const float* A, const batchSize, CUDA_R_32F, CUBLAS_GEMM_DEFAULT)); - cudaStreamSynchronize(cudaStreamHandle); - }); }); #elif defined(USE_HIPBLAS) sycl_queue.submit([&](sycl::handler &cgh) { //auto d_A = b_A.get_access(cgh); - cgh.host_task([=](sycl::interop_handle ih) { - auto hipStreamHandle = sycl::get_native(sycl_queue); + cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) { + auto hipStreamHandle = ih.get_native_queue(); hipblasSetStream(handle, hipStreamHandle); hipblasGemmStridedBatchedEx( handle, transpose_type_notranspose, transpose_type_notranspose, N, M, K, &floatOne, B, HIPBLAS_R_32F, N, N * K, A, HIPBLAS_R_32F, K, K * M, &floatZero, Out, HIPBLAS_R_32F, N, N * M, - batchSize, HIPBLAS_R_32F, HIPBLAS_GEMM_DEFAULT); + batchSize, HIPBLAS_COMPUTE_32F, HIPBLAS_GEMM_DEFAULT); - hipStreamSynchronize(hipStreamHandle); - }); }); #else @@ -1192,9 +1163,9 @@ template <> #ifdef USE_CUBLAS sycl_queue.submit([&](sycl::handler &cgh) { - cgh.host_task([=](sycl::interop_handle ih) { + cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) { - auto cudaStreamHandle = sycl::get_native(sycl_queue); + auto cudaStreamHandle = ih.get_native_queue(); cublasSetStream(handle, cudaStreamHandle); @@ -1203,22 +1174,18 @@ template <> N * K, A, CUDA_R_16F, K, 0, &zero_h, Out, CUDA_R_16F, N, N * M, batchSize, CUDA_R_16F, CUBLAS_GEMM_DEFAULT)); - cudaStreamSynchronize(cudaStreamHandle); - }); }); #elif defined(USE_HIPBLAS) sycl_queue.submit([&](sycl::handler &cgh) { - cgh.host_task([=](sycl::interop_handle ih) { - auto hipStreamHandle = - sycl::get_native(sycl_queue); + cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) { + auto hipStreamHandle = ih.get_native_queue(); hipblasSetStream(handle, hipStreamHandle); hipblasGemmStridedBatchedEx( handle, transpose_type_notranspose, transpose_type_notranspose, N, M, K, &alpha, B, HIPBLAS_R_16F, N, N * K, A, HIPBLAS_R_16F, K, - 0, &beta, Out, HIPBLAS_R_16F, N, N * M, batchSize, HIPBLAS_R_16F, + 0, &beta, Out, HIPBLAS_R_16F, N, N * M, batchSize, HIPBLAS_COMPUTE_16F, HIPBLAS_GEMM_DEFAULT); - hipStreamSynchronize(hipStreamHandle); }); }); #else @@ -1257,9 +1224,9 @@ void Conv1Layer::cublasSpecialMatrixMul(const float* A, const float* B, #ifdef USE_CUBLAS sycl_queue.submit([&](sycl::handler &cgh) { - cgh.host_task([=](sycl::interop_handle ih) { + cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) { - auto cudaStreamHandle = sycl::get_native(sycl_queue); + auto cudaStreamHandle = ih.get_native_queue(); cublasSetStream(handle, cudaStreamHandle); @@ -1268,25 +1235,21 @@ void Conv1Layer::cublasSpecialMatrixMul(const float* A, const float* B, N * K, A, CUDA_R_32F, K, 0, &floatZero, Out, CUDA_R_32F, N, N * M, batchSize, CUDA_R_32F, CUBLAS_GEMM_DEFAULT)); - cudaStreamSynchronize(cudaStreamHandle); - }); }); #elif defined(USE_HIPBLAS) sycl_queue.submit([&](sycl::handler &cgh) { - cgh.host_task([=](sycl::interop_handle ih) { + cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) { + auto hipStreamHandle = ih.get_native_queue(); - auto hipStreamHandle = sycl::get_native(sycl_queue); hipblasSetStream(handle, hipStreamHandle); hipblasGemmStridedBatchedEx( handle, transpose_type_notranspose, transpose_type_notranspose, N, M, K, &floatOne, B, HIPBLAS_R_32F, N, N * K, A, HIPBLAS_R_32F, K, 0, &floatZero, Out, HIPBLAS_R_32F, N, N * M, - batchSize, HIPBLAS_R_32F, HIPBLAS_GEMM_DEFAULT); - - hipStreamSynchronize(hipStreamHandle); + batchSize, HIPBLAS_COMPUTE_32F, HIPBLAS_GEMM_DEFAULT); }); }); @@ -1305,14 +1268,11 @@ void Conv1Layer::Eval(int N, DataType* output, const DataType* input, size_t /*scratch_size*/, sycl::queue &sycl_queue, DataType***) { - sycl_queue.wait(); - //CERR << "Conv1Layer::Eval. "; cublasSpecialMatrixMul(weights_, input, output, C, H * W, c_input_, N, sycl_queue); // CERR << "cublasSpecialMatrixMul. "; - sycl_queue.wait(); if (use_bias_){ // CERR << "addBias. " << N << " " << C << " " << H << " " << W; addBias_NCHW(output, output, biases_, N, C, H, W, act_, sycl_queue); @@ -1320,8 +1280,6 @@ void Conv1Layer::Eval(int N, DataType* output, const DataType* input, addVectors(output, output, (DataType*)nullptr, N * C * H * W, N * C * H * W, 0, act_, sycl_queue); // CERR << "addVectors. "; } - - sycl_queue.wait(); } template @@ -1792,24 +1750,22 @@ static void cublasXgemm(transpose_type transa, unsigned short alpha_h = FP32toFP16(alpha); unsigned short beta_h = FP32toFP16(beta); sycl_queue.submit([&](sycl::handler &cgh) { - cgh.host_task([=](sycl::interop_handle ih) { - auto cudaStreamHandle = sycl::get_native(sycl_queue); + cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) { + auto cudaStreamHandle = ih.get_native_queue(); cublasSetStream(handle, cudaStreamHandle); ReportCUBLASErrors(cublasHgemm( handle, transa, transb, m, n, k, (const half*)&alpha_h, ((const half *)A), lda, ((const half *)B), ldb, (const half*)&beta_h, ((half *)C), ldc)); - cudaStreamSynchronize(cudaStreamHandle); }); }); } else { sycl_queue.submit([&](sycl::handler &cgh) { - cgh.host_task([=](sycl::interop_handle ih) { - auto cudaStreamHandle = sycl::get_native(sycl_queue); + cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) { + auto cudaStreamHandle = ih.get_native_queue(); cublasSetStream(handle, cudaStreamHandle); ReportCUBLASErrors(cublasSgemm(handle, transa, transb, m, n, k, &alpha, (const float*)A, lda, (const float*)B, ldb, &beta, (float*)C, ldc)); - cudaStreamSynchronize(cudaStreamHandle); }); }); @@ -1820,21 +1776,19 @@ static void cublasXgemm(transpose_type transa, unsigned short alpha_h = FP32toFP16(alpha); unsigned short beta_h = FP32toFP16(beta); sycl_queue.submit([&](sycl::handler &cgh) { - cgh.host_task([=](sycl::interop_handle ih) { - auto hipStreamHandle = sycl::get_native(sycl_queue); + cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) { + auto hipStreamHandle = ih.get_native_queue(); hipblasSetStream(handle, hipStreamHandle); hipblasHgemm(handle, transa, transb, m, n, k, &alpha_h, (const hipblasHalf*)A, lda, (const hipblasHalf*)B, ldb, &beta_h, (hipblasHalf*)C, ldc); - hipStreamSynchronize(hipStreamHandle); }); }); } else { sycl_queue.submit([&](sycl::handler &cgh) { - cgh.host_task([=](sycl::interop_handle ih) { - auto hipStreamHandle = sycl::get_native(sycl_queue); + cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) { + auto hipStreamHandle = ih.get_native_queue(); hipblasSetStream(handle, hipStreamHandle); hipblasSgemm(handle, transa, transb, m, n, k, &alpha, (const float*)A, lda, (const float*)B, ldb, &beta, (float*)C, ldc); - hipStreamSynchronize(hipStreamHandle); }); }); } @@ -1860,8 +1814,8 @@ static void cublasXGemmStridedBatched(transpose_type transa, transpose_type tran unsigned short beta_h = FP32toFP16(beta); sycl_queue.submit([&](sycl::handler &cgh) { - cgh.host_task([=](sycl::interop_handle ih) { - auto cudaStreamHandle = sycl::get_native(sycl_queue); + cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) { + auto cudaStreamHandle = ih.get_native_queue(); cublasSetStream(handle, cudaStreamHandle); ReportCUBLASErrors(cublasGemmStridedBatchedEx( @@ -1869,7 +1823,6 @@ static void cublasXGemmStridedBatched(transpose_type transa, transpose_type tran B, CUDA_R_16F, ldb, strideB, &beta_h, C, CUDA_R_16F, ldc, strideC, batchCount, CUDA_R_16F, CUBLAS_GEMM_DEFAULT)); - cudaStreamSynchronize(cudaStreamHandle); }); @@ -1879,9 +1832,9 @@ static void cublasXGemmStridedBatched(transpose_type transa, transpose_type tran sycl_queue.submit([&](sycl::handler &cgh) { - cgh.host_task([=](sycl::interop_handle ih) { + cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) { - auto cudaStreamHandle = sycl::get_native(sycl_queue); + auto cudaStreamHandle = ih.get_native_queue(); cublasSetStream(handle, cudaStreamHandle); ReportCUBLASErrors(cublasGemmStridedBatchedEx( @@ -1889,7 +1842,6 @@ static void cublasXGemmStridedBatched(transpose_type transa, transpose_type tran CUDA_R_32F, ldb, strideB, &beta, C, CUDA_R_32F, ldc, strideC, batchCount, CUDA_R_32F, CUBLAS_GEMM_DEFAULT)); - cudaStreamSynchronize(cudaStreamHandle); }); }); @@ -1902,34 +1854,32 @@ static void cublasXGemmStridedBatched(transpose_type transa, transpose_type tran sycl_queue.submit([&](sycl::handler &cgh) { - cgh.host_task([=](sycl::interop_handle ih) { + cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) { + auto hipStreamHandle = ih.get_native_queue(); - auto hipStreamHandle = sycl::get_native(sycl_queue); hipblasSetStream(handle, hipStreamHandle); hipblasGemmStridedBatchedEx( handle, transa, transb, m, n, k, &alpha_h, A, HIPBLAS_R_16F, lda, strideA, B, HIPBLAS_R_16F, ldb, strideB, &beta_h, C, HIPBLAS_R_16F, ldc, strideC, - batchCount, HIPBLAS_R_16F, HIPBLAS_GEMM_DEFAULT); + batchCount, HIPBLAS_COMPUTE_16F, HIPBLAS_GEMM_DEFAULT); - hipStreamSynchronize(hipStreamHandle); }); }); } else { sycl_queue.submit([&](sycl::handler &cgh) { - cgh.host_task([=](sycl::interop_handle ih) { + cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) { + auto hipStreamHandle = ih.get_native_queue(); - auto hipStreamHandle = sycl::get_native(sycl_queue); hipblasSetStream(handle, hipStreamHandle); hipblasGemmStridedBatchedEx( handle, transa, transb, m, n, k, &alpha, A, HIPBLAS_R_32F, lda, strideA, B, HIPBLAS_R_32F, ldb, strideB, &beta, C, HIPBLAS_R_32F, ldc, strideC, - batchCount, HIPBLAS_R_32F, HIPBLAS_GEMM_DEFAULT); + batchCount, HIPBLAS_COMPUTE_32F, HIPBLAS_GEMM_DEFAULT); - hipStreamSynchronize(hipStreamHandle); }); }); @@ -1957,16 +1907,14 @@ static void cublasXGemmBatched(transpose_type transa, sycl_queue.submit([&](sycl::handler &cgh) { - cgh.host_task([=](sycl::interop_handle ih) { - auto cudaStreamHandle = sycl::get_native(sycl_queue); + cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) { + auto cudaStreamHandle = ih.get_native_queue(); cublasSetStream(handle, cudaStreamHandle); ReportCUBLASErrors(cublasHgemmBatched( handle, transa, transb, m, n, k, (const half*)&alpha_h, (half**)A, lda, (half**)B, ldb, (const half*)&beta_h, (half**)C, ldc, batchCount)); - cudaStreamSynchronize(cudaStreamHandle); - }); }); @@ -1974,16 +1922,14 @@ static void cublasXGemmBatched(transpose_type transa, } else { sycl_queue.submit([&](sycl::handler &cgh) { - cgh.host_task([=](sycl::interop_handle ih) { - auto cudaStreamHandle = sycl::get_native(sycl_queue); + cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) { + auto cudaStreamHandle = ih.get_native_queue(); cublasSetStream(handle, cudaStreamHandle); ReportCUBLASErrors(cublasSgemmBatched( handle, transa, transb, m, n, k, &alpha, (float**)A, lda, (float**)B, ldb, &beta, (float**)C, ldc, batchCount)); - cudaStreamSynchronize(cudaStreamHandle); - }); }); @@ -1999,17 +1945,15 @@ static void cublasXGemmBatched(transpose_type transa, sycl_queue.submit([&](sycl::handler &cgh) { - cgh.host_task([=](sycl::interop_handle ih) { + cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) { + auto hipStreamHandle = ih.get_native_queue(); - auto hipStreamHandle = sycl::get_native(sycl_queue); hipblasSetStream(handle, hipStreamHandle); hipblasHgemmBatched( handle, transa, transb, m, n, k, (const hipblasHalf*)&alpha_h, (hipblasHalf**)A, lda, (hipblasHalf**)B, ldb, (const hipblasHalf*)&beta_h, (hipblasHalf**)C, ldc, batchCount); - hipStreamSynchronize(hipStreamHandle); - }); }); @@ -2017,16 +1961,15 @@ static void cublasXGemmBatched(transpose_type transa, } else { sycl_queue.submit([&](sycl::handler &cgh) { - cgh.host_task([=](sycl::interop_handle ih) { + cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) { + auto hipStreamHandle = ih.get_native_queue(); - auto hipStreamHandle = sycl::get_native(sycl_queue); hipblasSetStream(handle, hipStreamHandle); hipblasSgemmBatched( handle, transa, transb, m, n, k, &alpha, (float**)A, lda, (float**)B, ldb, &beta, (float**)C, ldc, batchCount); - hipStreamSynchronize(hipStreamHandle); }); @@ -2179,27 +2122,13 @@ void EncoderBlock::Eval(int N, DataType* in_out_tensor, // matmul_qk = tf.matmul(q, k, transpose_b=True) { if (*offset_pointers == nullptr) { - std::vector offsets(encoder_heads_ * max_batch_size_ * 5); - for (int i = 0; i < encoder_heads_ * max_batch_size_; i++) { - int h = i % encoder_heads_; - int n = i / encoder_heads_; - offsets[i] = mha_k + h * depth + 64 * d_model * n; - offsets[i + encoder_heads_ * max_batch_size_] = - mha_q + h * depth + 64 * d_model * n; - offsets[i + 2 * encoder_heads_ * max_batch_size_] = - buffer1 + i * 64 * 64; - offsets[i + 3 * encoder_heads_ * max_batch_size_] = - mha_v + h * depth + 64 * d_model * n; - offsets[i + 4 * encoder_heads_ * max_batch_size_] = - buffer2 + h * depth + 64 * d_model * n; - } *offset_pointers = sycl::malloc_device( encoder_heads_ * max_batch_size_ * 5, sycl_queue_); - - sycl_queue.memcpy(*offset_pointers, offsets.data(), - encoder_heads_ * max_batch_size_ * 5 * sizeof(DataType*)).wait(); + genOffsetPointers(*offset_pointers, encoder_heads_, max_batch_size_, + depth, d_model, mha_k, mha_q, buffer1, + mha_v, buffer2, sycl_queue_); } cublasXGemmBatched(transpose_type_transpose, transpose_type_notranspose, diff --git a/src/neural/backends/sycl/layers.h b/src/neural/backends/sycl/layers.h index 6682429510..850e29ecc0 100644 --- a/src/neural/backends/sycl/layers.h +++ b/src/neural/backends/sycl/layers.h @@ -22,8 +22,6 @@ #pragma once #include -#include "dpct/dpct.hpp" -#include "dpct/blas_utils.hpp" #include diff --git a/src/neural/backends/sycl/network_sycl.cc.dp.cpp b/src/neural/backends/sycl/network_sycl.cc.dp.cpp index 873545ab8c..11683c8aae 100644 --- a/src/neural/backends/sycl/network_sycl.cc.dp.cpp +++ b/src/neural/backends/sycl/network_sycl.cc.dp.cpp @@ -22,7 +22,6 @@ #define DPCT_COMPAT_RT_VERSION 12020 #include -#include "dpct/dpct.hpp" #include #include #include @@ -202,28 +201,47 @@ class SyclNetwork : public Network { max_batch_size_ = options.GetOrDefault("max_batch", 1024); + // Get all available platforms + auto platforms = sycl::platform::get_platforms(); + if (platforms.empty()) { + throw Exception("No SYCL platform found."); + } + showPlatformInfo(platforms); + + // A vector to store all sycl devices. + std::vector devices; - int total_gpus = dpct::dev_mgr::instance().device_count(); + for (const auto& platform : platforms) { + auto platform_devices = platform.get_devices(); + devices.insert(devices.end(), platform_devices.begin(), platform_devices.end()); + } - if (gpu_id_ >= total_gpus) + if (gpu_id_ >= (int)devices.size() || gpu_id_ < 0) throw Exception("Invalid GPU Id: " + std::to_string(gpu_id_)); - - //dpct::dev_mgr::instance().get_device(gpu_id_).get_device_info(deviceProp); - - sycl_queue_ = new sycl::queue{dpct::dev_mgr::instance().get_device(gpu_id_), [] (sycl::exception_list exceptions) { - + // Is it a cpu device? + is_cpu_ = devices[gpu_id_].is_cpu(); + // Get the number of compute units(execution units). + compute_units_ = devices[gpu_id_].get_info(); + // Get context. + sycl::context context{devices[gpu_id_]}; + auto exceptions_handler = [&] (sycl::exception_list exceptions) { for (std::exception_ptr const& e : exceptions) { - try { - std::rethrow_exception(e); - } catch(sycl::exception const& e) { - - std::cout << "Caught asynchronous SYCL exception during GEMM:\n" << e.what() << std::endl; - } - - } - }, sycl::property_list{sycl::property::queue::in_order{}}}; + try { + std::rethrow_exception(e); + } catch(sycl::exception const& e) { + CERR + << "Caught asynchronous SYCL exception during GEMM:\n" + << e.what() + << "\n "; + std::terminate(); + } + } + }; + + sycl_queue_ = new sycl::queue{context, devices[gpu_id_], + exceptions_handler, sycl::property_list{sycl::property::queue::in_order{}} }; showDeviceInfo(*sycl_queue_); @@ -243,10 +261,12 @@ class SyclNetwork : public Network { if (fp16) { - dpct::has_capability_or_fail(sycl_queue_->get_device(), {sycl::aspect::fp16}); - CERR << "Using Fp16 "; + if (!sycl_queue_->get_device().has(sycl::aspect::fp16)) { + throw Exception("Requested fp16 is not supported by the device"); + } + CERR << "Using Fp16 "; } else { - CERR << "Using Fp32 "; + CERR << "Using Fp32 "; } const int kNumInputPlanes = kInputPlanes; @@ -741,93 +761,72 @@ class SyclNetwork : public Network { batchSize, spare1, flow, spare2, scratch_mem, scratch_size_, io_sycl_queue_, head_offset_pointers); // Entire Attention policy head except for the // policy map - io_sycl_queue_.wait(); if (fp16) { network_[l++]->Eval(batchSize, spare2, spare1, nullptr, scratch_mem, scratch_size_, io_sycl_queue_, nullptr); // policy map layer - io_sycl_queue_.wait(); copyTypeConverted(opPol, (sycl::half*)spare2, batchSize * kNumOutputPolicy, io_sycl_queue_); // POLICY output - io_sycl_queue_.wait(); } else { network_[l++]->Eval(batchSize, (DataType*)opPol, spare1, nullptr, scratch_mem, scratch_size_, io_sycl_queue_, nullptr); // policy map layer // POLICY output - io_sycl_queue_.wait(); } } else if (conv_policy_) { network_[l++]->Eval(batchSize, spare1, flow, nullptr, scratch_mem, scratch_size_, io_sycl_queue_, nullptr); // policy conv1 - io_sycl_queue_.wait(); network_[l++]->Eval(batchSize, spare2, spare1, nullptr, scratch_mem, scratch_size_, io_sycl_queue_, nullptr); // policy conv2 - io_sycl_queue_.wait(); if (fp16) { network_[l++]->Eval(batchSize, spare1, spare2, nullptr, scratch_mem, scratch_size_, io_sycl_queue_, nullptr); // policy map layer - io_sycl_queue_.wait(); copyTypeConverted(opPol, (sycl::half*)(spare1), batchSize * kNumOutputPolicy, io_sycl_queue_); // POLICY output - io_sycl_queue_.wait(); } else { network_[l++]->Eval(batchSize, (DataType*)opPol, spare2, nullptr, scratch_mem, scratch_size_, io_sycl_queue_, nullptr); // policy map layer // POLICY output - io_sycl_queue_.wait(); } } else { network_[l++]->Eval(batchSize, spare1, flow, nullptr, scratch_mem, scratch_size_, io_sycl_queue_, nullptr); // pol conv - io_sycl_queue_.wait(); if (fp16) { network_[l++]->Eval(batchSize, spare2, spare1, nullptr, scratch_mem, scratch_size_, io_sycl_queue_, nullptr); // pol FC - io_sycl_queue_.wait(); copyTypeConverted(opPol, (sycl::half*)(spare2), batchSize * kNumOutputPolicy, io_sycl_queue_); // POLICY - io_sycl_queue_.wait(); } else { network_[l++]->Eval(batchSize, (DataType*)opPol, spare1, nullptr, scratch_mem, scratch_size_, io_sycl_queue_, nullptr); // pol FC // POLICY - io_sycl_queue_.wait(); } } - // Copy policy output from device memory to host memory. - - io_sycl_queue_.memcpy(io->op_policy_mem_, io->op_policy_mem_gpu_, sizeof(float) * kNumOutputPolicy * batchSize); - io_sycl_queue_.wait(); - // value head if (fp16) { network_[l++]->Eval(batchSize, spare1, flow, spare2, scratch_mem, scratch_size_, io_sycl_queue_, nullptr); // value head - io_sycl_queue_.wait(); copyTypeConverted(opVal, (sycl::half*)spare1, wdl_ ? 3 * batchSize : batchSize, io_sycl_queue_); - io_sycl_queue_.wait(); } else { network_[l++]->Eval(batchSize, (DataType*)opVal, flow, spare2, scratch_mem, scratch_size_, io_sycl_queue_, nullptr); // value head - io_sycl_queue_.wait(); } if (moves_left_) { @@ -836,13 +835,9 @@ class SyclNetwork : public Network { network_[l++]->Eval(batchSize, spare1, flow, nullptr, scratch_mem, scratch_size_, io_sycl_queue_, nullptr); // moves conv or embedding - io_sycl_queue_.wait(); - network_[l++]->Eval(batchSize, spare2, spare1, nullptr, scratch_mem, scratch_size_, io_sycl_queue_, nullptr); // moves FC1 - io_sycl_queue_.wait(); - // Moves left FC2 if (fp16) { // TODO: consider fusing the bias-add of FC2 with format conversion. @@ -851,30 +846,29 @@ class SyclNetwork : public Network { network_[l++]->Eval(batchSize, spare1, spare2, nullptr, scratch_mem, scratch_size_, io_sycl_queue_, nullptr); - io_sycl_queue_.wait(); copyTypeConverted(opMov, (sycl::half*)(spare1), batchSize, io_sycl_queue_); - io_sycl_queue_.wait(); } else { network_[l++]->Eval(batchSize, (DataType*)opMov, spare2, nullptr, scratch_mem, scratch_size_, io_sycl_queue_, nullptr); - io_sycl_queue_.wait(); } } + + // Copy policy output from device memory to host memory. + auto event = io_sycl_queue_.memcpy(io->op_policy_mem_, io->op_policy_mem_gpu_, sizeof(float) * kNumOutputPolicy * batchSize); - if (multi_stream_) { - io_sycl_queue_.wait(); - } else { - io_sycl_queue_.wait(); + if (!multi_stream_) { //ReportCUDAErrors( // DPCT_CHECK_ERROR(dpct::get_current_device().queues_wait_and_throw())); // The next thread can start using the GPU now. lock_.unlock(); } + event.wait(); + if (wdl_) { // Value softmax done cpu side. for (int i = 0; i < batchSize; i++) { @@ -888,7 +882,7 @@ class SyclNetwork : public Network { float sum = w + d + l; w /= sum; l /= sum; - d = 1.0f - w - l; + d /= sum; io->op_value_mem_shared_[3 * i + 0] = w; io->op_value_mem_shared_[3 * i + 1] = d; io->op_value_mem_shared_[3 * i + 2] = l; @@ -916,14 +910,18 @@ class SyclNetwork : public Network { return capabilities_; } + // Check if device is the cpu for thread handling. + bool IsCpu() const override { return is_cpu_; } + + int GetThreads() const override { return 1 + multi_stream_; } + + int GetMiniBatchSize() const override { + if (is_cpu_) return 47; + // Simple heuristic that seems to work for a wide range of GPUs. + return 2 * compute_units_; + } + std::unique_ptr NewComputation() override { - // Set correct gpu id for this computation (as it might have been called - // from a different thread). - /* - DPCT1093:90: The "gpu_id_" device may be not the one intended for use. - Adjust the selected device if needed. - */ - dpct::select_device(gpu_id_); return std::make_unique>(this, wdl_, moves_left_); } @@ -953,6 +951,7 @@ class SyclNetwork : public Network { int gpu_id_; int l2_cache_size_; int max_batch_size_; + int compute_units_; bool wdl_; bool moves_left_; bool use_res_block_winograd_fuse_opt_; // fuse operations inside the residual @@ -960,10 +959,12 @@ class SyclNetwork : public Network { bool multi_stream_; // run multiple parallel network evals bool allow_cache_opt_; // try to fit residual block activations in L2 cache + // Currently only one NN Eval can happen a time (we can fix this if needed // by allocating more memory). mutable std::mutex lock_; - sycl::queue * sycl_queue_; + sycl::queue* sycl_queue_; + bool is_cpu_; int numBlocks_; @@ -997,15 +998,52 @@ class SyclNetwork : public Network { mutable std::mutex inputs_outputs_lock_; std::list> free_inputs_outputs_; - void showDeviceInfo(const sycl::queue & mqueue) const { - CERR << "PLATFORM: " << mqueue.get_device().get_platform().get_info(); - CERR << "GPU: " << mqueue.get_device().get_info(); - CERR << "GPU memory: " << mqueue.get_device().get_info(); - CERR << "GPU clock frequency: " << mqueue.get_device().get_info(); - CERR << "L2 cache capacity: " << mqueue.get_device().get_info(); - CERR << "Global memory Size: " << mqueue.get_device().get_info(); - - } + void showDeviceInfo(const sycl::queue &mqueue) const { + CERR << "Device-Info..."; + CERR << "Platform: " + << mqueue.get_device().get_platform().get_info() + << " selected"; + std::string device_type = mqueue.get_device().is_gpu() ? "GPU" : "CPU"; + CERR << device_type << ": " + << mqueue.get_device().get_info(); + CERR << device_type << ": " + << mqueue.get_device().get_info() / (1024 * 1024) + << " MB (max allocation)"; + CERR << device_type << " clock frequency: " + << mqueue.get_device().get_info() + << " MHz"; + CERR << "L2 cache capacity: " + << mqueue.get_device().get_info() / (1024) + << " KB"; + CERR << "Global memory size: " + << mqueue.get_device().get_info() / (1024 * 1024) + << " MB"; + CERR << "...Device-Info-End"; + } + + void showPlatformInfo(const std::vector& platforms) { + CERR << "Platform-List..."; + for (size_t i = 0; i < platforms.size(); ++i) { + std::string version = platforms[i].get_info(); + + for (const auto& device : platforms[i].get_devices()) { + std::string device_type; + switch (device.get_info()) { + case sycl::info::device_type::gpu: + device_type = "GPU"; break; + case sycl::info::device_type::cpu: + device_type = "CPU"; break; + default: + device_type = "Other"; break; + } + CERR << "Platform " << i << " (version: " << version << "):" << device_type + << " (Name" << ": " + << device.get_platform().get_info() << ")"; + } + } + + CERR << "...Platform-List-End"; + } }; template @@ -1101,16 +1139,17 @@ std::unique_ptr MakeSyclNetworkAuto( const std::optional& weights, const OptionsDict& options) { int gpu_id = options.GetOrDefault("gpu", 0); - try { - CERR << "Trying to switch to [sycl-fp16]..."; - dpct::has_capability_or_fail(dpct::dev_mgr::instance().get_device(gpu_id), - {sycl::aspect::fp16}); - CERR << "Switched to [sycl-fp16]..."; - return MakeSyclNetwork(weights, options); - } catch (std::exception& e) { + auto devices = sycl::device::get_devices(); + if (gpu_id >= devices.size()) { + throw Exception("Invalid GPU ID"); + } + CERR << "Trying to switch to [sycl-fp16]..."; + if (devices[gpu_id].has(sycl::aspect::fp16)) { + CERR << "Switched to [sycl-fp16]..."; + return MakeSyclNetwork(weights, options); + } else { CERR << "Device does not support sycl-fp16"; } - CERR << "Switched to [sycl]..."; return MakeSyclNetwork(weights, options); } diff --git a/src/neural/backends/sycl/sycl_common.h b/src/neural/backends/sycl/sycl_common.h index b0241da5e4..bbaee55645 100644 --- a/src/neural/backends/sycl/sycl_common.h +++ b/src/neural/backends/sycl/sycl_common.h @@ -22,8 +22,6 @@ #pragma once #include -#include "dpct/dpct.hpp" -#include "dpct/blas_utils.hpp" #include "utils/exception.h" diff --git a/src/neural/backends/sycl/winograd_helper.h b/src/neural/backends/sycl/winograd_helper.h index 184fa9b4df..175b925506 100644 --- a/src/neural/backends/sycl/winograd_helper.h +++ b/src/neural/backends/sycl/winograd_helper.h @@ -20,12 +20,12 @@ */ #include -#include "dpct/dpct.hpp" namespace lczero { namespace sycldnn_backend { -__dpct_inline__ float mishActivate(float el) { +[[gnu::always_inline]] +inline float mishActivate(float el) { auto e = sycl::native::exp(el); auto n = e * e + 2.0f * e; auto d = el / (n + 2.0f); @@ -35,7 +35,8 @@ __dpct_inline__ float mishActivate(float el) { return el - 2.0f * d; } } -__dpct_inline__ float activate(float cVal, ActivationFunction activation) { +[[gnu::always_inline]] +inline float activate(float cVal, ActivationFunction activation) { switch (activation) { case ACTIVATION_RELU: if (cVal < 0) cVal = 0; @@ -69,8 +70,8 @@ __dpct_inline__ float activate(float cVal, ActivationFunction activation) { } template -__dpct_inline__ void matrixMul_gpu_serial(T* c, const T* a, const T* b) { -#ifndef SKIP_FP16_BITS +[[gnu::always_inline]] +inline void matrixMul_gpu_serial(T* c, const T* a, const T* b) { #pragma unroll for (int i = 0; i < M; ++i) #pragma unroll @@ -80,11 +81,11 @@ __dpct_inline__ void matrixMul_gpu_serial(T* c, const T* a, const T* b) { for (int k = 0; k < K; ++k) S += a[i * K + k] * b[k * N + j]; c[i * N + j] = S; } -#endif } template -__dpct_inline__ void FilterTransform4x4(T* transformed_filter, +[[gnu::always_inline]] +inline void FilterTransform4x4(T* transformed_filter, const T* filter) { // transform applied to filter (of size 3x3) T G[6 * 3] = {1.0f / 4, 0, 0, -1.0f / 6, -1.0f / 6, @@ -102,7 +103,8 @@ __dpct_inline__ void FilterTransform4x4(T* transformed_filter, } template -__dpct_inline__ void InputTransform4x4(T* transformedInput, const T* input) { +[[gnu::always_inline]] +inline void InputTransform4x4(T* transformedInput, const T* input) { // transform applied to input tile (of size 4x4) const T Bt[6 * 6] = {4, 0, -5, 0, 1, 0, 0, -4, -4, 1, 1, 0, 0, 4, -4, -1, 1, 0, 0, -2, -1, 2, 1, 0, @@ -118,7 +120,8 @@ __dpct_inline__ void InputTransform4x4(T* transformedInput, const T* input) { } template -__dpct_inline__ void OutputTransform4x4(T* output, const T* transformedOutput) { +[[gnu::always_inline]] +inline void OutputTransform4x4(T* output, const T* transformedOutput) { // transform applied to result const T At[4 * 6] = {1, 1, 1, 1, 1, 0, 0, 1, -1, 2, -2, 0, 0, 1, 1, 4, 4, 0, 0, 1, -1, 8, -8, 1}; @@ -210,8 +213,7 @@ void InputTransform_kernel(int N, int C, const T* input, T* output, // top-left { - T inEl[6][6] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + T inEl[6][6] = {}; #pragma unroll for (int i = 0; i < 5; i++) @@ -229,8 +231,7 @@ void InputTransform_kernel(int N, int C, const T* input, T* output, // top-right { - T inEl[6][6] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + T inEl[6][6] = {}; #pragma unroll for (int i = 0; i < 5; i++) @@ -248,8 +249,7 @@ void InputTransform_kernel(int N, int C, const T* input, T* output, // bottom-left { - T inEl[6][6] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + T inEl[6][6] = {}; #pragma unroll for (int i = 0; i < 5; i++) @@ -267,8 +267,7 @@ void InputTransform_kernel(int N, int C, const T* input, T* output, // bottom-right { - T inEl[6][6] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + T inEl[6][6] = {}; #pragma unroll for (int i = 0; i < 5; i++) @@ -301,7 +300,6 @@ void OutputTransform_kernel(int N, int C, int se_K, T* output, const T* w2, const T* b2, const sycl::nd_item<3> &item_ct1, float *shared_data) { -#ifndef SKIP_FP16_BITS const bool fp16 = std::is_same::value; int k = item_ct1.get_local_id(2); @@ -442,11 +440,11 @@ void OutputTransform_kernel(int N, int C, int se_K, T* output, *((sycl::uint4*)&board[h][4]); } } -#endif } // fast reduction for the warp -__dpct_inline__ float warpReduce(float x, const sycl::nd_item<3>& item_ct1) { +[[gnu::always_inline]] +inline float warpReduce(float x, const sycl::nd_item<3>& item_ct1) { #pragma unroll for (int mask = 16; mask > 0; mask >>= 1) /* @@ -462,13 +460,14 @@ __dpct_inline__ float warpReduce(float x, const sycl::nd_item<3>& item_ct1) { device. Modify the size of the work-group to ensure that the value of the right-most dimension is a multiple of "32". */ - x += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), x, mask); + x += sycl::permute_group_by_xor(item_ct1.get_sub_group(), x, mask); return x; } // fast max reduction for the warp -__dpct_inline__ float warpMax(float x, const sycl::nd_item<3>& item_ct1) { +[[gnu::always_inline]] +inline float warpMax(float x, const sycl::nd_item<3>& item_ct1) { #pragma unroll for (int mask = 16; mask > 0; mask >>= 1) /* @@ -484,29 +483,16 @@ __dpct_inline__ float warpMax(float x, const sycl::nd_item<3>& item_ct1) { device. Modify the size of the work-group to ensure that the value of the right-most dimension is a multiple of "32". */ - x = sycl::max(x, (float)(dpct::permute_sub_group_by_xor( + x = sycl::max(x, (float)(sycl::permute_group_by_xor( item_ct1.get_sub_group(), x, mask))); return x; } -// atomic max implementation for floats -__dpct_inline__ float atomicMaxFloat(float* addr, float val) { - float max; - max = !sycl::signbit(val) - ? sycl::bit_cast(dpct::atomic_fetch_max< - sycl::access::address_space::generic_space>( - (int*)addr, sycl::bit_cast(val))) - : sycl::bit_cast(dpct::atomic_fetch_min< - sycl::access::address_space::generic_space>( - (unsigned int*)addr, sycl::bit_cast(val))); - - return max; -} - // Helper fuction to do vector loads/stores template -__dpct_inline__ void copyAs(void* dst, const void* src) { +[[gnu::always_inline]] +inline void copyAs(void* dst, const void* src) { *((T*)(dst)) = *((const T*)(src)); } @@ -530,7 +516,6 @@ void OutputTransform_SE_relu_InputTransform_kernel( const T* w1, const T* b1, const T* w2, const T* b2, const sycl::nd_item<3>& item_ct1, float* shared_data, sycl::local_accessor shared_sums) { -#ifndef SKIP_FP16_BITS const bool fp16 = std::is_same::value; int k = item_ct1.get_local_id(2); @@ -671,8 +656,7 @@ void OutputTransform_SE_relu_InputTransform_kernel( int c = k; // top-left { - T inEl[6][6] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + T inEl[6][6] = {}; #pragma unroll for (int i = 0; i < 5; i++) @@ -690,8 +674,7 @@ void OutputTransform_SE_relu_InputTransform_kernel( // top-right { - T inEl[6][6] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + T inEl[6][6] = {}; #pragma unroll for (int i = 0; i < 5; i++) @@ -709,8 +692,7 @@ void OutputTransform_SE_relu_InputTransform_kernel( // bottom-left { - T inEl[6][6] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + T inEl[6][6] = {}; #pragma unroll for (int i = 0; i < 5; i++) @@ -728,8 +710,7 @@ void OutputTransform_SE_relu_InputTransform_kernel( // bottom-right { - T inEl[6][6] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + T inEl[6][6] = {}; #pragma unroll for (int i = 0; i < 5; i++) @@ -744,7 +725,6 @@ void OutputTransform_SE_relu_InputTransform_kernel( for (int x = 0; x < 6; x++) output[TEMP_INDEX_HWNC(y, x, n * 4 + 3, c)] = inEl[y][x]; } -#endif } constexpr int kOpInpTransformBlockSize = 64; @@ -760,7 +740,6 @@ register pressure. void OutputTransform_relu_InputTransform_kernel( int N, int C, T* output, const T* input, T* skip, const T* bias, const sycl::nd_item<3>& item_ct1) { -#ifndef SKIP_FP16_BITS const bool fp16 = std::is_same::value; int k = item_ct1.get_local_id(2) + @@ -838,8 +817,7 @@ void OutputTransform_relu_InputTransform_kernel( int c = k; // top-left { - T inEl[6][6] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + T inEl[6][6] = {}; #pragma unroll for (int i = 0; i < 5; i++) @@ -857,8 +835,7 @@ void OutputTransform_relu_InputTransform_kernel( // top-right { - T inEl[6][6] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + T inEl[6][6] = {}; #pragma unroll for (int i = 0; i < 5; i++) @@ -876,8 +853,7 @@ void OutputTransform_relu_InputTransform_kernel( // bottom-left { - T inEl[6][6] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + T inEl[6][6] = {}; #pragma unroll for (int i = 0; i < 5; i++) @@ -895,8 +871,7 @@ void OutputTransform_relu_InputTransform_kernel( // bottom-right { - T inEl[6][6] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + T inEl[6][6] = {}; #pragma unroll for (int i = 0; i < 5; i++) @@ -911,7 +886,6 @@ void OutputTransform_relu_InputTransform_kernel( for (int x = 0; x < 6; x++) output[TEMP_INDEX_HWNC(y, x, n * 4 + 3, c)] = inEl[y][x]; } -#endif } template diff --git a/src/neural/backends/xla/network_xla.cc b/src/neural/backends/xla/network_xla.cc index 1d0cad0a71..336f03ef25 100644 --- a/src/neural/backends/xla/network_xla.cc +++ b/src/neural/backends/xla/network_xla.cc @@ -81,10 +81,10 @@ class XlaNetwork : public Network { return std::make_unique(this); } int GetMiniBatchSize() const override { - // 32 is the default prefetch size, subtract it so that backend doesn't - // crash. - // TODO make it better when we have a proper way to query the batch size. - return runner_->GetMaxBatchSize() - 32; + return runner_->GetMaxBatchSize(); + } + int GetPreferredBatchStep() const override { + return runner_->GetPreferredBatchStep(); } private: diff --git a/src/neural/backends/xla/xla_runner.cc b/src/neural/backends/xla/xla_runner.cc index 35b893dc67..0adac0dbd0 100644 --- a/src/neural/backends/xla/xla_runner.cc +++ b/src/neural/backends/xla/xla_runner.cc @@ -170,6 +170,7 @@ void XlaRunner::SetFrozenInputs( } size_t XlaRunner::GetMaxBatchSize() const { return executables_.back().first; } +size_t XlaRunner::GetPreferredBatchStep() const { return executables_.front().first; } std::vector> XlaRunner::ExecuteBlocking( const std::vector& inputs) { diff --git a/src/neural/backends/xla/xla_runner.h b/src/neural/backends/xla/xla_runner.h index 931571362e..4c8f71c374 100644 --- a/src/neural/backends/xla/xla_runner.h +++ b/src/neural/backends/xla/xla_runner.h @@ -34,8 +34,8 @@ #include #include "neural/backends/xla/pjrt.h" -#include "neural/xla/hlo.pb.h" #include "neural/xla/xla_tensor.h" +#include "proto/hlo.pb.h" namespace lczero { @@ -60,6 +60,7 @@ class XlaRunner { // Maximum supported batch size. It's expected that the capacity (not size) of // the input tensors would be able to fit this size. size_t GetMaxBatchSize() const; + size_t GetPreferredBatchStep() const; private: std::unique_ptr pjrt_client_; diff --git a/src/neural/factory.cc b/src/neural/factory.cc index a458a3b9e7..778fb47c90 100644 --- a/src/neural/factory.cc +++ b/src/neural/factory.cc @@ -29,6 +29,7 @@ #include +#include "default_backend.h" #include "neural/loader.h" #include "neural/shared_params.h" #include "utils/commandline.h" @@ -54,7 +55,15 @@ void NetworkFactory::RegisterNetwork(const std::string& name, std::vector NetworkFactory::GetBackendsList() const { std::vector result; - for (const auto& x : factories_) result.emplace_back(x.name); +#ifdef DEFAULT_BACKEND + result.emplace_back(DEFAULT_BACKEND); +#endif + for (const auto& x : factories_) { +#ifdef DEFAULT_BACKEND + if (x.name == result[0]) continue; +#endif + result.emplace_back(x.name); + } return result; } diff --git a/src/neural/factory.h b/src/neural/factory.h index a3221ed902..fb52c29bb1 100644 --- a/src/neural/factory.h +++ b/src/neural/factory.h @@ -108,23 +108,23 @@ class NetworkFactory { friend class Register; }; -#define REGISTER_NETWORK_WITH_COUNTER2(name, func, priority, counter) \ - namespace { \ - namespace ns##counter { \ - static NetworkFactory::Register regH38fhs##counter( \ - name, \ - [](const std::optional& w, const OptionsDict& o) { \ - return func(w, o); \ - }, \ - priority); \ - static BackendManager::Register regK03nv##counter( \ - std::make_unique( \ - name, \ - [](const std::optional& w, const OptionsDict& o) { \ - return func(w, o); \ - }, \ - priority)); \ - } \ +#define REGISTER_NETWORK_WITH_COUNTER2(name, func, priority, counter) \ + namespace { \ + namespace ns##counter { \ + [[maybe_unused]] static NetworkFactory::Register regH38fhs##counter( \ + name, \ + [](const std::optional& w, const OptionsDict& o) { \ + return func(w, o); \ + }, \ + priority); \ + [[maybe_unused]] static BackendManager::Register regK03nv##counter( \ + std::make_unique( \ + name, \ + [](const std::optional& w, const OptionsDict& o) { \ + return func(w, o); \ + }, \ + priority)); \ + } \ } #define REGISTER_NETWORK_WITH_COUNTER(name, func, priority, counter) \ diff --git a/src/neural/loader.cc b/src/neural/loader.cc index 5b5840edf8..9b706cabed 100644 --- a/src/neural/loader.cc +++ b/src/neural/loader.cc @@ -182,7 +182,8 @@ WeightsFile ParseWeightsProto(const std::string& buffer) { } if (net.has_weights() && - net.format().weights_encoding() != pblczero::Format::LINEAR16) { + net.format().weights_encoding() != pblczero::Format::LINEAR16 && + net_ver < GetVersionInt(0, 33, 0)) { throw Exception("Invalid weight file: unsupported encoding."); } diff --git a/src/neural/network.h b/src/neural/network.h index b46e63a745..becf424427 100644 --- a/src/neural/network.h +++ b/src/neural/network.h @@ -121,6 +121,7 @@ class Network { virtual void InitThread(int /*id*/) {} virtual bool IsCpu() const { return false; } virtual int GetMiniBatchSize() const { return 256; } + virtual int GetPreferredBatchStep() const { return 1; } virtual ~Network() = default; }; diff --git a/src/neural/network_legacy.cc b/src/neural/network_legacy.cc index 53846353c6..8c54b64973 100644 --- a/src/neural/network_legacy.cc +++ b/src/neural/network_legacy.cc @@ -142,7 +142,11 @@ BaseWeights::MHA::MHA(const pblczero::Weights::MHA& mha) dense_w(LayerAdapter(mha.dense_w()).as_vector()), dense_b(LayerAdapter(mha.dense_b()).as_vector()), smolgen(Smolgen(mha.smolgen())), - has_smolgen(mha.has_smolgen()) {} + has_smolgen(mha.has_smolgen()) { + if (mha.has_rpe_q() || mha.has_rpe_k() || mha.has_rpe_v()) { + throw Exception("RPE weights file not supported."); + } +} BaseWeights::FFN::FFN(const pblczero::Weights::FFN& ffn) : dense1_w(LayerAdapter(ffn.dense1_w()).as_vector()), diff --git a/src/neural/onnx/adapters.h b/src/neural/onnx/adapters.h index e83a9385a7..fc04096d9f 100644 --- a/src/neural/onnx/adapters.h +++ b/src/neural/onnx/adapters.h @@ -30,8 +30,8 @@ #include #include "neural/onnx/builder.h" -#include "neural/onnx/onnx.pb.h" #include "proto/net.pb.h" +#include "proto/onnx.pb.h" #include "utils/weights_adapter.h" namespace lczero { diff --git a/src/neural/onnx/builder.cc b/src/neural/onnx/builder.cc index fe09d5cb1c..94b7db650a 100644 --- a/src/neural/onnx/builder.cc +++ b/src/neural/onnx/builder.cc @@ -30,24 +30,30 @@ #include #include "neural/onnx/adapters.h" -#include "neural/onnx/onnx.pb.h" #include "utils/exception.h" -#include "utils/random.h" #include "version.h" namespace lczero { -OnnxBuilder::OnnxBuilder(int opset) : opset_(opset) { +OnnxBuilder::OnnxBuilder(int opset, int ir) : opset_(opset) { if (opset < 7 || opset > 22) { throw Exception("Only ONNX opsets between 7 and 22 are supported."); } - model_.set_ir_version(4); + // Map of latest opset corresponding to IR version. + std::map opset_to_ir = {{8, 3}, {9, 4}, {10, 5}, + {11, 6}, {14, 7}, {18, 8}, + {20, 9}, {22, 10}, {99, 11}}; + if (ir < 0) ir = opset_to_ir.upper_bound(opset - 1)->second; + if (ir < 3 || ir > 10) { + throw Exception("Only ONNX IR between 3 and 10 is supported."); + } + model_.set_ir_version(ir); model_.set_domain("org.lczero.models.*"); model_.set_producer_name("Lc0"); model_.set_producer_version(GetVersionStr()); model_.add_opset_import()->set_version(opset); - model_.mutable_graph()->set_name("org.lczero/converted/" + - Random::Get().GetString(16)); + // TODO change to real network name when it becomes available. + model_.mutable_graph()->set_name("org.lczero/converted"); } namespace { diff --git a/src/neural/onnx/builder.h b/src/neural/onnx/builder.h index 4ada3c37f7..7fa7323306 100644 --- a/src/neural/onnx/builder.h +++ b/src/neural/onnx/builder.h @@ -30,7 +30,7 @@ #include #include -#include "neural/onnx/onnx.pb.h" +#include "proto/onnx.pb.h" namespace lczero { @@ -45,7 +45,7 @@ class OnnxConst { // Builds Onnx::ModelProto. class OnnxBuilder { public: - OnnxBuilder(int opset); + OnnxBuilder(int opset, int ir = -1); void AddInput(const std::string& name, std::initializer_list dims, pblczero::TensorProto::DataType datatype); void AddOutput(const std::string& name, std::initializer_list dims, diff --git a/src/neural/onnx/converter.cc b/src/neural/onnx/converter.cc index 07986d4ef6..c59069842a 100644 --- a/src/neural/onnx/converter.cc +++ b/src/neural/onnx/converter.cc @@ -250,10 +250,10 @@ std::string Converter::EndOptionalBf16Fix(OnnxBuilder* builder, std::string Converter::MakeMish(OnnxBuilder* builder, const std::string& input, const std::string& name) { - if (!options_.alt_mish || options_.opset < 9) { + if (!options_.alt_mish) { std::string flow = input; flow = StartOptionalBf16Fix(builder, flow, name); - if (options_.opset >= 18) { + if (options_.opset >= 18 && options_.real_mish) { flow = builder->Mish(name, flow); return EndOptionalBf16Fix(builder, flow, name); } @@ -263,29 +263,14 @@ std::string Converter::MakeMish(OnnxBuilder* builder, const std::string& input, return builder->Mul(name, flow, input); } else { auto in = input; - if (options_.data_type != - WeightsToOnnxConverterOptions::DataType::kFloat32) { - in = builder->Cast(name + "/to_float", in, - pblczero::TensorProto::FLOAT); - } - const OnnxConst& two = - static_cast(FloatOnnxConst({2.0f}, {1})); - const OnnxConst& zero = - static_cast(FloatOnnxConst({0.0f}, {1})); - auto e = builder->Exp(name + "/exp", in); + auto one = builder->AddInitializer(name + "/one", *GetScalarConverter(1)); + auto two = builder->AddInitializer(name + "/two", *GetScalarConverter(2)); + auto e = builder->Exp(name + "/e", in); auto flow = builder->Add(name + "/e+2", e, two); - auto n = builder->Mul(name + "/n", e, flow); - flow = builder->Add(name + "/n+2", n, two); - auto d = builder->Div(name + "/d", in, flow); - auto f = builder->Mul(name + "/n*d", n, d); - flow = builder->Mul(name + "/2*d", d, two); - auto t = builder->Sub(name + "/in-2*d", in, flow); - flow = builder->Greater(name + "/compare", in, zero); - flow = builder->Where(name, flow, t, f); - if (options_.data_type != - WeightsToOnnxConverterOptions::DataType::kFloat32) { - flow = builder->Cast(name + "/to_data_type", flow, GetDataType()); - } + flow = builder->Mul(name + "/e*e+2e", e, flow); + flow = builder->Div(name + "/2/(e*e+2e)", two, flow); + flow = builder->Add(name + "/1+2/(e*e+2e)", flow, one); + flow = builder->Div(name + "/in/(1+2/(e*e+2e))", in, flow); return flow; } } @@ -758,7 +743,7 @@ std::string Converter::MakeAttentionBody(OnnxBuilder* builder, if (weights.ip_mult_gate.size() > 0 || weights.ip_add_gate.size() > 0) { flow = builder->Reshape( - "/attn_body/ma_gating/rehape1", flow, + "/attn_body/ma_gating/rehape", flow, builder->AddInitializer("/const/ma_gating/shape1", Int64OnnxConst({-1, 64, embedding_size}, {3}))); if (weights.ip_mult_gate.size() > 0) { @@ -771,17 +756,23 @@ std::string Converter::MakeAttentionBody(OnnxBuilder* builder, *GetWeghtsConverter(weights.ip_add_gate, {64, embedding_size}, {1, 0})); } - flow = builder->Reshape( - "/attn_body/ma_gating/rehape2", flow, - builder->AddInitializer("/const/ma_gating/shape2", - Int64OnnxConst({-1, embedding_size}, {2}))); } + flow = builder->Reshape( + "/attn_body/rehape", flow, + builder->AddInitializer("/const/ma_gating/shape2", + Int64OnnxConst({-1, embedding_size}, {2}))); + float alpha = std::pow(2.0f * NumEncBlocks(), -0.25f); if (input_embedding == network_format::INPUT_EMBEDDING_PE_DENSE) { - flow = MakeFFN(builder, weights.ip_emb_ffn, embedding_size, flow, - "/attn_body", default_activation_, alpha); + const auto ffn_activation = static_cast( + src_.format().network_format().ffn_activation()); + flow = + MakeFFN(builder, weights.ip_emb_ffn, embedding_size, flow, "/attn_body", + ffn_activation == ACTIVATION_DEFAULT ? default_activation_ + : ffn_activation, + alpha); flow = MakeLayerNorm( builder, flow, "/attn_body/ln2", *GetWeghtsConverter(weights.ip_emb_ffn_ln_gammas, {embedding_size}), @@ -921,7 +912,7 @@ void Converter::MakePolicyHead(pblczero::OnnxModel* onnx, OnnxBuilder* builder, const std::string& input, const MultiHeadWeights& weights) { // Check that selected policy head exists. - if (weights.policy_heads.count(options_.policy_head) == 0) { + if (!weights.policy_heads.contains(options_.policy_head)) { throw Exception("The policy head you specified '" + options_.policy_head + "'" + " does not exist in this net."); } @@ -989,7 +980,7 @@ void Converter::MakeValueHead(pblczero::OnnxModel* onnx, OnnxBuilder* builder, const std::string& input, const MultiHeadWeights& weights) { // Check that selected value head exists. - if (weights.value_heads.count(options_.value_head) == 0) { + if (!weights.value_heads.contains(options_.value_head)) { throw Exception("The value head you specified '" + options_.value_head + "'" + " does not exist in this net."); } @@ -1033,9 +1024,11 @@ void Converter::MakeValueHead(pblczero::OnnxModel* onnx, OnnxBuilder* builder, *GetWeghtsConverter(head.ip2_val_w, {128, 3}, {1, 0})); flow = builder->Add("/value/dense2/add", flow, *GetWeghtsConverter(head.ip2_val_b, {3})); - auto output = builder->Softmax(options_.output_wdl, flow); - builder->AddOutput(output, {options_.batch_size, 3}, GetDataType()); - onnx->set_output_wdl(output); + if (!options_.no_wdl_softmax) { + flow = builder->Softmax(options_.output_wdl, flow); + } + builder->AddOutput(flow, {options_.batch_size, 3}, GetDataType()); + onnx->set_output_wdl(flow); } else { flow = builder->MatMul("/value/dense2/matmul", flow, @@ -1092,15 +1085,15 @@ void Converter::MakeMovesLeftHead(pblczero::OnnxModel* onnx, *GetWeghtsConverter(weights.ip2_mov_w, {mlh_fc1_outputs, 1}, {1, 0})); flow = builder->Add("/mlh/dense2/add", flow, *GetWeghtsConverter(weights.ip2_mov_b, {1})); - flow = MakeActivation(builder, flow, "/mlh/dense2", default_activation_); - auto output = builder->Identity(options_.output_mlh, flow); + // Explicity ReLU activation. + auto output = builder->Relu(options_.output_mlh, flow); builder->AddOutput(output, {options_.batch_size, 1}, GetDataType()); onnx->set_output_mlh(output); } void Converter::GenerateOnnx(pblczero::OnnxModel* onnx) { MultiHeadWeights weights(src_.weights()); - OnnxBuilder builder(options_.opset); + OnnxBuilder builder(options_.opset, options_.ir); if (GetDataType() == pblczero::TensorProto::FLOAT16) { onnx->set_data_type(pblczero::OnnxModel::FLOAT16); diff --git a/src/neural/onnx/converter.h b/src/neural/onnx/converter.h index 632f65c94b..e6c768aad9 100644 --- a/src/neural/onnx/converter.h +++ b/src/neural/onnx/converter.h @@ -29,8 +29,8 @@ #include -#include "neural/onnx/onnx.pb.h" #include "proto/net.pb.h" +#include "proto/onnx.pb.h" namespace lczero { @@ -45,9 +45,12 @@ struct WeightsToOnnxConverterOptions { std::string output_mlh = "/output/mlh"; int batch_size = -1; int opset = 17; - bool alt_mish = false; // Use "Mish" approximation (fp32 only). + int ir = -1; // ONNX IR, -1 for auto. + bool alt_mish = false; // Use "Mish" approximation. + bool real_mish = true; // Use "Mish" operator (opset 18+ and !alt_mish). bool alt_layernorm = false; // Discrete "LayerNormalization" implementation. bool no_shape = false; // Avoid use of "Shape" operator. + bool no_wdl_softmax = false; // Skip wdl softmax. std::string policy_head = "vanilla"; std::string value_head = "winner"; diff --git a/src/neural/register.cc b/src/neural/register.cc index 3f37d2a9ce..2a61e99352 100644 --- a/src/neural/register.cc +++ b/src/neural/register.cc @@ -29,6 +29,7 @@ #include +#include "default_backend.h" #include "neural/shared_params.h" namespace lczero { @@ -52,6 +53,12 @@ std::vector BackendManager::GetBackendNames() const { std::transform(priority_and_names.begin(), priority_and_names.end(), std::back_inserter(result), [](const std::pair& p) { return p.second; }); +#ifdef DEFAULT_BACKEND + std::string name = DEFAULT_BACKEND; + auto pos = std::find(result.begin(), result.end(), name); + if (pos == result.end()) throw Exception("Unknown backend: " + name); + std::rotate(result.begin(), pos, pos + 1); +#endif return result; } diff --git a/src/neural/register.h b/src/neural/register.h index fda3cdda8f..db30ef2a52 100644 --- a/src/neural/register.h +++ b/src/neural/register.h @@ -71,9 +71,9 @@ class BackendManager { std::vector> algorithms_; }; -#define REGISTER_BACKEND(factory) \ - namespace { \ - static SearchFactory::Register reg29c93##factory( \ - std::make_unique()); \ +#define REGISTER_BACKEND(factory) \ + namespace { \ + [[maybe_unused]] static SearchFactory::Register reg29c93##factory( \ + std::make_unique()); \ } } // namespace lczero diff --git a/src/neural/wrapper.cc b/src/neural/wrapper.cc index 2d935f231a..11d9dac78f 100644 --- a/src/neural/wrapper.cc +++ b/src/neural/wrapper.cc @@ -34,6 +34,7 @@ #include "neural/shared_params.h" #include "utils/atomic_vector.h" #include "utils/fastmath.h" +#include "utils/trace.h" namespace lczero { namespace { @@ -121,6 +122,7 @@ class NetworkAsBackendComputation : public BackendComputation { void ComputeBlocking() override { for (auto& entry : entries_) computation_->AddInput(std::move(entry.input)); computation_->ComputeBlocking(); + LCTRACE_FUNCTION_SCOPE; for (size_t i = 0; i < entries_.size(); ++i) { const EvalResultPtr& result = entries_[i].result; if (result.q) *result.q = computation_->GetQVal(i); @@ -132,6 +134,7 @@ class NetworkAsBackendComputation : public BackendComputation { void SoftmaxPolicy(std::span dst, const NetworkComputation* computation, int idx) { + LCTRACE_FUNCTION_SCOPE; const std::vector& moves = entries_[idx].legal_moves; const int transform = entries_[idx].transform; // Copy the values to the destination array and compute the maximum. @@ -192,4 +195,4 @@ std::unique_ptr NetworkAsBackendFactory::Create( return std::make_unique(std::move(network), options); } -} // namespace lczero \ No newline at end of file +} // namespace lczero diff --git a/src/neural/xla/hlo_builder.cc b/src/neural/xla/hlo_builder.cc index 873d2d50a8..d018eb5deb 100644 --- a/src/neural/xla/hlo_builder.cc +++ b/src/neural/xla/hlo_builder.cc @@ -536,7 +536,7 @@ std::optional HloBuilder::GetComputationId( HloComputation HloBuilder::AddComputation(std::string_view name, const HloBuilder& builder) { std::unordered_map id_map; - if (computation_names_.count(std::string(name))) { + if (computation_names_.contains(std::string(name))) { throw Exception("Computation with name " + std::string(name) + " already exists"); } diff --git a/src/neural/xla/hlo_builder.h b/src/neural/xla/hlo_builder.h index 1211446765..652ccd6326 100644 --- a/src/neural/xla/hlo_builder.h +++ b/src/neural/xla/hlo_builder.h @@ -32,7 +32,7 @@ #include #include -#include "neural/xla/hlo.pb.h" +#include "proto/hlo.pb.h" #include "utils/logging.h" namespace lczero { @@ -187,4 +187,4 @@ class HloContext { pblczero::XlaOpMetadata saved_metadata_; }; -} // namespace lczero \ No newline at end of file +} // namespace lczero diff --git a/src/neural/xla/onnx2hlo.cc b/src/neural/xla/onnx2hlo.cc index 017618e155..c6211ca2ba 100644 --- a/src/neural/xla/onnx2hlo.cc +++ b/src/neural/xla/onnx2hlo.cc @@ -32,8 +32,6 @@ #include #include -#include "neural/onnx/onnx.pb.h" -#include "neural/xla/hlo.pb.h" #include "neural/xla/hlo_builder.h" #include "neural/xla/print_hlo.h" #include "utils/bf16_utils.h" @@ -660,7 +658,7 @@ class Onnx2HloConverter { bool AllInputsConstant(const pblczero::NodeProto& node) { for (const auto& input : node.input()) { const std::string name(input); - if (initializers_.count(name)) continue; + if (initializers_.contains(name)) continue; if (auto iter = onnx_name_to_hlo_flow_.find(name); iter != onnx_name_to_hlo_flow_.end() && iter->second->opcode() == "constant") { @@ -1746,4 +1744,4 @@ std::unique_ptr OnnxTensorToXlaTensor( onnx_tensor.raw_data()); } -} // namespace lczero \ No newline at end of file +} // namespace lczero diff --git a/src/neural/xla/onnx2hlo.h b/src/neural/xla/onnx2hlo.h index e06436915d..ddc7bd16df 100644 --- a/src/neural/xla/onnx2hlo.h +++ b/src/neural/xla/onnx2hlo.h @@ -31,9 +31,9 @@ #include #include -#include "neural/onnx/onnx.pb.h" -#include "neural/xla/hlo.pb.h" #include "neural/xla/xla_tensor.h" +#include "proto/hlo.pb.h" +#include "proto/onnx.pb.h" namespace lczero { diff --git a/src/neural/xla/print_hlo.h b/src/neural/xla/print_hlo.h index e906bbe346..c7db16cbfa 100644 --- a/src/neural/xla/print_hlo.h +++ b/src/neural/xla/print_hlo.h @@ -29,7 +29,7 @@ #include -#include "neural/xla/hlo.pb.h" +#include "proto/hlo.pb.h" namespace lczero { @@ -43,4 +43,4 @@ struct PrettyPrintHloOptions { void PrettyPrintHlo(const pblczero::HloModuleProto& module, PrettyPrintHloOptions options, std::ostream& stream); -} // namespace lczero \ No newline at end of file +} // namespace lczero diff --git a/src/neural/xla/xla_tensor.h b/src/neural/xla/xla_tensor.h index b49766b9cf..43f9899522 100644 --- a/src/neural/xla/xla_tensor.h +++ b/src/neural/xla/xla_tensor.h @@ -33,7 +33,7 @@ #include #include -#include "neural/xla/hlo.pb.h" +#include "proto/hlo.pb.h" #include "utils/exception.h" namespace lczero { @@ -136,4 +136,4 @@ class XlaMutableTensor : public XlaTensor { std::unique_ptr data_; }; -} // namespace lczero \ No newline at end of file +} // namespace lczero diff --git a/src/python/weights.h b/src/python/weights.h index 86dd805f19..53ad0968e4 100644 --- a/src/python/weights.h +++ b/src/python/weights.h @@ -235,7 +235,8 @@ class Backend { class GameState { public: GameState(const std::optional startpos, - const std::vector& moves) { + const std::vector& moves, + const bool is_c960): is_c960_(is_c960) { ChessBoard starting_board; int no_capture_ply; int full_moves; @@ -246,11 +247,16 @@ class GameState { full_moves * 2 - (starting_board.flipped() ? 1 : 2)); for (const auto& m : moves) { - Move move(m, history_.IsBlackToMove()); + auto board = history_.Last().GetBoard(); + Move move = board.ParseMove(m); history_.Append(move); } } + GameState(const std::optional startpos, + const std::vector& moves) + : GameState(startpos, moves, false) {} + std::unique_ptr as_input(const Backend& backend) const { int tmp; return std::make_unique( @@ -264,8 +270,8 @@ class GameState { bool is_black = history_.IsBlackToMove(); std::vector result; for (auto m : ms) { - if (is_black) m.Mirror(); - result.push_back(m.as_string()); + if (is_black) m.Flip(); + result.push_back(m.ToString(is_c960_)); } return result; } @@ -274,7 +280,7 @@ class GameState { auto ms = history_.Last().GetBoard().GenerateLegalMoves(); std::vector result; for (auto m : ms) { - result.push_back(m.as_nn_index(/* transform= */ 0)); + result.push_back(MoveToNNIndex(m, /* transform= */ 0)); } return result; } @@ -287,6 +293,7 @@ class GameState { private: PositionHistory history_; + bool is_c960_; }; } // namespace python diff --git a/src/search/classic/params.cc b/src/search/classic/params.cc index 0caff4fde3..e61b0f9c88 100644 --- a/src/search/classic/params.cc +++ b/src/search/classic/params.cc @@ -525,6 +525,10 @@ const OptionId BaseSearchParams::kUCIRatingAdvId{ const OptionId BaseSearchParams::kSearchSpinBackoffId{ "search-spin-backoff", "SearchSpinBackoff", "Enable backoff for the spin lock that acquires available searcher."}; +const OptionId BaseSearchParams::kGarbageCollectionDelayId{ + "garbage-collection-delay", "GarbageCollectionDelay", + "The percentage of expected move time until garbage collection start. " + "Delay lets search find transpositions to freed search tree branches."}; const OptionId SearchParams::kMaxPrefetchBatchId{ "max-prefetch", "MaxPrefetch", @@ -626,6 +630,7 @@ void BaseSearchParams::Populate(OptionsParser* options) { options->Add(kUCIOpponentId); options->Add(kUCIRatingAdvId, -10000.0f, 10000.0f) = 0.0f; options->Add(kSearchSpinBackoffId) = false; + options->Add(kGarbageCollectionDelayId, 0.0f, 100.0f) = 10.0f; } void SearchParams::Populate(OptionsParser* options) { @@ -719,7 +724,8 @@ BaseSearchParams::BaseSearchParams(const OptionsDict& options) options.Get(kMaxCollisionVisitsScalingEndId)), kMaxCollisionVisitsScalingPower( options.Get(kMaxCollisionVisitsScalingPowerId)), - kSearchSpinBackoff(options_.Get(kSearchSpinBackoffId)) {} + kSearchSpinBackoff(options_.Get(kSearchSpinBackoffId)), + kGarbageCollectionDelay(options_.Get(kGarbageCollectionDelayId)) {} SearchParams::SearchParams(const OptionsDict& options) : BaseSearchParams(options), diff --git a/src/search/classic/params.h b/src/search/classic/params.h index 679e3fe4ec..d84dbad5d8 100644 --- a/src/search/classic/params.h +++ b/src/search/classic/params.h @@ -159,6 +159,10 @@ class BaseSearchParams { } bool GetSearchSpinBackoff() const { return kSearchSpinBackoff; } + float GetGarbageCollectionDelay() const { + return kGarbageCollectionDelay; + } + // Search parameter IDs. static const OptionId kMiniBatchSizeId; static const OptionId kCpuctId; @@ -226,6 +230,7 @@ class BaseSearchParams { static const OptionId kUCIOpponentId; static const OptionId kUCIRatingAdvId; static const OptionId kSearchSpinBackoffId; + static const OptionId kGarbageCollectionDelayId; protected: const OptionsDict& options_; @@ -284,6 +289,7 @@ class BaseSearchParams { const int kMaxCollisionVisitsScalingEnd; const float kMaxCollisionVisitsScalingPower; const bool kSearchSpinBackoff; + const float kGarbageCollectionDelay; }; class SearchParams : public BaseSearchParams { diff --git a/src/search/classic/search.cc b/src/search/classic/search.cc index f38395a38f..7a4d1c7deb 100644 --- a/src/search/classic/search.cc +++ b/src/search/classic/search.cc @@ -29,7 +29,6 @@ #include #include -#include #include #include #include @@ -42,6 +41,7 @@ #include "utils/fastmath.h" #include "utils/random.h" #include "utils/spinhelper.h" +#include "utils/trace.h" namespace lczero { namespace classic { @@ -282,6 +282,7 @@ void Search::SendUciInfo() REQUIRES(nodes_mutex_) REQUIRES(counters_mutex_) { .count(); if (time_since_first_batch_ms > 0) { common_info.nps = total_playouts_ * 1000 / time_since_first_batch_ms; + common_info.eps = network_evaluations_ * 1000 / time_since_first_batch_ms; } } common_info.tb_hits = tb_hits_.load(std::memory_order_acquire); @@ -426,7 +427,7 @@ float Search::GetDrawScore(bool is_odd_depth) const { } namespace { -inline float GetFpu(const SearchParams& params, Node* node, bool is_root_node, +inline float GetFpu(const SearchParams& params, const Node* node, bool is_root_node, float draw_score) { const auto value = params.GetFpuValue(is_root_node); return params.GetFpuAbsolute(is_root_node) @@ -436,7 +437,7 @@ inline float GetFpu(const SearchParams& params, Node* node, bool is_root_node, } // Faster version for if visited_policy is readily available already. -inline float GetFpu(const SearchParams& params, Node* node, bool is_root_node, +inline float GetFpu(const SearchParams& params, const Node* node, bool is_root_node, float draw_score, float visited_pol) { const auto value = params.GetFpuValue(is_root_node); return params.GetFpuAbsolute(is_root_node) @@ -453,7 +454,10 @@ inline float ComputeCpuct(const SearchParams& params, uint32_t N, } } // namespace -std::vector Search::GetVerboseStats(Node* node) const { +// Ignore the last tuple element when sorting in GetVerboseStats +static bool operator<(const EdgeAndNode&, const EdgeAndNode&) { return false; } + +std::vector Search::GetVerboseStats(const Node* node) const { assert(node == root_node_ || node->GetParent() == root_node_); const bool is_root = (node == root_node_); const bool is_odd_depth = !is_root; @@ -463,16 +467,14 @@ std::vector Search::GetVerboseStats(Node* node) const { const float cpuct = ComputeCpuct(params_, node->GetN(), is_root); const float U_coeff = cpuct * std::sqrt(std::max(node->GetChildrenVisits(), 1u)); - std::vector edges; - for (const auto& edge : node->Edges()) edges.push_back(edge); - - std::sort(edges.begin(), edges.end(), - [&fpu, &U_coeff, &draw_score](EdgeAndNode a, EdgeAndNode b) { - return std::forward_as_tuple( - a.GetN(), a.GetQ(fpu, draw_score) + a.GetU(U_coeff)) < - std::forward_as_tuple( - b.GetN(), b.GetQ(fpu, draw_score) + b.GetU(U_coeff)); - }); + std::vector> edges; + edges.reserve(node->GetNumEdges()); + for (const auto& edge : node->Edges()) { + edges.emplace_back(edge.GetN(), + edge.GetQ(fpu, draw_score) + edge.GetU(U_coeff), + edge); + } + std::sort(edges.begin(), edges.end()); auto print = [](auto* oss, auto pre, auto v, auto post, auto w, int p = 0) { *oss << pre << std::setw(w) << std::setprecision(p) << v << post; @@ -544,7 +546,8 @@ std::vector Search::GetVerboseStats(Node* node) const { std::vector infos; const auto m_evaluator = backend_attributes_.has_mlh ? MEvaluator(params_, node) : MEvaluator(); - for (const auto& edge : edges) { + for (const auto& edge_tuple : edges) { + const auto& edge = std::get<2>(edge_tuple); float Q = edge.GetQ(fpu, draw_score); float M = m_evaluator.GetMUtility(edge, Q); std::ostringstream oss; @@ -622,7 +625,7 @@ void Search::MaybeTriggerStop(const IterationStats& stats, // Already responded bestmove, nothing to do here. if (bestmove_is_sent_) return; // Don't stop when the root node is not yet expanded. - if (total_playouts_ + initial_visits_ == 0) return; + if (stats.total_nodes == 0) return; if (!stop_.load(std::memory_order_acquire)) { if (stopper_->ShouldStop(stats, hints)) FireStopInternal(); @@ -1104,7 +1107,7 @@ void SearchWorker::RunTasks(int tid) { // We got the spin lock, double check we're still in the clear. if (nta < tc) { id = tasks_taken_.fetch_add(1, std::memory_order_acq_rel); - task = &picking_tasks_[id]; + task = picking_tasks_.data() + id; task_taking_started_.store(0, std::memory_order_release); break; } @@ -1152,7 +1155,7 @@ void SearchWorker::RunTasks(int tid) { break; } } - picking_tasks_[id].complete = true; + picking_tasks_.data()[id].complete = true; completed_tasks_.fetch_add(1, std::memory_order_acq_rel); } } @@ -1160,7 +1163,7 @@ void SearchWorker::RunTasks(int tid) { void SearchWorker::ExecuteOneIteration() { // 1. Initialize internal structures. - InitializeIteration(search_->backend_->CreateComputation()); + InitializeIteration(); if (params_.GetMaxConcurrentSearchers() != 0) { std::unique_ptr spin_helper; @@ -1249,9 +1252,12 @@ void SearchWorker::ExecuteOneIteration() { // 1. Initialize internal structures. // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -void SearchWorker::InitializeIteration( - std::unique_ptr computation) { - computation_ = std::move(computation); +void SearchWorker::InitializeIteration() { + LCTRACE_FUNCTION_SCOPE; + // Free the old computation before allocating a new one. This works better + // when backend caches buffer allocations between computations. + computation_.reset(); + computation_ = search_->backend_->CreateComputation(); minibatch_.clear(); minibatch_.reserve(2 * target_minibatch_size_); } @@ -1282,6 +1288,7 @@ int CalculateCollisionsLeft(int64_t nodes, const SearchParams& params) { } // namespace void SearchWorker::GatherMinibatch() { + LCTRACE_FUNCTION_SCOPE; // Total number of nodes to process. int minibatch_size = 0; int cur_n = 0; @@ -1383,6 +1390,7 @@ void SearchWorker::GatherMinibatch() { } } if (some_ooo) { + LCTRACE_FUNCTION_SCOPE; SharedMutex::Lock lock(search_->nodes_mutex_); for (int i = static_cast(minibatch_.size()) - 1; i >= new_start; i--) { @@ -1407,6 +1415,7 @@ void SearchWorker::GatherMinibatch() { } } + LCTRACE_FUNCTION_SCOPE; // Check for stop at the end so we have at least one node. for (size_t i = new_start; i < minibatch_.size(); i++) { auto& picked_node = minibatch_[i]; @@ -1435,6 +1444,7 @@ void SearchWorker::GatherMinibatch() { void SearchWorker::ProcessPickedTask(int start_idx, int end_idx, TaskWorkspace* workspace) { + LCTRACE_FUNCTION_SCOPE; auto& history = workspace->history; history = search_->played_history_; @@ -1565,6 +1575,7 @@ void SearchWorker::PickNodesToExtendTask( const std::vector& moves_to_base, std::vector* receiver, TaskWorkspace* workspace) NO_THREAD_SAFETY_ANALYSIS { + LCTRACE_FUNCTION_SCOPE; // TODO: Bring back pre-cached nodes created outside locks in a way that works // with tasks. // TODO: pre-reserve visits_to_perform for expected depth and likely maximum @@ -2004,22 +2015,9 @@ void SearchWorker::ExtendNode(Node* node, int depth, node->CreateEdges(legal_moves); } -// Returns whether node was already in cache. -bool SearchWorker::AddNodeToComputation(Node* node) { - std::vector moves; - if (node && node->HasChildren()) { - moves.reserve(node->GetNumEdges()); - for (const auto& edge : node->Edges()) moves.emplace_back(edge.GetMove()); - } else { - moves = history_.Last().GetBoard().GenerateLegalMoves(); - } - return computation_->AddInput(EvalPosition{history_.GetPositions(), moves}, - EvalResultPtr{}) == - BackendComputation::FETCHED_IMMEDIATELY; -} - // 2b. Copy collisions into shared collisions. void SearchWorker::CollectCollisions() { + LCTRACE_FUNCTION_SCOPE; SharedMutex::Lock lock(search_->nodes_mutex_); for (const NodeToProcess& node_to_process : minibatch_) { @@ -2033,6 +2031,7 @@ void SearchWorker::CollectCollisions() { // 3. Prefetch into cache. // ~~~~~~~~~~~~~~~~~~~~~~~ void SearchWorker::MaybePrefetchIntoCache() { + LCTRACE_FUNCTION_SCOPE; // TODO(mooskagh) Remove prefetch into cache if node collisions work well. // If there are requests to NN, but the batch is not full, try to prefetch // nodes which are likely useful in future. @@ -2056,13 +2055,17 @@ int SearchWorker::PrefetchIntoCache(Node* node, int budget, bool is_odd_depth) { // We are in a leaf, which is not yet being processed. if (!node || node->GetNStarted() == 0) { - if (AddNodeToComputation(node)) { + if (search_->backend_->GetCachedEvaluation( + EvalPosition{history_.GetPositions(), {}})) { // Make it return 0 to make it not use the slot, so that the function // tries hard to find something to cache even among unpopular moves. // In practice that slows things down a lot though, as it's not always // easy to find what to cache. return 1; } + auto moves = history_.Last().GetBoard().GenerateLegalMoves(); + computation_->AddInput(EvalPosition{history_.GetPositions(), moves}, + EvalResultPtr{}); return 1; } @@ -2146,6 +2149,7 @@ void SearchWorker::RunNNComputation() { // 5. Retrieve NN computations (and terminal values) into nodes. // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ void SearchWorker::FetchMinibatchResults() { + LCTRACE_FUNCTION_SCOPE; // Populate NN/cached results, or terminal results, into nodes. for (auto& node_to_process : minibatch_) { FetchSingleNodeResult(&node_to_process); @@ -2194,6 +2198,7 @@ void SearchWorker::FetchSingleNodeResult(NodeToProcess* node_to_process) { // 6. Propagate the new nodes' information to all their parents in the tree. // ~~~~~~~~~~~~~~ void SearchWorker::DoBackupUpdate() { + LCTRACE_FUNCTION_SCOPE; // Nodes mutex for doing node updates. SharedMutex::Lock lock(search_->nodes_mutex_); @@ -2287,6 +2292,9 @@ void SearchWorker::DoBackupUpdateSingleNode( } } search_->total_playouts_ += node_to_process.multivisit; + if (node_to_process.nn_queried && !node_to_process.is_cache_hit) { + search_->network_evaluations_++; + } search_->cum_depth_ += node_to_process.depth * node_to_process.multivisit; search_->max_depth_ = std::max(search_->max_depth_, node_to_process.depth); } @@ -2363,6 +2371,7 @@ bool SearchWorker::MaybeSetBounds(Node* p, float m, int* n_to_fix, // 7. Update the Search's status and progress information. //~~~~~~~~~~~~~~~~~~~~ void SearchWorker::UpdateCounters() { + LCTRACE_FUNCTION_SCOPE; search_->PopulateCommonIterationStats(&iteration_stats_); search_->MaybeTriggerStop(iteration_stats_, &latest_time_manager_hints_); search_->MaybeOutputInfo(); diff --git a/src/search/classic/search.h b/src/search/classic/search.h index c267522e04..34293f3173 100644 --- a/src/search/classic/search.h +++ b/src/search/classic/search.h @@ -126,7 +126,7 @@ class Search { // Returns verbose information about given node, as vector of strings. // Node can only be root or ponder (depth 1). - std::vector GetVerboseStats(Node* node) const; + std::vector GetVerboseStats(const Node* node) const; // Returns the draw score at the root of the search. At odd depth pass true to // the value of @is_odd_depth to change the sign of the draw score. @@ -181,6 +181,7 @@ class Search { Edge* last_outputted_info_edge_ GUARDED_BY(nodes_mutex_) = nullptr; ThinkingInfo last_outputted_uci_info_ GUARDED_BY(nodes_mutex_); int64_t total_playouts_ GUARDED_BY(nodes_mutex_) = 0; + int64_t network_evaluations_ GUARDED_BY(nodes_mutex_) = 0; int64_t total_batches_ GUARDED_BY(nodes_mutex_) = 0; // Maximum search depth = length of longest path taken in PickNodetoExtend. uint16_t max_depth_ GUARDED_BY(nodes_mutex_) = 0; @@ -278,7 +279,7 @@ class SearchWorker { // The same operations one by one: // 1. Initialize internal structures. // @computation is the computation to use on this iteration. - void InitializeIteration(std::unique_ptr computation); + void InitializeIteration(); // 2. Gather minibatch. void GatherMinibatch(); @@ -398,15 +399,14 @@ class SearchWorker { }; NodeToProcess PickNodeToExtend(int collision_limit); - bool AddNodeToComputation(Node* node); int PrefetchIntoCache(Node* node, int budget, bool is_odd_depth); void DoBackupUpdateSingleNode(const NodeToProcess& node_to_process); // Returns whether a node's bounds were set based on its children. bool MaybeSetBounds(Node* p, float m, int* n_to_fix, float* v_delta, float* d_delta, float* m_delta) const; void PickNodesToExtend(int collision_limit); - void PickNodesToExtendTask(Node* starting_point, int collision_limit, - int base_depth, + void PickNodesToExtendTask(Node* starting_point, int base_depth, + int collision_limit, const std::vector& moves_to_base, std::vector* receiver, TaskWorkspace* workspace); @@ -432,7 +432,6 @@ class SearchWorker { PositionHistory history_; int number_out_of_order_ = 0; const SearchParams& params_; - std::unique_ptr precached_node_; const bool moves_left_support_; IterationStats iteration_stats_; StoppersHints latest_time_manager_hints_; diff --git a/src/search/classic/stoppers/common.cc b/src/search/classic/stoppers/common.cc index 8ca61c4c00..313176bd9c 100644 --- a/src/search/classic/stoppers/common.cc +++ b/src/search/classic/stoppers/common.cc @@ -131,8 +131,8 @@ void PopulateCommonUciStoppers(ChainedSearchStopper* stopper, } // "go nodes" stopper. - int64_t node_limit = 0; - if (params.nodes) { + int64_t node_limit = 4000000000; + if (params.nodes.has_value()) { if (options.Get(kNodesAsPlayoutsId)) { stopper->AddStopper(std::make_unique( *params.nodes, options.Get(kSmartPruningFactorId) > 0.0f)); @@ -140,8 +140,7 @@ void PopulateCommonUciStoppers(ChainedSearchStopper* stopper, node_limit = *params.nodes; } } - // always limit nodes to avoid exceeding the limit 4000000000. That number is - // default when node_limit = 0. + // Always limit nodes to avoid exceeding the limit 4000000000. stopper->AddStopper(std::make_unique( node_limit, options.Get(kSmartPruningFactorId) > 0.0f)); diff --git a/src/search/classic/stoppers/stoppers.cc b/src/search/classic/stoppers/stoppers.cc index 896d39a38a..5cf4ecd092 100644 --- a/src/search/classic/stoppers/stoppers.cc +++ b/src/search/classic/stoppers/stoppers.cc @@ -97,8 +97,12 @@ MemoryWatchingStopper::MemoryWatchingStopper(int ram_limit_mb, uint32_t nodes, bool populate_remaining_playouts) : VisitsStopper( - (ram_limit_mb * 1000000LL - total_memory + avg_node_size * nodes) / - avg_node_size, + [&]() -> size_t { + const auto ram_limit = ram_limit_mb * 1000000LL; + const auto nodes_memory = avg_node_size * nodes; + if (ram_limit + nodes_memory < total_memory) return 0; + return (ram_limit + nodes_memory - total_memory) / avg_node_size; + }(), populate_remaining_playouts) { LOGFILE << "RAM limit " << ram_limit_mb << "MB. Memory allocated is " << (total_memory - avg_node_size * nodes) / 1000000 diff --git a/src/search/classic/stoppers/stoppers.h b/src/search/classic/stoppers/stoppers.h index 3cc220129c..7232d8000e 100644 --- a/src/search/classic/stoppers/stoppers.h +++ b/src/search/classic/stoppers/stoppers.h @@ -54,7 +54,7 @@ class ChainedSearchStopper : public SearchStopper { class VisitsStopper : public SearchStopper { public: VisitsStopper(int64_t limit, bool populate_remaining_playouts) - : nodes_limit_(limit ? limit : 4000000000ll), + : nodes_limit_(limit), populate_remaining_playouts_(populate_remaining_playouts) {} int64_t GetVisitsLimit() const { return nodes_limit_; } bool ShouldStop(const IterationStats&, StoppersHints*) override; diff --git a/src/search/classic/wrapper.cc b/src/search/classic/wrapper.cc index d0935448a6..8ba6f0fd49 100644 --- a/src/search/classic/wrapper.cc +++ b/src/search/classic/wrapper.cc @@ -30,7 +30,8 @@ #include "search/classic/stoppers/factory.h" #include "search/register.h" #include "search/search.h" -#include "src/neural/shared_params.h" +#include "neural/shared_params.h" +#include "utils/trace.h" namespace lczero { namespace classic { @@ -97,18 +98,21 @@ MoveList StringsToMovelist(const std::vector& moves, } void ClassicSearch::NewGame() { + LCTRACE_FUNCTION_SCOPE; search_.reset(); tree_.reset(); time_manager_ = MakeTimeManager(*options_); } void ClassicSearch::SetPosition(const GameState& pos) { + LCTRACE_FUNCTION_SCOPE; if (!tree_) tree_ = std::make_unique(); const bool is_same_game = tree_->ResetToPosition(pos); if (!is_same_game) time_manager_ = MakeTimeManager(*options_); } void ClassicSearch::StartSearch(const GoParams& params) { + LCTRACE_FUNCTION_SCOPE; auto forwarder = std::make_unique(uci_responder_); if (options_->Get