diff --git a/.circleci/config.yml b/.circleci/config.yml
index 1c5bdbc82d..6404b6bb28 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -9,24 +9,26 @@ jobs:
           name: Install build tools
           command: |
             apt-get update
-            apt-get -y install git python3-pip gcc-10 g++-10 clang-12 zlib1g zlib1g-dev
+            apt-get -y install git python3-pip gcc-10 g++-10 clang-12 zlib1g zlib1g-dev wget
             pip3 install meson==0.63
             pip3 install ninja
       - run:
-          name: "Pull Submodules"
-          command: git submodule update --init
+          name: Install onnxruntime
+          command: |
+            wget https://github.com/microsoft/onnxruntime/releases/download/v1.22.0/onnxruntime-linux-x64-1.22.0.tgz -P /tmp
+            tar xzf /tmp/onnxruntime-linux-x64-1.22.0.tgz -C /tmp
       - run:
           name: Meson GCC
           environment:
             CC: gcc-10
             CXX: g++-10
-          command: meson build-gcc -Dgtest=false
+          command: meson build-gcc -Dgtest=false -Donnx_include=/tmp/onnxruntime-linux-x64-1.22.0/include -Donnx_libdir=/tmp/onnxruntime-linux-x64-1.22.0/lib
       - run:
           name: Meson Clang
           environment:
             CC: clang-12
             CXX: clang++-12
-          command: meson build-clang -Dgtest=false -Db_lto=false
+          command: meson build-clang -Dgtest=false -Db_lto=false -Donnx_include=/tmp/onnxruntime-linux-x64-1.22.0/include -Donnx_libdir=/tmp/onnxruntime-linux-x64-1.22.0/lib
       - run:
           name: Build GCC
           command: |
@@ -39,13 +41,9 @@ jobs:
             ninja -j 4
   "mac":
     macos:
-      xcode: 14.1.0
-    resource_class: macos.m1.medium.gen1
+      xcode: 14.3.1
     steps:
       - checkout
-      - run:
-          name: "Pull Submodules"
-          command: git submodule update --init
       - run:
           name: Install build tools
           command: |
@@ -71,43 +69,63 @@ jobs:
           command: lipo -create -o /tmp/lc0 build/lc0 build-arm/lc0
       - store_artifacts:
           path: /tmp/lc0
-          destination: lc0-macos_12.6.1
+          destination: lc0-macos_13.2.1
       - run:
-          name: Verify Workspace
+          name: Prepare Workspace
           command: |
-            mv /tmp/lc0 /tmp/lc0-macos_12.6.1
-            ls -lah /tmp
+            mkdir -p workspace
+            mv /tmp/lc0 workspace
       - persist_to_workspace:
-          root: /tmp
+          root: workspace
           paths: 
-            - lc0-macos_12.6.1
-
+            - lc0
+  "mac latest":
+    macos:
+      xcode: 26.1.0
+    steps:
+      - checkout
+      - run:
+          name: Install build tools
+          command: |
+            pip3 install meson
+            pip3 install ninja
+      - run:
+          name: Build lc0 arm
+          command: |
+            meson build-arm --buildtype=release -Dgtest=false -Dopencl=false
+            cd build-arm
+            ninja
   "upload-github-release":
     macos:
-      xcode: 14.1.0
+      xcode: 14.3.1
     steps:
       - attach_workspace:
-          at: /tmp
+          at: /tmp/workspace
       - run:
           name: Install GitHub CLI
           command: brew install gh
       - run:
           name: Verify Workspace
           command: |
-            ls -lah /tmp
+            ls -lah /tmp/workspace
       - run:
           name: Upload to GitHub Release
           command: |
+            mv /tmp/workspace/lc0 /tmp/lc0-$CIRCLE_TAG-macos_13.2.1
             gh release upload \
               "$CIRCLE_TAG" \
-              /tmp/lc0-macos_12.6.1 \
-              --clobber
+              /tmp/lc0-$CIRCLE_TAG-macos_13.2.1 \
+              --clobber --repo LeelaChessZero/lc0
 workflows:
   version: 2
   builds:
     jobs:
       - build
-      - "mac"
+      - "mac":
+          filters:
+            tags:
+              only: /v[0-9]+(\.[0-9]+)*(\-.+)?/
+      - "mac latest"
       - "upload-github-release":
           requires:
             - "mac"
diff --git a/.gitmodules b/.gitmodules
index 6575e63266..e69de29bb2 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +0,0 @@
-[submodule "libs/lczero-common"]
-	path = libs/lczero-common
-	url = https://github.com/LeelaChessZero/lczero-common.git
diff --git a/AUTHORS b/AUTHORS
index d1fabfddfe..b7d6010ae3 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -8,6 +8,7 @@ almaudoh
 Aloril
 Andrew Grant
 Andy Olsen
+Aniebiet Udoh
 Ankan
 Ankan Banerjee
 Anson Hu
@@ -19,9 +20,11 @@ Boštjan Mejak
 Brandon Lin
 Brett Holman
 Carlo Wood
+Chin-Chang Yang
 cn4750
 Cong
 Contrad Namiseb (Bonan)
+Copilot (bot)
 cwbriscoe
 danegraphics
 Daniel Monroe
@@ -46,8 +49,10 @@ Francis Li
 Francois
 Francois Pays
 François Pays
+Gabe
 Ganesh Krishnan
 GBeauregard
+Gergely Fülöp
 Gian-Carlo Pascutto
 gmorenz
 Google LLC
@@ -56,11 +61,16 @@ Hace
 Hans Ekbrand
 Henrik Forstén
 Ikko Eltociear Ashimine
+Jack L
 Jack Thomson
 James Horsfall Thomas
+jamie
 jjoshua2
 John Newlin
+john-sp
+Julian-Dominik Helmsen
 Karl Kfoury
+Kathleen Mcgrievy
 kiilas
 Kip Hamiltons
 Kovax
@@ -73,6 +83,7 @@ Martin
 Martin Senft
 masterkni6
 masterkni666
+Menkib
 Mike Roberts
 Naphthalin
 nathan-lc0
@@ -85,13 +96,17 @@ Pan
 patrik-ha
 PaulJeFi
 Pratik Dixit
+psykose
 QxC4eva
+Rafal Bielski
 Raj
 Reece H. Dunn
 Ron Wolf
 Sami Kiminki
+Sherman Siu
 Shreyas Kapur
 shtayerc
+Shukant Pal
 Simon
 slash
 students
@@ -108,6 +123,7 @@ Valentin
 Valeriy Huz
 Victor Popovici
 Videodr0me
+Viet-Anh Tran
 Viren6
 Yan Zhang
-zz4032
\ No newline at end of file
+zz4032
diff --git a/README.md b/README.md
index a56da72740..0ce7a2a125 100644
--- a/README.md
+++ b/README.md
@@ -7,33 +7,28 @@ Lc0 is a UCI-compliant chess engine designed to play chess via neural network, s
 
 ## Downloading source
 
-Lc0 can be acquired either via a git clone or an archive download from GitHub. Be aware that there is a required submodule which isn't included in source archives.
+Lc0 can be acquired either via a git clone or an archive download from GitHub.
 
-For essentially all purposes, including selfplay game generation and match play, we highly recommend using the latest `release/version` branch (for example `release/0.31`), which is equivalent to using the latest version tag.
+For essentially all purposes, including selfplay game generation and match play, we highly recommend using the latest `release/version` branch (for example `release/0.32`), which is equivalent to using the latest version tag.
 
 Versioning follows the Semantic Versioning guidelines, with major, minor and patch sections. The training server enforces game quality using the versions output by the client and engine.
 
-
 Download using git:
 
 ```shell
-git clone -b release/0.31 --recurse-submodules https://github.com/LeelaChessZero/lc0.git
+git clone -b release/0.32 https://github.com/LeelaChessZero/lc0.git
 ```
 
 If you have cloned already an old version, fetch, view and checkout a new branch:
 ```shell
 git fetch --all
 git branch --all
-git checkout -t remotes/origin/release/0.31
+git checkout -t remotes/origin/release/0.32
 ```
 
-
-If you prefer to download an archive, you need to also download and place the submodule:
- * Download the [.zip](https://api.github.com/repos/LeelaChessZero/lc0/zipball/release/0.31) file ([.tar.gz](https://api.github.com/repos/LeelaChessZero/lc0/tarball/release/0.31) archive is also available)
+If you prefer to download an archive:
+ * Download the [.zip](https://api.github.com/repos/LeelaChessZero/lc0/zipball/release/0.32) file ([.tar.gz](https://api.github.com/repos/LeelaChessZero/lc0/tarball/release/0.32) archive is also available)
  * Extract
- * Download https://github.com/LeelaChessZero/lczero-common/archive/master.zip (also available as [.tar.gz](https://github.com/LeelaChessZero/lczero-common/archive/master.tar.gz))
- * Move the second archive into the first archive's `libs/lczero-common/` folder and extract
- * The final form should look like `<TOP>/libs/lczero-common/proto/`
 
 Having successfully acquired Lc0 via either of these methods, proceed to the build section below and follow the instructions for your OS.
 
@@ -42,13 +37,11 @@ Having successfully acquired Lc0 via either of these methods, proceed to the bui
 
 Building should be easier now than it was in the past. Please report any problems you have.
 
-Aside from the git submodule, lc0 requires the Meson build system and at least one backend library for evaluating the neural network, as well as the required `zlib`. (`gtest` is optionally used for the test suite.) If your system already has this library installed, they will be used; otherwise Meson will generate its own copy of the two (a "subproject"), which in turn requires that git is installed (yes, separately from cloning the actual lc0 repository). Meson also requires python and Ninja.
+Building lc0 requires the Meson build system and at least one backend library for evaluating the neural network, as well as a few libraries. If your system already has these libraries installed, they will be used; otherwise Meson will generate its own copy (a "subproject"), which in turn requires that git is installed (yes, separately from cloning the actual lc0 repository). Meson also requires python and Ninja.
 
-Backend support includes (in theory) any CBLAS-compatible library for CPU usage, such as OpenBLAS or Intel's DNNL or MKL. For GPUs, OpenCL and CUDA+cudnn are supported, while DX-12 can be used in Windows 10 with latest drivers.
+Backend support includes (in theory) any CBLAS-compatible library for CPU usage, but OpenBLAS or Intel's DNNL are the main ones. For GPUs, the following are supported: CUDA (with optional cuDNN), various flavors of onnxruntime, and Apple's Metal Performance Shaders. There is also experimental SYCL support for AMD and Intel GPUs.
 
-Finally, lc0 requires a compiler supporting C++17. Minimal versions seem to be g++ v8.0, clang v5.0 (with C++17 stdlib) or Visual Studio 2017.
-
-*Note* that cuda checks the compiler version and stops even with newer compilers, and to work around this we have added the `nvcc_ccbin` build option. This is more of an issue with new Linux versions, but you can get around it by using an earlier version of gcc just for cuda. As an example, adding `-Dnvcc_ccbin=g++-9` to the `build.sh` command line will use g++-9 with cuda instead of the system compiler.
+Finally, lc0 requires a compiler supporting C++20. Minimal versions tested are g++ v10.0, clang v12.0 and Visual Studio 2019 version 16.11.
 
 Given those basics, the OS and backend specific instructions are below.
 
@@ -56,160 +49,125 @@ Given those basics, the OS and backend specific instructions are below.
 
 #### Generic
 
-1. Install backend:
-    - If you want to use NVidia graphics cards Install [CUDA](https://developer.nvidia.com/cuda-zone) and [cuDNN](https://developer.nvidia.com/cudnn).
-    - If you want to use AMD graphics cards install OpenCL.
-    - if you want OpenBLAS version Install OpenBLAS (`libopenblas-dev`).
+1. Install backend (also read the detailed instructions in later sections):
+    - If you want to use NVidia graphics cards Install [CUDA](https://developer.nvidia.com/cuda-zone) (and optionally [cuDNN](https://developer.nvidia.com/cudnn)).
+    - If you want to use AMD or Intel graphics cards you can try SYCL.
+    - if you want BLAS install either OpenBLAS or DNNL.
 2. Install ninja build (`ninja-build`), meson, and (optionally) gtest (`libgtest-dev`).
 3. Go to `lc0/`
 4. Run `./build.sh`
 5. `lc0` will be in `lc0/build/release/` directory
-6. Unzip a [neural network](https://lczero.org/play/networks/bestnets/) in the same directory as the binary.
+6. Download a [neural network](https://lczero.org/play/networks/bestnets/) in the same directory as the binary (no need to unpack it).
 
 If you want to build with a different compiler, pass the `CC` and `CXX` environment variables:
+```shell
+CC=clang CXX=clang++ ./build.sh
+```
 
-    CC=clang-6.0 CXX=clang++-6.0 ./build.sh
-
-#### Note on installing CUDA on Ubuntu
-
-Nvidia provides .deb packages. CUDA will be installed in `/usr/local/cuda-10.0` and requires 3GB of diskspace.
-If your `/usr/local` partition doesn't have that much space left you can create a symbolic link before
-doing the install; for example: `sudo ln -s /opt/cuda-10.0 /usr/local/cuda-10.0`
-
-The instructions given on the nvidia website tell you to finish with `apt install cuda`. However, this
-might not work (missing dependencies). In that case use `apt install cuda-10-0`. Afterwards you can
-install the meta package `cuda` which will cause an automatic upgrade to a newer version when that
-comes available (assuming you use `Installer Type deb (network)`, if you'd want that (just cuda-10-0 will
-stay at version 10). If you don't know what to do, only install cuda-10-0.
-
-cuDNN exists of two packages, the Runtime Library and the Developer Library (both a .deb package).
+#### Ubuntu 20.04
 
-Before you can download the latter you need to create a (free) "developer" account with nvidia for
-which at least a legit email address is required (their website says: The e-mail address is not made public
-and will only be used if you wish to receive a new password or wish to receive certain news or notifications
-by e-mail.). Further they ask for a name, date of birth (not visible later on), country, organisation ("LeelaZero"
-if you have none), primary industry segment ("Other"/none) and which development areas you are interested
-in ("Deep Learning").
+For Ubuntu 20.04 you need meson, ninja and gcc-10 before performing the steps above. The following should work:
+```shell
+apt-get update
+apt-get -y install git python3-pip gcc-10 g++-10 zlib1g zlib1g-dev
+pip3 install meson
+pip3 install ninja
+CC=gcc-10 CXX=g++-10 INSTALL_PREFIX=~/.local ./build.sh
+```
 
-#### Ubuntu 18.04
+Make sure that `~/.local/bin` is in your `PATH` environment variable. You can now type `lc0 --help` and start.
 
-For Ubuntu 18.04 you need the latest version of meson, libstdc++-8-dev, and clang-6.0 before performing the steps above:
+### Windows
 
-    sudo apt-get install libstdc++-8-dev clang-6.0 ninja-build pkg-config
-    pip3 install meson --user
-    CC=clang-6.0 CXX=clang++-6.0 INSTALL_PREFIX=~/.local ./build.sh
+Here are the brief instructions for CUDA/cuDNN, for details and other options see `windows-build.md` and the instructions in the following sections.
 
-Make sure that `~/.local/bin` is in your `PATH` environment variable. You can now type `lc0 --help` and start.
+1. Install Microsoft Visual Studio (2019 version 16.11 or later)
+2. Install [CUDA](https://developer.nvidia.com/cuda-zone)
+3. (Optionally install [cuDNN](https://developer.nvidia.com/cudnn)).
+4. Install Python3 if you didn't install it with Visual Studio.
+5. Install Meson: `pip3 install --upgrade meson`
+6. If `CUDA_PATH` is not set (run the `set` command to see the full list of variables), edit `build.cmd` and set the `CUDA_PATH` with your CUDA directory
+* If you also want cuDNN, set `CUDNN_PATH` with your cuDNN directory (not needed if it is the same with `CUDA_PATH`).
 
-#### Ubuntu 16.04
+7. Run `build.cmd`. It will ask permission to delete the build directory, then generate MSVS project and pause.
 
-For Ubuntu 16.04 you need the latest version of meson, ninja, clang-6.0, and libstdc++-8:
+Then either:
 
-    wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add -
-    sudo apt-add-repository 'deb http://apt.llvm.org/xenial/ llvm-toolchain-xenial-6.0 main'
-    sudo add-apt-repository ppa:ubuntu-toolchain-r/test
-    sudo apt-get update
-    sudo apt-get install clang-6.0 libstdc++-8-dev
-    pip3 install meson ninja --user
-    CC=clang-6.0 CXX=clang++-6.0 INSTALL_PREFIX=~/.local ./build.sh
+8. Hit `Enter` to build it.
+9. Resulting binary will be `build/lc0.exe`
 
-Make sure that `~/.local/bin` is in your `PATH` environment variable. You can now type `lc0 --help` and start.
+Or.
 
-#### openSUSE (all versions)
+8. Open generated solution `build/lc0.sln` in Visual Studio and build it yourself.
 
-Instructions, packages and tools for building on openSUSE are at [openSUSE_install.md](openSUSE_install.md)
+### Mac
 
-#### Docker
+You will need xcode and python3 installed. Then you need to install some required packages through Terminal:
 
-Use https://github.com/vochicong/lc0-docker
-to run latest releases of lc0 and the client inside a Docker container.
+1. Install meson: `pip3 install meson`
+2. Install ninja: `pip3 install ninja`
 
+Now download the lc0 source, if you haven't already done so, following the instructions earlier in the page.
 
-### Windows
+3. Go to the lc0 directory.
+4. Run `./build.sh -Dgtest=false`
 
-Here are the brief instructions for CUDA/CuDNN, for details and other options see `windows-build.md`.
+The compiled Lc0 will be in `build/release` 
 
-0. Install Microsoft Visual Studio (2017 or later)
-1. Install [CUDA](https://developer.nvidia.com/cuda-zone)
-2. Install [cuDNN](https://developer.nvidia.com/cudnn).
-3. Install Python3
-4. Install Meson: `pip3 install --upgrade meson`
-5. Edit `build.cmd`:
+Starting with v0.32.0, we are also offering a pre-compiled version that can be downloaded from the [release page](https://github.com/LeelaChessZero/lc0/releases).
 
-* Set `CUDA_PATH` with your CUDA directory
-* Set `CUDNN_PATH` with your cuDNN directory (may be the same with CUDA_PATH)
+### CUDA
 
-6. Run `build.cmd`. It will ask permission to delete the build directory, then generate MSVS project and pause.
+CUDA can be downloaded and installed following the instructions in from <https://developer.nvidia.com/cuda-downloads>. The build in most cases will pick it up with no further action. However if the cuda compiler (`nvcc`) is not found you can call the build like this: `PATH=/usr/local/cuda/bin:$PATH ./build.sh`, replacing the path with the correct one for `nvcc`.
 
-Then either:
+*Note* that CUDA uses the system compiler and stops if it doesn't recognize the version, even if newer. This is more of an issue with new Linux versions, but you can get around with the `nvcc_ccbin` build option to specify a different compiler just for cuda. As an example, adding `-Dnvcc_ccbin=g++-11` to the build command line will use g++-11 with cuda instead of the system compiler.
 
-7. Hit `Enter` to build it.
-8. Resulting binary will be `build/lc0.exe`
+### ONNX
 
-Or.
+Lc0 offers several ONNX based backends, namely onnx-cpu, onnx-cuda, onnx-trt, onnx-rocm and on Windows onnx-dml, utilizing the execution providers offered by onnxruntime.
 
-7. Open generated solution `build/lc0.sln` in Visual Studio and build yourself.
+Some Linux systems are starting to offer onnxruntime packages, so after installing this there is a good chance the Lc0 build will pick it up with no further action required. Otherwise you can set the `onnx_libdir` and `onnx_include` build options to point to the onnxruntime libraries and include directories respectively. The same options are used if you unpack a package downloaded from <https://github.com/microsoft/onnxruntime/releases>.
 
-### Mac
+For Windows, we offer pre-compiled packages for onnx-dml and onnx-trt, see the included README for installation instructions.
 
-First you need to install some required packages through Terminal:
-1. Install brew as per the instructions at https://brew.sh/
-2. Install python3: `brew install python3`
-3. Install meson: `brew install meson`
-4. Install ninja: `brew install ninja`
-5. (For Mac OS 10.14 Mojave, or if the other step 5 fails):
- * Install developer tools: ``xcode-select --install``
- * When using Mojave install SDK headers: `installer -pkg /Library/Developer/CommandLineTools/Packages/macOS_SDK_headers_for_macOS_10.14.pkg -target /` (if this doesn't work, use `sudo installer` instead of just `installer`.)
+### SYCL
 
-Or.
+*Note* that SYCL support is new in v0.32.0 and as such is still considered experimental.
 
-5. (For MacOS 10.15 Catalina, or if the other step 5 fails): 
- * Install Xcode command-line tools: ``xcode-select --install``
- * Install "XCode Developer Tools" through the app store. (First one on the list of Apps if searched.)
- * Associate the SDK headers in XCode with a command: export CPATH=\`xcrun --show-sdk-path\`/usr/include
- 
-Now download the lc0 source, if you haven't already done so, following the instructions earlier in the page.
+You will need the Intel "oneAPI DPC++/C++ Compiler", "DPC++ Compatibility Tool" and (for an Intel GPU) "oneAPI Math Kernel Library (oneMKL)" or (for an AMD GPU) hipBLAS.
 
-6. Go to the lc0 directory.
-7. Run `./build.sh -Dgtest=false` (needs step 5)
+The Intel tools can be found in either the "oneAPI Base Toolkit" or "C++ Essentials" packages that can be downloaded from
+<https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html>, while hipBLAS can be downloaded from
+<https://rocm.docs.amd.com/projects/hipBLAS/en/latest/>
 
-### Raspberry Pi
+The compiler for C code is icx and for C++ code is icx on Windows but icpx on Linux.
 
-You'll need to be running the latest Raspberry Pi OS "buster".
+To build Lc0 with SYCL you need to set the `sycl` build option using `-Dsycl=l0` (that is el zero) for an Intel GPU or `-Dsycl=amd` for (you guessed it) an AMD GPU.
 
-1. Install OpenBLAS
+You may also have to set the `dpct_include` option to point to the DPC++ Compatibility Tool includes, the `onemkl_include` similarly for the oneMKL includes, or `hip_libdirs` and `hip_include` to the AMD HIP libraries and includes respectively.
 
+On Linux, a typical session would go like this:
 ```shell
-git clone https://github.com/xianyi/OpenBLAS.git
-cd OpenBLAS/
-make
-sudo make PREFIX=/usr install
-cd ..
+. /opt/intel/oneapi/setvars.sh --include-intel-llvm
+CC=icx CXX=icpx AR=llvm-ar ./build.sh release -Dgtest=false -Dsycl=l0
 ```
+The first line is to initialize the build environment and is only needed once per session, while the build line may need modification as described above.
 
-2. Install Meson
+On windows you will have to build using `ninja`, this is provided by Visual Studio if you install the CMake component. We provide a `build-sycl.cmd` script that should build just fine for an Intel GPU. This script has not yet been tested with and AMD GPU, some editing will be required.
 
-```shell
-pip install meson
-pip install ninja
-```
+You can also install the [oneAPI DPC++/C++ Compiler Runtime](https://www.intel.com/content/www/us/en/developer/articles/tool/compilers-redistributable-libraries-by-version.html) so you can run Lc0 without needing to initialize the build environment every time.
 
-3. Install compiler and standard libraries
+### BLAS
 
-```shell
-sudo apt install clang-6.0 libstdc++-8-dev
-```
+Lc0 can also run (a bit slow) on CPU, using matrix multiplication functions from a BLAS library. By default OpenBLAS is used if available as it seems to offer good performance on a wide range of processors. If your system doesn't offer an OpenBLAS package (e.g. `libopenblas-dev`), or you have a recent processor you can get DNNL from [here](<https://github.com/uxlfoundation/oneDNN/releases/v2.2>). To use DNNL you have to pass `-Ddnnl=true` to the build and specify the directory where it was installed using the `-Ddnnl_dir=` option. For macs, the Accelerate library will be used.
 
-4. Clone lc0 and compile
+If the "Intel Implicit SPMD Program Compiler" (`ispc`) is [installed](<https://ispc.github.io/downloads.html>), some performance critical functions will use vectorized code for faster execution. 
 
-```shell
-git clone https://github.com/LeelaChessZero/lc0.git
-cd lc0
-git submodule update --init --recursive
-CC=clang-6.0 CXX=clang++-6.0 ./build.sh -Ddefault_library=static
-```
+*Note* that Lc0 is not able to control the number of threads with all BLAS libraries. Some libraries try to exploit cores aggressively, in which case it may be best to leave the threads set to the default (i.e. automatic) setting.
+
+## Getting help
 
-5. The resulting binary will be in build/release
+If there is an issue or the above instructions were not clear, you can always ask for help. The fastest way is to ask in the help channel of our [discord chat](http://lc0.org/chat), but you can also open a [github issue](https://github.com/LeelaChessZero/lc0/issues) (after checking the issue hasn't already been reported).
 
 ## Python bindings
 
@@ -240,8 +198,8 @@ along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
 
 ### Additional permission under GNU GPL version 3 section 7
 
-_The source files of Lc0 with the exception of the BLAS and OpenCL
-backends (all files in the `blas` and `opencl` sub-directories) have
+_The source files of Lc0 with the exception of the BLAS, OpenCL and SYCL
+backends (all files in the `blas`, `opencl` and `sycl` sub-directories) have
 the following additional permission, as allowed under GNU GPL version 3
 section 7:_
 
diff --git a/appveyor.yml b/appveyor.yml
index fa4ea670a3..e68f9f136e 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -6,20 +6,24 @@ image:
 environment:
   matrix:
   - NAME: gpu-nvidia-cudnn
-  - NAME: gpu-nvidia-cuda
+  - NAME: gpu-nvidia-cuda12
 #  - NAME: gpu-dx12
 #  - NAME: gpu-opencl
   - NAME: cpu-dnnl
   - NAME: cpu-openblas
 #  - NAME: onednn
-  - NAME: onnx-dml
+  - NAME: onnx
   - NAME: android
+  - NAME: gpu-nvidia-cuda11
 for:
 -
   matrix:
     only:
+    - NAME: gpu-nvidia-cudnn
+    - NAME: gpu-nvidia-cuda11
 #    - NAME: gpu-opencl
     - NAME: cpu-dnnl
+    - NAME: cpu-openblas
   skip_non_tags: true
 clone_folder: c:\projects\lc0
 install:
@@ -29,20 +33,21 @@ install:
 - cmd: set OPENCL=false
 - cmd: set BLAS=false
 - cmd: set ONEDNN=false
-- cmd: set ONNX_DML=false
+- cmd: set ONNX=false
 - cmd: set GTEST=false
 - cmd: set ANDROID=false
 - cmd: IF %NAME%==android set ANDROID=true
 - cmd: IF %NAME%==gpu-nvidia-cudnn set CUDNN=true
 - cmd: IF %NAME%==gpu-nvidia-cudnn set CUDA=true
-- cmd: IF %NAME%==gpu-nvidia-cuda set CUDA=true
+- cmd: IF %NAME%==gpu-nvidia-cuda11 set CUDA=true
+- cmd: IF %NAME%==gpu-nvidia-cuda12 set CUDA=true
 - cmd: IF %NAME%==gpu-dx12 set DX=true
 - cmd: IF %NAME%==gpu-opencl set OPENCL=true
 - cmd: IF %NAME%==cpu-dnnl set BLAS=true
 - cmd: IF %NAME%==cpu-openblas set BLAS=true
-- cmd: IF %NAME%==cpu-openblas set GTEST=true
 - cmd: IF %NAME%==onednn set ONEDNN=true
-- cmd: IF %NAME%==onnx-dml set ONNX_DML=true
+- cmd: IF %NAME%==onnx set ONNX=true
+- cmd: IF %NAME%==onnx set GTEST=true
 - cmd: set NET=753723
 - cmd: set NET_HASH=3e3444370b9fe413244fdc79671a490e19b93d3cca1669710ffeac890493d198
 - cmd: IF NOT %OPENCL%==true IF NOT %DX%==true set NET=791556
@@ -54,9 +59,12 @@ install:
 - cmd: IF %NAME%==onednn set DNNL_NAME=dnnl_win_2.7.2_cpu_vcomp_gpu_vcomp
 - cmd: IF %NAME%==onednn IF NOT EXIST C:\cache\%DNNL_NAME% appveyor DownloadFile https://github.com/borg323/oneDNN/releases/download/v2.7.2/dnnl_win_2.7.2_cpu_vcomp_gpu_vcomp.zip
 - cmd: IF %NAME%==onednn IF NOT EXIST C:\cache\%DNNL_NAME% 7z x dnnl_win_2.7.2_cpu_vcomp_gpu_vcomp.zip -oC:\cache
-- cmd: IF %NAME%==onnx-dml set ONNX_NAME=onnxruntime-win-x64-dml-1.22
-- cmd: IF %NAME%==onnx-dml IF NOT EXIST C:\cache\%ONNX_NAME% appveyor DownloadFile https://github.com/microsoft/onnxruntime/releases/download/v1.22.0/Microsoft.ML.OnnxRuntime.DirectML.1.22.0.nupkg
-- cmd: IF %NAME%==onnx-dml IF NOT EXIST C:\cache\%ONNX_NAME% 7z x Microsoft.ML.OnnxRuntime.DirectML.1.22.0.nupkg -oC:\cache\%ONNX_NAME%
+- cmd: IF %NAME%==onnx set ONNX_NAME=onnxruntime-win-x64-dml-1.22.1
+- cmd: IF %NAME%==onnx set ONNX_NAME_TWO=onnxruntime-win-x64-gpu-1.22.1
+- cmd: IF %NAME%==onnx IF NOT EXIST C:\cache\%ONNX_NAME% appveyor DownloadFile https://github.com/microsoft/onnxruntime/releases/download/v1.22.1/Microsoft.ML.OnnxRuntime.DirectML.1.22.1.nupkg
+- cmd: IF %NAME%==onnx IF NOT EXIST C:\cache\%ONNX_NAME% 7z x Microsoft.ML.OnnxRuntime.DirectML.1.22.1.nupkg -oC:\cache\%ONNX_NAME%
+- cmd: IF %NAME%==onnx IF NOT EXIST C:\cache\%ONNX_NAME_TWO% appveyor DownloadFile https://github.com/microsoft/onnxruntime/releases/download/v1.22.1/onnxruntime-win-x64-gpu-1.22.1.zip
+- cmd: IF %NAME%==onnx IF NOT EXIST C:\cache\%ONNX_NAME_TWO% 7z x onnxruntime-win-x64-gpu-1.22.1.zip -oC:\cache
 - cmd: IF %NAME%==cpu-openblas IF NOT EXIST C:\cache\OpenBLAS appveyor DownloadFile https://sjeng.org/ftp/OpenBLAS-0.3.3-win-oldthread.zip
 - cmd: IF %NAME%==cpu-openblas IF NOT EXIST C:\cache\OpenBLAS 7z x OpenBLAS-0.3.3-win-oldthread.zip -oC:\cache\OpenBLAS
 - cmd: IF %OPENCL%==true nuget install opencl-nug -Version 0.777.77 -OutputDirectory C:\cache
@@ -71,11 +79,21 @@ install:
 - cmd: IF DEFINED CUDNN_INSTALL cuda_10.1.243_win10_network -s nvcc_10.1 cublas_dev_10.1 cublas_10.1 cudart_10.1
 - cmd: IF DEFINED CUDNN_INSTALL appveyor DownloadFile https://developer.download.nvidia.com/compute/redist/cudnn/v7.5.1/cudnn-10.1-windows10-x64-v7.5.1.10.zip
 - cmd: IF DEFINED CUDNN_INSTALL 7z x cudnn-10.1-windows10-x64-v7.5.1.10.zip -o"%CUDA_PATH%"
-- cmd: IF %CUDNN%==false set "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1"
+- cmd: IF %NAME%==gpu-nvidia-cuda11 set "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1"
+- cmd: IF %NAME%==gpu-nvidia-cuda12 set "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9"
+- cmd: IF %NAME%==onnx set "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9"
 - cmd: IF %CUDA%==true IF NOT EXIST "%CUDA_PATH%" set CUDA_INSTALL=1
-- cmd: IF DEFINED CUDA_INSTALL appveyor DownloadFile https://developer.download.nvidia.com/compute/cuda/11.1.0/network_installers/cuda_11.1.0_win10_network.exe
-- cmd: IF DEFINED CUDA_INSTALL cuda_11.1.0_win10_network.exe -s nvcc_11.1 cublas_dev_11.1 cublas_11.1 cudart_11.1 documentation_11.1
+- cmd: IF %ONNX%==true IF NOT EXIST "%CUDA_PATH%" set CUDA_INSTALL=1
+- cmd: IF DEFINED CUDA_INSTALL IF %NAME%==gpu-nvidia-cuda11 appveyor DownloadFile https://developer.download.nvidia.com/compute/cuda/11.1.0/network_installers/cuda_11.1.0_win10_network.exe
+- cmd: IF DEFINED CUDA_INSTALL IF %NAME%==gpu-nvidia-cuda11 cuda_11.1.0_win10_network.exe -s nvcc_11.1 cublas_dev_11.1 cublas_11.1 cudart_11.1 documentation_11.1
+- cmd: IF DEFINED CUDA_INSTALL IF %NAME%==gpu-nvidia-cuda12 appveyor DownloadFile https://developer.download.nvidia.com/compute/cuda/12.9.0/network_installers/cuda_12.9.0_windows_network.exe
+- cmd: IF DEFINED CUDA_INSTALL IF %NAME%==gpu-nvidia-cuda12 cuda_12.9.0_windows_network.exe -s nvcc_12.9 cublas_dev_12.9 cublas_12.9 curand_dev_12.9 cudart_12.9 documentation_12.9
+- cmd: IF %NAME%==gpu-nvidia-cuda12 IF NOT EXIST C:\cache\cutlass-2.11.0 appveyor DownloadFile https://github.com/NVIDIA/cutlass/archive/refs/tags/v2.11.0.zip
+- cmd: IF %NAME%==gpu-nvidia-cuda12 IF NOT EXIST C:\cache\cutlass-2.11.0 7z x v2.11.0.zip -oC:\cache\
+- cmd: IF DEFINED CUDA_INSTALL IF %NAME%==onnx appveyor DownloadFile https://developer.download.nvidia.com/compute/cuda/12.9.0/network_installers/cuda_12.9.0_windows_network.exe
+- cmd: IF DEFINED CUDA_INSTALL IF %NAME%==onnx cuda_12.9.0_windows_network.exe -s nvcc_12.9 cudart_12.9
 - cmd: IF %CUDA%==true set PATH=%CUDA_PATH%\bin;%PATH%
+- cmd: IF %ONNX%==true set PATH=%CUDA_PATH%\bin;%PATH%
 - cmd: set PATH=C:\Python310;C:\Python310\scripts;%PATH%
 #- cmd: pip3 install --upgrade meson==0.55.3
 - cmd: set MIMALLOC_PATH=C:\cache\mimalloc-1.8.7
@@ -108,10 +126,10 @@ cache:
   - C:\cache
   - 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1'
   - 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1'
+  - 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9 -> appveyor.yml'
   - C:\projects\lc0\subprojects\packagecache
   - C:\ndk\android-ndk-r27c\toolchains\llvm\prebuilt\windows-x86_64
 before_build:
-- cmd: git submodule update --init --recursive
 - cmd: IF %BLAS%==true (echo.#define DEFAULT_MAX_PREFETCH 0 & echo.#define DEFAULT_TASK_WORKERS 0) > params_override.h
 - cmd: IF %ANDROID%==true (echo.#define DEFAULT_MAX_PREFETCH 0 & echo.#define DEFAULT_TASK_WORKERS 0) > params_override.h
 - cmd: SET BUILD_BLAS=%BLAS%
@@ -126,8 +144,9 @@ before_build:
 - cmd: IF %CUDA%==true SET F16C=false
 - cmd: SET EXTRA=
 - cmd: IF %ANDROID%==false SET EXTRA=-Db_vscrt=md
-- cmd: IF %ONNX_DML%==true SET EXTRA=-Db_vscrt=md -Donnx_libdir=C:\cache\%ONNX_NAME%\runtimes\win-x64\native\ -Donnx_include=C:\cache\%ONNX_NAME%\build\native\include
-- cmd: IF %ANDROID%==false meson build --backend vs2019 --buildtype release -Dgtest=%GTEST% -Dopencl=%OPENCL% -Dblas=%BUILD_BLAS% -Ddnnl=true -Ddx=%DX% -Dcudnn=%CUDNN% -Donednn=%ONEDNN% -Dispc_native_only=false -Dnative_cuda=false -Dpopcnt=%POPCNT% -Df16c=%F16C% -Dcudnn_include="%CUDA_PATH%\include","%CUDA_PATH%\cuda\include" -Dcudnn_libdirs="%CUDA_PATH%\lib\x64","%CUDA_PATH%\cuda\lib\x64" -Dopenblas_include="%PKG_FOLDER%\OpenBLAS\dist64\include" -Dopenblas_libdirs="%PKG_FOLDER%\OpenBLAS\dist64\lib" -Ddnnl_dir="%PKG_FOLDER%\%DNNL_NAME%" -Dopencl_include="%PKG_FOLDER%\opencl-nug.0.777.77\build\native\include" -Dopencl_libdirs="%PKG_FOLDER%\opencl-nug.0.777.77\build\native\lib\x64" -Ddefault_library=static -Dmalloc=mimalloc -Dmimalloc_libdir="%MIMALLOC_PATH%"\out\msvc-x64\Release %EXTRA%
+- cmd: IF %ONNX%==true SET EXTRA=-Db_vscrt=md -Donnx_libdir=C:\cache\%ONNX_NAME%\runtimes\win-x64\native\ -Donnx_include=C:\cache\%ONNX_NAME%\build\native\include -Ddefault_backend=onnx-trt -Dplain_cuda=false
+- cmd: IF %NAME%==gpu-nvidia-cuda12 SET EXTRA=-Db_vscrt=md -Dcutlass=true -Dcutlass_include=C:\cache\cutlass-2.11.0\include
+- cmd: IF %ANDROID%==false meson build --backend vs2019 --buildtype release -Dgtest=false -Dopencl=%OPENCL% -Dblas=%BUILD_BLAS% -Ddnnl=true -Ddx=%DX% -Dcudnn=%CUDNN% -Donednn=%ONEDNN% -Dispc_native_only=false -Dnative_cuda=false -Dpopcnt=%POPCNT% -Df16c=%F16C% -Dcudnn_include="%CUDA_PATH%\include","%CUDA_PATH%\cuda\include" -Dcudnn_libdirs="%CUDA_PATH%\lib\x64","%CUDA_PATH%\cuda\lib\x64" -Dopenblas_include="%PKG_FOLDER%\OpenBLAS\dist64\include" -Dopenblas_libdirs="%PKG_FOLDER%\OpenBLAS\dist64\lib" -Ddnnl_dir="%PKG_FOLDER%\%DNNL_NAME%" -Dopencl_include="%PKG_FOLDER%\opencl-nug.0.777.77\build\native\include" -Dopencl_libdirs="%PKG_FOLDER%\opencl-nug.0.777.77\build\native\lib\x64" -Ddefault_library=static -Dmalloc=mimalloc -Dmimalloc_libdir="%MIMALLOC_PATH%"\out\msvc-x64\Release %EXTRA%
 - cmd: IF %ANDROID%==true meson arm64-v8a --buildtype release -Dgtest=false -Dopenblas_include="%PKG_FOLDER%\OpenBLAS\android-aarch64\include" -Dopenblas_libdirs="%PKG_FOLDER%\OpenBLAS\android-aarch64\lib" -Dembed=%EMBED% -Ddefault_library=static --cross-file crossfile-aarch64
 - cmd: IF %ANDROID%==true meson armeabi-v7a --buildtype release -Dgtest=false -Dopenblas_include="%PKG_FOLDER%\OpenBLAS\android-armv7a\include" -Dopenblas_libdirs="%PKG_FOLDER%\OpenBLAS\android-armv7a\lib" -Dembed=%EMBED% -Ddefault_library=static --cross-file crossfile-armv7a -Dispc=false -Dneon=false
 build_script:
@@ -139,7 +158,7 @@ after_build:
 - cmd: IF %APPVEYOR_REPO_TAG%==true IF %ANDROID%==true call scripts\appveyor_android_package.cmd
 - cmd: cd C:\projects\lc0
 artifacts:
-  - path: build/lc0.exe
+  - path: /build/lc0*.exe/
     name: lc0-$(NAME)
   - path: arm64-v8a/lc0
     name: lc0-android-arm64-v8a
@@ -169,6 +188,7 @@ deploy:
 test_script:
 - cmd: IF %GTEST%==true cd build
 - cmd: IF %GTEST%==true xcopy /s /i C:\cache\syzygy syzygy
+- cmd: IF %GTEST%==true IF %ONNX%==true copy %PKG_FOLDER%\%ONNX_NAME%\runtimes\win-x64\native\onnxruntime.dll
 - cmd: IF %GTEST%==true meson test --print-errorlogs
 - cmd: cd C:\projects\lc0
 on_finish:
diff --git a/build.cmd b/build.cmd
index 071105b1f8..262b5ee00e 100644
--- a/build.cmd
+++ b/build.cmd
@@ -2,7 +2,7 @@
 setlocal
 
 rem 1. Set the following for the options you want to build.
-set CUDNN=true
+set CUDNN=false
 set CUDA=true
 set DX12=false
 set OPENCL=false
@@ -11,6 +11,7 @@ set DNNL=false
 set OPENBLAS=false
 set EIGEN=false
 set TEST=false
+set CUTLASS=true
 
 if "%CUDA%"=="true" (
   if not defined CUDA_PATH (
@@ -71,6 +72,7 @@ meson setup build --backend %backend% --buildtype release -Ddx=%DX12% -Dcudnn=%C
 -Dmkl_include="%MKL_PATH%\include" -Dmkl_libdirs="%MKL_PATH%\lib\intel64" -Ddnnl_dir="%DNNL_PATH%" ^
 -Dopencl_libdirs="%OPENCL_LIB_PATH%" -Dopencl_include="%OPENCL_INCLUDE_PATH%" ^
 -Dopenblas_include="%OPENBLAS_PATH%\include" -Dopenblas_libdirs="%OPENBLAS_PATH%\lib" ^
+-Dcutlass="%CUTLASS%" ^
 -Ddefault_library=static
 
 if errorlevel 1 exit /b
@@ -80,4 +82,4 @@ pause
 cd build
 
 msbuild /m /p:Configuration=Release /p:Platform=x64 /p:WholeProgramOptimization=true ^
-/p:PreferredToolArchitecture=x64 lc0.sln /filelogger
\ No newline at end of file
+/p:PreferredToolArchitecture=x64 lc0.sln /filelogger
diff --git a/build.sh b/build.sh
index fa30e5c3df..8eb935c926 100755
--- a/build.sh
+++ b/build.sh
@@ -24,7 +24,7 @@ if [ -f "${BUILDDIR}/build.ninja" ]
 then
   "${MESON}" configure "${BUILDDIR}" -Dbuildtype="${BUILDTYPE}" -Dprefix="${INSTALL_PREFIX:-/usr/local}" "$@"
 else
-  "${MESON}" "${BUILDDIR}" --buildtype "${BUILDTYPE}" --prefix "${INSTALL_PREFIX:-/usr/local}" "$@"
+  "${MESON}" setup "${BUILDDIR}" --buildtype "${BUILDTYPE}" --prefix "${INSTALL_PREFIX:-/usr/local}" "$@"
 fi
 
 "${MESON}" compile -C "${BUILDDIR}"
diff --git a/changelog.txt b/changelog.txt
index 5d208674ac..cdfec68116 100644
--- a/changelog.txt
+++ b/changelog.txt
@@ -1,4 +1,101 @@
-﻿v0.31.0-rc1 (2024-03-25)
+﻿v0.32.0 (2025-08-21)
+~~~~~~~
+* Support for building with cuda 13.
+* README update.
+* Build system improvements.
+
+v0.32.0-rc2 (2025-08-12)
+~~~~~~~
+* Fix for onnx-trt bug, where the wrong network could be used from the cache.
+* Added code to detect RPE nets and give an error instead of bad results.
+* Better instructions in the readme and install script for onnx-trt.
+* Made `UCI_ShowWDL` again off by default again as some GUIs have issues.
+* Fixed a long standing issue when compiled with `-ffast-math` (or `icx -O3`).
+* Several improvements to the sycl backend.
+* Several improvements to the metal backend.
+* Refactored the rescorer code and training data header to make them usable by
+  external tools.
+* Relaxed cuda/cudnn version checks so that no warnings are shown for mismatched
+  versions that are supported.
+* Several build system updates.
+* Assorted small fixes and improvements.
+
+v0.32.0-rc1 (2025-07-18)
+~~~~~~~
+The code has been reorganized and undergone major changes. Therefore this
+changelog will be less detailed and describe the changes in major groups.
+* We have a new search API that allows search algorithms to co-exist. Currently
+  available are `classic` (the default), `dag-preview` (more later),
+  `valuehead` and `policyhead`. The default algorithm can be changed either at
+  build time by the `default_search` option or by renaming the executable to
+  include the algorithm name (e.g. lc0-valuehead).
+* We also have a new backend interface that is chess oriented and not tied to
+  the network architecture. The existing backends still use the old interface
+  through a wrapper.
+* The source code is reorganized, with a more logical directory structure.
+* The original search was ported to the new search and backend interfaces and
+  is renamed to `classic`. This has allowed some streamlining and
+  simplifications.
+* The `dag-preview` search is the DAG algorithm that lived in a separate branch
+  up to now. It hasn't been so well tested, that's why it has "preview" in its
+  name for now, but lives in the `src/search/dag-classic` directory.
+* The `valuehead` search replaces `ValueOnly` mode and selects the move with the
+  best value head evaluation.
+* The `policyhead` search is equivalent to a single node search, selecting the
+  best move using just the policy head.
+* The new `default_backend` build option allows to override the fixed priority
+  for the backend used by default.
+* The new `native_arch` build option to override the `-march=native` compiler
+  default for linux release builds, to help with distribution package creation.
+* We have a new `sycl` backend that will work with amd, intel and nvidia gpus.
+* There is also a new `onnx-trt` backend, using tensorrt on nvidia gpus.
+* Support simple/normal/pro mode in options was cleaned up, using a common
+  mechanism.
+* Added the `wait` uci extension command to allow running simple tests from the
+  command line.
+* Removed the `fen` uci extension command as it was unnecessarily complicating
+  things.
+* Some preliminary fp8 support was added for onnx and xla. This is not
+  functional, just there to make experimentation easier.
+* Several build system changes and improvements.
+* We now generate binaries for cuda 12, onnx-trt and macos.
+* Support for using lc0 with openbench.
+* New `bench` mode for a quicker benchmark.
+* Assorted small fixes and improvements.
+
+v0.31.2 (2024-10-20)
+~~~~~~~
+* Updated the WDL_mu centipawn fallback.
+* Fix for build issues with newer Linux c++ libraries.
+* Fix for an XLA Mish bug.
+* Minor README.md update.
+
+v0.31.1 (2024-08-11)
+~~~~~~~
+* Make WDL_mu score type work as intended.
+* Fix macos CI builds.
+
+v0.31.0 (2024-06-16)
+~~~~~~~
+* No changes from rc3.
+
+v0.31.0-rc3 (2024-05-29)
+~~~~~~~
+* The `WDLDrawRateTarget` option now accepts the value 0 (new default) to retain
+  raw WDL values if `WDLCalibrationElo` is set to 0 (default).
+* Improvements to the verbose move stats if `WDLEvalObjectivity` is used.
+* The centipawn score is displayed by default for old nets without WDL output.
+* Some build system improvements.
+
+v0.31.0-rc2 (2024-04-16)
+~~~~~~~
+* Changed cuda compilation options to use `-arch=native` or `-arch=all-major`
+  if no specific version is requested, with fallback for older cuda that don't
+  support those options.
+* Updated android builds to use openblas 0.3.27.
+* A few small fixes.
+
+v0.31.0-rc1 (2024-03-25)
 ~~~~~~~
 * The blas, cuda, eigen, metal and onnx backends now have support for multihead
   network architecture and can run BT3/BT4 nets.
@@ -39,6 +136,9 @@
   natively higher draw rates.
 * Made the WDL Rescale sharpness limit configurable via the `--wdl-max-s` hidden
   option.
+* The search task workers can be set automatically, to either 0 for cpu backends
+  or up to 4 depending on the number of cpu cores. This is enabled by
+  `--task-workers=-1` (the new default).
 * Several assorted fixes and code cleanups.
 
 v0.30.0 (2023-07-21)
diff --git a/dist/README-onnx-trt.txt b/dist/README-onnx-trt.txt
new file mode 100644
index 0000000000..8a50b2689e
--- /dev/null
+++ b/dist/README-onnx-trt.txt
@@ -0,0 +1,88 @@
+# Lc0
+
+Lc0 is a UCI-compliant chess engine designed to play chess via
+neural network, specifically those of the LeelaChessZero project
+(https://lczero.org).
+
+# Installation
+
+Summary: run `instrall.cmd` and follow the instructions.
+
+To run this version you will also need several dll files from NVIDA's
+CUDA, cuDNN and TensorRT. Those dlls can either be on the system path
+from a separate installation of these libraries, or can be placed
+directly in the Lc0 folder. Either way, you will get an error message
+for any that isn't found.
+
+The dlls needed are the following:
+
+1. CUDA
+* cublas64_12.dll
+* cublasLt64_12.dll
+* cudart64_12.dll
+* cufft64_11.dll
+
+2. cuDNN
+* cudnn64_9.dll
+* cudnn_graph64_9.dll
+
+3. TensorRT:
+* nvinfer_10.dll
+* nvinfer_builder_resource_10.dll
+* nvinfer_plugin_10.dll
+* nvonnxparser_10.dll
+
+The install.cmd script included in this package will download the
+CUDA and cuDNN files needed and will open the TensorRT download page
+using your browser. If it fails, you can download the files manually
+using the following addresses, the dlls are in the `bin` directory
+in the CUDA/cuDNN zips and the `lib` directory in the TensorRT zip.
+
+* https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.9.79-archive.zip
+* https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.9.1.4-archive.zip
+* https://developer.download.nvidia.com/compute/cuda/redist/libcufft/windows-x86_64/libcufft-windows-x86_64-11.4.1.4-archive.zip
+* https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-9.11.0.98_cuda12-archive.zip
+* https://developer.nvidia.com/tensorrt/download/10x#trt1012
+
+The TensorRT link will take you to the download page, after
+registering go to the "TensorRT 10.12 GA for x86_64 Architecture"
+section and get the "TensorRT 10.12 GA for Windows 10, 11,
+Server 2022 and CUDA 12.0 to 12.9 ZIP Package".
+
+Finally, if Lc0 still won't run, get the latest Visual C++
+redistributable from: https://aka.ms/vs/17/release/vc_redist.x64.exe
+
+# Running
+
+When running Lc0 with a new network file, it will take some time to
+create the optimized model to use. This is normal. The model will be
+cached for future runs in the `trt_cache` folder, so next time it will
+be faster. If you want to experiment you can rename the `trt_cache`
+folder and rerun, sometimes TensorRT will generate a different model
+that may be faster. Moreover, if you are having issues, you can
+delete/rename the cache and rerun.
+
+# License
+
+Leela Chess is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+Leela Chess is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+Additional permission under GNU GPL version 3 section 7
+
+If you modify this Program, or any covered work, by linking or
+combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+modified version of those libraries), containing parts covered by the
+terms of the respective license agreement, the licensors of this
+Program grant you additional permission to convey the resulting work.
+
diff --git a/dist/install-cuda_12_9.cmd b/dist/install-cuda_12_9.cmd
new file mode 100644
index 0000000000..c5a253093b
--- /dev/null
+++ b/dist/install-cuda_12_9.cmd
@@ -0,0 +1,43 @@
+@echo off
+where /q tar
+if errorlevel 1 goto error
+
+cd /d %~dp0
+
+cls
+echo Installing the CUDA dlls required by the Lc0 cuda backend.
+
+echo 1/4. Downloading cudart.
+curl -# --ssl-no-revoke -o tmp_cudart.zip https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.9.37-archive.zip"
+if errorlevel 1 goto error
+
+echo 2/4. Extracting files.
+tar -xzOf tmp_cudart.zip cuda_cudart-windows-x86_64-12.9.37-archive/bin/cudart64_12.dll >cudart64_12.dll
+if errorlevel 1 goto error
+
+tar -xzOf tmp_cudart.zip cuda_cudart-windows-x86_64-12.9.37-archive/LICENSE >CUDA.txt
+
+del /q tmp_cudart.zip
+
+echo 3/4. Downloading cublas.
+curl -# --ssl-no-revoke -o tmp_cublas.zip https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.9.0.13-archive.zip"
+if errorlevel 1 goto error
+
+echo 4/4. Extracting files.
+tar -xzOf tmp_cublas.zip libcublas-windows-x86_64-12.9.0.13-archive/bin/cublas64_12.dll >cublas64_12.dll
+if errorlevel 1 goto error
+
+tar -xzOf tmp_cublas.zip libcublas-windows-x86_64-12.9.0.13-archive/bin/cublasLt64_12.dll >cublasLt64_12.dll
+if errorlevel 1 goto error
+
+del /q tmp_cublas.zip
+
+echo Installation successful.
+pause
+exit /b
+
+:error
+cls
+echo Installation failed - you will have to download cuda 12.9 yourself.
+pause
+
diff --git a/dist/install-trt.cmd b/dist/install-trt.cmd
new file mode 100644
index 0000000000..3538c30b66
--- /dev/null
+++ b/dist/install-trt.cmd
@@ -0,0 +1,99 @@
+@echo off
+where /q tar
+if errorlevel 1 goto error
+
+cd /d %~dp0
+
+cls
+
+echo This script will download and install the CUDA/cuDNN/tensorRT dlls required by the Lc0 onnx-trt backend.
+echo(
+echo If you are using a metered internet connection, be aware the download will be arounbd 3 Gb.
+echo(
+pause
+
+echo Installing the CUDA dlls required by the Lc0 onnx-trt backend.
+
+echo 1/6. Downloading cudart.
+curl -# --ssl-no-revoke -o tmp_cudart.zip https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.9.79-archive.zip"
+if errorlevel 1 goto error
+
+echo 2/6. Extracting files.
+tar -xzOf tmp_cudart.zip cuda_cudart-windows-x86_64-12.9.79-archive/bin/cudart64_12.dll >cudart64_12.dll
+if errorlevel 1 goto error
+
+tar -xzOf tmp_cudart.zip cuda_cudart-windows-x86_64-12.9.79-archive/LICENSE >CUDA.txt
+
+del /q tmp_cudart.zip
+
+echo 3/6. Downloading cublas.
+curl -# --ssl-no-revoke -o tmp_cublas.zip https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.9.1.4-archive.zip"
+if errorlevel 1 goto error
+
+echo 4/6. Extracting files.
+tar -xzOf tmp_cublas.zip libcublas-windows-x86_64-12.9.1.4-archive/bin/cublas64_12.dll >cublas64_12.dll
+if errorlevel 1 goto error
+
+tar -xzOf tmp_cublas.zip libcublas-windows-x86_64-12.9.1.4-archive/bin/cublasLt64_12.dll >cublasLt64_12.dll
+if errorlevel 1 goto error
+
+del /q tmp_cublas.zip
+
+echo 5/6. Downloading cufft.
+curl -# --ssl-no-revoke -o tmp_cufft.zip https://developer.download.nvidia.com/compute/cuda/redist/libcufft/windows-x86_64/libcufft-windows-x86_64-11.4.1.4-archive.zip"
+if errorlevel 1 goto error
+
+echo 6/6. Extracting files.
+tar -xzOf tmp_cufft.zip libcufft-windows-x86_64-11.4.1.4-archive/bin/cufft64_11.dll >cufft64_11.dll
+if errorlevel 1 goto error
+
+del /q tmp_cufft.zip
+
+echo Installing the cuDNN dlls required by the Lc0 onnx-trt backend.
+
+echo 1/2. Downloading cudnn.
+curl -# --ssl-no-revoke -o tmp_cudnn.zip https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-9.11.0.98_cuda12-archive.zip"
+if errorlevel 1 goto error
+
+echo 2/2. Extracting files.
+tar -xzOf tmp_cudnn.zip cudnn-windows-x86_64-9.11.0.98_cuda12-archive/bin/cudnn64_9.dll >cudnn64_9.dll
+if errorlevel 1 goto error
+
+tar -xzOf tmp_cudnn.zip cudnn-windows-x86_64-9.11.0.98_cuda12-archive/bin/cudnn_graph64_9.dll >cudnn_graph64_9.dll
+if errorlevel 1 goto error
+
+tar -xzOf tmp_cudnn.zip cudnn-windows-x86_64-9.11.0.98_cuda12-archive/LICENSE >CUDNN.txt
+
+del /q tmp_cudnn.zip
+
+echo Installing the tensorRT dlls required by the Lc0 onnx-trt backend.
+
+echo 1/2. Downloading tensorRT.
+curl -# --ssl-no-revoke -o tmp_tensorrt.zip https://developer.download.nvidia.com/compute/machine-learning/tensorrt/10.12.0/zip/TensorRT-10.12.0.36.Windows.win10.cuda-12.9.zip"
+if errorlevel 1 goto error
+
+echo 2/2. Extracting files.
+tar -xzOf tmp_tensorrt.zip TensorRT-10.12.0.36/lib/nvinfer_10.dll >nvinfer_10.dll
+if errorlevel 1 goto error
+
+tar -xzOf tmp_tensorrt.zip TensorRT-10.12.0.36/lib/nvinfer_builder_resource_10.dll >nvinfer_builder_resource_10.dll
+if errorlevel 1 goto error
+
+tar -xzOf tmp_tensorrt.zip TensorRT-10.12.0.36/lib/nvinfer_plugin_10.dll >nvinfer_plugin_10.dll
+if errorlevel 1 goto error
+
+tar -xzOf tmp_tensorrt.zip TensorRT-10.12.0.36/lib/nvonnxparser_10.dll >nvonnxparser_10.dll
+if errorlevel 1 goto error
+
+tar -xzOf tmp_tensorrt.zip TensorRT-10.12.0.36/doc/Readme.txt >TENSORRT.txt
+
+del /q tmp_tensorrt.zip
+
+pause
+exit /b
+
+:error
+cls
+echo Installation failed - see the README for alternative download instructions.
+pause
+
diff --git a/libs/lczero-common b/libs/lczero-common
deleted file mode 160000
index 55e1b382ef..0000000000
--- a/libs/lczero-common
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 55e1b382efadd57903e37f2a2e29caef3ea85799
diff --git a/meson.build b/meson.build
index fe10b0e977..63613fb618 100644
--- a/meson.build
+++ b/meson.build
@@ -16,21 +16,11 @@
 
 project('lc0', 'cpp',
         default_options : ['cpp_std=c++20', 'b_ndebug=if-release', 'warning_level=3', 'b_lto=true', 'b_vscrt=mt'],
-        meson_version: '>=0.55')
+        meson_version: '>=0.57')
 
-cc = meson.get_compiler('cpp')
-
-if not cc.has_header('optional') or not cc.has_header('string_view')
-    error('Lc0 requires a compiler supporting C++17, for example g++ v8.0, ' +
-          'clang v5.0 or later (with C++17 stdlib) and Visual Studio 2017 or ' +
-          'later.')
-endif
+fs = import('fs')
 
-if not cc.has_header('charconv')
-    warning('Your compiler or library does not have full C++17 support. ' +
-            'See the README for compilers that are known to be working. ' +
-            'This will become an error in the future.')
-endif
+cc = meson.get_compiler('cpp')
 
 if cc.get_id() == 'clang'
   # Thread safety annotation
@@ -38,7 +28,9 @@ if cc.get_id() == 'clang'
 endif
 if cc.get_id() != 'msvc'
   if get_option('buildtype') == 'release'
-    add_project_arguments(cc.get_supported_arguments(['-march=native']), language : 'cpp')
+     if get_option('native_arch')
+       add_project_arguments(cc.get_supported_arguments(['-march=native']), language : 'cpp')
+     endif
   endif
 endif
 if cc.get_id() == 'msvc'
@@ -70,37 +62,19 @@ includes += include_directories('third_party', is_system: true)
 compile_proto = find_program('scripts/compile_proto.py')
 gen = generator(compile_proto, output: ['@BASENAME@.pb.h'],
   arguments : [
-    '--proto_path=@CURRENT_SOURCE_DIR@/libs/lczero-common',
+    '--proto_path=@CURRENT_SOURCE_DIR@',
     '--cpp_out=@BUILD_DIR@',
     '@INPUT@'])
 
-# Handle submodules.
-git = find_program('git', required: false)
-if run_command('scripts/checkdir.py', 'libs/lczero-common/proto', check : false).returncode() != 0
-  if git.found()
-    if run_command(git, 'status', check : false).returncode() == 0
-      message('updating git submodule libs/lczero-common')
-      run_command(git, 'submodule', 'update', '--init', '--recursive', check : false)
-    else
-      message('cloning lczero-common.git into libs/lczero-common')
-      run_command(git, 'clone', '--depth=1',
-                  'https://github.com/LeelaChessZero/lczero-common.git',
-                  'libs/lczero-common/', check : false)
-    endif
-  else
-    error('Please install git to automatically fetch submodules or download the archives manually from GitHub.')
-  endif
-endif
-
 pb_files = [
   'src/utils/protomessage.cc',
-  gen.process('libs/lczero-common/proto/net.proto',
-    preserve_path_from : meson.current_source_dir() + '/libs/lczero-common/')
+  gen.process('proto/net.proto', preserve_path_from : meson.current_source_dir())
 ]
 common_files += pb_files
 
 # Extract git short revision.
 short_rev = 'unknown'
+git = find_program('git', required: false)
 if git.found()
   r = run_command(git, 'rev-parse', '--short', 'HEAD', check : false)
   if r.returncode() == 0
@@ -142,17 +116,11 @@ elif get_option('malloc') != ''
 endif
 
 # ONNX and HLO protobufs.
-gen_proto_src = generator(compile_proto, output: ['@BASENAME@.pb.h'],
-  arguments : [
-    '--proto_path=@CURRENT_SOURCE_DIR@/src',
-    '--cpp_out=@BUILD_DIR@',
-    '@INPUT@'])
-
-files += gen_proto_src.process('src/neural/onnx/onnx.proto',
-  preserve_path_from : meson.current_source_dir() + '/src/')
+files += gen.process('proto/onnx.proto',
+  preserve_path_from : meson.current_source_dir())
 
-files += gen_proto_src.process('src/neural/xla/hlo.proto',
-  preserve_path_from : meson.current_source_dir() + '/src/')
+files += gen.process('proto/hlo.proto',
+  preserve_path_from : meson.current_source_dir())
 
 #############################################################################
 ## Main files
@@ -239,6 +207,11 @@ files += [
 
 includes += include_directories('src')
 
+deps += dependency('absl_flat_hash_map',
+                   include_type: 'system',
+                   fallback: ['abseil-cpp', 'absl_container_dep'],
+                   default_options : ['warning_level=0', 'cpp_std=c++20'])
+
 deps += dependency('threads')
 
 #############################################################################
@@ -259,9 +232,6 @@ if get_option('dag_classic')
     'src/search/dag_classic/search.cc',
     'src/search/dag_classic/wrapper.cc',
   ]
-
-  absl = subproject('abseil-cpp', default_options : ['warning_level=0', 'cpp_std=c++20'])
-  deps += absl.get_variable('absl_container_dep').as_system()
 endif
 
 #############################################################################
@@ -348,7 +318,13 @@ if get_option('build_backends')
 
     endif
 
-    deps += dependency('eigen3', fallback: ['eigen', 'eigen_dep']).as_system()
+    eigen_dep = dependency('eigen3')
+    # Check for needed header, bad dependency seen in the widl.
+    if eigen_dep.found() and cc.has_header('Eigen/Core', dependencies: eigen_dep)
+      deps += eigen_dep.as_system()
+    else
+      deps += subproject('eigen').get_variable('eigen_dep').as_system()
+    endif
 
     ispc = find_program('ispc', required: false)
     ispc_arch = 'x86-64'
@@ -476,48 +452,45 @@ if get_option('build_backends')
   ## cuDNN
   ## ~~~~~
   cudnn_libdirs = get_option('cudnn_libdirs')
+  nvcc_paths = []
+  foreach p : cudnn_libdirs
+    nvcc_paths += fs.parent(p) + '/bin/nvcc'
+  endforeach
+  nvcc_paths += ['nvcc', '/usr/local/cuda/bin/nvcc', '/opt/cuda/bin/nvcc']
+  message('Looking for nvcc in: ' + ', '.join(nvcc_paths))
   cu_blas = cc.find_library('cublas', dirs: cudnn_libdirs, required: false)
   cu_dnn = cc.find_library('cudnn', dirs: cudnn_libdirs, required: false)
   cu_dart = cc.find_library('cudart', dirs: cudnn_libdirs, required: false)
-  nvcc = find_program('nvcc', '/usr/local/cuda/bin/nvcc', '/opt/cuda/bin/nvcc',
+  nvcc = find_program(nvcc_paths,
                       required: false)
-
-  if (get_option('cudnn') or get_option('plain_cuda')) and cu_blas.found() and cu_dart.found() and nvcc.found()
-    deps += [cu_blas, cu_dart]
-    cuda_files = ['src/neural/backends/cuda/layers.cc']
-    if get_option('cudnn') and cu_dnn.found()
-      deps += cu_dnn
-      cuda_files += 'src/neural/backends/cuda/network_cudnn.cc'
-      cuda_files += 'src/neural/backends/cuda/network_cuda.cc' # To support newer nets.
-      add_project_arguments('-DUSE_CUDNN', language : 'cpp')
-    elif get_option('plain_cuda')
-      cuda_files += 'src/neural/backends/cuda/network_cuda.cc'
-    endif
+  nvcc_ok = false
+  if get_option('nvcc') and nvcc.found()
     foreach d : get_option('cudnn_include')
       if run_command('scripts/checkdir.py', d, check : false).returncode() == 0
         includes += include_directories(d, is_system: true)
       endif
     endforeach
-    includes += include_directories('src/neural/backends/cuda/')
-
-    cuda_arguments = ['-c', '@INPUT@', '-o', '@OUTPUT@',
+    nvcc_arguments = ['-c', '@INPUT@', '-o', '@OUTPUT@',
                       '-I', meson.current_source_dir() + '/src']
     nvcc_help = run_command(nvcc, '-h', check : false).stdout()
     if host_machine.system() == 'windows'
       if get_option('b_vscrt') == 'mt'
-        cuda_arguments += ['-Xcompiler', '-MT']
+        nvcc_arguments += ['-Xcompiler', '-MT']
       elif get_option('b_vscrt') == 'mtd'
-        cuda_arguments += ['-Xcompiler', '-MTd']
+        nvcc_arguments += ['-Xcompiler', '-MTd']
       elif get_option('b_vscrt') == 'mdd' or (get_option('b_vscrt') == 'from_buildtype' and get_option('buildtype') == 'debug')
-        cuda_arguments += ['-Xcompiler', '-MDd']
+        nvcc_arguments += ['-Xcompiler', '-MDd']
       elif get_option('b_vscrt') != 'none'
-        cuda_arguments += ['-Xcompiler', '-MD']
+        nvcc_arguments += ['-Xcompiler', '-MD']
       endif
     else
-      cuda_arguments += ['--std=c++14', '-Xcompiler', '-fPIC']
+      nvcc_arguments += ['--std=c++17', '-Xcompiler', '-fPIC']
+      if get_option('debug')
+        nvcc_arguments += ['-g']
+      endif
     endif
     if get_option('nvcc_ccbin') != ''
-      cuda_arguments += ['-ccbin=' + get_option('nvcc_ccbin')]
+      nvcc_arguments += ['-ccbin=' + get_option('nvcc_ccbin')]
     endif
     cuda_cc = get_option('cc_cuda') # Unfortunately option cuda_cc is reserved.
     nvcc_extra_args = []
@@ -543,26 +516,68 @@ if get_option('build_backends')
       endif
     endif
     foreach x : get_option('cudnn_include')
-      cuda_arguments += ['-I', x]
+      nvcc_arguments += ['-I', x]
     endforeach
     if host_machine.system() == 'windows'
       outputname = '@BASENAME@.obj'
     else
       outputname = '@BASENAME@.o'
     endif
+    nvcc_ok = true
+
+    max_cuda = 0
+    nvcc_dryrun = run_command(nvcc, '--dryrun', nvcc_extra_args, 'foo.cu', check : false).stderr()
+    foreach x : nvcc_dryrun.split()
+      if x.contains('-D__CUDA_ARCH__=')
+        arch = x.substring(16).to_int()
+        if arch > max_cuda
+          max_cuda = arch
+        endif
+      endif
+    endforeach
+  endif
+  if (get_option('cudnn') or get_option('plain_cuda')) and cu_dart.found() and cu_blas.found() and nvcc_ok
+    deps += [cu_blas, cu_dart]
+    cuda_files = ['src/neural/backends/cuda/layers.cc']
+    if get_option('cudnn') and cu_dnn.found()
+      deps += cu_dnn
+      cuda_files += 'src/neural/backends/cuda/network_cudnn.cc'
+      cuda_files += 'src/neural/backends/cuda/network_cuda.cc' # To support newer nets.
+      add_project_arguments('-DUSE_CUDNN', language : 'cpp')
+    elif get_option('plain_cuda')
+      cuda_files += 'src/neural/backends/cuda/network_cuda.cc'
+    endif
+    includes += include_directories('src/neural/backends/cuda/')
 	 files += cuda_files
+
+    if get_option('cutlass') and max_cuda >= 800
+      add_project_arguments('-DUSE_CUTLASS', language : 'cpp')
+      nvcc_arguments += ['-DUSE_CUTLASS']
+      if get_option('cutlass_include') != ''
+        nvcc_arguments += ['-I', get_option('cutlass_include')]
+      else
+        nvcc_arguments += ['-I', subproject('cutlass').get_variable('include_directory')]
+      endif
+      nvcc_arguments += ['-isystem=@CURRENT_SOURCE_DIR@/third_party']
+      files += custom_target('cuda cutlass code',
+        input : 'src/neural/backends/cuda/cutlass_kernels.cu',
+        output : outputname,
+        command : [nvcc, nvcc_extra_args, nvcc_arguments]
+      )
+    endif
+
     files += custom_target('cuda fp32 code',
       input : 'src/neural/backends/cuda/common_kernels.cu',
       output : outputname,
       depend_files: 'src/neural/backends/cuda/winograd_helper.inc',
-      command : [nvcc, nvcc_extra_args, cuda_arguments]
+      command : [nvcc, nvcc_extra_args, nvcc_arguments]
     )
 
     files += custom_target('cuda fp16 code',
       input : 'src/neural/backends/cuda/fp16_kernels.cu',
       output : outputname,
       depend_files: 'src/neural/backends/cuda/winograd_helper.inc',
-      command : [nvcc, nvcc_extra_args, cuda_arguments]
+      command : [nvcc, nvcc_extra_args, nvcc_arguments]
     )
     has_backends = true
   endif
@@ -602,24 +617,47 @@ if get_option('build_backends')
   ## ~~~~~~~~~~
   ## ONNX
   ## ~~~~~~~~~~
-  if get_option('onnx_libdir') != '' and get_option('onnx_include') != ''
-    deps += cc.find_library('onnxruntime', dirs: get_option('onnx_libdir'),
-                            required: true)
-    includes += include_directories(get_option('onnx_include'), is_system: true)
+  onnxruntime = cc.find_library('onnxruntime', dirs: get_option('onnx_libdir'),
+                                required: false)
+  if get_option('onnx') and onnxruntime.found()
+    deps += onnxruntime
+    onnx_inc_dir = get_option('onnx_include')
+    if fs.is_dir(onnx_inc_dir + '/onnxruntime/core/session')
+      # Top level of source dir.
+      onnx_inc_dir += '/onnxruntime/core/session'
+    elif fs.is_dir(onnx_inc_dir + '/onnxruntime')
+      onnx_inc_dir += '/onnxruntime'
+    endif
+    includes += include_directories(onnx_inc_dir, is_system: true)
     cc.has_header('onnxruntime_cxx_api.h', required: true,
-                  args: '-I' + get_option('onnx_include'))
-    if not cc.has_header('cpu_provider_factory.h',
-                         args: '-I' + get_option('onnx_include'))
-      cc.has_header('../providers/cpu/cpu_provider_factory.h', required: true,
-                    args: '-I' + get_option('onnx_include'))
-      includes += include_directories(get_option('onnx_include') + '/../providers/cpu',
-                                      is_system: true)
+                  include_directories: includes)
+    files += 'src/neural/backends/onnx/network_onnx.cc'
+    onnx_conf = configuration_data()
+    if cc.has_header('dml_provider_factory.h', required: false,
+                     include_directories: includes)
+      # The header is not actually needed, used here to detect DML onnxruntime.
+      onnx_conf.set('USE_DML', true)
     endif
-    files += 'src/neural/backends/network_onnx.cc'
     if cc.find_library('onnxruntime_providers_rocm',
                        dirs: get_option('onnx_libdir'), required: false).found()
-      add_project_arguments('-DUSE_ROCM', language : 'cpp')
+      onnx_conf.set('USE_ROCM', true)
+    endif
+    if cc.find_library('onnxruntime_providers_migraphx',
+                       dirs: get_option('onnx_libdir'), required: false).found()
+      onnx_conf.set('USE_MIGRAPHX', true)
     endif
+    if cu_dart.found() and nvcc_ok
+      onnx_conf.set('USE_ONNX_CUDART', true)
+      deps += cu_dart
+      files += custom_target('cuda onnx code',
+        input : 'src/neural/backends/onnx/onnx_kernels.cu',
+        output : outputname,
+        command : [nvcc, nvcc_extra_args, nvcc_arguments]
+        )
+    else
+      warning('No CUDA support available. Using compatibility implementation for onnx-trt and onnx-cuda.')
+    endif
+    configure_file(output : 'onnx_conf.h', configuration : onnx_conf)
     has_backends = true
   endif
 
@@ -632,7 +670,7 @@ if get_option('build_backends')
                                 modules : ['Foundation', 'Metal', 'MetalPerformanceShaders', 'MetalPerformanceShadersGraph'],
                                 required: get_option('metal'))
 
-  if (metal_frameworks.found() and add_languages('objc', 'objcpp'))
+  if metal_frameworks.found() and add_languages('objc', 'objcpp', native: false)
     deps += metal_frameworks
 
     files += [
@@ -644,6 +682,13 @@ if get_option('build_backends')
     has_backends = true
     add_project_arguments('-fobjc-arc', language : 'objc')
     add_project_arguments('-fobjc-arc', language : 'objcpp')
+
+    # Minimum MacOS version = 12.6.1
+    macos_min_version = '12.6'
+    add_project_arguments(
+      '-mmacosx-version-min=' + macos_min_version,
+      language: ['c', 'cpp', 'objc', 'objcpp']
+    )
   endif
 
   ## ~~~~~~~~
@@ -682,15 +727,53 @@ if get_option('build_backends')
         deps += cc.find_library('mkl_core', required: true)
         deps += cc.find_library('OpenCL', required: true)
       elif get_option('sycl') == 'amd'
-        deps += cc.find_library('hipblas', required: true)
-        deps += cc.find_library('amdhip64', required: true)
+        hip_libdirs = get_option('hip_libdirs')
+        hip_args = []
+        foreach hip_include : get_option('hip_include')
+          if run_command('scripts/checkdir.py', hip_include, check : false).returncode() == 0
+            includes += include_directories(hip_include, is_system: true)
+            hip_args += '-I' + hip_include
+          endif
+        endforeach
+        deps += cc.find_library('hipblas', dirs: hip_libdirs, required: true)
+        cc.has_header('hipblas/hipblas.h', required: true, args: hip_args)
+        deps += cc.find_library('amdhip64', dirs: hip_libdirs, required: true)
+        cc.has_header('hip/hip_runtime.h', required: true, args: hip_args)
         add_project_arguments('-DUSE_HIPBLAS=ON', language : 'cpp')
         add_project_arguments('-D__HIP_PLATFORM_AMD__', language : 'cpp')
-        if get_option('amd_gfx') == ''
-          error('-Dsycl=amd requires specifying -Damd_gfx architecture identifier (e.g. 90a, 1100 or similar)')
+        amd_gfx = get_option('amd_gfx')
+        if amd_gfx == ''
+          amd_gfx = []
+          agent_enum = find_program('rocm_agent_enumerator', '/opt/rocm/bin/rocm_agent_enumerator',
+                      required: false)
+          if not agent_enum.found()
+            warning( '\'rocm_agent_enumerator\' not found. AMD GPU detection doesn\'t work. You can install rocminfo or set -Damd_gfx.')
+          elif meson.version().version_compare('<1.2.0')
+            warning( 'Automatic AMD GPU detection requires Meson 1.2.0')
+          else
+            agents = run_command(agent_enum, check : false).stdout()
+            agent_list = agents.splitlines()
+            foreach agent : agent_list
+              if agent.startswith('gfx')
+                amd_gfx += 'amd_gpu_' + agent
+              else
+                error( '\'' + agent_enum.full_path() + '\' unexpected output: ' + agent)
+              endif
+            endforeach
+            if amd_gfx.length() == 0
+              warning( '\'' + agent_enum.full_path() + '\' failed to detect any AMD GPUs in the system.')
+            else
+              message( 'Detected AMD GPU cores: ' + ','.join(amd_gfx))
+            endif
+          endif
+        else
+          amd_gfx = ['amd_gpu_' + amd_gfx]
+        endif
+        if amd_gfx.length() == 0
+          error('-Dsycl=amd requires specifying -Damd_gfx architecture identifier (e.g. gfx90a, gfx1100 or similar)')
         endif
-        add_project_arguments('-fsycl-targets=amd_gpu_gfx'+get_option('amd_gfx'), language : 'cpp')
-        add_project_link_arguments('-fsycl-targets=amd_gpu_gfx'+get_option('amd_gfx'), language : 'cpp')
+        add_project_arguments('-fsycl-targets=' + ','.join(amd_gfx), language : 'cpp')
+        add_project_link_arguments('-fsycl-targets=' + ','.join(amd_gfx), language : 'cpp')
       else
         deps += cc.find_library('cublas', required: true)
         deps += cc.find_library('cudart', required: true)
@@ -707,6 +790,7 @@ if get_option('build_backends')
         # For sycl under windows we need to link using icx to generate the device code.
         # This script edits build.ninja for this and for an icx dependency issue.
         meson.add_postconf_script('scripts/sycl_build_hack.py')
+        add_project_link_arguments('-rtlib=compiler-rt', language : 'cpp')
       endif
   endif
 
@@ -737,15 +821,53 @@ endif
     deps += dependency('zlib', fallback: ['zlib', 'zlib_dep'])
   endif
 
+  trace_lib = get_option('trace_library')
+  trace_config = configuration_data()
+
+  common_files += 'src/utils/trace.cc'
+  ## ~~~~~~~~
+  ## perfetto
+  ## ~~~~~~~~
+  if trace_lib == 'perfetto'
+    perfetto_dep = dependency('perfetto', required: true,
+                             fallback: ['perfetto', 'dep_perfetto'])
+    deps += perfetto_dep
+    trace_config.set('USE_PERFETTO_TRACE', 1)
+  endif
+
+  ## ~~~~
+  ## nvtx
+  ## ~~~~
+  if trace_lib == 'nvtx'
+    nvtx_includes = get_option('cudnn_include')
+    nvtx_header_found = false
+    foreach d : nvtx_includes
+      if run_command('scripts/checkdir.py', d, check : false).returncode() == 0
+        if cc.has_header('nvtx3/nvtx3.hpp', args: '-I' + d)
+          includes += include_directories(d)
+          nvtx_header_found = true
+          break
+        endif
+      endif
+    endforeach
+    if not nvtx_header_found
+      error('nvtx3/nvtx3.hpp header not found in cudnn_include paths')
+    endif
+    # This could support other tracing apis like systemtap.
+    trace_config.set('USE_NVTX_TRACE', 1)
+  endif
+  configure_file(output : 'trace_config.h',
+                 configuration : trace_config)
+
   ## ~~~~~~~~
   ## Profiler
   ## ~~~~~~~~
   if get_option('buildtype') != 'release'
-    deps += cc.find_library('libprofiler',
+    deps += cc.find_library('profiler',
       dirs: ['/usr/local/lib'], required: false)
   endif
 
-  deps += cc.find_library('libatomic', required: false)
+  deps += cc.find_library('atomic', required: false)
 
 #############################################################################
 ## Main Executable
@@ -759,6 +881,10 @@ if not get_option('f16c')
   add_project_arguments('-DNO_F16C', language : 'cpp')
 endif
 
+if cc.has_type('_Float16')
+  add_project_arguments('-DHAS_FLOAT16', language : 'cpp')
+endif
+
 if not get_option('pext')
   add_project_arguments('-DNO_PEXT', language : 'cpp')
 endif
@@ -767,10 +893,19 @@ if get_option('embed')
   add_project_arguments('-DEMBED', language : 'cpp')
 endif
 
+default_search_h = configuration_data()
 if get_option('default_search') != ''
-  add_project_arguments('-DDEFAULT_SEARCH=' +
-                        get_option('default_search'), language : 'cpp')
+  default_search_h.set_quoted('DEFAULT_SEARCH', get_option('default_search'))
+endif
+configure_file(output : 'default_search.h',
+               configuration : default_search_h)
+
+default_backend_h = configuration_data()
+if get_option('default_backend') != ''
+  default_backend_h.set_quoted('DEFAULT_BACKEND', get_option('default_backend'))
 endif
+configure_file(output : 'default_backend.h',
+               configuration : default_backend_h)
 
 if get_option('lc0')
   files += common_files
@@ -783,10 +918,10 @@ endif
 #############################################################################
 
 if get_option('rescorer')
-  deps += subproject('gaviotatb').get_variable('gaviotatb_dep')
+  gaviota_dep = subproject('gaviotatb').get_variable('gaviotatb_dep')
   executable('rescorer', 'src/rescorer_main.cc',
        [common_files, 'src/trainingdata/rescorer.cc'],
-       include_directories: includes, dependencies: deps, install: true)
+       include_directories: includes, dependencies: [deps, gaviota_dep], install: true)
 endif
 
 #############################################################################
@@ -796,13 +931,18 @@ endif
 if get_option('gtest')
   gtest = dependency('gtest', fallback: ['gtest', 'gtest_dep'])
   gmock = dependency('gmock', fallback: ['gtest', 'gmock_dep'])
-  lc0_lib = library('lc0_lib', files, include_directories: includes, dependencies: deps)
+  lc0_lib = library('lc0_lib', common_files, include_directories: includes, dependencies: deps)
 
   test('ChessBoard',
     executable('chessboard_test', 'src/chess/board_test.cc',
     include_directories: includes, link_with: lc0_lib, dependencies: gtest
   ), args: '--gtest_output=xml:chessboard.xml', timeout: 90)
 
+  test('FP16',
+    executable('fp16_test', 'src/utils/fp16_utils_test.cc',
+    include_directories: includes, link_with: lc0_lib, dependencies: gtest
+  ), args: '--gtest_output=xml:fp16.xml', timeout: 90)
+
   test('HashCat',
     executable('hashcat_test', 'src/utils/hashcat_test.cc',
     include_directories: includes, link_with: lc0_lib, dependencies: gtest
@@ -830,7 +970,8 @@ if get_option('gtest')
   ), args: '--gtest_output=xml:encoder.xml', timeout: 90)
 
   test('EngineTest',
-    executable('engine_test', 'src/engine_test.cc', pb_files,
+    executable('engine_test', 'src/engine_test.cc', 'src/engine.cc',
+               'src/neural/memcache.cc', pb_files,
     include_directories: includes, link_with: lc0_lib, dependencies: [gtest, gmock]),
     args: '--gtest_output=xml:engine_test.xml', timeout: 90)
 endif
diff --git a/meson_options.txt b/meson_options.txt
index 6f941d0c42..ec5c53917a 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -43,6 +43,11 @@ option('cudnn_include',
        value: ['/opt/cuda/include/', '/usr/local/cuda/include/', '/usr/lib/cuda/include/'],
        description: 'Paths to cudnn include directory')
 
+option('cutlass_include',
+       type: 'string',
+       value: '',
+       description: 'Paths to cutlass include directory')
+
 option('build_backends',
        type: 'boolean',
        value: true,
@@ -68,6 +73,11 @@ option('native_cuda',
        value: true,
        description: 'build cuda code for native arch only (if supported)')
 
+option('native_arch',
+       type: 'boolean',
+       value: true, 
+       description: 'build code for native arch only')
+
 option('cudnn',
        type: 'boolean',
        value: false,
@@ -78,6 +88,11 @@ option('plain_cuda',
        value: true,
        description: 'Enable CUDA backend')
 
+option('cutlass',
+       type: 'boolean',
+       value: true,
+       description: 'Enable cutlass lib for cuda backend. Only supports Ampere+ right now')
+
 option('opencl',
        type: 'boolean',
        value: false,
@@ -181,16 +196,21 @@ option('cc_cuda',
 option('amd_gfx',
        type: 'string',
        value: '',
-       description: 'Build for a specific AMD GPU architecture, e.g. -Damd_gfx=90a for gfx90a')
+       description: 'Build for a specific AMD GPU architecture, e.g. -Damd_gfx=gfx90a for gfx90a')
+
+option('onnx',
+       type: 'boolean',
+       value: true,
+       description: 'Enable ONNX backends')
 
 option('onnx_libdir',
        type: 'string',
-       value: '',
+       value: '/usr/lib/',
        description: 'Paths to ONNX runtime libraries')
 
 option('onnx_include',
        type: 'string',
-       value: '',
+       value: '/usr/include/onnxruntime/',
        description: 'Paths to ONNX runtime includes')
 
 option('xla',
@@ -204,6 +224,22 @@ option('sycl',
        value: 'off',
        description: 'Enable SYCL backend')
 
+option('hip_libdirs',
+       type: 'array',
+       value: ['/opt/rocm/lib'],
+       description: 'Paths to AMD HIP libraries')
+
+option('hip_include',
+       type: 'array',
+       value: ['/opt/rocm/include'],
+       description: 'Path to AMD HIP includes')
+
+option('trace_library',
+       type: 'combo',
+       choices: ['off', 'perfetto', 'nvtx'],
+       value: 'off',
+       description: 'Enable trace library support')
+
 option('lc0',
        type: 'boolean',
        value: true,
@@ -219,7 +255,17 @@ option('default_search',
        value: '',
        description: 'Default search algorithm to use, e.g. -Ddefault_search=classic')
 
+option('default_backend',
+       type: 'string',
+       value: '',
+       description: 'Default backend to use, e.g. -Ddefault_backend=onnx-trt')
+
 option('dag_classic',
        type: 'boolean',
        value: true,
        description: 'Enable dag-classic search algorithm')
+
+option('nvcc',
+       type: 'boolean',
+       value: true,
+       description: 'Use nvcc: required for cuda, optional for onnx')
diff --git a/src/neural/xla/hlo.proto b/proto/hlo.proto
similarity index 100%
rename from src/neural/xla/hlo.proto
rename to proto/hlo.proto
diff --git a/proto/net.proto b/proto/net.proto
new file mode 100644
index 0000000000..961a73992a
--- /dev/null
+++ b/proto/net.proto
@@ -0,0 +1,411 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2018 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+syntax = "proto2";
+
+package pblczero;
+
+message EngineVersion {
+  optional uint32 major = 1;
+  optional uint32 minor = 2;
+  optional uint32 patch = 3;
+}
+
+message Weights {
+  message Layer {
+    optional float min_val = 1;
+    optional float max_val = 2;
+    optional bytes params = 3;
+    enum Encoding {
+      UNKNOWN_ENCODING = 0;
+      LINEAR16 = 1;
+      FLOAT16 = 2;
+      BFLOAT16 = 3;
+      FLOAT32 = 4;
+    }
+    optional Encoding encoding = 4;
+    repeated uint32 dims = 5;
+  }
+
+  message ConvBlock {
+    optional Layer weights = 1;
+    optional Layer biases = 2;
+    optional Layer bn_means = 3;
+    optional Layer bn_stddivs = 4;
+    optional Layer bn_gammas = 5;
+    optional Layer bn_betas = 6;
+  }
+
+  message SEunit {
+    // Squeeze-excitation unit (https://arxiv.org/abs/1709.01507)
+    // weights and biases of the two fully connected layers.
+    optional Layer w1 = 1;
+    optional Layer b1 = 2;
+    optional Layer w2 = 3;
+    optional Layer b2 = 4;
+  }
+
+  message Residual {
+    optional ConvBlock conv1 = 1;
+    optional ConvBlock conv2 = 2;
+    optional SEunit se = 3;
+  }
+
+  message Smolgen {
+    // For NETWORK_ATTENTIONBODY_WITH_HEADFORMAT.
+    optional Layer compress = 1;
+    optional Layer dense1_w = 2;
+    optional Layer dense1_b = 3;
+    optional Layer ln1_gammas = 4;
+    optional Layer ln1_betas = 5;
+    optional Layer dense2_w = 6;
+    optional Layer dense2_b = 7;
+    optional Layer ln2_gammas = 8;
+    optional Layer ln2_betas = 9;
+  }
+
+  message MHA {
+    optional Layer q_w = 1;
+    optional Layer q_b = 2;
+    optional Layer k_w = 3;
+    optional Layer k_b = 4;
+    optional Layer v_w = 5;
+    optional Layer v_b = 6;
+    optional Layer dense_w = 7;
+    optional Layer dense_b = 8;
+    optional Smolgen smolgen = 9;
+
+    optional Layer rpe_q = 10;
+    optional Layer rpe_k = 11;
+    optional Layer rpe_v = 12;
+
+    // reserved 13 - 22 for int8 quantization
+  }
+
+  message FFN {
+    optional Layer dense1_w = 1;
+    optional Layer dense1_b = 2;
+    optional Layer dense2_w = 3;
+    optional Layer dense2_b = 4;
+    // reserved 5 - 10 for int8 quantization
+  }
+
+  message EncoderLayer {
+    optional MHA mha = 1;
+    optional Layer ln1_gammas = 2;
+    optional Layer ln1_betas = 3;
+    optional FFN ffn = 4;
+    optional Layer ln2_gammas = 5;
+    optional Layer ln2_betas = 6;
+  }
+
+  message PolicyHead {
+    optional Layer ip_pol_w = 1;
+    optional Layer ip_pol_b = 2;
+    optional Layer ip2_pol_w = 3;  // "wq" in policy attention
+    optional Layer ip2_pol_b = 4;
+    optional Layer ip3_pol_w = 5;  // "wk" in policy attention
+    optional Layer ip3_pol_b = 6;
+    optional Layer ip4_pol_w = 7;  // "ppo" in policy attention
+
+    // Optional policy encoders for policy head.
+    repeated EncoderLayer pol_encoder = 8;
+    optional uint32 pol_headcount = 9;
+
+    // Convolutions for legacy policy head.
+    optional ConvBlock policy1 = 10;
+    optional ConvBlock policy = 11;
+  }
+
+  message ValueHead {
+    optional Layer ip_val_w = 1;  // "embedding" for attention body value
+    optional Layer ip_val_b = 2;
+    optional Layer ip1_val_w = 3;
+    optional Layer ip1_val_b = 4;
+    optional Layer ip2_val_w = 5;
+    optional Layer ip2_val_b = 6;
+    optional Layer ip_val_err_w = 7;
+    optional Layer ip_val_err_b = 8;
+    optional Layer ip_val_cat_w = 9;
+    optional Layer ip_val_cat_b = 10;
+
+    // Legacy value head support.
+    optional ConvBlock value = 11;
+  }
+
+  message PolicyHeadMap {
+    required string key = 1;  // name of the policy head
+    required PolicyHead value = 2;
+  }
+
+  message PolicyHeads {
+    optional Layer ip_pol_w = 1;    // "embedding" in policy attention
+    optional Layer ip_pol_b = 2;
+    optional PolicyHead vanilla = 3;
+    optional PolicyHead optimistic_st = 4;
+    optional PolicyHead soft = 5;
+    optional PolicyHead opponent = 6;
+    // map<string, PolicyHead> policy_head_map = 7;
+    repeated PolicyHeadMap policy_head_map = 7;
+  }
+
+  message ValueHeadMap {
+    required string key = 1;  // name of the value head
+    required ValueHead value = 2;
+  }
+
+  message ValueHeads {
+    optional ValueHead winner = 1;
+    optional ValueHead q = 2;
+    optional ValueHead st = 3;
+    // map<string, ValueHead> value_head_map = 4;
+    repeated ValueHeadMap value_head_map = 4;
+  }
+
+  // Input convnet.
+  optional ConvBlock input = 1;
+
+  // Residual tower.
+  repeated Residual residual = 2;
+
+  // Embedding layer for attention body encoders
+  // (NETWORK_ATTENTIONBODY_WITH_HEADFORMAT).
+
+  optional Layer ip_emb_preproc_w = 37;
+  optional Layer ip_emb_preproc_b = 38;
+
+  optional Layer ip_emb_w = 25;
+  optional Layer ip_emb_b = 26;
+
+  optional Layer ip_emb_ln_gammas = 39;
+  optional Layer ip_emb_ln_betas = 40;
+
+
+
+  // Input gating (NETWORK_ATTENTIONBODY_WITH_HEADFORMAT).
+  optional Layer ip_mult_gate = 33;
+  optional Layer ip_add_gate = 34;
+
+  optional FFN ip_emb_ffn = 41;
+  optional Layer ip_emb_ffn_ln_gammas = 42;
+  optional Layer ip_emb_ffn_ln_betas = 43;
+
+  // Encoder stack (NETWORK_ATTENTIONBODY_WITH_HEADFORMAT).
+  repeated EncoderLayer encoder = 27;
+  optional uint32 headcount = 28;
+
+  // Policy encoder stack
+  // The ffn activation up to and including NETWORK_SE_WITH_HEADFORMAT is SELU,
+  // otherwise it follows the ffn activation setting.
+  repeated EncoderLayer pol_encoder = 21;
+  optional uint32 pol_headcount = 24;
+
+  // Policy head
+  // Extra convolution for AZ-style policy head
+  optional ConvBlock policy1 = 11;
+  optional ConvBlock policy = 3;
+  optional Layer ip_pol_w = 4;    // "embedding" in policy attention
+  optional Layer ip_pol_b = 5;
+  // For policy attention, up to and including NETWORK_SE_WITH_HEADFORMAT the
+  // "embedding" activation is SELU, otherwise it is the default activation.
+  optional Layer ip2_pol_w = 17;  // "wq" in policy attention
+  optional Layer ip2_pol_b = 18;
+  optional Layer ip3_pol_w = 19;  // "wk" in policy attention
+  optional Layer ip3_pol_b = 20;
+  optional Layer ip4_pol_w = 22;  // "ppo" in policy attention
+
+  // Value head
+  optional ConvBlock value = 6;
+  optional Layer ip_val_w = 29;  // "embedding" for attention body value
+  optional Layer ip_val_b = 30;
+  optional Layer ip1_val_w = 7;
+  optional Layer ip1_val_b = 8;
+  optional Layer ip2_val_w = 9;
+  optional Layer ip2_val_b = 10;
+
+  optional ValueHeads value_heads = 44;
+  optional PolicyHeads policy_heads = 45;
+
+  // Moves left head
+  optional ConvBlock moves_left = 12;
+  optional Layer ip_mov_w = 31;  // "embedding" for attention body moves left
+  optional Layer ip_mov_b = 32;
+  optional Layer ip1_mov_w = 13;
+  optional Layer ip1_mov_b = 14;
+  optional Layer ip2_mov_w = 15;
+  optional Layer ip2_mov_b = 16;
+
+  // Global smolgen weights (NETWORK_ATTENTIONBODY_WITH_HEADFORMAT).
+  optional Layer smolgen_w = 35;
+  optional Layer smolgen_b = 36;
+}
+
+message TrainingParams {
+  optional uint32 training_steps = 1;
+  optional float learning_rate = 2;
+  optional float mse_loss = 3;
+  optional float policy_loss = 4;
+  optional float accuracy = 5;
+  optional string lc0_params = 6;
+}
+
+message NetworkFormat {
+  // Format to encode the input planes with. Used by position encoder.
+  enum InputFormat {
+    INPUT_UNKNOWN = 0;
+    INPUT_CLASSICAL_112_PLANE = 1;
+    INPUT_112_WITH_CASTLING_PLANE = 2;
+    INPUT_112_WITH_CANONICALIZATION = 3;
+    INPUT_112_WITH_CANONICALIZATION_HECTOPLIES = 4;
+    INPUT_112_WITH_CANONICALIZATION_HECTOPLIES_ARMAGEDDON = 132;
+    INPUT_112_WITH_CANONICALIZATION_V2 = 5;
+    INPUT_112_WITH_CANONICALIZATION_V2_ARMAGEDDON = 133;
+  }
+  optional InputFormat input = 1;
+
+  // Output format of the NN. Used by search code to interpret results.
+  enum OutputFormat {
+    OUTPUT_UNKNOWN = 0;
+    OUTPUT_CLASSICAL = 1;
+    OUTPUT_WDL = 2;
+  }
+  optional OutputFormat output = 2;
+
+  // Network architecture. Used by backends to build the network.
+  enum NetworkStructure {
+    // Networks without PolicyFormat or ValueFormat specified
+    NETWORK_UNKNOWN = 0;
+    NETWORK_CLASSICAL = 1;
+    NETWORK_SE = 2;
+    // Networks with PolicyFormat and ValueFormat specified
+    NETWORK_CLASSICAL_WITH_HEADFORMAT = 3;
+    NETWORK_SE_WITH_HEADFORMAT = 4;
+    NETWORK_ONNX = 5;
+    NETWORK_ATTENTIONBODY_WITH_HEADFORMAT = 6;
+    NETWORK_ATTENTIONBODY_WITH_MULTIHEADFORMAT = 7;
+    NETWORK_AB_LEGACY_WITH_MULTIHEADFORMAT = 134;
+  }
+  optional NetworkStructure network = 3;
+
+  // Policy head architecture
+  enum PolicyFormat {
+    POLICY_UNKNOWN = 0;
+    POLICY_CLASSICAL = 1;
+    POLICY_CONVOLUTION = 2;
+    POLICY_ATTENTION = 3;
+  }
+  optional PolicyFormat policy = 4;
+
+  // Value head architecture
+  enum ValueFormat {
+    VALUE_UNKNOWN = 0;
+    VALUE_CLASSICAL = 1;
+    VALUE_WDL = 2;
+    VALUE_PARAM = 3;
+  }
+  optional ValueFormat value = 5;
+
+  // Moves left head architecture
+  enum MovesLeftFormat {
+    MOVES_LEFT_NONE = 0;
+    MOVES_LEFT_V1 = 1;
+  }
+  optional MovesLeftFormat moves_left = 6;
+
+  enum ActivationFunction {
+    ACTIVATION_DEFAULT = 0;
+    ACTIVATION_MISH = 1;
+    ACTIVATION_RELU = 2;
+    ACTIVATION_NONE = 3;
+    ACTIVATION_TANH = 4;
+    ACTIVATION_SIGMOID = 5;
+    ACTIVATION_SELU = 6;
+    ACTIVATION_SWISH = 7;
+    ACTIVATION_RELU_2 = 8;
+    ACTIVATION_SOFTMAX = 9;
+  }
+
+  // Activation used everywhere except head outputs or otherwise specified.
+  enum DefaultActivation {
+    DEFAULT_ACTIVATION_RELU = 0;
+    DEFAULT_ACTIVATION_MISH = 1;
+  }
+  optional DefaultActivation default_activation = 7;
+
+  optional ActivationFunction smolgen_activation = 8;
+  optional ActivationFunction ffn_activation = 9;
+
+  enum InputEmbeddingFormat {
+    INPUT_EMBEDDING_NONE = 0;
+    INPUT_EMBEDDING_PE_MAP = 1;
+    INPUT_EMBEDDING_PE_DENSE = 2;
+  }
+  optional InputEmbeddingFormat input_embedding = 10;
+}
+
+message Format {
+  enum Encoding {
+    UNKNOWN = 0;
+    LINEAR16 = 1;
+  }
+  // Any encoding specified in a Layer overides this.
+  optional Encoding weights_encoding = 1;
+  // If network_format is missing, it's assumed to have
+  // INPUT_CLASSICAL_112_PLANE / OUTPUT_CLASSICAL / NETWORK_CLASSICAL format.
+  optional NetworkFormat network_format = 2;
+}
+
+message OnnxModel {
+  enum DataType {
+    UNKNOWN_DATATYPE = 0;
+    FLOAT = 1;
+    FLOAT16 = 10;
+    BFLOAT16 = 16;
+  }
+
+  // Serialized OnnxProto model.
+  optional bytes model = 1;
+  optional DataType data_type = 2;
+  // Name of the input tensor to populate.
+  optional string input_planes = 3;
+  // Names of the output tensors to get results from.
+  // If some feature is not present, corresponding values are not set.
+  optional string output_value = 4;
+  optional string output_wdl = 5;
+  optional string output_policy = 6;
+  optional string output_mlh = 7;
+}
+
+message Net {
+  optional fixed32 magic = 1;
+  optional string license = 2;
+  optional EngineVersion min_version = 3;
+  optional Format format = 4;
+  optional TrainingParams training_params = 5;
+  // Either weights or onnx_model is set, but not both.
+  optional Weights weights = 10;
+  optional OnnxModel onnx_model = 11;
+}
diff --git a/src/neural/onnx/onnx.proto b/proto/onnx.proto
similarity index 100%
rename from src/neural/onnx/onnx.proto
rename to proto/onnx.proto
diff --git a/scripts/appveyor_win_build.cmd b/scripts/appveyor_win_build.cmd
index 43ab5f211a..00e739d567 100644
--- a/scripts/appveyor_win_build.cmd
+++ b/scripts/appveyor_win_build.cmd
@@ -1,5 +1,5 @@
 SET PGO=false
-IF %APPVEYOR_REPO_TAG%==true IF %DX%==false IF %ONNX_DML%==false SET PGO=true
+IF %APPVEYOR_REPO_TAG%==true IF %DX%==false IF %ONNX%==false SET PGO=true
 IF %PGO%==false msbuild "C:\projects\lc0\build\lc0.sln" /m /p:WholeProgramOptimization=true /logger:"C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll"
 IF EXIST build\lc0.pdb del build\lc0.pdb
 IF %PGO%==true msbuild "C:\projects\lc0\build\lc0.sln" /m /p:WholeProgramOptimization=PGInstrument /logger:"C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll"
@@ -19,3 +19,12 @@ IF %PGO%==true (
 )
 cd ..
 IF %PGO%==true msbuild "C:\projects\lc0\build\lc0.sln" /m /p:WholeProgramOptimization=PGOptimize /p:DebugInformationFormat=ProgramDatabase /logger:"C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll"
+IF %NAME%==onnx (
+  ren build\lc0.exe lc0-trt.exe
+  meson configure build -Ddefault_backend= -Dcudnn_libdirs= -Dgtest=%GTEST%
+  # This is needed as a separate step.
+  msbuild "C:\projects\lc0\build\lc0.sln" /target:REGEN
+  IF %PGO%==true msbuild "C:\projects\lc0\build\lc0.sln" /m /p:WholeProgramOptimization=PGOptimize /p:DebugInformationFormat=ProgramDatabase /logger:"C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll"
+  IF %PGO%==false msbuild "C:\projects\lc0\build\lc0.sln" /m /p:WholeProgramOptimization=true /logger:"C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll"
+  ren build\lc0.exe lc0-dml.exe
+)
diff --git a/scripts/appveyor_win_package.cmd b/scripts/appveyor_win_package.cmd
index eb36adf26c..eaf1ba73b7 100644
--- a/scripts/appveyor_win_package.cmd
+++ b/scripts/appveyor_win_package.cmd
@@ -1,6 +1,6 @@
 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip %APPVEYOR_BUILD_FOLDER%\build\lc0.exe
-IF %NAME%==gpu-nvidia-cuda appveyor DownloadFile "https://github.com/LeelaChessZero/lczero-client/releases/latest/download/lc0-training-client.exe"
-IF %NAME%==gpu-nvidia-cuda 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip lc0-training-client.exe
+IF %NAME%==gpu-nvidia-cuda12 appveyor DownloadFile "https://github.com/LeelaChessZero/lczero-client/releases/latest/download/lc0-training-client.exe"
+IF %NAME%==gpu-nvidia-cuda12 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip lc0-training-client.exe
 type COPYING |more /P > dist\COPYING
 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\COPYING
 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip c:\cache\%NET%.pb.gz
@@ -17,24 +17,48 @@ IF %NAME%==onednn 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip C:\cache\
 IF %OPENCL%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip C:\cache\opencl-nug.0.777.77\build\native\bin\OpenCL.dll
 IF %CUDNN%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip "%CUDA_PATH%\bin\cudart64_101.dll" "%CUDA_PATH%\bin\cublas64_10.dll" "%CUDA_PATH%\bin\cublasLt64_10.dll"
 IF %CUDNN%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip "%CUDA_PATH%\cuda\bin\cudnn64_7.dll"
-IF %CUDA%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip "%CUDA_PATH%\bin\cudart64_110.dll" "%CUDA_PATH%\bin\cublas64_11.dll" "%CUDA_PATH%\bin\cublasLt64_11.dll"
-IF %NAME%==cpu-dnnl copy "%PKG_FOLDER%\%DNNL_NAME%\LICENSE" dist\DNNL-LICENSE
-IF %NAME%==cpu-dnnl copy "%PKG_FOLDER%\%DNNL_NAME%\THIRD-PARTY-PROGRAMS" dist\DNNL-THIRD-PARTY-PROGRAMS
-IF %NAME%==cpu-dnnl 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\DNNL-LICENSE
-IF %NAME%==cpu-dnnl 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\DNNL-THIRD-PARTY-PROGRAMS
-IF %NAME%==onednn copy "%PKG_FOLDER%\%DNNL_NAME%\LICENSE" dist\DNNL-LICENSE
-IF %NAME%==onednn copy "%PKG_FOLDER%\%DNNL_NAME%\THIRD-PARTY-PROGRAMS" dist\DNNL-THIRD-PARTY-PROGRAMS
-IF %NAME%==onednn 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\DNNL-LICENSE
-IF %NAME%==onednn 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\DNNL-THIRD-PARTY-PROGRAMS
-IF %ONNX_DML%==true type dist\README-onnx-dml.txt |more /P > dist\README.txt
-IF %ONNX_DML%==true type dist\install-dml.cmd |more /P > dist\install.cmd
-IF %ONNX_DML%==true copy "%PKG_FOLDER%\%ONNX_NAME%\LICENSE" dist\ONNX-DML-LICENSE
-IF %ONNX_DML%==true copy "%PKG_FOLDER%\%ONNX_NAME%\ThirdPartyNotices.txt" dist\ONNX-DML-ThirdPartyNotices.txt
-IF %ONNX_DML%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip "%PKG_FOLDER%\%ONNX_NAME%\runtimes\win-x64\native\onnxruntime.dll"
-IF %ONNX_DML%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\README.txt
-IF %ONNX_DML%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\install.cmd
-IF %ONNX_DML%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\ONNX-DML-LICENSE
-IF %ONNX_DML%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\ONNX-DML-ThirdPartyNotices.txt
+IF %NAME%==gpu-nvidia-cuda11 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip "%CUDA_PATH%\bin\cudart64_110.dll" "%CUDA_PATH%\bin\cublas64_11.dll" "%CUDA_PATH%\bin\cublasLt64_11.dll"
+IF %NAME%==gpu-nvidia-cuda12 (
+  7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip "%CUDA_PATH%\bin\cudart64_12.dll" "%CUDA_PATH%\bin\cublas64_12.dll" "%CUDA_PATH%\bin\cublasLt64_12.dll"
+  type dist\install-cuda_12_9.cmd |more /P > dist\install.cmd
+  7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-nodll.zip .\dist\install.cmd
+)
+IF %NAME%==cpu-dnnl (
+  copy "%PKG_FOLDER%\%DNNL_NAME%\LICENSE" dist\DNNL-LICENSE
+  copy "%PKG_FOLDER%\%DNNL_NAME%\THIRD-PARTY-PROGRAMS" dist\DNNL-THIRD-PARTY-PROGRAMS
+  7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\DNNL-LICENSE
+  7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\DNNL-THIRD-PARTY-PROGRAMS
+)
+IF %NAME%==onednn (
+  copy "%PKG_FOLDER%\%DNNL_NAME%\LICENSE" dist\DNNL-LICENSE
+  copy "%PKG_FOLDER%\%DNNL_NAME%\THIRD-PARTY-PROGRAMS" dist\DNNL-THIRD-PARTY-PROGRAMS
+  7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\DNNL-LICENSE
+  7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\DNNL-THIRD-PARTY-PROGRAMS
+)
+IF %ONNX%==true (
+  copy "%PKG_FOLDER%\%ONNX_NAME%\LICENSE" dist\ONNX-LICENSE
+  copy "%PKG_FOLDER%\%ONNX_NAME%\ThirdPartyNotices.txt" dist\ONNX-ThirdPartyNotices.txt
+  7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip "%PKG_FOLDER%\%ONNX_NAME%\runtimes\win-x64\native\onnxruntime.dll"
+  7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\ONNX-LICENSE
+  7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\ONNX-ThirdPartyNotices.txt
+  copy lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-trt.zip
+  ren lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-dml.zip
+  7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-dml.zip %APPVEYOR_BUILD_FOLDER%\build\lc0-dml.exe
+  7z rn lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-dml.zip lc0-dml.exe lc0.exe
+  type dist\README-onnx-dml.txt |more /P > dist\README.txt
+  type dist\install-dml.cmd |more /P > dist\install.cmd
+  7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-dml.zip .\dist\README.txt
+  7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-dml.zip .\dist\install.cmd
+  7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-trt.zip %APPVEYOR_BUILD_FOLDER%\build\lc0-trt.exe
+  7z rn lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-trt.zip lc0-trt.exe lc0.exe
+  7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-trt.zip "%PKG_FOLDER%\%ONNX_NAME%\runtimes\win-x64\native\onnxruntime_providers_shared.dll"
+  7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-trt.zip "%PKG_FOLDER%\%ONNX_NAME_TWO%\lib\onnxruntime_providers_cuda.dll"
+  7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-trt.zip "%PKG_FOLDER%\%ONNX_NAME_TWO%\lib\onnxruntime_providers_tensorrt.dll"
+  type dist\README-onnx-trt.txt |more /P > dist\README.txt
+  type dist\install-trt.cmd |more /P > dist\install.cmd
+  7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-trt.zip .\dist\README.txt
+  7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%-trt.zip .\dist\install.cmd
+)
 IF %OPENCL%==true type scripts\check_opencl.bat |more /P > dist\check_opencl.bat
 IF %OPENCL%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\check_opencl.bat
 IF %DX%==true type scripts\check_dx.bat |more /P > dist\check_dx.bat
diff --git a/scripts/compile_proto.py b/scripts/compile_proto.py
index bbbdfb7342..c6a81996d9 100755
--- a/scripts/compile_proto.py
+++ b/scripts/compile_proto.py
@@ -29,62 +29,73 @@
 import os
 import re
 import sys
+from typing import Any
 
 VARINT_TYPES = {
-    'int32': 'std::int32_t',
-    'int64': 'std::int64_t',
-    'uint32': 'std::uint32_t',
-    'uint64': 'std::uint64_t',
-    'sint32': 'std::uint32_t',
-    'sint64': 'std::uint64_t',
-    'bool': 'bool',
+    "int32": "std::int32_t",
+    "int64": "std::int64_t",
+    "uint32": "std::uint32_t",
+    "uint64": "std::uint64_t",
+    "sint32": "std::uint32_t",
+    "sint64": "std::uint64_t",
+    "bool": "bool",
 }
 
 FIXED64_TYPES = {
-    'fixed64': 'std::uint64_t',
-    'sfixed64': 'std::int64_t',
-    'double': 'double',
+    "fixed64": "std::uint64_t",
+    "sfixed64": "std::int64_t",
+    "double": "double",
 }
 FIXED32_TYPES = {
-    'fixed32': 'std::uint32_t',
-    'sfixed32': 'std::int32_t',
-    'float': 'float',
+    "fixed32": "std::uint32_t",
+    "sfixed32": "std::int32_t",
+    "float": "float",
 }
 BYTES_TYPES = {
-    'string': 'std::string_view',
-    'bytes': 'std::string_view',
+    "string": "std::string_view",
+    "bytes": "std::string_view",
 }
-ZIGZAG_TYPES = set(['sint32', 'sint64'])
-FLOAT_TYPES = set(['float', 'double'])
+ZIGZAG_TYPES = set(["sint32", "sint64"])
+FLOAT_TYPES = set(["float", "double"])
 
 TYPES = {**VARINT_TYPES, **FIXED32_TYPES, **FIXED64_TYPES, **BYTES_TYPES}
 
 RESERVED_WORDS = [
-    'enum',
-    'message',
-    'optional',
-    'package',
-    'repeated',
-    'required',
-    'reserved',
-    'syntax',
-    'to',
+    "enum",
+    "message",
+    "optional",
+    "package",
+    "repeated",
+    "required",
+    "reserved",
+    "syntax",
+    "to",
 ] + list(TYPES.keys())
 
-GRAMMAR = ([(r'%s\b' % x, x)
-            for x in RESERVED_WORDS] + [('\\' + x, x) for x in '=;{}.,'] + [
-                (r'/\*.*?\*/', None),  # /* Comment */
-                (r'//.*?$', None),  # // Comment
-                (r'\s+', None),  # Whitespace
-                (r'$', 'EOF'),
-                (r'"((?:[^"\\]|\\.)*)"', 'string'),
-                (r'\d+', 'number'),
-                (r'(\w+)', 'identifier'),
-            ])
+GRAMMAR = (
+    [(r"%s\b" % x, x) for x in RESERVED_WORDS]
+    + [("\\" + x, x) for x in "=;{}.,[]"]
+    + [
+        (r"/\*.*?\*/", None),  # /* Comment */
+        (r"//.*?$", None),  # // Comment
+        (r"\s+", None),  # Whitespace
+        (r"$", "EOF"),
+        (r'"((?:[^"\\]|\\.)*)"', "string"),
+        (
+            r"[-+]?(?:[0-9]*\.[0-9]+(?:[eE][-+]?[0-9]+)?|[0-9]+[eE][-+]?[0-9]+)",
+            "fnumber",
+        ),
+        (r"[-+]?\d+", "number"),
+        (r"(\w+)", "identifier"),
+    ]
+)
+
+ALLOWED_ATTRIBUTES = {
+    "default",
+}
 
 
 class Lexer:
-
     def __init__(self, text):
         self.text = text
         self.grammar = [(re.compile(x, re.S + re.M), y) for x, y in GRAMMAR]
@@ -92,31 +103,31 @@ def __init__(self, text):
         self.cur_offset = 0
 
     def Pick(self):
-        '''Picks the last token in queue. Doesn't advance the queue.'''
+        """Picks the last token in queue. Doesn't advance the queue."""
         if self.cur_token is None:
             self.cur_token = self.NextToken()
         return self.cur_token
 
     def Consume(self, expected_token, value=None, group=0):
-        '''Gets the token from the queue and advances the queue.
+        """Gets the token from the queue and advances the queue.
 
         If @expected_token if of wrong type, or @value is not equal to regexes
         @group, throws an error.
-        '''
+        """
         token, match = self.Pick()
         if expected_token != token:
-            self.Error(f'Expected token type [{expected_token}], got [{token}]')
+            self.Error(f"Expected token type [{expected_token}], got [{token}]")
         if value is not None and value != match.group(group):
-            self.Error('Expected value [%s]' % value)
+            self.Error("Expected value [%s]" % value)
         self.cur_offset = match.span()[1]
         self.cur_token = None
         return match
 
     def NextToken(self):
-        '''Reads the stream and returns the next token.
+        """Reads the stream and returns the next token.
 
         (which is not whitespace or comment)
-        '''
+        """
         while True:
             token, match = self.NextTokenOrWhitespace()
             if token is None:
@@ -125,40 +136,42 @@ def NextToken(self):
                 return token, match
 
     def NextTokenOrWhitespace(self):
-        '''Reads the stream and returns the next token (possibly whitespace).'''
+        """Reads the stream and returns the next token (possibly whitespace)."""
         for r, token in self.grammar:
             m = r.match(self.text, self.cur_offset)
             if m:
                 return (token, m)
-        token_snippet = self.text[self.cur_offset:self.cur_offset + 10]
-        self.Error(f'Unparseable token [{token_snippet}...]')
+        token_snippet = self.text[self.cur_offset : self.cur_offset + 10]
+        self.Error(f"Unparseable token [{token_snippet}...]")
 
     def Error(self, text):
-        '''Throws an error with context in the file read.'''
-        line = self.text[:self.cur_offset].count('\n') + 1
-        line_start = self.text.rfind('\n', 0, self.cur_offset) + 1
-        line_end = self.text.find('\n', line_start)
+        """Throws an error with context in the file read."""
+        line = self.text[: self.cur_offset].count("\n") + 1
+        line_start = self.text.rfind("\n", 0, self.cur_offset) + 1
+        line_end = self.text.find("\n", line_start)
         if line_end == -1:
             line_end = len(self.text)
-        sys.stderr.write('%s:\n' % text)
-        sys.stderr.write(self.text[line_start:line_end] + '\n')
-        sys.stderr.write(' ' * (self.cur_offset - line_start) + '^^^\n')
-        raise ValueError("Parse error: %s at line %d column %d." %
-                         (text, line, (self.cur_offset - line_start)))
+        sys.stderr.write("%s:\n" % text)
+        sys.stderr.write(self.text[line_start:line_end] + "\n")
+        sys.stderr.write(" " * (self.cur_offset - line_start) + "^^^\n")
+        raise ValueError(
+            "Parse error: %s at line %d column %d."
+            % (text, line, (self.cur_offset - line_start))
+        )
 
 
 def ReadIdentifierPath(lexer):
-    '''Reads qualified identifier a.b.d into ['a', 'b', 'd'] list'''
+    """Reads qualified identifier a.b.d into ['a', 'b', 'd'] list"""
     path = []
     while True:
-        path.append(lexer.Consume('identifier').group(0))
-        if lexer.Pick()[0] != '.':
+        path.append(lexer.Consume("identifier").group(0))
+        if lexer.Pick()[0] != ".":
             return path
-        lexer.Consume('.')
+        lexer.Consume(".")
 
 
 def LookupType(name, stack):
-    '''Looks up the (possibly qualified) from the innermost scope first.'''
+    """Looks up the (possibly qualified) from the innermost scope first."""
     for y in stack:
         for x in y:
             if x.GetName() == name[0]:
@@ -166,7 +179,7 @@ def LookupType(name, stack):
                     return x
                 else:
                     return LookupType(name[1:], [x.GetTypes()])
-    raise ValueError("Cannot find type: %s." % '.'.join(name))
+    raise ValueError("Cannot find type: %s." % ".".join(name))
 
 
 # All *Parser classes have the following semantics:
@@ -175,18 +188,17 @@ def LookupType(name, stack):
 
 
 class ProtoTypeParser:
-
     def __init__(self, lexer, object_stack):
         token, match = lexer.Pick()
         if token in TYPES:
-            self.typetype = 'basic'
+            self.typetype = "basic"
             self.name = token
             lexer.Consume(token)
-        elif token == 'identifier':
+        elif token == "identifier":
             self.name = ReadIdentifierPath(lexer)
-            self.typetype = 'forward'
+            self.typetype = "forward"
         else:
-            lexer.Error('Type expected')
+            lexer.Error("Type expected")
 
     def LookupForwardFieldType(self, object_stack):
         if self.IsForward():
@@ -195,41 +207,43 @@ def LookupForwardFieldType(self, object_stack):
             self.name = [typ.GetFullName()]
 
     def IsZigzag(self):
-        if self.typetype == 'basic':
+        if self.typetype == "basic":
             return self.name in ZIGZAG_TYPES
         return False
 
     def GetCppType(self):
-        if self.typetype == 'basic':
+        if self.typetype == "basic":
             return TYPES[self.name]
         else:
-            return '_'.join(self.name)
+            return "_".join(self.name)
 
     def GetVariableCppType(self):
         if self.IsBytesType():
-            return 'std::string'
+            return "std::string"
         else:
             return self.GetCppType()
 
     def IsEnumType(self):
-        return self.typetype == 'enum'
+        return self.typetype == "enum"
 
     def IsVarintType(self):
-        return self.typetype == 'enum' or (self.typetype == 'basic'
-                                           and self.name in VARINT_TYPES)
+        return self.typetype == "enum" or (
+            self.typetype == "basic" and self.name in VARINT_TYPES
+        )
 
     def IsFixedType(self):
-        return self.typetype == 'basic' and (self.name in FIXED64_TYPES
-                                             or self.name in FIXED32_TYPES)
+        return self.typetype == "basic" and (
+            self.name in FIXED64_TYPES or self.name in FIXED32_TYPES
+        )
 
     def IsBytesType(self):
-        return self.typetype == 'basic' and self.name in BYTES_TYPES
+        return self.typetype == "basic" and self.name in BYTES_TYPES
 
     def IsFloatType(self):
-        return self.typetype == 'basic' and self.name in FLOAT_TYPES
+        return self.typetype == "basic" and self.name in FLOAT_TYPES
 
     def GetWireType(self):
-        if self.typetype == 'basic':
+        if self.typetype == "basic":
             if self.name in VARINT_TYPES:
                 return 0
             if self.name in FIXED64_TYPES:
@@ -238,52 +252,84 @@ def GetWireType(self):
                 return 2
             if self.name in FIXED32_TYPES:
                 return 5
-            raise ValueError('Unknown type %s' % self.name)
-        elif self.typetype == 'enum':
+            raise ValueError("Unknown type %s" % self.name)
+        elif self.typetype == "enum":
             return 0
-        elif self.typetype == 'message':
+        elif self.typetype == "message":
             return 2
         else:
-            raise ValueError('Unknown typetype %s' % self.typetype)
+            raise ValueError("Unknown typetype %s" % self.typetype)
 
     def IsMessage(self):
-        return self.typetype == 'message'
+        return self.typetype == "message"
 
     def IsForward(self):
-        return self.typetype == 'forward'
+        return self.typetype == "forward"
 
     def IsIntegralType(self):
-        if self.typetype == 'basic':
-            if self.name == 'double':
+        if self.typetype == "basic":
+            if self.name == "double":
                 return False
-            if self.name == 'float':
+            if self.name == "float":
                 return False
             if self.name in BYTES_TYPES:
                 return False
             if self.name in TYPES:
                 return True
-            raise ValueError('Unknown type %s' % self.name)
-        elif self.typetype == 'enum':
+            raise ValueError("Unknown type %s" % self.name)
+        elif self.typetype == "enum":
             return True
-        elif self.typetype == 'message':
+        elif self.typetype == "message":
             return False
         else:
-            raise ValueError('Unknown typetype %s' % self.typetype)
+            raise ValueError("Unknown typetype %s" % self.typetype)
 
 
 class ProtoFieldParser:
-
     def __init__(self, lexer, object_stack):
         token, match = lexer.Pick()
-        if token not in ['repeated', 'optional', 'required']:
-            lexer.Error('repeated, optional or required expected')
+        if token not in ["repeated", "optional", "required"]:
+            lexer.Error("repeated, optional or required expected")
         self.category = token
         lexer.Consume(token)
         self.type = ProtoTypeParser(lexer, object_stack)
-        self.name = lexer.Consume('identifier')
-        lexer.Consume('=')
-        self.number = int(lexer.Consume('number').group(0))
-        lexer.Consume(';')
+        self.name = lexer.Consume("identifier")
+        lexer.Consume("=")
+        self.number = int(lexer.Consume("number").group(0))
+        self.attributes = ProtoFieldParser.ParseAttributes(lexer)
+        lexer.Consume(";")
+
+    @staticmethod
+    def ParseAttributes(lexer):
+        attributes = {}
+        token, match = lexer.Pick()
+        if token != "[":
+            return attributes
+        lexer.Consume("[")
+        while True:
+            name = lexer.Consume("identifier").group(0)
+            if name not in ALLOWED_ATTRIBUTES:
+                lexer.Error("Unknown attribute %s" % name)
+            lexer.Consume("=")
+            token, match = lexer.Pick()
+            value = None
+            if token == "string":
+                value = lexer.Consume("string").group(0)
+            elif token == "fnumber":
+                value = float(lexer.Consume("fnumber").group(0))
+            elif token == "number":
+                value = int(lexer.Consume("number").group(0))
+            else:
+                lexer.Error("Expected string or number as default value")
+            attributes[name] = value
+            token, _ = lexer.Pick()
+            if token == "]":
+                lexer.Consume("]")
+                return attributes
+            elif token == ",":
+                lexer.Consume(",")
+            else:
+                lexer.Error("Expected ']' or ','")
 
     def IsType(self):
         return False
@@ -294,96 +340,96 @@ def LookupForwardFieldType(self, object_stack):
     def GetParser(self):
         name = self.name.group(0)
         if self.type.IsMessage():
-            if self.category == 'repeated':
-                return 'add_%s()->MergeFromString(val)' % name
+            if self.category == "repeated":
+                return "add_%s()->MergeFromString(val)" % name
             else:
-                return 'mutable_%s()->MergeFromString(val)' % name
+                return "mutable_%s()->MergeFromString(val)" % name
 
         cpp_type = self.type.GetCppType()
-        val = 'NOT IMPLEMENTED!'
+        val = "NOT IMPLEMENTED!"
         if self.type.IsVarintType():
-            val_val = 'UnZigZag(val)' if self.type.IsZigzag() else 'val'
-            val = 'static_cast<%s>(%s)' % (cpp_type, val_val)
+            val_val = "UnZigZag(val)" if self.type.IsZigzag() else "val"
+            val = "static_cast<%s>(%s)" % (cpp_type, val_val)
         elif self.type.IsFixedType():
             if self.type.IsFloatType():
-                val = 'bit_cast<%s>(val)' % cpp_type
+                val = "bit_cast<%s>(val)" % cpp_type
             else:
-                val = 'static_cast<%s>(val)' % cpp_type
+                val = "static_cast<%s>(val)" % cpp_type
         elif self.type.IsBytesType():
-            val = 'val'
+            val = "val"
 
-        if self.category == 'repeated':
-            return '%s_.emplace_back(%s)' % (name, val)
+        if self.category == "repeated":
+            return "%s_.emplace_back(%s)" % (name, val)
         else:
-            return 'set_%s(%s)' % (name, val)
+            return "set_%s(%s)" % (name, val)
 
     def GenerateCaseClause(self, w):
-        w.Write('case %d: %s; break;' % (self.number, self.GetParser()))
+        w.Write("case %d: %s; break;" % (self.number, self.GetParser()))
 
     def GenerateClear(self, w):
-        name   = self.name.group(0)
-        if self.category == 'repeated':
-            w.Write('%s_.clear();' % name)
+        name = self.name.group(0)
+        if self.category == "repeated":
+            w.Write("%s_.clear();" % name)
         else:
-            w.Write('has_%s_ = false;' % name)
-            w.Write('%s_ = {};' % name)
+            w.Write("has_%s_ = false;" % name)
+            if "default" in self.attributes:
+                w.Write("%s_ = %s;" % (name, self.attributes["default"]))
+            else:
+                w.Write("%s_ = {};" % name)
 
     def GenerateOutput(self, w):
         fname = {
-            0: 'AppendVarInt',
-            1: 'AppendInt64',
-            2: 'AppendString',
-            5: 'AppendInt32'
+            0: "AppendVarInt",
+            1: "AppendInt64",
+            2: "AppendString",
+            5: "AppendInt32",
         }
         tname = {
-            0: 'std::uint64_t',
-            1: 'std::uint64_t',
-            2: 'std::string_view',
-            5: 'std::uint32_t'
+            0: "std::uint64_t",
+            1: "std::uint64_t",
+            2: "std::string_view",
+            5: "std::uint32_t",
         }
         wire_id = self.type.GetWireType()
-        if self.category == 'repeated':
-            prefix = 'for (const auto& x : %s)' % (self.name.group(0) + '_')
-            name = 'x'
+        if self.category == "repeated":
+            prefix = "for (const auto& x : %s)" % (self.name.group(0) + "_")
+            name = "x"
         else:
-            name = self.name.group(0) + '_'
-            prefix = 'if (has_%s)' % (name)
+            name = self.name.group(0) + "_"
+            prefix = "if (has_%s)" % (name)
         if self.type.IsMessage():
-            name += '.OutputAsString()'
+            name += ".OutputAsString()"
         elif self.type.IsFloatType():
-            name = 'bit_cast<%s>(%s)' % (tname[wire_id], name)
+            name = "bit_cast<%s>(%s)" % (tname[wire_id], name)
 
-        w.Write('%s %s(%d, %s, &out);' %
-                (prefix, fname[wire_id], self.number, name))
+        w.Write("%s %s(%d, %s, &out);" % (prefix, fname[wire_id], self.number, name))
 
     def GenerateJsonOutput(self, w):
         name = self.name.group(0)
-        if self.category == 'repeated':
-            prefix = 'if (!%s_.empty())' % name
-            funcname = 'AppendJsonRepeatedField'
+        if self.category == "repeated":
+            prefix = "if (!%s_.empty())" % name
+            funcname = "AppendJsonRepeatedField"
         else:
-            prefix = 'if (has_%s_)' % name
-            funcname = 'AppendJsonField'
+            prefix = "if (has_%s_)" % name
+            funcname = "AppendJsonField"
         if self.type.IsEnumType():
-            value = '%s_Name(%s_)' % (self.type.GetCppType(), name)
+            value = "%s_Name(%s_)" % (self.type.GetCppType(), name)
         else:
             value = name + "_"
-        w.Write('%s %s("%s", %s, &first, &out);' %
-                (prefix, funcname, name, value))
+        w.Write('%s %s("%s", %s, &first, &out);' % (prefix, funcname, name, value))
 
     def GenerateFunctionDeclarations(self, w):
         name = self.name.group(0)
         cpp_type = self.type.GetCppType()
         var_cpp_type = self.type.GetVariableCppType()
-        if self.category == 'repeated':
+        if self.category == "repeated":
             if self.type.IsMessage():
                 w.Write("%s* add_%s();" % (cpp_type, name))
             else:
                 w.Write("void add_%s(%s val);" % (name, cpp_type))
             # Using a vector here breaks API compatibility with the standard
             # protobuf library, but it is more convenient.
-            w.Write("const std::vector<%s>& %s() const;" %
-                    (var_cpp_type, name))
+            w.Write("const std::vector<%s>& %s() const;" % (var_cpp_type, name))
             w.Write("std::vector<%s>* mutable_%s();" % (var_cpp_type, name))
             if self.type.IsMessage():
                 w.Write("const %s& %s(size_t idx) const;" % (cpp_type, name))
@@ -405,54 +451,70 @@ def GenerateFunctionDefinitions(self, w, class_name):
         name = self.name.group(0)
         cpp_type = self.type.GetCppType()
         var_cpp_type = self.type.GetVariableCppType()
-        if self.category == 'repeated':
+        if self.category == "repeated":
             if self.type.IsMessage():
                 w.Write(
-                    "inline %s* %s::add_%s() { return &%s_.emplace_back(); }" %
-                    (cpp_type, class_name, name, name))
+                    "inline %s* %s::add_%s() { return &%s_.emplace_back(); }"
+                    % (cpp_type, class_name, name, name)
+                )
             else:
                 w.Write(
                     "inline void %s::add_%s(%s val) { %s_.emplace_back(val); }"
-                    % (class_name, name, cpp_type, name))
+                    % (class_name, name, cpp_type, name)
+                )
             w.Write(
                 "inline const std::vector<%s>& %s::%s() const { return %s_; }"
-                % (var_cpp_type, class_name, name, name))
+                % (var_cpp_type, class_name, name, name)
+            )
             w.Write(
                 "inline std::vector<%s>* %s::mutable_%s() { return &%s_; }"
-                % (var_cpp_type, class_name, name, name))
+                % (var_cpp_type, class_name, name, name)
+            )
             if self.type.IsMessage():
                 w.Write(
                     "inline const %s& %s::%s(size_t idx) const { return %s_[idx]; }"
-                    % (cpp_type, class_name, name, name))
+                    % (cpp_type, class_name, name, name)
+                )
                 w.Write(
                     "inline %s* %s::mutable_%s(size_t idx) { return &%s_[idx]; }"
-                    % (cpp_type, class_name, name, name))
+                    % (cpp_type, class_name, name, name)
+                )
             else:
                 w.Write(
-                    "inline %s %s::%s(size_t idx) const { return %s_[idx]; }" %
-                    (cpp_type, class_name, name, name))
+                    "inline %s %s::%s(size_t idx) const { return %s_[idx]; }"
+                    % (cpp_type, class_name, name, name)
+                )
             w.Write(
-                "inline size_t %s::%s_size() const { return %s_.size(); }" %
-                (class_name, name, name))
+                "inline size_t %s::%s_size() const { return %s_.size(); }"
+                % (class_name, name, name)
+            )
         else:
-            w.Write("inline bool %s::has_%s() const { return has_%s_; }" %
-                    (class_name, name, name))
+            w.Write(
+                "inline bool %s::has_%s() const { return has_%s_; }"
+                % (class_name, name, name)
+            )
             if self.type.IsMessage():
-                w.Write("inline const %s& %s::%s() const { return %s_; }" %
-                        (cpp_type, class_name, name, name))
+                w.Write(
+                    "inline const %s& %s::%s() const { return %s_; }"
+                    % (cpp_type, class_name, name, name)
+                )
             if self.type.IsMessage() or self.type.IsBytesType():
-                w.Write("inline %s* %s::mutable_%s() {" %
-                        (var_cpp_type, class_name, name))
+                w.Write(
+                    "inline %s* %s::mutable_%s() {" % (var_cpp_type, class_name, name)
+                )
                 w.Indent()
-                w.Write('has_%s_ = true;' % (name))
-                w.Write('return &%s_;' % name)
+                w.Write("has_%s_ = true;" % (name))
+                w.Write("return &%s_;" % name)
                 w.Unindent()
                 w.Write("}")
             if not self.type.IsMessage():
-                w.Write("inline %s %s::%s() const { return %s_; }" %
-                        (cpp_type, class_name, name, name))
-                w.Write("inline void %s::set_%s(%s val) {" %
-                        (class_name, name, cpp_type))
+                w.Write(
+                    "inline %s %s::%s() const { return %s_; }"
+                    % (cpp_type, class_name, name, name)
+                )
+                w.Write(
+                    "inline void %s::set_%s(%s val) {" % (class_name, name, cpp_type)
+                )
                 w.Indent()
                 w.Write("has_%s_ = true;" % name)
                 w.Write("%s_ = val;" % name)
@@ -462,41 +524,43 @@ def GenerateFunctionDefinitions(self, w, class_name):
     def GenerateVariable(self, w):
         name = self.name.group(0)
         cpp_type = self.type.GetVariableCppType()
-        if self.category == 'repeated':
+        if self.category == "repeated":
             w.Write("std::vector<%s> %s_;" % (cpp_type, name))
         else:
             w.Write("bool has_%s_{};" % (name))
-            w.Write("%s %s_{};" % (cpp_type, name))
+            if "default" in self.attributes:
+                w.Write("%s %s_{%s};" % (cpp_type, name, self.attributes["default"]))
+            else:
+                w.Write("%s %s_{};" % (cpp_type, name))
         return
 
 
 class ProtoEnumParser:
-
     def __init__(self, lexer, scope):
-        lexer.Consume('enum')
-        self.name = lexer.Consume('identifier').group(0)
+        lexer.Consume("enum")
+        self.name = lexer.Consume("identifier").group(0)
         self.values = []
         self.scope = scope[:]
-        lexer.Consume('{')
+        lexer.Consume("{")
         while True:
             token, match = lexer.Pick()
-            if token == '}':
+            if token == "}":
                 break
-            key = lexer.Consume('identifier').group(0)
-            lexer.Consume('=')
-            value = int(lexer.Consume('number').group(0))
-            lexer.Consume(';')
+            key = lexer.Consume("identifier").group(0)
+            lexer.Consume("=")
+            value = int(lexer.Consume("number").group(0))
+            lexer.Consume(";")
             self.values.append((key, value))
-        lexer.Consume('}')
+        lexer.Consume("}")
 
     def GetName(self):
         return self.name
 
     def GetFullName(self):
-        return '_'.join([x.GetName() for x in self.scope] + [self.name])
+        return "_".join([x.GetName() for x in self.scope] + [self.name])
 
     def GetType(self):
-        return 'enum'
+        return "enum"
 
     def IsType(self):
         return True
@@ -515,110 +579,112 @@ def GenerateFunctionDefinitions(self, w):
 
     def GenerateEnumDefinitions(self, w):
         # Protobuf enum is mapped directly to C++ enum.
-        w.Write('enum %s : int {' % self.GetFullName())
+        w.Write("enum %s : int {" % self.GetFullName())
         w.Indent()
         for key, value in self.values:
-            w.Write('%s_%s = %d,' % (self.GetFullName(), key, value))
+            w.Write("%s_%s = %d," % (self.GetFullName(), key, value))
         w.Unindent()
-        w.Write('};')
-        w.Write('inline std::string %s_Name(%s val) {' %
-                (self.GetFullName(), self.GetFullName()))
+        w.Write("};")
+        w.Write(
+            "inline std::string %s_Name(%s val) {"
+            % (self.GetFullName(), self.GetFullName())
+        )
         w.Indent()
-        w.Write('switch (val) {')
+        w.Write("switch (val) {")
         w.Indent()
         for key, _ in self.values:
-            w.Write('case %s_%s:' % (self.GetFullName(), key))
+            w.Write("case %s_%s:" % (self.GetFullName(), key))
             w.Write('  return "%s";' % key)
         w.Unindent()
-        w.Write('};')
+        w.Write("};")
         w.Write('return "%s(" + std::to_string(val) + ")";' % self.name)
         w.Unindent()
-        w.Write('}')
+        w.Write("}")
 
     def GenerateUsingDirectives(self, w):
-        w.Write('using %s = %s;' % (self.name, self.GetFullName()))
+        w.Write("using %s = %s;" % (self.name, self.GetFullName()))
         for key, _ in self.values:
-            w.Write('static constexpr %s %s =' % (self.name, key))
-            w.Write('    %s_%s;' % (self.GetFullName(), key))
-        w.Write('static constexpr std::array<%s,%d> %s_AllValues = {' %
-                (self.name, len(self.values), self.name))
+            w.Write("static constexpr %s %s =" % (self.name, key))
+            w.Write("    %s_%s;" % (self.GetFullName(), key))
+        w.Write(
+            "static constexpr std::array<%s,%d> %s_AllValues = {"
+            % (self.name, len(self.values), self.name)
+        )
         w.Indent()
         for key, _ in self.values:
-            w.Write('%s,' % key)
+            w.Write("%s," % key)
         w.Unindent()
-        w.Write('};')
+        w.Write("};")
         # Static function to convert an enum value to its name.
-        w.Write('static std::string %s_Name(%s val) {' %
-                (self.name, self.name))
+        w.Write("static std::string %s_Name(%s val) {" % (self.name, self.name))
         w.Indent()
-        w.Write('return %s_Name(val);' % (self.GetFullName()))
+        w.Write("return %s_Name(val);" % (self.GetFullName()))
         w.Unindent()
-        w.Write('}')
+        w.Write("}")
 
 
 def ParseReservedFields(lexer):
     res = set()
-    lexer.Consume('reserved')
+    lexer.Consume("reserved")
     while True:
         token, match = lexer.Pick()
-        if token == 'number':
-            num = int(lexer.Consume('number').group(0))
-            if lexer.Pick()[0] == 'to':
-                lexer.Consume('to')
-                end = int(lexer.Consume('number').group(0))
+        if token == "number":
+            num = int(lexer.Consume("number").group(0))
+            if lexer.Pick()[0] == "to":
+                lexer.Consume("to")
+                end = int(lexer.Consume("number").group(0))
                 res.add(range(num, end + 1))
             else:
                 res.add(num)
-        elif token in ['identifier', 'string']:
+        elif token in ["identifier", "string"]:
             res.add(lexer.Consume(token).group(1))
         else:
-            lexer.Error('Expected number or identifier')
+            lexer.Error("Expected number or identifier")
         token, _ = lexer.Pick()
-        if token == ';':
-            lexer.Consume(';')
+        if token == ";":
+            lexer.Consume(";")
             break
-        lexer.Consume(',')
+        lexer.Consume(",")
     return res
 
 
 class ProtoMessageParser:
-
     def __init__(self, lexer, type_stack, scope):
         type_stack[0].append(self)
         self.reserved = set()
         self.types = []
         self.fields = []
         self.scope = scope[:]
-        lexer.Consume('message')
-        self.name = lexer.Consume('identifier').group(0)
-        lexer.Consume('{')
+        lexer.Consume("message")
+        self.name = lexer.Consume("identifier").group(0)
+        lexer.Consume("{")
         while True:
             token, match = lexer.Pick()
-            if token == '}':
+            if token == "}":
                 break
-            elif token == 'message':
-                ProtoMessageParser(lexer, [self.types, *type_stack],
-                                   self.scope + [self])
-            elif token == 'enum':
+            elif token == "message":
+                ProtoMessageParser(
+                    lexer, [self.types, *type_stack], self.scope + [self]
+                )
+            elif token == "enum":
                 self.types.append(ProtoEnumParser(lexer, self.scope + [self]))
-            elif token in ['repeated', 'optional', 'required']:
-                self.fields.append(
-                    ProtoFieldParser(lexer, [self.types, *type_stack]))
-            elif token == 'reserved':
+            elif token in ["repeated", "optional", "required"]:
+                self.fields.append(ProtoFieldParser(lexer, [self.types, *type_stack]))
+            elif token == "reserved":
                 self.reserved.update(ParseReservedFields(lexer))
             else:
-                lexer.Error('Expected field or type')
-        lexer.Consume('}')
+                lexer.Error("Expected field or type")
+        lexer.Consume("}")
         self.CheckReserved()
 
     def GetName(self):
         return self.name
 
     def GetFullName(self):
-        return '_'.join([x.GetName() for x in self.scope] + [self.name])
+        return "_".join([x.GetName() for x in self.scope] + [self.name])
 
     def GetType(self):
-        return 'message'
+        return "message"
 
     def IsType(self):
         return True
@@ -631,19 +697,20 @@ def GetFieldsGruppedByWireType(self):
         for x in self.fields:
             type_to_fields.setdefault(x.type.GetWireType(), []).append(x)
         return type_to_fields
-    
+
     def CheckReserved(self):
         for r in self.reserved:
             if isinstance(r, int):
                 if any(x.number == r for x in self.fields):
-                    raise ValueError(f'Field number [{r}] is reserved.')
+                    raise ValueError(f"Field number [{r}] is reserved.")
             elif isinstance(r, range):
                 if any(x.number in r for x in self.fields):
-                    raise ValueError(f'Field range [{r.start} to {r.stop-1}] '
-                                     'is reserved.')
+                    raise ValueError(
+                        f"Field range [{r.start} to {r.stop - 1}] is reserved."
+                    )
             else:
                 if any(x.name.group(0) == r for x in self.fields):
-                    raise ValueError(f'Field name [{r}] is reserved.')
+                    raise ValueError(f"Field name [{r}] is reserved.")
 
     def ResolveForwardDeclarations(self, type_stack):
         type_stack.append(self.types)
@@ -654,41 +721,44 @@ def ResolveForwardDeclarations(self, type_stack):
         type_stack.pop()
 
     def WriteFieldParserDeclaration(self, w, wire_id, fields):
-        fname = {0: 'SetVarInt', 1: 'SetInt64', 2: 'SetString', 5: 'SetInt32'}
+        fname = {0: "SetVarInt", 1: "SetInt64", 2: "SetString", 5: "SetInt32"}
         tname = {
-            0: 'std::uint64_t',
-            1: 'std::uint64_t',
-            2: 'std::string_view',
-            5: 'std::uint32_t'
+            0: "std::uint64_t",
+            1: "std::uint64_t",
+            2: "std::string_view",
+            5: "std::uint32_t",
         }
-        w.Write('void %s(int field_id, %s val) final;' %
-                (fname[wire_id], tname[wire_id]))
+        w.Write(
+            "void %s(int field_id, %s val) final;" % (fname[wire_id], tname[wire_id])
+        )
 
     def WriteFieldParserDefinition(self, w, wire_id, fields):
-        fname = {0: 'SetVarInt', 1: 'SetInt64', 2: 'SetString', 5: 'SetInt32'}
+        fname = {0: "SetVarInt", 1: "SetInt64", 2: "SetString", 5: "SetInt32"}
         tname = {
-            0: 'std::uint64_t',
-            1: 'std::uint64_t',
-            2: 'std::string_view',
-            5: 'std::uint32_t'
+            0: "std::uint64_t",
+            1: "std::uint64_t",
+            2: "std::string_view",
+            5: "std::uint32_t",
         }
-        w.Write('inline void %s::%s(int field_id, %s val) {' %
-                (self.GetFullName(), fname[wire_id], tname[wire_id]))
+        w.Write(
+            "inline void %s::%s(int field_id, %s val) {"
+            % (self.GetFullName(), fname[wire_id], tname[wire_id])
+        )
         w.Indent()
-        w.Write('switch (field_id) {')
+        w.Write("switch (field_id) {")
         w.Indent()
         for field in fields:
             field.GenerateCaseClause(w)
         w.Unindent()
-        w.Write('}')
+        w.Write("}")
         w.Unindent()
-        w.Write('}')
+        w.Write("}")
 
     def GenerateUsingDirectives(self, w):
-        w.Write('using %s = %s;' % (self.name, self.GetFullName()))
+        w.Write("using %s = %s;" % (self.name, self.GetFullName()))
 
     def GenerateMessageDeclarations(self, w):
-        w.Write(f'class %s;' % self.GetFullName())
+        w.Write(f"class %s;" % self.GetFullName())
         for x in self.types:
             x.GenerateMessageDeclarations(w)
 
@@ -699,42 +769,41 @@ def GenerateEnumDefinitions(self, w):
     def GenerateMessageDefinitions(self, w):
         # Writing nested messages.
         for x in self.types:
-            if x.GetType() == 'message':
+            if x.GetType() == "message":
                 x.GenerateMessageDefinitions(w)
         # Protobuf message is a C++ class.
-        w.Write('class %s final : public lczero::ProtoMessage {' %
-                self.GetFullName())
-        w.Write(' public:')
+        w.Write("class %s final : public lczero::ProtoMessage {" % self.GetFullName())
+        w.Write(" public:")
         w.Indent()
         # Writing using directives.
         for x in self.types:
             x.GenerateUsingDirectives(w)
         # Writing function declarations.
         for x in self.fields:
-            w.Write('')
+            w.Write("")
             x.GenerateFunctionDeclarations(w)
-        w.Write('')
-        w.Write('std::string OutputAsString() const final;')
-        w.Write('std::string OutputAsJson() const final;')
-        w.Write('void Clear() final;')
+        w.Write("")
+        w.Write("std::string OutputAsString() const final;")
+        w.Write("std::string OutputAsJson() const final;")
+        w.Write("void Clear() final;")
 
         w.Unindent()
-        w.Write('')
-        w.Write(' private:')
+        w.Write("")
+        w.Write(" private:")
         w.Indent()
         for k, v in self.GetFieldsGruppedByWireType().items():
             self.WriteFieldParserDeclaration(w, k, v)
-        w.Write('')
+        w.Write("")
         for x in self.fields:
             x.GenerateVariable(w)
         w.Unindent()
-        w.Write('};')
-        w.Write('')
+        w.Write("};")
+        w.Write("")
 
     def GenerateFunctionDefinitions(self, w):
         # Writing nested messages.
         for x in self.types:
-            if x.GetType() == 'message':
+            if x.GetType() == "message":
                 x.GenerateFunctionDefinitions(w)
         self.GenerateOutputAsStringFunc(w)
         self.GenerateOutputAsJsonFunc(w)
@@ -743,37 +812,35 @@ def GenerateFunctionDefinitions(self, w):
         self.GenerateFieldAccessorFuncs(w)
 
     def GenerateOutputAsStringFunc(self, w):
-        w.Write('inline std::string %s::OutputAsString() const {' %
-                self.GetFullName())
+        w.Write("inline std::string %s::OutputAsString() const {" % self.GetFullName())
         w.Indent()
-        w.Write('std::string out;')
+        w.Write("std::string out;")
         for x in sorted(self.fields, key=lambda x: x.number):
             x.GenerateOutput(w)
-        w.Write('return out;')
+        w.Write("return out;")
         w.Unindent()
-        w.Write('}')
+        w.Write("}")
 
     def GenerateOutputAsJsonFunc(self, w):
-        w.Write('inline std::string %s::OutputAsJson() const {' %
-                self.GetFullName())
+        w.Write("inline std::string %s::OutputAsJson() const {" % self.GetFullName())
         w.Indent()
         if self.fields:
-            w.Write('bool first = true;')
+            w.Write("bool first = true;")
         w.Write('std::string out = "{";')
         for x in self.fields:
             x.GenerateJsonOutput(w)
         w.Write('out += "}";')
-        w.Write('return out;')
+        w.Write("return out;")
         w.Unindent()
-        w.Write('}')
+        w.Write("}")
 
     def GenerateClearFunc(self, w):
-        w.Write('inline void %s::Clear() {' % self.GetFullName())
+        w.Write("inline void %s::Clear() {" % self.GetFullName())
         w.Indent()
         for x in self.fields:
             x.GenerateClear(w)
         w.Unindent()
-        w.Write('}')
+        w.Write("}")
 
     def GenerateParserFuncs(self, w):
         for k, v in self.GetFieldsGruppedByWireType().items():
@@ -785,38 +852,38 @@ def GenerateFieldAccessorFuncs(self, w):
 
 
 class ProtoFileParser:
-    '''Root grammar of .proto file'''
+    """Root grammar of .proto file"""
 
     def __init__(self, lexer):
         self.package = None
         self.types = []
         while True:
             token, match = lexer.Pick()
-            if token == 'EOF':
+            if token == "EOF":
                 return
-            elif token == 'syntax':
+            elif token == "syntax":
                 self.ParseSyntax(lexer)
-            elif token == 'package':
+            elif token == "package":
                 self.ParsePackage(lexer)
-            elif token == 'message':
+            elif token == "message":
                 self.ParseMessage(lexer)
-            elif token == 'enum':
+            elif token == "enum":
                 self.ParseEnum(lexer)
             else:
-                lexer.Error('Expected message or something similar')
+                lexer.Error("Expected message or something similar")
 
     def ParseSyntax(self, lexer):
-        lexer.Consume('syntax')
-        lexer.Consume('=')
-        lexer.Consume('string', 'proto2', 1)
-        lexer.Consume(';')
+        lexer.Consume("syntax")
+        lexer.Consume("=")
+        lexer.Consume("string", "proto2", 1)
+        lexer.Consume(";")
 
     def ParsePackage(self, lexer):
-        lexer.Consume('package')
+        lexer.Consume("package")
         if self.package is not None:
-            lexer.Error('Package was already defined')
+            lexer.Error("Package was already defined")
         self.package = ReadIdentifierPath(lexer)
-        lexer.Consume(';')
+        lexer.Consume(";")
 
     def ParseMessage(self, lexer):
         ProtoMessageParser(lexer, [self.types], [])
@@ -825,27 +892,27 @@ def ParseEnum(self, lexer):
         self.types.append(ProtoEnumParser(lexer, []))
 
     def Generate(self, w):
-        w.Write('// This file is AUTOGENERATED, do not edit.')
-        w.Write('#pragma once')
+        w.Write("// This file is AUTOGENERATED, do not edit.")
+        w.Write("#pragma once")
         w.Write('#include "utils/protomessage.h"')
         for x in self.package:
-            w.Write('namespace %s {' % x)
-        w.Write('')
-        w.Write('// Forward declarations.')
+            w.Write("namespace %s {" % x)
+        w.Write("")
+        w.Write("// Forward declarations.")
         for object in self.types:
             object.GenerateMessageDeclarations(w)
         for object in self.types:
             object.GenerateEnumDefinitions(w)
-        w.Write('')
-        w.Write('// Class declarations.')
+        w.Write("")
+        w.Write("// Class declarations.")
         for object in self.types:
             object.GenerateMessageDefinitions(w)
-        w.Write('')
-        w.Write('// Function definitions.')
+        w.Write("")
+        w.Write("// Function definitions.")
         for object in self.types:
             object.GenerateFunctionDefinitions(w)
         for x in reversed(self.package):
-            w.Write('}  // namespace %s' % x)
+            w.Write("}  // namespace %s" % x)
 
     def ResolveForwardDeclarations(self):
         type_stack = [self.types]
@@ -854,7 +921,7 @@ def ResolveForwardDeclarations(self):
 
 
 class Writer:
-    '''A helper class for writing file line by line with indent.'''
+    """A helper class for writing file line by line with indent."""
 
     def __init__(self, fo):
         self.fo = fo
@@ -868,26 +935,26 @@ def Unindent(self):
 
     def Write(self, text):
         if text:
-            self.fo.write(' ' * self.indent + text + '\n')
+            self.fo.write(" " * self.indent + text + "\n")
         else:
-            self.fo.write('\n')
+            self.fo.write("\n")
 
 
 if __name__ == "__main__":
     # Have the same flags as protoc has.
     parser = argparse.ArgumentParser(description="Compile protobuf files.")
-    parser.add_argument('input', type=str)
-    parser.add_argument('--proto_path', type=str)
-    parser.add_argument('--cpp_out', type=str)
+    parser.add_argument("input", type=str)
+    parser.add_argument("--proto_path", type=str)
+    parser.add_argument("--cpp_out", type=str)
     args = parser.parse_args()
 
     rel_path = os.path.relpath(args.input, args.proto_path)
-    dest_name = os.path.splitext(rel_path)[0] + '.pb.h'
+    dest_name = os.path.splitext(rel_path)[0] + ".pb.h"
     dest_path = os.path.join(args.cpp_out, dest_name)
     dest_dir = os.path.dirname(dest_path)
     os.makedirs(dest_dir, exist_ok=True)
 
-    with open(args.input, 'r') as input, open(dest_path, 'w') as output:
+    with open(args.input, "r") as input, open(dest_path, "w") as output:
         proto_file = ProtoFileParser(Lexer(input.read()))
         proto_file.ResolveForwardDeclarations()
         writer = Writer(output)
diff --git a/scripts/sycl_build_hack.py b/scripts/sycl_build_hack.py
index 14edff6ded..e7e3478875 100644
--- a/scripts/sycl_build_hack.py
+++ b/scripts/sycl_build_hack.py
@@ -12,12 +12,12 @@
 link_flag = False
 
 for line in lines:
-  # Replace xilink with icx -fsycl as the linker.
+  # Replace xilink with icx as the linker.
   if not link_flag:
     link_flag = 'xilink.exe' in line
   if link_flag:
     line = line.replace('xilink.exe', 'icx')
-    line = line.replace('/MACHINE:x64', '-fsycl')
+    line = line.replace('/MACHINE:x64', '')
     line = line.replace('/OUT:', '-o ')
     line = line.replace('/SUBSYSTEM:CONSOLE', '')
     line = line.replace('/OPT:REF', '')
diff --git a/src/chess/board.cc b/src/chess/board.cc
index 8d171141b3..59bc0c39cd 100644
--- a/src/chess/board.cc
+++ b/src/chess/board.cc
@@ -34,6 +34,7 @@
 #include <cstring>
 #include <sstream>
 #include <utility>
+#include <absl/cleanup/cleanup.h>
 
 #include "utils/exception.h"
 
@@ -573,8 +574,36 @@ MoveList ChessBoard::GeneratePseudolegalMoves() const {
   return result;
 }  // namespace lczero
 
+bool ChessBoard::IsValid() const {
+  const auto all = ours() | theirs();
+  auto check = all | pawns() | bishops() | rooks() | queens() | kings();
+  if (check != all ||
+      (pawns() & bishops()).as_int() ||
+      (pawns() & rooks()).as_int() ||
+      (pawns() & queens()).as_int() ||
+      (pawns() & kings()).as_int() ||
+      (bishops() & rooks()).as_int() ||
+      (bishops() & queens()).as_int() ||
+      (bishops() & kings()).as_int() ||
+      (rooks() & queens()).as_int() ||
+      (rooks() & kings()).as_int() ||
+      (queens() & kings()).as_int()) {
+    return false;
+  }
+  return true;
+}
+
 bool ChessBoard::ApplyMove(Move move) {
   assert(our_pieces_.intersects(BitBoard::FromSquare(move.from())));
+#ifndef NDEBUG
+  absl::Cleanup validate = [&] {
+    if (!IsValid()) {
+      CERR << "Move " + move.ToString(true) +
+                  " resulted in invalid board: " + DebugString();
+      assert(false);
+    }
+  };
+#endif
   const Square& from = move.from();
   const Square& to = move.to();
   const Rank from_rank = from.rank();
@@ -1113,7 +1142,9 @@ bool ChessBoard::HasMatingMaterial() const {
 }
 
 std::string ChessBoard::DebugString() const {
-  return "https://lc0.org/fen/" + BoardToFen(*this);
+  auto fen = BoardToFen(*this);
+  std::replace(fen.begin(), fen.end(), ' ', '_');
+  return "https://lc0.org/fen/" + fen;
 }
 
 Move ChessBoard::ParseMove(std::string_view move_str) const {
@@ -1160,7 +1191,7 @@ Move ChessBoard::ParseMove(std::string_view move_str) const {
     // Qeenside castling.
     return Move::WhiteCastling(from.file(), kFileA);
   }
-  if (from.file() != to.file() && pawns_.get(from) && !their_pieces_.get(to)) {
+  if (from.file() != to.file() && pawns().get(from) && !their_pieces_.get(to)) {
     // En passant.
     return Move::WhiteEnPassant(from, to);
   }
diff --git a/src/chess/board.h b/src/chess/board.h
index 4d2dbe17e5..d455fcb69d 100644
--- a/src/chess/board.h
+++ b/src/chess/board.h
@@ -231,6 +231,8 @@ class ChessBoard {
  private:
   // Sets the piece on the square.
   void PutPiece(Square square, PieceType piece, bool is_theirs);
+  // Check internal state is consistent after state transformations.
+  bool IsValid() const;
 
   // All white pieces.
   BitBoard our_pieces_;
diff --git a/src/chess/board_test.cc b/src/chess/board_test.cc
index 40621be621..eef6247d1d 100644
--- a/src/chess/board_test.cc
+++ b/src/chess/board_test.cc
@@ -2236,6 +2236,20 @@ TEST(ChessBoard, InvalidEnPassantFromKnightPromotion) {
   EXPECT_TRUE(board.en_passant().empty());
 }
 
+// Move from an en-passant flag square was mistakenly marked as en-passant.
+TEST(ChessBoard, QueenMoveFromEnPassantFlagBug) {
+  ChessBoard board;
+  board.SetFromFen("1Qnkr3/1p1b4/p2P2p1/P1q5/1NP3pP/1KN5/8/3R4 b - - 0 32");
+  board.ApplyMove(board.ParseMove("b7b5"));
+  board.Mirror();
+  auto m = board.ParseMove("b8c7");
+  EXPECT_FALSE(m.is_en_passant());
+  board.ApplyMove(m);
+  board.Mirror();
+  MoveList legal_moves = {board.ParseMove("c5c7")};
+  EXPECT_EQ(board.GenerateLegalMoves(), legal_moves);
+}
+
 }  // namespace lczero
 
 int main(int argc, char** argv) {
diff --git a/src/chess/callbacks.h b/src/chess/callbacks.h
index 63cd7b88b4..4205e2441a 100644
--- a/src/chess/callbacks.h
+++ b/src/chess/callbacks.h
@@ -66,6 +66,8 @@ struct ThinkingInfo {
   int64_t nodes = -1;
   // Nodes per second.
   int nps = -1;
+  // Evaluations per second.
+  int eps = -1;
   // Hash fullness * 1000
   int hashfull = -1;
   // Moves to mate.
diff --git a/src/chess/pgn.h b/src/chess/pgn.h
index 4025398a57..dd50ab9c98 100644
--- a/src/chess/pgn.h
+++ b/src/chess/pgn.h
@@ -319,7 +319,7 @@ class PgnReader {
     std::optional<Square> enpassant = std::nullopt;
     if (!board.en_passant().empty()) {
       auto sq = *board.en_passant().begin();
-      enpassant = Square(sq.file(), kRank6);
+      enpassant = Square(sq.file(), board.flipped() ? kRank3 : kRank6);
     }
     Square from(File::FromIdx(c1), Rank::FromIdx(r1));
     Square to(File::FromIdx(c2), Rank::FromIdx(r2));
diff --git a/src/chess/uciloop.cc b/src/chess/uciloop.cc
index a033e4cc1b..398a8bd7bd 100644
--- a/src/chess/uciloop.cc
+++ b/src/chess/uciloop.cc
@@ -54,6 +54,11 @@ const OptionId kShowWDL{{.long_flag = "show-wdl",
                          .uci_option = "UCI_ShowWDL",
                          .help_text = "Show win, draw and lose probability.",
                          .visibility = OptionId::kAlwaysVisible}};
+const OptionId kShowEPS{
+    {.long_flag = "show-eps",
+     .uci_option = "UCI_ShowEPS",
+     .help_text = "Show neural network evaluations per second.",
+     .visibility = OptionId::kAlwaysVisible}};
 const OptionId kShowMovesleft{{.long_flag = "show-movesleft",
                                .uci_option = "UCI_ShowMovesLeft",
                                .help_text = "Show estimated moves left.",
@@ -63,7 +68,7 @@ const std::unordered_map<std::string, std::unordered_set<std::string>>
     kKnownCommands = {
         {{"uci"}, {}},
         {{"isready"}, {}},
-        {{"setoption"}, {"context", "name", "value"}},
+        {{"setoption"}, {"name", "value"}},
         {{"ucinewgame"}, {}},
         {{"position"}, {"fen", "startpos", "moves"}},
         {{"go"},
@@ -94,6 +99,26 @@ ParseCommand(const std::string& line) {
     throw Exception("Unknown command: " + line);
   }
 
+  // Special parsing for setoption to keep strings unmodified.
+  if (command->first == "setoption") {
+    iss >> token;
+    if (token != "name") {
+      throw Exception("setoption must be followed by name");
+    }
+    int name_pos = iss.eof() ? line.length() : static_cast<int>(iss.tellg());
+    std::optional<int> value_pos;
+    while (iss >> token) {
+      if (token == "value") {
+        value_pos = iss.eof() ? line.length() : static_cast<int>(iss.tellg());
+        params["value"] = Trim(line.substr(*value_pos));
+        break;
+      }
+    }
+    params["name"] = Trim(line.substr(
+        name_pos, value_pos ? *value_pos - name_pos - 5 : std::string::npos));
+    return {"setoption", params};
+  }
+
   std::string whitespace;
   while (iss >> token) {
     auto iter = command->second.find(token);
@@ -139,7 +164,7 @@ int GetNumeric(const std::unordered_map<std::string, std::string>& params,
 
 bool ContainsKey(const std::unordered_map<std::string, std::string>& params,
                  const std::string& key) {
-  return params.find(key) != params.end();
+  return params.contains(key);
 }
 }  // namespace
 
@@ -164,9 +189,12 @@ bool UciLoop::DispatchCommand(
     engine_->EnsureReady();
     uci_responder_->SendRawResponse("readyok");
   } else if (command == "setoption") {
-    options_->SetUciOption(GetOrEmpty(params, "name"),
-                           GetOrEmpty(params, "value"),
-                           GetOrEmpty(params, "context"));
+    if (GetOrEmpty(params, "name").empty()) {
+      throw Exception("setoption requires name");
+    } else {
+      options_->SetUciOption(GetOrEmpty(params, "name"),
+                             GetOrEmpty(params, "value"));
+    }
   } else if (command == "ucinewgame") {
     engine_->NewGame();
   } else if (command == "position") {
@@ -235,7 +263,8 @@ bool UciLoop::ProcessLine(const std::string& line) {
 
 void StringUciResponder::PopulateParams(OptionsParser* options) {
   options->Add<BoolOption>(kUciChess960) = false;
-  options->Add<BoolOption>(kShowWDL) = true;
+  options->Add<BoolOption>(kShowWDL) = false;
+  options->Add<BoolOption>(kShowEPS) = false;
   options->Add<BoolOption>(kShowMovesleft) = false;
   options_ = &options->GetOptionsDict();
 }
@@ -289,6 +318,9 @@ void StringUciResponder::OutputThinkingInfo(std::vector<ThinkingInfo>* infos) {
     }
     if (info.hashfull >= 0) res += " hashfull " + std::to_string(info.hashfull);
     if (info.nps >= 0) res += " nps " + std::to_string(info.nps);
+    if (info.eps >= 0 && options_ && options_->Get<bool>(kShowEPS)) {
+      res += " eps " + std::to_string(info.eps);
+    }
     if (info.tb_hits >= 0) res += " tbhits " + std::to_string(info.tb_hits);
     if (info.multipv >= 0) res += " multipv " + std::to_string(info.multipv);
 
diff --git a/src/engine.cc b/src/engine.cc
index d76e630086..c4c487c020 100644
--- a/src/engine.cc
+++ b/src/engine.cc
@@ -162,6 +162,7 @@ void Engine::EnsureSearchStopped() {
 }
 
 void Engine::UpdateBackendConfig() {
+  LOGFILE << "Update backend configuration.";
   const std::string backend_name =
       options_.Get<std::string>(SharedBackendParams::kBackendId);
   if (!backend_ || backend_name != backend_name_ ||
@@ -182,6 +183,7 @@ void Engine::EnsureSyzygyTablebasesLoaded() {
   previous_tb_paths_ = tb_paths;
 
   if (tb_paths.empty()) {
+    LOGFILE << "Reset Syzygy tablebases.";
     syzygy_tb_.reset();
   } else {
     syzygy_tb_ = std::make_unique<SyzygyTablebase>();
@@ -198,6 +200,7 @@ void Engine::EnsureSyzygyTablebasesLoaded() {
 // Initializes the search with either the specified position for the normal
 // search or the position one ply trimmed for the ponder search.
 void Engine::InitializeSearchPosition(bool for_ponder) {
+  LOGFILE << "Setting a new search position.";
   assert(last_position_);
   if (!for_ponder) {
     search_->SetPosition(*last_position_);
@@ -217,7 +220,8 @@ void Engine::SetPosition(const std::string& fen,
   EnsureSearchStopped();
   ponder_enabled_ = options_.Get<bool>(kPonderId);
   strict_uci_timing_ = options_.Get<bool>(kStrictUciTiming);
-  if (!strict_uci_timing_) search_->StartClock();
+  isready_seen_ = false;
+  search_->StartClock();
   UpdateBackendConfig();
   EnsureSyzygyTablebasesLoaded();
   last_position_ = MakeGameState(fen, moves);
@@ -235,13 +239,18 @@ void Engine::Go(const GoParams& params) {
     throw Exception(
         "Ponder is not enabled, but the ponder search is requested.");
   }
-  if (strict_uci_timing_) search_->StartClock();
+  if ((strict_uci_timing_ && isready_seen_) ||
+      !(params.wtime || params.btime)) {
+    search_->StartClock();
+  }
   if (!last_position_) NewGame();
   if (ponder_enabled_) InitializeSearchPosition(params.ponder);
   last_go_params_ = params;
   search_->StartSearch(params);
 }
 
+void Engine::EnsureReady() { isready_seen_ = true; }
+
 void Engine::Wait() { search_->WaitSearch(); }
 
 void Engine::Stop() { search_->StopSearch(); }
diff --git a/src/engine.h b/src/engine.h
index 80b593301c..e50d661393 100644
--- a/src/engine.h
+++ b/src/engine.h
@@ -44,7 +44,7 @@ class Engine : public EngineControllerBase {
 
   static void PopulateOptions(OptionsParser*);
 
-  void EnsureReady() override {};
+  void EnsureReady() override;
   void NewGame() override;
   void SetPosition(const std::string& fen,
                    const std::vector<std::string>& moves) override;
@@ -74,8 +74,10 @@ class Engine : public EngineControllerBase {
   std::unique_ptr<SyzygyTablebase> syzygy_tb_;  // absl_nullable
 
   // UCI parameters cache to be consistent between `position` and `go`.
+  // Defaults ensure corect operation even if `go` comes first.
   bool ponder_enabled_ = false;
-  bool strict_uci_timing_ = false;
+  bool strict_uci_timing_ = true;
+  bool isready_seen_ = true;
   // Last position set for the search. Used to:
   // 1. Detect whether the position was ever set (to initialize to startpos).
   // 2. Remember the position for ponder go (removing the last ply).
diff --git a/src/main.cc b/src/main.cc
index 78415a3a33..dc83a199e8 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -26,6 +26,7 @@
 */
 
 #include "chess/board.h"
+#include "default_search.h"
 #include "engine.h"
 #include "search/register.h"
 #include "selfplay/loop.h"
@@ -37,6 +38,7 @@
 #include "utils/commandline.h"
 #include "utils/esc_codes.h"
 #include "utils/logging.h"
+#include "utils/trace.h"
 #include "version.h"
 
 namespace lczero {
@@ -52,14 +54,9 @@ void ChooseAndRunEngine() {
 
   // Then if DEFAULT_SEARCH is defined, run the engine specified by it.
 #ifdef DEFAULT_SEARCH
-#define STRINGIFY_INTERNAL(x) #x
-#define STRINGIFY(x) STRINGIFY_INTERNAL(x)
   SearchFactory* factory =
-      SearchManager::Get()->GetFactoryByName(STRINGIFY(DEFAULT_SEARCH));
-  if (!factory)
-    throw Exception("Unknown search algorithm: " STRINGIFY(DEFAULT_SEARCH));
-#undef STRINGIFY
-#undef STRINGIFY_INTERNAL
+      SearchManager::Get()->GetFactoryByName(DEFAULT_SEARCH);
+  if (!factory) throw Exception("Unknown search algorithm: " DEFAULT_SEARCH);
   RunEngine(factory);
   return;
 #endif
@@ -80,6 +77,7 @@ void ChooseAndRunEngine() {
 }  // namespace lczero
 
 int main(int argc, const char** argv) {
+  LCTRACE_INITIALIZE;
   using namespace lczero;
   EscCodes::Init();
   LOGFILE << "Lc0 started.";
diff --git a/src/neural/backends/blas/blas.h b/src/neural/backends/blas/blas.h
index 7001be64d7..a9018c71d2 100644
--- a/src/neural/backends/blas/blas.h
+++ b/src/neural/backends/blas/blas.h
@@ -18,6 +18,13 @@
 
 #pragma once
 
+// clang-format off
+// math.h include is workaround for Eigen trying to use math functions from global
+// namespaces. math.h must be included before Eigen/Core.
+#include <math.h>
+#include <Eigen/Core>
+// clang-format on
+
 // Select the BLAS vendor based on defines
 
 #ifdef USE_MKL
diff --git a/src/neural/backends/blas/convolution1.cc b/src/neural/backends/blas/convolution1.cc
index 8674b06dcf..1da550cb5b 100644
--- a/src/neural/backends/blas/convolution1.cc
+++ b/src/neural/backends/blas/convolution1.cc
@@ -19,8 +19,6 @@
 #include "neural/backends/blas/convolution1.h"
 #include "neural/backends/blas/blas.h"
 
-#include <Eigen/Dense>
-
 namespace lczero {
 template <typename T>
 using EigenMatrixMap =
diff --git a/src/neural/backends/blas/fully_connected_layer.cc b/src/neural/backends/blas/fully_connected_layer.cc
index 84699a3ec2..d0736c1eb3 100644
--- a/src/neural/backends/blas/fully_connected_layer.cc
+++ b/src/neural/backends/blas/fully_connected_layer.cc
@@ -23,8 +23,6 @@
 #include <cassert>
 #include <cmath>
 
-#include <Eigen/Dense>
-
 namespace lczero {
 namespace {
 void ApplyBias(size_t batch_size, const size_t output_size, const float* biases,
diff --git a/src/neural/backends/blas/network_blas.cc b/src/neural/backends/blas/network_blas.cc
index 71c561ca12..c91c5c44f5 100644
--- a/src/neural/backends/blas/network_blas.cc
+++ b/src/neural/backends/blas/network_blas.cc
@@ -16,7 +16,6 @@
  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#include <Eigen/Core>
 #include <algorithm>
 #include <cassert>
 #include <cmath>
@@ -70,7 +69,7 @@ class BlasComputation : public NetworkComputation {
                   const ActivationFunction smolgen_activation,
                   const ActivationFunction ffn_activation,
                   const bool attn_policy, const bool attn_body,
-                  bool is_pe_dense_embedding);
+                  bool is_pe_dense_embedding, int threads);
 
   virtual ~BlasComputation() {}
 
@@ -157,13 +156,14 @@ template <bool use_eigen>
 class BlasNetwork : public Network {
  public:
   BlasNetwork(const WeightsFile& weights, const OptionsDict& options);
-  virtual ~BlasNetwork(){};
+  virtual ~BlasNetwork() {};
 
   std::unique_ptr<NetworkComputation> NewComputation() override {
     return std::make_unique<BlasComputation<use_eigen>>(
         this, weights_, policy_head_, value_head_, max_batch_size_, wdl_,
         moves_left_, conv_policy_, default_activation_, smolgen_activation_,
-        ffn_activation_, attn_policy_, attn_body_, is_pe_dense_embedding_);
+        ffn_activation_, attn_policy_, attn_body_, is_pe_dense_embedding_,
+        threads_);
   }
 
   const NetworkCapabilities& GetCapabilities() const override {
@@ -199,15 +199,16 @@ class BlasNetwork : public Network {
   const NetworkCapabilities capabilities_;
   MultiHeadWeights weights_;
   size_t max_batch_size_;
+  int threads_;
   bool wdl_;
   bool moves_left_;
   bool conv_policy_;
   bool attn_policy_;
   bool attn_body_;
   bool is_pe_dense_embedding_;
-  ActivationFunction default_activation_;
-  ActivationFunction smolgen_activation_;
-  ActivationFunction ffn_activation_;
+  ActivationFunction default_activation_ = ACTIVATION_NONE;
+  ActivationFunction smolgen_activation_ = ACTIVATION_NONE;
+  ActivationFunction ffn_activation_ = ACTIVATION_NONE;
   std::string policy_head_;
   std::string value_head_;
   std::mutex buffers_lock_;
@@ -222,7 +223,8 @@ BlasComputation<use_eigen>::BlasComputation(
     const bool conv_policy, const ActivationFunction default_activation,
     const ActivationFunction smolgen_activation,
     const ActivationFunction ffn_activation, const bool attn_policy,
-    const bool attn_body, bool is_pe_dense_embedding)
+    const bool attn_body, bool is_pe_dense_embedding,
+    [[maybe_unused]] int threads)
     : weights_(weights),
       max_batch_size_(max_batch_size),
       policies_(0),
@@ -240,7 +242,7 @@ BlasComputation<use_eigen>::BlasComputation(
       value_head_(value_head),
       network_(network) {
 #ifdef USE_DNNL
-  omp_set_num_threads(1);
+  omp_set_num_threads(threads);
 #endif
 }
 
@@ -989,6 +991,7 @@ BlasNetwork<use_eigen>::BlasNetwork(const WeightsFile& file,
 
   max_batch_size_ =
       static_cast<size_t>(options.GetOrDefault<int>("batch_size", 256));
+  threads_ = options.GetOrDefault<int>("threads", 1);
 
   auto nf = file.format().network_format();
   using NF = pblczero::NetworkFormat;
@@ -1075,7 +1078,7 @@ BlasNetwork<use_eigen>::BlasNetwork(const WeightsFile& file,
   } else {
 #ifdef USE_OPENBLAS
     int num_procs = openblas_get_num_procs();
-    openblas_set_num_threads(1);
+    openblas_set_num_threads(threads_);
     const char* core_name = openblas_get_corename();
     const char* config = openblas_get_config();
     CERR << "BLAS vendor: OpenBLAS.";
@@ -1084,7 +1087,7 @@ BlasNetwork<use_eigen>::BlasNetwork(const WeightsFile& file,
 #endif
 
 #ifdef USE_MKL
-    mkl_set_num_threads(1);
+    mkl_set_num_threads(threads_);
     CERR << "BLAS vendor: MKL.";
     constexpr int len = 256;
     char versionbuf[len];
diff --git a/src/neural/backends/blas/winograd_convolution3.cc b/src/neural/backends/blas/winograd_convolution3.cc
index 31f00b50df..c1687aebe6 100644
--- a/src/neural/backends/blas/winograd_convolution3.cc
+++ b/src/neural/backends/blas/winograd_convolution3.cc
@@ -29,8 +29,6 @@
 #include "winograd_transform_ispc.h"
 #endif
 
-#include <Eigen/Dense>
-
 namespace lczero {
 template <typename T>
 using EigenMatrixMap =
diff --git a/src/neural/backends/cuda/common_kernels.cu b/src/neural/backends/cuda/common_kernels.cu
index ea8801ec2f..bab99ce4cf 100644
--- a/src/neural/backends/cuda/common_kernels.cu
+++ b/src/neural/backends/cuda/common_kernels.cu
@@ -31,6 +31,7 @@
 #include "cuda_common.h"
 #include "neural/tables/activation_function.h"
 #include "neural/tables/attention_policy_map.h"
+#include "utils/exception.h"
 #include "winograd_helper.inc"
 
 namespace lczero {
@@ -381,12 +382,13 @@ __global__ void NCHWtoNHWC_kernel(dT* output_tensor, const sT* input_tensor,
 
 template <typename DstType, typename SrcType>
 void convertNCHWtoNHWC(DstType* output_tensor, const SrcType* input_tensor,
-                       int Nin, int Cin, int Nout, int Cout, int H, int W) {
+                       int Nin, int Cin, int Nout, int Cout, int H, int W,
+                       cudaStream_t stream) {
   size_t numElements = Nout * Cout * H * W;
   const int blockSize = 256;
   int blocks = DivUp(numElements, blockSize);
-  NCHWtoNHWC_kernel<<<blocks, blockSize>>>(output_tensor, input_tensor, Nin,
-                                           Cin, Nout, Cout, H, W);
+  NCHWtoNHWC_kernel<<<blocks, blockSize, 0, stream>>>(
+      output_tensor, input_tensor, Nin, Cin, Nout, Cout, H, W);
 }
 
 template <typename DstType, typename SrcType>
@@ -437,65 +439,20 @@ __global__ void batchNorm_kernel(T* output, const T* input, const T* skipInput,
 template <typename T>
 void batchNorm(T* output, const T* input, const T* skipInput, int N, int C,
                int H, int W, float* means, float* var_multipliers,
-               ActivationFunction activation) {
+               ActivationFunction activation, cudaStream_t stream) {
   const int total_elements = N * C * H * W;
   const int kBlockSize = 256;
   int blocks = DivUp(total_elements, kBlockSize);
 
-  batchNorm_kernel<<<blocks, kBlockSize>>>(output, input, skipInput, N, C, H, W,
-                                           means, var_multipliers, activation);
+  batchNorm_kernel<<<blocks, kBlockSize, 0, stream>>>(
+      output, input, skipInput, N, C, H, W, means, var_multipliers, activation);
 
   ReportCUDAErrors(cudaGetLastError());
 }
 
-__global__ void expandPlanes_kernel_Fp32_NCHW(float* output,
-                                              const uint64_t* masks,
-                                              const float* values, int n) {
-  // Block size of 256, same mask/val for 64 consecutive threads.
-  constexpr int kNumShmemElements = 256 / 64;
-
-  __shared__ uint64_t shMasks[kNumShmemElements];
-  __shared__ float shVals[kNumShmemElements];
-
-  int index = threadIdx.x + blockDim.x * blockIdx.x;
-
-  int planeIndex = index >> 6;
-
-  if (planeIndex >= n) return;
-
-  // Load inputs to shared memory.
-  if (threadIdx.x < kNumShmemElements) {
-    shMasks[threadIdx.x] = masks[planeIndex + threadIdx.x];
-    shVals[threadIdx.x] = values[planeIndex + threadIdx.x];
-  }
-  __syncthreads();
-
-  uint64_t mask = shMasks[threadIdx.x >> 6];
-
-  int sqIndex = index & 0x3F;
-  float op = 0;
-
-  bool set = !!(mask & (1ull << sqIndex));
-  if (set) {
-    op = shVals[threadIdx.x >> 6];
-  }
-  output[index] = op;
-}
-
-void expandPlanes_Fp32_NCHW(float* output, const uint64_t* masks,
-                            const float* values, int n, cudaStream_t stream) {
-  int threads = n * 8 * 8;  // Each thread writes a single element.
-  const int blockSize = 256;
-  int blocks = DivUp(threads, blockSize);
-  expandPlanes_kernel_Fp32_NCHW<<<blocks, blockSize, 0, stream>>>(output, masks,
-                                                                  values, n);
-  ReportCUDAErrors(cudaGetLastError());
-}
-
-// TODO: Can optimize using shared memory if this becomes a bottleneck.
-__global__ void expandPlanes_kernel_Fp16_NHWC(half* output,
-                                              const uint64_t* masks,
-                                              const float* values, int n) {
+template <typename T>
+__global__ void expandPlanes_kernel_NHWC(T* output, const uint64_t* masks,
+                                         const T* values, int n) {
   const int index = threadIdx.x + blockDim.x * blockIdx.x;
   if (index >= n * 8 * 8) return;
 
@@ -505,66 +462,61 @@ __global__ void expandPlanes_kernel_Fp16_NHWC(half* output,
 
   uint64_t mask = masks[boardIndex * kInputPlanes + planeIndex];
 
-  half op = 0;
+  T op = 0;
   bool set = !!(mask & (1ull << sqIndex));
   if (set) {
-    float val = values[boardIndex * kInputPlanes + planeIndex];
-    op = (half)val;
+    op = values[boardIndex * kInputPlanes + planeIndex];
   }
   output[index] = op;
 }
 
-void expandPlanes_Fp16_NHWC(half* output, const uint64_t* masks,
-                            const float* values, int n, cudaStream_t stream) {
+template <typename T>
+void expandPlanes_NHWC(T* output, const uint64_t* masks, const T* values, int n,
+                       cudaStream_t stream) {
   int threads = n * 8 * 8;  // Each thread writes a single element.
   const int kBlockSize = 256;
   int blocks = DivUp(threads, kBlockSize);
-  expandPlanes_kernel_Fp16_NHWC<<<blocks, kBlockSize, 0, stream>>>(
-      output, masks, values, n);
+  expandPlanes_kernel_NHWC<<<blocks, kBlockSize, 0, stream>>>(output, masks,
+                                                              values, n);
   ReportCUDAErrors(cudaGetLastError());
 }
 
-__global__ void expandPlanes_kernel_Fp16_NCHW(half* output,
-                                              const uint64_t* masks,
-                                              const float* values, int n) {
-  // block size of 256, same mask/val for 64 consecutive threads
-  constexpr int kNumShmemElements = 256 / 64;
-
-  __shared__ uint64_t shMasks[kNumShmemElements];
-  __shared__ half shVals[kNumShmemElements];
-
-  int index = threadIdx.x + blockDim.x * blockIdx.x;
+template <typename T>
+__global__ void expandPlanes_kernel_NCHW(T* output, const uint64_t* masks,
+                                         const T* values, unsigned n) {
+  unsigned index = threadIdx.x + blockDim.x * blockIdx.x;
 
-  int planeIndex = index >> 6;
+  index *= 2;
+  unsigned planeIndex = index >> 6;
 
   if (planeIndex >= n) return;
 
-  // load inputs to shared memory
-  if (threadIdx.x < kNumShmemElements) {
-    shMasks[threadIdx.x] = masks[planeIndex + threadIdx.x];
-    shVals[threadIdx.x] = values[planeIndex + threadIdx.x];
-  }
-  __syncthreads();
-
-  uint64_t mask = shMasks[threadIdx.x >> 6];
+  uint64_t mask = masks[planeIndex];
 
   int sqIndex = index & 0x3F;
-  half op = 0;
+  T op[2] = {0, 0};
 
   bool set = !!(mask & (1ull << sqIndex));
   if (set) {
-    op = (half)shVals[threadIdx.x >> 6];
+    op[0] = values[planeIndex];
   }
-  output[index] = op;
+  sqIndex++;
+  set = !!(mask & (1ull << sqIndex));
+  if (set) {
+    op[1] = values[planeIndex];
+  }
+  output[index + 0] = op[0];
+  output[index + 1] = op[1];
 }
 
-void expandPlanes_Fp16_NCHW(half* output, const uint64_t* masks,
-                            const float* values, int n, cudaStream_t stream) {
-  int threads = n * 8 * 8;  // each thread writes a single element
+template <typename T>
+void expandPlanes_NCHW(T* output, const uint64_t* masks, const T* values,
+                            int n, cudaStream_t stream) {
+  unsigned threads = n * 8 * 8 / 2;  // each thread writes two elements.
   const int blockSize = 256;
-  int blocks = DivUp(threads, blockSize);
-  expandPlanes_kernel_Fp16_NCHW<<<blocks, blockSize, 0, stream>>>(output, masks,
-                                                                  values, n);
+  unsigned blocks = DivUp(threads, blockSize);
+  expandPlanes_kernel_NCHW<<<blocks, blockSize, 0, stream>>>(output, masks,
+                                                             values, n);
   ReportCUDAErrors(cudaGetLastError());
 }
 
@@ -704,14 +656,14 @@ __global__ void globalAvgPool_kernel(T* output, const T* input,
 
 template <typename T>
 void globalAvgPool(int N, int C, T* output, const T* input,
-                   const T* prevLayerBias, bool nhwc) {
+                   const T* prevLayerBias, bool nhwc, cudaStream_t stream) {
   const int kPlaneSize = 64;
   if (nhwc) {
     assert((std::is_same<half, T>::value));
     // For NHWC fp16, simply launch N blocks, each with C threads.
-    globalAvgPool_kernel_NHWC_fp16<<<N, C>>>((half*)output, (half*)input,
-                                             (half*)prevLayerBias,
-                                             N * C * kPlaneSize, N * C);
+    globalAvgPool_kernel_NHWC_fp16<<<N, C, 0, stream>>>(
+        (half*)output, (half*)input, (half*)prevLayerBias, N * C * kPlaneSize,
+        N * C);
   } else {
     // For NCHW layout (used with fp32),
     // each warp processes a full plane (64 elements), and writes a single
@@ -722,8 +674,8 @@ void globalAvgPool(int N, int C, T* output, const T* input,
     const int kBlockSize = kWarpsPerBlock * 32;
 
     int blocks = DivUp(kTotalWarps, kWarpsPerBlock);
-    globalAvgPool_kernel<<<blocks, kBlockSize>>>(output, input, prevLayerBias,
-                                                 N * C * kPlaneSize, N * C, C);
+    globalAvgPool_kernel<<<blocks, kBlockSize, 0, stream>>>(
+        output, input, prevLayerBias, N * C * kPlaneSize, N * C, C);
   }
   ReportCUDAErrors(cudaGetLastError());
 }
@@ -731,18 +683,18 @@ void globalAvgPool(int N, int C, T* output, const T* input,
 template <typename T>
 void globalScale(int N, int C, T* output, const T* input, const T* scaleBias,
                  const T* prevLayerBias, bool nhwc,
-                 ActivationFunction activation) {
+                 ActivationFunction activation, cudaStream_t stream) {
   // Each thread writes one output.
   const int kBlockSize = 256;
   const int kBlocks = DivUp(N * 8 * 8 * C, kBlockSize);
 
   if (nhwc) {
     assert((std::is_same<half, T>::value));
-    globalScale_kernel_fp16_nhwc<<<kBlocks, kBlockSize>>>(
+    globalScale_kernel_fp16_nhwc<<<kBlocks, kBlockSize, 0, stream>>>(
         (half*)output, (half*)input, (half*)scaleBias, (half*)prevLayerBias,
         N * C * 8 * 8, C, 8 * 8 * C, activation);
   } else {
-    globalScale_kernel<<<kBlocks, kBlockSize>>>(
+    globalScale_kernel<<<kBlocks, kBlockSize, 0, stream>>>(
         output, input, scaleBias, prevLayerBias, N * C * 8 * 8, C, activation);
   }
   ReportCUDAErrors(cudaGetLastError());
@@ -808,6 +760,15 @@ void OutputInputTransform(int N, int C, int se_K, T* output, const T* input,
   ReportCUDAErrors(cudaGetLastError());
 }
 
+__device__ __forceinline__ float clamp(float val, float low, float high) {
+  if (__builtin_expect(isnan(val), 0)) return val;
+  return fminf(fmaxf(val, low), high);
+}
+
+namespace {
+constexpr float kTwiceHalfMax = 131008.0f;  // Twice the max finite fp16 value.
+}  // namespace
+
 // softmax along C dimension which is assumed to be 64
 // each thread processes two elements. Each warp computes a sum (over 64
 // elements)
@@ -843,6 +804,11 @@ __global__ void softmax_opt_64_kernel(T* output, const T* input,
     x[0] += x[2];
     x[1] += x[3];
   }
+  if (fp16) {
+    // Guard against Inf from fp16 overflow.
+    x[0] = clamp(x[0], -kTwiceHalfMax, kTwiceHalfMax);
+    x[1] = clamp(x[1], -kTwiceHalfMax, kTwiceHalfMax);
+  }
   float threadMax = max(x[0], x[1]);
   float maxval = warpMax(threadMax);
   maxval = __shfl_sync(0xFFFFFFFF, maxval, 0);
@@ -884,6 +850,10 @@ __global__ void softmax_kernel(T* output, const T* input, const T* input2) {
 
   float x = (float)input[index];
   if (input2 != nullptr) x += (float)input2[index];
+  if (std::is_same<half, T>::value) {
+    // Guard against Inf from fp16 overflow.
+    x = clamp(x, -kTwiceHalfMax, kTwiceHalfMax);
+  }
 
   __shared__ float sum, maxval;
   if (c == 0) {
@@ -1242,7 +1212,8 @@ __global__ void preprocess_for_attention_body_kernel(
   if (c >= input_size) {
     // concatenate from position encoding array
     if (is_pe_dense_embedding) {
-      op = (T)(encoding[n * 64 * encoding_size + hw * encoding_size + (c - input_size)]);
+      op = (T)(encoding[n * 64 * encoding_size + hw * encoding_size +
+                        (c - input_size)]);
     } else {
       op = (T)(encoding[64 * hw + (c - input_size)]);
     }
@@ -1309,6 +1280,64 @@ void applyInputGating(T* output, const T* input, const T* mult, const T* add,
   ReportCUDAErrors(cudaGetLastError());
 }
 
+template <typename T, int kWorkPerThread>
+__global__ void genOffsetPointers_kernel(T** offsets, int heads, int block_size,
+                                         int depth, int d_model, T* k, T* q,
+                                         T* b1, T* v, T* b2) {
+  const int i = (blockIdx.x * blockDim.x + threadIdx.x) * kWorkPerThread;
+  if (i >= block_size) return;
+  const int h = i % heads;
+  const int n = i / heads;
+  int w;
+  T* res[kWorkPerThread];
+  for (w = 0; w < kWorkPerThread; w++) {
+    res[w] = k + h * depth + 64 * d_model * n + w * depth;
+    offsets[i + w] = res[w];
+  }
+
+  for (w = 0; w < kWorkPerThread; w++) {
+    res[w] = q + h * depth + 64 * d_model * n + w * depth;
+    offsets[i + w + block_size] = res[w];
+  }
+
+  for (w = 0; w < kWorkPerThread; w++) {
+    res[w] = b1 + i * 64 * 64 + w * 64 * 64;
+    offsets[i + w + 2 * block_size] = res[w];
+  }
+
+  for (w = 0; w < kWorkPerThread; w++) {
+    res[w] = v + h * depth + 64 * d_model * n + w * depth;
+    offsets[i + w + 3 * block_size] = res[w];
+  }
+
+  for (w = 0; w < kWorkPerThread; w++) {
+    res[w] = b2 + h * depth + 64 * d_model * n + w * depth;
+    offsets[i + w + 4 * block_size] = res[w];
+  }
+}
+
+template <typename T>
+void genOffsetPointers(T** offsets, int heads, int max_batch, int depth,
+                       int d_model, T* k, T* q, T* b1, T* v, T* b2,
+                       cudaStream_t stream) {
+  const int block_size = heads * max_batch;
+  // Process two elements per thread to use 128 bit store instructions.
+  constexpr int kWorkPerThread = 2;
+  constexpr int kWorkGroupSize = 128;
+  if (block_size % kWorkPerThread != 0) {
+    // Handle odd block sizes.
+    int grid = DivUp(block_size, kWorkGroupSize);
+    genOffsetPointers_kernel<T, 1><<<grid, kWorkGroupSize, 0, stream>>>(
+        offsets, heads, block_size, depth, d_model, k, q, b1, v, b2);
+  } else {
+    // Handle even block size
+    int grid = DivUp(block_size, kWorkGroupSize * kWorkPerThread);
+    genOffsetPointers_kernel<T, kWorkPerThread>
+        <<<grid, kWorkGroupSize, 0, stream>>>(offsets, heads, block_size, depth,
+                                              d_model, k, q, b1, v, b2);
+  }
+}
+
 // Template instantiation.
 template void copyTypeConverted<half, float>(half* op, float* ip, int N,
                                              cudaStream_t stream);
@@ -1322,11 +1351,13 @@ template void copyTypeConverted<half, half>(half* op, half* ip, int N,
 template void batchNorm<float>(float* output, const float* input,
                                const float* skipInput, int N, int C, int H,
                                int W, float* means, float* var_multipliers,
-                               ActivationFunction activation);
+                               ActivationFunction activation,
+                               cudaStream_t stream);
 template void batchNorm<half>(half* output, const half* input,
                               const half* skipInput, int N, int C, int H, int W,
                               float* means, float* var_multipliers,
-                              ActivationFunction activation);
+                              ActivationFunction activation,
+                              cudaStream_t stream);
 
 template void addVectors<float>(float* c, float* a, float* b, int size,
                                 int asize, int bsize, ActivationFunction act,
@@ -1368,18 +1399,36 @@ template void addBias_NCHW<half>(half* c, half* a, half* b, int N, int C, int H,
 
 template void globalAvgPool<float>(int N, int C, float* output,
                                    const float* input,
-                                   const float* prevLayerBias, bool nhwc);
+                                   const float* prevLayerBias, bool nhwc,
+                                   cudaStream_t stream);
 template void globalAvgPool<half>(int N, int C, half* output, const half* input,
-                                  const half* prevLayerBias, bool nhwc);
+                                  const half* prevLayerBias, bool nhwc,
+                                  cudaStream_t stream);
+
+template void expandPlanes_NHWC<float>(float* output, const uint64_t* masks,
+                                       const float* values, int n,
+                                       cudaStream_t stream);
+template void expandPlanes_NHWC<half>(half* output, const uint64_t* masks,
+                                      const half* values, int n,
+                                      cudaStream_t stream);
+
+template void expandPlanes_NCHW<float>(float* output, const uint64_t* masks,
+                                       const float* values, int n,
+                                       cudaStream_t stream);
+template void expandPlanes_NCHW<half>(half* output, const uint64_t* masks,
+                                      const half* values, int n,
+                                      cudaStream_t stream);
 
 template void globalScale<float>(int N, int C, float* output,
                                  const float* input, const float* scaleBias,
                                  const float* prevLayerBias, bool nhwc,
-                                 ActivationFunction activation);
+                                 ActivationFunction activation,
+                                 cudaStream_t stream);
 template void globalScale<half>(int N, int C, half* output, const half* input,
                                 const half* scaleBias,
                                 const half* prevLayerBias, bool nhwc,
-                                ActivationFunction activation);
+                                ActivationFunction activation,
+                                cudaStream_t stream);
 
 template void PolicyMap<float>(int N, float* output, const float* input,
                                const short* indices, int inputSize,
@@ -1391,7 +1440,7 @@ template void PolicyMap<half>(int N, half* output, const half* input,
                               int outputSize, cudaStream_t stream);
 
 template void FilterTransform<float>(int N, int C, float* transformedFilter,
-                                     const float* filter);
+                                     const float* filter, cudaStream_t stream);
 
 template void InputTransform<float, true>(int N, int C,
                                           float* transformed_input,
@@ -1566,15 +1615,16 @@ template void ComputePromotionLogits<float>(int N, int C, float* output,
 template void convertNCHWtoNHWC<half, float>(half* output_tensor,
                                              const float* input_tensor, int Nin,
                                              int Cin, int Nout, int Cout, int H,
-                                             int W);
+                                             int W, cudaStream_t stream);
 template void convertNCHWtoNHWC<float, float>(float* output_tensor,
                                               const float* input_tensor,
                                               int Nin, int Cin, int Nout,
-                                              int Cout, int H, int W);
+                                              int Cout, int H, int W,
+                                              cudaStream_t stream);
 template void convertNCHWtoNHWC<half, half>(half* output_tensor,
                                             const half* input_tensor, int Nin,
                                             int Cin, int Nout, int Cout, int H,
-                                            int W);
+                                            int W, cudaStream_t stream);
 
 template void inputPreprocessForAttentionBody<half>(
     half* output, const half* input, const half* encoding, int N,
@@ -1595,5 +1645,14 @@ template void applyInputGating<float>(float* output, const float* input,
                                       const float* mult, const float* add,
                                       int N, int C, int output_size,
                                       cudaStream_t stream);
+
+template void genOffsetPointers<float>(float** offsets, int heads,
+                                       int max_batch, int depth, int d_model,
+                                       float* k, float* q, float* b1, float* v,
+                                       float* b2, cudaStream_t stream);
+template void genOffsetPointers<half>(half** offsets, int heads, int max_batch,
+                                      int depth, int d_model, half* k, half* q,
+                                      half* b1, half* v, half* b2,
+                                      cudaStream_t stream);
 }  // namespace cudnn_backend
 }  // namespace lczero
diff --git a/src/neural/backends/cuda/cuda_common.h b/src/neural/backends/cuda/cuda_common.h
index ca91f0e91b..1babb7e003 100644
--- a/src/neural/backends/cuda/cuda_common.h
+++ b/src/neural/backends/cuda/cuda_common.h
@@ -30,7 +30,7 @@
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
 
-#include "utils/exception.h"
+#include "utils/fp16_utils.h"
 
 #ifdef USE_CUDNN
 #include <cudnn.h>
diff --git a/src/neural/backends/cuda/cutlass_kernels.cu b/src/neural/backends/cuda/cutlass_kernels.cu
new file mode 100644
index 0000000000..619c839f90
--- /dev/null
+++ b/src/neural/backends/cuda/cutlass_kernels.cu
@@ -0,0 +1,124 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2018 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#include "neural/backends/cuda/cuda_common.h"
+
+// Fused MHA implementation from cutlass example #41
+#include "fused_multi_head_attention/kernel_forward.h"
+#include "utils/exception.h"
+
+namespace lczero {
+namespace cudnn_backend {
+
+template <bool bias>
+void fusedMHACutlass(void* output, void* q, void* k, void* v, void* skip,
+                     int batch_size, int num_heads, int depth,
+                     cudaStream_t stream) {
+  cutlass::half_t* mha_q = (cutlass::half_t*)q;
+  cutlass::half_t* mha_k = (cutlass::half_t*)k;
+  cutlass::half_t* mha_v = (cutlass::half_t*)v;
+
+  constexpr int kQueriesPerBlock = 64;
+  constexpr int kKeysPerBlock = 64;
+  constexpr bool kSingleValueIteration = true;
+
+  using Attention =
+      AttentionKernel<cutlass::half_t,      // scalar_t
+                      cutlass::arch::Sm80,  // ArchTag
+                      true,                 // Memory is aligned
+                      kQueriesPerBlock, kKeysPerBlock, kSingleValueIteration,
+                      false,  // Supports dropout
+                      bias    // Supports bias
+                      >;
+  static_assert(
+      !Attention::kNeedsOutputAccumulatorBuffer,
+      "Unhandled case in cutlass MHA: needs output accumulator buffer");
+
+  typename Attention::Params p;
+  {  // set parameters
+    p.query_ptr = mha_q;
+    p.key_ptr = mha_k;
+    p.value_ptr = mha_v;
+    p.logsumexp_ptr = nullptr;  // Only needed for bw
+    p.output_accum_ptr = nullptr;
+    p.output_ptr = (cutlass::half_t*)output;
+    p.attn_bias_ptr = (cutlass::half_t*)skip;
+
+    p.scale = 1.0f / sqrt((float)depth);
+
+    p.num_heads = num_heads;
+    p.num_batches = batch_size;
+    p.head_dim = depth;
+    p.head_dim_value = depth;
+    p.num_queries = 64;
+    p.num_keys = 64;
+
+    // All tensors are in BMHK shapes
+    p.q_strideH = depth;
+    p.k_strideH = depth;
+    p.v_strideH = depth;
+    p.q_strideM = depth * num_heads;
+    p.k_strideM = depth * num_heads;
+    p.v_strideM = depth * num_heads;
+    p.q_strideB = p.q_strideM * 64;
+    p.k_strideB = p.k_strideM * 64;
+    p.v_strideB = p.v_strideM * 64;
+    p.o_strideM = p.head_dim_value * p.num_heads;
+
+    p.bias_strideH = 64 * 64;
+    p.bias_strideM = 64;
+    p.bias_strideB = num_heads * p.bias_strideH;
+  }
+
+  constexpr auto kernel_fn = attention_kernel_batched_impl<Attention>;
+  int smem_bytes = sizeof(typename Attention::SharedStorage);
+  if (smem_bytes > 0xc000) {
+    ReportCUDAErrors(cudaFuncSetAttribute(
+        kernel_fn, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_bytes));
+  }
+  if (!Attention::check_supported(p)) {
+    throw Exception("Unhandled case in cutlass MHA: check_supported failed.");
+  }
+
+  kernel_fn<<<p.getBlocksGrid(), p.getThreadsGrid(), smem_bytes, stream>>>(p);
+
+  ReportCUDAErrors(cudaGetLastError());
+}
+
+void fusedMHA(void* output, void* mha_q, void* mha_k, void* mha_v, void* skip,
+              int batch_size, int num_heads, int depth, cudaStream_t stream) {
+  if (skip == nullptr) {
+    fusedMHACutlass<false>(output, mha_q, mha_k, mha_v, skip, batch_size,
+                           num_heads, depth, stream);
+  } else {
+    fusedMHACutlass<true>(output, mha_q, mha_k, mha_v, skip, batch_size,
+                          num_heads, depth, stream);
+  }
+}
+
+}  // namespace cudnn_backend
+}  // namespace lczero
diff --git a/src/neural/backends/cuda/fp16_kernels.cu b/src/neural/backends/cuda/fp16_kernels.cu
index 0d93ca6459..37827ba0eb 100644
--- a/src/neural/backends/cuda/fp16_kernels.cu
+++ b/src/neural/backends/cuda/fp16_kernels.cu
@@ -27,6 +27,7 @@
 
 #include "cuda_common.h"
 #include "neural/tables/activation_function.h"
+#include "utils/exception.h"
 
 // Allow building on an old architecture.
 #if __CUDA_ARCH__ < 530
@@ -137,61 +138,61 @@ __global__ void SE_Layer_NHWC(half* output, const half* skip, const half* input,
 bool Se_Fp16_NHWC(int N, int C, int numFc1Out, half* output, const half* skip,
                   const half* input, const half* w1, const half* b1,
                   const half* w2, const half* b2, const half* bPrev,
-                  ActivationFunction activation) {
+                  ActivationFunction activation, cudaStream_t stream) {
   // TODO: Think of more elegant way to avoid this hardcoding :-/
   if (numFc1Out == 16) {
     if (C == 64) {
-      SE_Layer_NHWC<64, 16>
-          <<<N, C>>>(output, skip, input, w1, b1, w2, b2, bPrev, activation);
+      SE_Layer_NHWC<64, 16><<<N, C, 0, stream>>>(output, skip, input, w1, b1,
+                                                 w2, b2, bPrev, activation);
     } else {
       // TODO: support other channel counts.
       throw Exception("channel count unsupported by SE layer");
     }
   } else if (numFc1Out == 32) {
     if (C == 64) {
-      SE_Layer_NHWC<64, 32>
-          <<<N, C>>>(output, skip, input, w1, b1, w2, b2, bPrev, activation);
+      SE_Layer_NHWC<64, 32><<<N, C, 0, stream>>>(output, skip, input, w1, b1,
+                                                 w2, b2, bPrev, activation);
     } else if (C == 128) {
-      SE_Layer_NHWC<128, 32>
-          <<<N, C>>>(output, skip, input, w1, b1, w2, b2, bPrev, activation);
+      SE_Layer_NHWC<128, 32><<<N, C, 0, stream>>>(output, skip, input, w1, b1,
+                                                  w2, b2, bPrev, activation);
     } else if (C == 192) {
-      SE_Layer_NHWC<192, 32>
-          <<<N, C>>>(output, skip, input, w1, b1, w2, b2, bPrev, activation);
+      SE_Layer_NHWC<192, 32><<<N, C, 0, stream>>>(output, skip, input, w1, b1,
+                                                  w2, b2, bPrev, activation);
     } else if (C == 256) {
-      SE_Layer_NHWC<256, 32>
-          <<<N, C>>>(output, skip, input, w1, b1, w2, b2, bPrev, activation);
+      SE_Layer_NHWC<256, 32><<<N, C, 0, stream>>>(output, skip, input, w1, b1,
+                                                  w2, b2, bPrev, activation);
     } else if (C == 320) {
-      SE_Layer_NHWC<320, 32>
-          <<<N, C>>>(output, skip, input, w1, b1, w2, b2, bPrev, activation);
+      SE_Layer_NHWC<320, 32><<<N, C, 0, stream>>>(output, skip, input, w1, b1,
+                                                  w2, b2, bPrev, activation);
     } else if (C == 352) {
-      SE_Layer_NHWC<352, 32>
-          <<<N, C>>>(output, skip, input, w1, b1, w2, b2, bPrev, activation);
+      SE_Layer_NHWC<352, 32><<<N, C, 0, stream>>>(output, skip, input, w1, b1,
+                                                  w2, b2, bPrev, activation);
     } else if (C == 384) {
-      SE_Layer_NHWC<384, 32>
-          <<<N, C>>>(output, skip, input, w1, b1, w2, b2, bPrev, activation);
+      SE_Layer_NHWC<384, 32><<<N, C, 0, stream>>>(output, skip, input, w1, b1,
+                                                  w2, b2, bPrev, activation);
     } else {
       // TODO: support other channel counts.
       return false;
     }
   } else if (numFc1Out == 64) {
     if (C == 64) {
-      SE_Layer_NHWC<64, 64>
-          <<<N, C>>>(output, skip, input, w1, b1, w2, b2, bPrev, activation);
+      SE_Layer_NHWC<64, 64><<<N, C, 0, stream>>>(output, skip, input, w1, b1,
+                                                 w2, b2, bPrev, activation);
     } else if (C == 128) {
-      SE_Layer_NHWC<128, 64>
-          <<<N, C>>>(output, skip, input, w1, b1, w2, b2, bPrev, activation);
+      SE_Layer_NHWC<128, 64><<<N, C, 0, stream>>>(output, skip, input, w1, b1,
+                                                  w2, b2, bPrev, activation);
     } else if (C == 192) {
-      SE_Layer_NHWC<192, 64>
-          <<<N, C>>>(output, skip, input, w1, b1, w2, b2, bPrev, activation);
+      SE_Layer_NHWC<192, 64><<<N, C, 0, stream>>>(output, skip, input, w1, b1,
+                                                  w2, b2, bPrev, activation);
     } else if (C == 256) {
-      SE_Layer_NHWC<256, 64>
-          <<<N, C>>>(output, skip, input, w1, b1, w2, b2, bPrev, activation);
+      SE_Layer_NHWC<256, 64><<<N, C, 0, stream>>>(output, skip, input, w1, b1,
+                                                  w2, b2, bPrev, activation);
     } else if (C == 320) {
-      SE_Layer_NHWC<320, 64>
-          <<<N, C>>>(output, skip, input, w1, b1, w2, b2, bPrev, activation);
+      SE_Layer_NHWC<320, 64><<<N, C, 0, stream>>>(output, skip, input, w1, b1,
+                                                  w2, b2, bPrev, activation);
     } else if (C == 384) {
-      SE_Layer_NHWC<384, 64>
-          <<<N, C>>>(output, skip, input, w1, b1, w2, b2, bPrev, activation);
+      SE_Layer_NHWC<384, 64><<<N, C, 0, stream>>>(output, skip, input, w1, b1,
+                                                  w2, b2, bPrev, activation);
     } else {
       // TODO: support other channel counts.
       return false;
@@ -474,7 +475,7 @@ void OutputInputTransform(int N, int C, int se_K, T* output, const T* input,
 }
 
 template void FilterTransform<half>(int N, int C, half* transformedFilter,
-                                    const half* filter);
+                                    const half* filter, cudaStream_t stream);
 
 template void InputTransform<half, true>(int N, int C, half* transformed_input,
                                          const half* input,
diff --git a/src/neural/backends/cuda/inputs_outputs.h b/src/neural/backends/cuda/inputs_outputs.h
index 4c356994a8..89e728da84 100644
--- a/src/neural/backends/cuda/inputs_outputs.h
+++ b/src/neural/backends/cuda/inputs_outputs.h
@@ -27,75 +27,158 @@
 
 #pragma once
 
+#include <cassert>
+#include <memory>
+
+#include "cuda_common.h"
 #include "neural/network.h"
+#include "utils/bit.h"
 
 namespace lczero {
 namespace cudnn_backend {
 
+inline void ToType(float& dst, float src) { dst = src; }
+inline void ToType(half& dst, float src) {
+  auto temp = FP32toFP16(src);
+  dst = bit_cast<half>(temp);
+}
+
+inline float FromType(float src) { return src; }
+inline float FromType(half src) {
+  uint16_t temp = bit_cast<uint16_t>(src);
+  return FP16toFP32(temp);
+}
+
+template <typename DataType>
+struct CudaGraphCapture;
+
+template <typename DataType>
+struct CudaGraphExec {
+  ~CudaGraphExec() {
+    if (graph_exec_ != nullptr) {
+      ReportCUDAErrors(cudaGraphExecDestroy(graph_exec_));
+    }
+  }
+
+  CudaGraphExec& operator=(const CudaGraphCapture<DataType>&);
+  explicit operator bool() const { return graph_exec_ != nullptr; }
+
+  void Launch(cudaStream_t stream) {
+    ReportCUDAErrors(cudaGraphLaunch(graph_exec_, stream));
+  }
+  cudaGraphExec_t graph_exec_ = nullptr;
+};
+
+template <typename DataType>
 struct InputsOutputs {
-  InputsOutputs(int maxBatchSize, bool wdl, bool moves_left,
+  InputsOutputs(unsigned maxBatchSize, bool wdl, bool moves_left,
                 size_t tensor_mem_size = 0, size_t scratch_size = 0,
                 bool cublasDisableTensorCores = false) {
     ReportCUDAErrors(cudaHostAlloc(
         &input_masks_mem_, maxBatchSize * kInputPlanes * sizeof(uint64_t),
         cudaHostAllocMapped));
-    ReportCUDAErrors(
-        cudaHostGetDevicePointer(&input_masks_mem_gpu_, input_masks_mem_, 0));
+    ReportCUDAErrors(cudaMalloc(
+        &input_masks_mem_gpu_, maxBatchSize * kInputPlanes * sizeof(uint64_t)));
 
-    ReportCUDAErrors(cudaHostAlloc(&input_val_mem_,
-                                   maxBatchSize * kInputPlanes * sizeof(float),
-                                   cudaHostAllocMapped));
     ReportCUDAErrors(
-        cudaHostGetDevicePointer(&input_val_mem_gpu_, input_val_mem_, 0));
+        cudaHostAlloc(&input_val_mem_,
+                      maxBatchSize * kInputPlanes * sizeof(input_val_mem_[0]),
+                      cudaHostAllocMapped));
+    ReportCUDAErrors(cudaMalloc(
+        &input_val_mem_gpu_,
+        maxBatchSize * kInputPlanes * sizeof(input_val_mem_gpu_[0])));
 
     ReportCUDAErrors(cudaHostAlloc(
-        &op_policy_mem_, maxBatchSize * kNumOutputPolicy * sizeof(float), 0));
+        &op_policy_mem_,
+        maxBatchSize * kNumOutputPolicy * sizeof(op_policy_mem_[0]), 0));
 
     // Seperate device memory copy for policy output.
     // It's faster to write to device memory and then copy to host memory
     // than having the kernel write directly to it.
     ReportCUDAErrors(cudaMalloc(
-        &op_policy_mem_gpu_, maxBatchSize * kNumOutputPolicy * sizeof(float)));
-
-    ReportCUDAErrors(cudaHostAlloc(&op_value_mem_,
-                                   maxBatchSize * (wdl ? 3 : 1) * sizeof(float),
-                                   cudaHostAllocMapped));
+        &op_policy_mem_gpu_,
+        maxBatchSize * kNumOutputPolicy * sizeof(op_policy_mem_[0])));
+    ReportCUDAErrors(cudaHostAlloc(
+        &op_value_mem_, maxBatchSize * (wdl ? 3 : 1) * sizeof(op_value_mem_[0]),
+        cudaHostAllocMapped));
+    ReportCUDAErrors(cudaMalloc(
+        &op_value_mem_gpu_,
+        maxBatchSize * (wdl ? 3 : 1) * sizeof(op_value_mem_gpu_[0])));
+    if (wdl && sizeof(DataType) != sizeof(float)) {
+      wdl_cpu_softmax_ = std::make_unique<float[]>(maxBatchSize * 2);
+    }
+    ReportCUDAErrors(
+        cudaEventCreateWithFlags(&upload_done_event_, cudaEventDisableTiming));
     ReportCUDAErrors(
-        cudaHostGetDevicePointer(&op_value_mem_gpu_, op_value_mem_, 0));
+        cudaEventCreateWithFlags(&policy_done_event_, cudaEventDisableTiming));
+    ReportCUDAErrors(
+        cudaEventCreateWithFlags(&value_done_event_, cudaEventDisableTiming));
+    ReportCUDAErrors(cudaEventCreateWithFlags(&wdl_download_done_event_,
+                                              cudaEventDisableTiming));
+    ReportCUDAErrors(cudaEventCreateWithFlags(&download_done_event_,
+                                              cudaEventDisableTiming));
     if (moves_left) {
-      ReportCUDAErrors(cudaHostAlloc(&op_moves_left_mem_,
-                                     maxBatchSize * sizeof(float),
-                                     cudaHostAllocMapped));
-      ReportCUDAErrors(cudaHostGetDevicePointer(&op_moves_left_mem_gpu_,
-                                                op_moves_left_mem_, 0));
+      ReportCUDAErrors(cudaHostAlloc(
+          &op_moves_left_mem_, maxBatchSize * sizeof(op_moves_left_mem_[0]),
+          cudaHostAllocMapped));
+      ReportCUDAErrors(
+          cudaMalloc(&op_moves_left_mem_gpu_,
+                     maxBatchSize * sizeof(op_moves_left_mem_gpu_[0])));
+      ReportCUDAErrors(cudaEventCreateWithFlags(&moves_left_done_event_,
+                                                cudaEventDisableTiming));
     }
 
+    ReportCUDAErrors(
+        cudaStreamCreateWithFlags(&exec_stream_, cudaStreamNonBlocking));
+    ReportCUDAErrors(
+        cudaEventCreateWithFlags(&join_capture_event_, cudaEventDisableTiming));
+    cuda_graphs_ = std::make_unique<CudaGraphExec<DataType>[]>(maxBatchSize);
+
     // memory for network execution managed inside this structure
     if (tensor_mem_size) {
       multi_stream_ = true;
-      ReportCUDAErrors(cudaStreamCreate(&stream_));
+      ReportCUDAErrors(
+          cudaStreamCreateWithFlags(&compute_stream_, cudaStreamNonBlocking));
+      ReportCUDAErrors(
+          cudaStreamCreateWithFlags(&upload_stream_, cudaStreamNonBlocking));
+      ReportCUDAErrors(
+          cudaStreamCreateWithFlags(&download_stream_, cudaStreamNonBlocking));
       ReportCUDAErrors(cudaMalloc(&scratch_mem_, scratch_size));
       for (auto& mem : tensor_mem_) {
         ReportCUDAErrors(cudaMalloc(&mem, tensor_mem_size));
-        ReportCUDAErrors(cudaMemsetAsync(mem, 0, tensor_mem_size, stream_));
+        ReportCUDAErrors(
+            cudaMemsetAsync(mem, 0, tensor_mem_size, compute_stream_));
       }
       ReportCUBLASErrors(cublasCreate(&cublas_));
       ReportCUBLASErrors(cublasSetMathMode(
           cublas_, cublasDisableTensorCores ? CUBLAS_PEDANTIC_MATH
                                             : CUBLAS_TENSOR_OP_MATH));
-      ReportCUBLASErrors(cublasSetStream(cublas_, stream_));
+      ReportCUBLASErrors(cublasSetStream(cublas_, compute_stream_));
     } else {
       multi_stream_ = false;
     }
   }
   ~InputsOutputs() {
     ReportCUDAErrors(cudaFreeHost(input_masks_mem_));
+    ReportCUDAErrors(cudaFree(input_masks_mem_gpu_));
     ReportCUDAErrors(cudaFreeHost(input_val_mem_));
+    ReportCUDAErrors(cudaFree(input_val_mem_gpu_));
     ReportCUDAErrors(cudaFreeHost(op_policy_mem_));
     ReportCUDAErrors(cudaFree(op_policy_mem_gpu_));
     ReportCUDAErrors(cudaFreeHost(op_value_mem_));
-    if (op_moves_left_mem_ != nullptr)
+    ReportCUDAErrors(cudaFree(op_value_mem_gpu_));
+    ReportCUDAErrors(cudaEventDestroy(upload_done_event_));
+    ReportCUDAErrors(cudaEventDestroy(policy_done_event_));
+    ReportCUDAErrors(cudaEventDestroy(value_done_event_));
+    ReportCUDAErrors(cudaEventDestroy(wdl_download_done_event_));
+    ReportCUDAErrors(cudaEventDestroy(download_done_event_));
+    if (op_moves_left_mem_ != nullptr) {
       ReportCUDAErrors(cudaFreeHost(op_moves_left_mem_));
+      ReportCUDAErrors(cudaFree(op_moves_left_mem_gpu_));
+      ReportCUDAErrors(cudaEventDestroy(moves_left_done_event_));
+    }
+    ReportCUDAErrors(cudaEventDestroy(join_capture_event_));
+    ReportCUDAErrors(cudaStreamDestroy(exec_stream_));
 
     if (multi_stream_) {
       for (auto mem : tensor_mem_) {
@@ -106,24 +189,26 @@ struct InputsOutputs {
       if (head_offset_pointers_) {
         ReportCUDAErrors(cudaFree(head_offset_pointers_));
       }
-      cudaStreamDestroy(stream_);
-      cublasDestroy(cublas_);
+      ReportCUDAErrors(cudaStreamDestroy(compute_stream_));
+      ReportCUDAErrors(cudaStreamDestroy(upload_stream_));
+      ReportCUDAErrors(cudaStreamDestroy(download_stream_));
+      ReportCUBLASErrors(cublasDestroy(cublas_));
     }
   }
   uint64_t* input_masks_mem_;
-  float* input_val_mem_;
-  float* op_policy_mem_;
-  float* op_value_mem_;
-  float* op_moves_left_mem_ = nullptr;
+  DataType* input_val_mem_;
+  DataType* op_policy_mem_;
+  DataType* op_value_mem_;
+  DataType* op_moves_left_mem_ = nullptr;
 
-  // GPU pointers for the above allocations.
+  // Copies in VRAM.
   uint64_t* input_masks_mem_gpu_;
-  float* input_val_mem_gpu_;
-  float* op_value_mem_gpu_;
-  float* op_moves_left_mem_gpu_;
+  DataType* input_val_mem_gpu_;
+  DataType* op_policy_mem_gpu_;
+  DataType* op_value_mem_gpu_;
+  DataType* op_moves_left_mem_gpu_ = nullptr;
 
-  // This is a seperate copy.
-  float* op_policy_mem_gpu_;
+  std::unique_ptr<float[]> wdl_cpu_softmax_;
 
   // memory needed to run the network owned by InputsOutputs when multi_stream
   // is enabled
@@ -134,11 +219,82 @@ struct InputsOutputs {
   void** head_offset_pointers_ = nullptr;
 
   // cuda stream used to run the network
-  cudaStream_t stream_;
+  cudaStream_t compute_stream_ = nullptr;
+  cudaStream_t upload_stream_ = nullptr;
+  cudaStream_t download_stream_ = nullptr;
+
+  // cuda events to synchronize between streams
+  cudaEvent_t upload_done_event_ = nullptr;
+  cudaEvent_t policy_done_event_ = nullptr;
+  cudaEvent_t value_done_event_ = nullptr;
+  cudaEvent_t moves_left_done_event_ = nullptr;
+  cudaEvent_t wdl_download_done_event_ = nullptr;
+  cudaEvent_t download_done_event_ = nullptr;
+
+  // cuda graph support
+  cudaStream_t exec_stream_ = nullptr;
+  std::unique_ptr<CudaGraphExec<DataType>[]> cuda_graphs_;
+  cudaEvent_t join_capture_event_ = nullptr;
 
   // cublas handle used to run the network
-  cublasHandle_t cublas_;
+  cublasHandle_t cublas_ = nullptr;
+};
+
+template <typename DataType>
+struct CudaGraphCapture {
+  static constexpr int kMinimumFreeMemory = 100 * 1024 * 1024;
+
+  CudaGraphCapture(InputsOutputs<DataType>& io, cudaStream_t upload_stream,
+                   cudaStream_t download_stream)
+      : io_(io),
+        upload_stream_(upload_stream),
+        download_stream_(download_stream) {
+    ReportCUDAErrors(cudaStreamBeginCapture(upload_stream_,
+                                            cudaStreamCaptureModeThreadLocal));
+  }
+
+  ~CudaGraphCapture() {
+    if (graph_ != nullptr) {
+      ReportCUDAErrors(cudaGraphDestroy(graph_));
+    }
+  }
+
+  static bool EnsureEnoughFreeMemory() {
+    size_t free_mem = 0;
+    size_t total_mem = 0;
+    ReportCUDAErrors(cudaMemGetInfo(&free_mem, &total_mem));
+    return free_mem > kMinimumFreeMemory;
+  }
+
+  void EndCapture() {
+    ReportCUDAErrors(
+        cudaEventRecord(io_.join_capture_event_, download_stream_));
+    ReportCUDAErrors(
+        cudaStreamWaitEvent(upload_stream_, io_.join_capture_event_, 0));
+    ReportCUDAErrors(cudaStreamEndCapture(upload_stream_, &graph_));
+  }
+
+  InputsOutputs<DataType>& io_;
+  cudaStream_t upload_stream_;
+  cudaStream_t download_stream_;
+
+  cudaGraph_t graph_ = nullptr;
 };
 
+template <typename DataType>
+inline CudaGraphExec<DataType>& CudaGraphExec<DataType>::operator=(
+    const CudaGraphCapture<DataType>& graph) {
+  assert(graph_exec_ == nullptr);
+  if (graph.graph_ == nullptr) {
+    throw Exception("Trying to instantiate an nullptr cuda graph");
+  }
+  ReportCUDAErrors(
+      cudaGraphInstantiate(&graph_exec_, graph.graph_, nullptr, nullptr, 0));
+#if CUDART_VERSION >= 11010
+  ReportCUDAErrors(cudaGraphUpload(graph_exec_, graph.io_.exec_stream_));
+#endif
+  return *this;
+}
+
 }  // namespace cudnn_backend
 }  // namespace lczero
diff --git a/src/neural/backends/cuda/kernels.h b/src/neural/backends/cuda/kernels.h
index 06ad15c657..91ee87abe0 100644
--- a/src/neural/backends/cuda/kernels.h
+++ b/src/neural/backends/cuda/kernels.h
@@ -67,7 +67,8 @@ void addBias_NCHW(T* c, T* a, T* b, int N, int C, int H, int W,
 // params, also pad/un-pad elements from Batch or Channel dimensions
 template <typename DstType, typename SrcType>
 void convertNCHWtoNHWC(DstType* output_tensor, const SrcType* input_tensor,
-                       int Nin, int Cin, int Nout, int Cout, int H, int W);
+                       int Nin, int Cin, int Nout, int Cout, int H, int W,
+                       cudaStream_t stream);
 
 // Plain data-type conversion (no layout conversion).
 template <typename DstType, typename SrcType>
@@ -77,35 +78,34 @@ void copyTypeConverted(DstType* op, SrcType* ip, int N, cudaStream_t stream);
 template <typename T>
 void batchNorm(T* output, const T* input, const T* skipInput, int N, int C,
                int H, int W, float* means, float* var_multipliers,
-               ActivationFunction activation);
+               ActivationFunction activation, cudaStream_t stream);
 
 // Unpack planes (input to network).
-void expandPlanes_Fp32_NCHW(float* output, const uint64_t* masks,
-                            const float* values, int n, cudaStream_t stream);
-
-void expandPlanes_Fp16_NHWC(half* output, const uint64_t* masks,
-                            const float* values, int n, cudaStream_t stream);
+template <typename T>
+void expandPlanes_NHWC(T* output, const uint64_t* masks, const T* values, int n,
+                       cudaStream_t stream);
 
-void expandPlanes_Fp16_NCHW(half* output, const uint64_t* masks,
-                            const float* values, int n, cudaStream_t stream);
+template <typename T>
+void expandPlanes_NCHW(T* output, const uint64_t* masks, const T* values, int n,
+                       cudaStream_t stream);
 
 // Perform global avg pool.
 template <typename T>
 void globalAvgPool(int N, int C, T* output, const T* input,
-                   const T* prevLayerBias, bool nhwc);
+                   const T* prevLayerBias, bool nhwc, cudaStream_t steam);
 
 // Perform global scale.
 template <typename T>
 void globalScale(int N, int C, T* output, const T* input, const T* scaleBias,
                  const T* prevLayerBias, bool nhwc,
-                 ActivationFunction activation);
+                 ActivationFunction activation, cudaStream_t steam);
 
 // Perform Squeeze-and-Excitation (SE) in a single fused kernel.
 // Returns false if the fused kernel can't handle the sizes.
 bool Se_Fp16_NHWC(int N, int C, int numFc1Out, half* output, const half* skip,
                   const half* input, const half* w1, const half* b1,
                   const half* w2, const half* b2, const half* bPrev,
-                  ActivationFunction activation);
+                  ActivationFunction activation, cudaStream_t stream);
 
 template <typename T>
 void PolicyMap(int N, T* output, const T* input, const short* indices,
@@ -114,7 +114,8 @@ void PolicyMap(int N, T* output, const T* input, const short* indices,
 
 // Custom winograd helper functions
 template <typename T>
-void FilterTransform(int N, int C, T* transformedFilter, const T* filter);
+void FilterTransform(int N, int C, T* transformedFilter, const T* filter,
+                     cudaStream_t stream);
 
 template <typename T, bool nhcw>
 void InputTransform(int N, int C, T* transformedInput, const T* input,
@@ -157,5 +158,14 @@ void inputPreprocessForAttentionBody(T* output, const T* input,
 template <typename T>
 void applyInputGating(T* output, const T* input, const T* mult, const T* add,
                       int N, int HW, int C, cudaStream_t stream);
+
+template <typename T>
+void genOffsetPointers(T** offsets, int heads, int max_batch, int depth,
+                       int d_model, T* k, T* q, T* b1, T* v, T* b2,
+                       cudaStream_t stream);
+
+void fusedMHA(void* output, void* mha_q, void* mha_k, void* mha_v, void* skip,
+              int batch_size, int num_heads, int depth, cudaStream_t stream);
+
 }  // namespace cudnn_backend
 }  // namespace lczero
diff --git a/src/neural/backends/cuda/layers.cc b/src/neural/backends/cuda/layers.cc
index 81a0b01b8b..5ae5b7f7dc 100644
--- a/src/neural/backends/cuda/layers.cc
+++ b/src/neural/backends/cuda/layers.cc
@@ -219,7 +219,7 @@ void ConvLayer<half>::LoadWeights(float* pfilter, float* pBias, void* scratch) {
 
   if (nhwc_) {
     convertNCHWtoNHWC((half*)weights, (float*)scratch, C, c_input_, C, c_input_,
-                      filter_size_, filter_size_);
+                      filter_size_, filter_size_, 0);
   } else {
     copyTypeConverted((half*)weights, (float*)scratch,
                       C * c_input_ * filter_size_ * filter_size_, 0);
@@ -495,7 +495,7 @@ void SELayer<float>::Eval(int N, float* output, const float* input,
 
   // 1. Global avg pooling (also adds previous layer bias before computing
   // averages).
-  globalAvgPool(N, C, op2, input, bPrev_, false);
+  globalAvgPool(N, C, op2, input, bPrev_, false, stream);
 
   // 2. First fully connected layer.
   float alpha = 1.0f, beta = 0.0f;
@@ -514,7 +514,7 @@ void SELayer<float>::Eval(int N, float* output, const float* input,
 
   // 4. (Optional prev layer bias add), Global scale, residual add, relu and
   // bias.
-  globalScale(N, C, output, input, op2, bPrev_, false, act_);
+  globalScale(N, C, output, input, op2, bPrev_, false, act_, stream);
 }
 
 template <>
@@ -525,7 +525,7 @@ void SELayer<half>::Eval(int N, half* output, const half* input,
   bool se_done = false;
   if (kUseFusedSELayer && nhwc_) {
     se_done = Se_Fp16_NHWC(N, C, numFc1Out_, output, input2, input, w1_t_, b1_,
-                           w2_t_, b2_, bPrev_, act_);
+                           w2_t_, b2_, bPrev_, act_, stream);
   }
   if (!se_done) {
     assert(output == input2);
@@ -535,7 +535,7 @@ void SELayer<half>::Eval(int N, half* output, const half* input,
 
     // 1. Global avg pooling (also adds previous layer bias before computing
     // averages).
-    globalAvgPool(N, C, op2, input, bPrev_, nhwc_);
+    globalAvgPool(N, C, op2, input, bPrev_, nhwc_, stream);
 
     // 2. First fully connected layer.
     __half_raw one_h{0x3C00};
@@ -557,7 +557,7 @@ void SELayer<half>::Eval(int N, half* output, const half* input,
 
     // 4. (Optional prev layer bias add), Global scale, residual add, relu and
     // bias.
-    globalScale(N, C, output, input, op2, bPrev_, nhwc_, act_);
+    globalScale(N, C, output, input, op2, bPrev_, nhwc_, act_, stream);
   }
 }
 
@@ -593,7 +593,7 @@ void FCLayer<half>::LoadWeights(float* cpuWeight, float* cpuBias,
   if (nhwc_) {
     convertNCHWtoNHWC((half*)weights_, (float*)scratch, (int)num_biases,
                       input_->GetC(), (int)num_biases, input_->GetC(),
-                      input_->GetH(), input_->GetW());
+                      input_->GetH(), input_->GetW(), 0);
   } else {
     copyTypeConverted((half*)weights_, (float*)scratch, (int)num_weights, 0);
   }
@@ -851,7 +851,7 @@ void FusedWinogradConvSELayer<DataType>::LoadWeights(float* pfilter,
   }
 
   // run winograd transform kernel for the filter
-  FilterTransform(C, c_input_, transformed_weights_, weights);
+  FilterTransform(C, c_input_, transformed_weights_, weights, 0);
 }
 
 // TODO: Do this on the GPU to improve network load time!
@@ -1200,7 +1200,7 @@ void ResidualBlock<DataType>::LoadWeights0(float* pfilter, float* pBias,
   }
 
   // run winograd transform kernel for the filter
-  FilterTransform(C, c_input_, transformed_weights0_, weights);
+  FilterTransform(C, c_input_, transformed_weights0_, weights, 0);
 }
 
 template <typename DataType>
@@ -1226,7 +1226,7 @@ void ResidualBlock<DataType>::LoadWeights1(float* pfilter, float* pBias,
   }
 
   // run winograd transform kernel for the filter
-  FilterTransform(C, C, transformed_weights1_, weights);
+  FilterTransform(C, C, transformed_weights1_, weights, 0);
 }
 
 template <typename DataType>
@@ -1422,7 +1422,7 @@ template <typename DataType>
 AttentionPolicyHead<DataType>::AttentionPolicyHead(
     BaseLayer<DataType>* ip, const MultiHeadWeights::PolicyHead& weights,
     void* scratch, bool attention_body, ActivationFunction act,
-    int max_batch_size)
+    int max_batch_size, bool use_gemm_ex)
     : BaseLayer<DataType>(64 * 64 + 24 * 8, 1, 1, ip),
       attention_body_(attention_body),
       // Old networks without attention body (e.g. T79) use hardcoded SELU
@@ -1474,8 +1474,9 @@ AttentionPolicyHead<DataType>::AttentionPolicyHead(
         nullptr, 0,  // smolgen weights not implemented in
                      // policy encoder heads yet.
         max_batch_size, ACTIVATION_SWISH, act_,
-        1e-6);  // attentionbody nets don't have policy encoders, so using old
-                // epsilon for backward compatibility with T78.
+        1e-6,          // attentionbody nets don't have policy encoders, so
+        use_gemm_ex,   // using old epsilon for backward compatibility with T78.
+        false);
     encoder_weights_.emplace_back(pW);
   }
 }
@@ -1485,7 +1486,8 @@ EncoderBlock<DataType>::EncoderBlock(
     const MultiHeadWeights::EncoderLayer& cpu_weights, void* scratch, int heads,
     int size, float alpha, DataType* smolgen_global_scratch,
     int smolgen_global_size, int max_batch_size, ActivationFunction smolgen_act,
-    ActivationFunction ffn_act, float default_eps)
+    ActivationFunction ffn_act, float default_eps, bool use_gemm_ex,
+    bool fused_mha)
     : embedding_op_size_(size),
       encoder_heads_(heads),
       alpha_(alpha),
@@ -1493,7 +1495,9 @@ EncoderBlock<DataType>::EncoderBlock(
       has_smolgen_(cpu_weights.mha.has_smolgen),
       smolgen_activation_(smolgen_act),
       ffn_activation_(ffn_act),
-      max_batch_size_(max_batch_size) {
+      max_batch_size_(max_batch_size),
+      use_fused_mha_(fused_mha),
+      use_gemm_ex_(use_gemm_ex) {
   mha_q_size_ = cpu_weights.mha.q_b.size();
   mha_k_size_ = cpu_weights.mha.k_b.size();
   mha_v_size_ = cpu_weights.mha.v_b.size();
@@ -1605,7 +1609,8 @@ static void cublasXGemmStridedBatched(
     cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
     int m, int n, int k, float alpha, const void* A, int lda,
     long long int strideA, const void* B, int ldb, long long int strideB,
-    float beta, void* C, int ldc, long long int strideC, int batchCount) {
+    float beta, void* C, int ldc, long long int strideC, int batchCount,
+    bool use_gemm_ex) {
   const bool fp16 = std::is_same<half, DataType>::value;
   if (fp16) {
     unsigned short alpha_h = FP32toFP16(alpha);
@@ -1615,10 +1620,17 @@ static void cublasXGemmStridedBatched(
         B, CUDA_R_16F, ldb, strideB, &beta_h, C, CUDA_R_16F, ldc, strideC,
         batchCount, CUDA_R_16F, CUBLAS_GEMM_DEFAULT));
   } else {
-    ReportCUBLASErrors(cublasGemmStridedBatchedEx(
-        handle, transa, transb, m, n, k, &alpha, A, CUDA_R_32F, lda, strideA, B,
-        CUDA_R_32F, ldb, strideB, &beta, C, CUDA_R_32F, ldc, strideC,
-        batchCount, CUDA_R_32F, CUBLAS_GEMM_DEFAULT));
+    if (use_gemm_ex) {
+      ReportCUBLASErrors(cublasGemmStridedBatchedEx(
+          handle, transa, transb, m, n, k, &alpha, A, CUDA_R_32F, lda, strideA,
+          B, CUDA_R_32F, ldb, strideB, &beta, C, CUDA_R_32F, ldc, strideC,
+          batchCount, CUDA_R_32F, CUBLAS_GEMM_DEFAULT));
+    } else {
+      ReportCUBLASErrors(cublasSgemmStridedBatched(
+          handle, transa, transb, m, n, k, &alpha, (const float*)A, lda,
+          strideA, (const float*)B, ldb, strideB, &beta, (float*)C, ldc,
+          strideC, batchCount));
+    }
   }
 }
 
@@ -1736,7 +1748,8 @@ void EncoderBlock<DataType>::Eval(int N, DataType* in_out_tensor,
     cublasXGemmStridedBatched<DataType>(
         cublas, CUBLAS_OP_T, CUBLAS_OP_N, num_outputs, batch, num_inputs, 1.0f,
         mha_qkv_w, num_inputs, num_inputs * num_outputs, in_out_tensor,
-        num_inputs, 0, 0.0f, mha_q, num_outputs, num_outputs * max_batch, 3);
+        num_inputs, 0, 0.0f, mha_q, num_outputs, num_outputs * max_batch, 3,
+        use_gemm_ex_);
     addBiasBatched<DataType>(mha_q, mha_q, mha_qkv_b, 3, batch, num_outputs,
                              max_batch, ACTIVATION_NONE, stream);
   }
@@ -1760,31 +1773,33 @@ void EncoderBlock<DataType>::Eval(int N, DataType* in_out_tensor,
   // shape(k)[-1] = depth
   float factor = 1.0f / sqrt((float)depth);
 
+#ifdef USE_CUTLASS
+  if (use_fused_mha_) {
+    // TODO: check if we need skip in a different tensor than same tensor as
+    // output!
+    fusedMHA(buffer2, mha_q, mha_k, mha_v, has_smolgen_ ? buffer2 : nullptr, N,
+             encoder_heads_, depth, stream);
+  } else
+#endif
   // matmul_qk = tf.matmul(q, k, transpose_b=True)
   {
     if (*offset_pointers == nullptr) {
-      std::vector<DataType*> offsets(encoder_heads_ * max_batch_size_ * 5);
-      for (int i = 0; i < encoder_heads_ * max_batch_size_; i++) {
-        int h = i % encoder_heads_;
-        int n = i / encoder_heads_;
-        offsets[i] = mha_k + h * depth + 64 * d_model * n;
-        offsets[i + encoder_heads_ * max_batch_size_] =
-            mha_q + h * depth + 64 * d_model * n;
-        offsets[i + 2 * encoder_heads_ * max_batch_size_] =
-            buffer1 + i * 64 * 64;
-        offsets[i + 3 * encoder_heads_ * max_batch_size_] =
-            mha_v + h * depth + 64 * d_model * n;
-        offsets[i + 4 * encoder_heads_ * max_batch_size_] =
-            buffer2 + h * depth + 64 * d_model * n;
-      }
+#ifndef NDEBUG
+      cudaStreamCaptureStatus capture;
+      ReportCUDAErrors(cudaStreamIsCapturing(stream, &capture));
+      assert(capture !=
+                 cudaStreamCaptureStatus::cudaStreamCaptureStatusActive &&
+             "Stream capture is active, cannot allocate memory for offset "
+             "pointers");
+#endif
       ReportCUDAErrors(
           cudaMalloc((void**)offset_pointers,
                      encoder_heads_ * max_batch_size_ * 5 * sizeof(DataType*)));
-      ReportCUDAErrors(
-          cudaMemcpy(*offset_pointers, offsets.data(),
-                     encoder_heads_ * max_batch_size_ * 5 * sizeof(DataType*),
-                     cudaMemcpyHostToDevice));
+      genOffsetPointers((DataType**)*offset_pointers, encoder_heads_,
+                        max_batch_size_, depth, d_model, mha_k, mha_q, buffer1,
+                        mha_v, buffer2, stream);
     }
+
     cublasXGemmBatched<DataType>(
         cublas, CUBLAS_OP_T, CUBLAS_OP_N, 64 /*M*/, 64 /*N*/,
         depth /*K*/,  // A/B, and M/N are swapped for row-major to col-major
@@ -1805,20 +1820,18 @@ void EncoderBlock<DataType>::Eval(int N, DataType* in_out_tensor,
         64 /*LDC*/,
         // 64 * 64 /*strideC*/,
         N * encoder_heads_);
-  }
 
-  // attention_weights = tf.nn.softmax(scaled_attention_logits, axis = -1)
-  // attention_weights -> buffer1
-  if (has_smolgen_) {
-    // Add smolgen weights to the scaled matmul_qk attention logits before
-    // softmax.
-    Softmax(encoder_heads_ * N * 64, 64, buffer1, buffer1, buffer2, stream);
-  } else {
-    Softmax(encoder_heads_ * N * 64, 64, buffer1, buffer1,
-            (const DataType*)nullptr, stream);
-  }
+    // attention_weights = tf.nn.softmax(scaled_attention_logits, axis = -1)
+    // attention_weights -> buffer1
+    if (has_smolgen_) {
+      // Add smolgen weights to the scaled matmul_qk attention logits before
+      // softmax.
+      Softmax(encoder_heads_ * N * 64, 64, buffer1, buffer1, buffer2, stream);
+    } else {
+      Softmax(encoder_heads_ * N * 64, 64, buffer1, buffer1,
+              (const DataType*)nullptr, stream);
+    }
 
-  {
     cublasXGemmBatched<DataType>(
         cublas, CUBLAS_OP_N, CUBLAS_OP_N, depth /*M*/, 64 /*N*/, 64 /*K*/, 1.0f,
         *offset_pointers + encoder_heads_ * max_batch_size_ *
@@ -1892,8 +1905,10 @@ void AttentionPolicyHead<DataType>::Eval(
   DataType* buffer2 = input2_tensor + scratch_size / (2 * sizeof(DataType));
 
   int inputC = this->input_->GetC();
-  if (!attention_body_)
-    convertNCHWtoNHWC((DataType*)scratch, input, N, inputC, N, inputC, 8, 8);
+  bool input_nhwc = attention_body_ || this->input_->isNHWC();
+  if (!input_nhwc)
+    convertNCHWtoNHWC((DataType*)scratch, input, N, inputC, N, inputC, 8, 8,
+                      stream);
 
   // 1. Policy embedding (fully connected layer)
   // Input data in NHWC layout N*(64)*C, output is N*(64)*embedding_op_size_
@@ -1905,7 +1920,7 @@ void AttentionPolicyHead<DataType>::Eval(
     cublasXgemm<DataType>(cublas, CUBLAS_OP_T, CUBLAS_OP_N, num_outputs, batch,
                           num_inputs, 1.0f, (const DataType*)ip_pol_w_,
                           num_inputs,
-                          attention_body_ ? input : (DataType*)scratch,
+                          input_nhwc ? input : (DataType*)scratch,
                           num_inputs, 0.0f, pol_embedding, num_outputs);
     addBiasBatched(pol_embedding, pol_embedding, ip_pol_b_, 1, batch,
                    num_outputs, act_, stream);
@@ -1929,7 +1944,7 @@ void AttentionPolicyHead<DataType>::Eval(
     cublasXGemmStridedBatched<DataType>(
         cublas, CUBLAS_OP_T, CUBLAS_OP_N, num_outputs, batch, num_inputs, 1.0f,
         wqk_w_, num_inputs, num_inputs * num_outputs, input2_tensor, num_inputs,
-        0, 0.0f, wq, num_outputs, num_outputs * batch, 2);
+        0, 0.0f, wq, num_outputs, num_outputs * batch, 2, use_gemm_ex_);
 
     addBiasBatched<DataType>(wq, wq, wqk_b_, 2, batch, num_outputs,
                              ACTIVATION_NONE, stream);
@@ -1952,7 +1967,7 @@ void AttentionPolicyHead<DataType>::Eval(
         wk /*A*/, policy_d_model_ /*LDA*/, 64 * policy_d_model_, /*strideA*/
         wq /*B*/, policy_d_model_ /*LDB*/, 64 * policy_d_model_, /*strideB*/
         0.0f, output /*C*/,  // output (policy_attn_logits)
-        64 /*LDC*/, 64 * 64 + 8 * 24 /*strideC*/, N);
+        64 /*LDC*/, 64 * 64 + 8 * 24 /*strideC*/, N, use_gemm_ex_);
   }
 
   // Compute promotion_logits in a single kernel (and put the result just after
@@ -2045,8 +2060,10 @@ AttentionBody<DataType>::AttentionBody(const MultiHeadWeights& weights,
                                        void* scratch, Activations activations,
                                        int num_res_blocks, int input_c,
                                        int max_batch_size,
-                                       bool is_pe_dense_embedding)
-    : BaseLayer<DataType>(weights.ip_emb_b.size(), 8, 8, nullptr),
+                                       bool is_pe_dense_embedding,
+                                       bool use_gemm_ex, bool fused_mha)
+    : BaseLayer<DataType>(weights.ip_emb_b.size(), 8, 8, nullptr, false,
+                          use_gemm_ex),
       embedding_op_size_(weights.ip_emb_b.size()),
       encoder_head_count_(weights.encoder_head_count),
       activations_(activations),
@@ -2055,7 +2072,8 @@ AttentionBody<DataType>::AttentionBody(const MultiHeadWeights& weights,
       has_gating_(weights.ip_mult_gate.size() > 0 &&
                   weights.ip_add_gate.size() > 0),
       has_smolgen_(weights.has_smolgen),
-      is_pe_dense_embedding_(is_pe_dense_embedding) {
+      is_pe_dense_embedding_(is_pe_dense_embedding),
+      use_fused_mha_(fused_mha) {
   allocAndUpload<DataType>(&ip_emb_w_, weights.ip_emb_w, scratch);
   allocAndUpload<DataType>(&ip_emb_b_, weights.ip_emb_b, scratch);
 
@@ -2110,7 +2128,7 @@ AttentionBody<DataType>::AttentionBody(const MultiHeadWeights& weights,
         enc, scratch, encoder_head_count_, embedding_op_size_, alpha,
         smolgen_global_, smolgen_global_size_, max_batch_size,
         activations_.smolgen_activation, activations_.ffn_activation,
-        is_pe_dense_embedding_ ? 1e-3 : 1e-6);
+        is_pe_dense_embedding_ ? 1e-3 : 1e-6, use_gemm_ex, use_fused_mha_);
     encoder_weights_.emplace_back(pW);
   }
 }
@@ -2172,7 +2190,8 @@ void AttentionBody<DataType>::Eval(int N, DataType* output,
       const int num_inputs = 64 * 12;
       const int batch = N;
 
-      convertNCHWtoNHWC((DataType*)scratch, input, N, inputC, N, 12, 8, 8);
+      convertNCHWtoNHWC((DataType*)scratch, input, N, inputC, N, 12, 8, 8,
+                        stream);
       cublasXgemm<DataType>(
           cublas, CUBLAS_OP_T, CUBLAS_OP_N, num_outputs, batch, num_inputs,
           1.0f, (const DataType*)ip_emb_pre_w_, num_inputs,
@@ -2207,7 +2226,8 @@ void AttentionBody<DataType>::Eval(int N, DataType* output,
     // #redirect flow through encoder blocks
     // flow = tf.transpose(flow, perm = [ 0, 2, 3, 1 ])
     // flow = tf.reshape(flow, [ -1, 64, self.RESIDUAL_FILTERS ])
-    convertNCHWtoNHWC((DataType*)scratch, input, N, inputC, N, inputC, 8, 8);
+    convertNCHWtoNHWC((DataType*)scratch, input, N, inputC, N, inputC, 8, 8,
+                      stream);
   }
 
   if (is_pe_dense_embedding_) {
@@ -2439,6 +2459,7 @@ void CudnnError(cudnnStatus_t status, const char* file, const int& line) {
     char message[128];
     sprintf(message, "CUDNN error: %s (%s:%d) ", cudnnGetErrorString(status),
             file, line);
+    CERR << message;
     throw Exception(message);
   }
 }
@@ -2475,6 +2496,7 @@ void CublasError(cublasStatus_t status, const char* file, const int& line) {
     char message[128];
     sprintf(message, "CUBLAS error: %s (%s:%d) ", CublasGetErrorString(status),
             file, line);
+    CERR << message;
     throw Exception(message);
   }
 }
@@ -2484,6 +2506,7 @@ void CudaError(cudaError_t status, const char* file, const int& line) {
     char message[128];
     sprintf(message, "CUDA error: %s (%s:%d) ", cudaGetErrorString(status),
             file, line);
+    CERR << message;
     throw Exception(message);
   }
 }
diff --git a/src/neural/backends/cuda/layers.h b/src/neural/backends/cuda/layers.h
index 9ba5bd286e..5c5ec871c1 100644
--- a/src/neural/backends/cuda/layers.h
+++ b/src/neural/backends/cuda/layers.h
@@ -29,6 +29,7 @@
 #include <cublas_v2.h>
 
 #include <cstddef>
+#include <memory>
 
 #include "cuda_common.h"
 #include "neural/network_legacy.h"
@@ -340,7 +341,8 @@ class EncoderBlock {
                int heads, int size, float alpha,
                DataType* smolgen_global_scratch, int smolgen_global_size,
                int max_batch_size, ActivationFunction smolgen_act,
-               ActivationFunction ffn_act, float default_eps);
+               ActivationFunction ffn_act, float default_eps, bool use_gemm_ex,
+               bool fused_mha);
   ~EncoderBlock();
 
   void Eval(int N, DataType* inpop, DataType* scratch0, DataType* scratch1,
@@ -393,6 +395,8 @@ class EncoderBlock {
   int smol_global_size_;
 
   const int max_batch_size_;
+  const bool use_fused_mha_;
+  const bool use_gemm_ex_;
 };
 
 // The Attention policy head implementation
@@ -406,12 +410,14 @@ class AttentionPolicyHead : public BaseLayer<DataType> {
   using BaseLayer<DataType>::GetC;
   using BaseLayer<DataType>::GetH;
   using BaseLayer<DataType>::GetW;
+  using BaseLayer<DataType>::use_gemm_ex_;
 
  public:
   AttentionPolicyHead(BaseLayer<DataType>* ip,
                       const MultiHeadWeights::PolicyHead& weights,
                       void* scratch, bool attention_body,
-                      ActivationFunction act, int max_batch_size);
+                      ActivationFunction act, int max_batch_size,
+                      bool use_gemm_ex);
   ~AttentionPolicyHead();
   void Eval(int N, DataType* output, const DataType* input,
             const DataType* input2, void* scratch, size_t scratch_size,
@@ -476,7 +482,8 @@ class AttentionBody : public BaseLayer<DataType> {
  public:
   AttentionBody(const MultiHeadWeights& weights, void* scratch,
                 Activations activations, int num_res_blocks, int input_c,
-                int max_batch_size, bool is_pe_dense_embedding);
+                int max_batch_size, bool is_pe_dense_embedding,
+                bool use_gemm_ex, bool fused_mha);
   ~AttentionBody();
   void Eval(int N, DataType* output, const DataType* input,
             const DataType* input2, void* scratch, size_t scratch_size,
@@ -507,6 +514,7 @@ class AttentionBody : public BaseLayer<DataType> {
   const bool has_gating_;
   const bool has_smolgen_;
   bool is_pe_dense_embedding_;  // flag for dense position encoding
+  const bool use_fused_mha_;
 };
 
 // The value head implementation
@@ -523,8 +531,8 @@ class ValueHead : public BaseLayer<DataType> {
 
  public:
   ValueHead(BaseLayer<DataType>* ip, const MultiHeadWeights::ValueHead& weights,
-            void* scratch, bool attention_body, bool wdl, ActivationFunction act,
-            int max_batch_size, bool use_gemm_ex);
+            void* scratch, bool attention_body, bool wdl,
+            ActivationFunction act, int max_batch_size, bool use_gemm_ex);
   ~ValueHead();
   void Eval(int N, DataType* output, const DataType* input,
             const DataType* input2, void* scratch, size_t scratch_size,
@@ -548,6 +556,5 @@ class ValueHead : public BaseLayer<DataType> {
   ActivationFunction act_;
 };
 
-
 }  // namespace cudnn_backend
 }  // namespace lczero
diff --git a/src/neural/backends/cuda/network_cuda.cc b/src/neural/backends/cuda/network_cuda.cc
index 43187a4316..85c80ce2e8 100644
--- a/src/neural/backends/cuda/network_cuda.cc
+++ b/src/neural/backends/cuda/network_cuda.cc
@@ -26,10 +26,10 @@
 */
 #include <algorithm>
 #include <cassert>
-#include <functional>
 #include <list>
 #include <memory>
 #include <mutex>
+#include <type_traits>
 
 #include "cuda_common.h"
 #include "inputs_outputs.h"
@@ -39,8 +39,17 @@
 #include "neural/network_legacy.h"
 #include "neural/tables/attention_policy_map.h"
 #include "neural/tables/policy_map.h"
-#include "utils/bititer.h"
 #include "utils/exception.h"
+#include "utils/fp16_utils.h"
+#include "utils/trace.h"
+
+#if CUDART_VERSION >= 11010
+#define CUDA_GRAPH_SUPPORTS_EXTERNAL_EVENTS 1
+#else
+#define CUDA_GRAPH_SUPPORTS_EXTERNAL_EVENTS 0
+#undef cudaEventWaitExternal
+#undef cudaEventRecordExternal
+#endif
 
 namespace lczero {
 using namespace cudnn_backend;
@@ -120,8 +129,8 @@ static size_t getMaxAttentionBodySize(const MultiHeadWeights& weights, int N) {
 template <typename DataType>
 class CudaNetworkComputation : public NetworkComputation {
  public:
-  CudaNetworkComputation(CudaNetwork<DataType>* network,
-                         bool wdl, bool moves_left);
+  CudaNetworkComputation(CudaNetwork<DataType>* network, bool wdl,
+                         bool moves_left);
   ~CudaNetworkComputation();
 
   void AddInput(InputPlanes&& input) override {
@@ -130,11 +139,11 @@ class CudaNetworkComputation : public NetworkComputation {
     const auto iter_val =
         &inputs_outputs_->input_val_mem_[batch_size_ * kInputPlanes];
 
-    int i = 0;
-    for (const auto& plane : input) {
+    assert(input.size() == kInputPlanes);
+    for (int i = 0; i < kInputPlanes; i++) {
+      const auto& plane = input[i];
       iter_mask[i] = plane.mask;
-      iter_val[i] = plane.value;
-      i++;
+      ToType(iter_val[i], plane.value);
     }
 
     batch_size_++;
@@ -142,38 +151,47 @@ class CudaNetworkComputation : public NetworkComputation {
 
   void ComputeBlocking() override;
 
+  void CaptureGraph(std::unique_lock<std::mutex>&& lock = {});
+
   int GetBatchSize() const override { return batch_size_; }
 
   float GetQVal(int sample) const override {
     if (wdl_) {
-      auto w = inputs_outputs_->op_value_mem_[3 * sample + 0];
-      auto l = inputs_outputs_->op_value_mem_[3 * sample + 2];
-      return w - l;
+      const float* wdl =
+          sizeof(inputs_outputs_->op_value_mem_[0]) == sizeof(float)
+              ? (float*)inputs_outputs_->op_value_mem_
+              : inputs_outputs_->wdl_cpu_softmax_.get();
+      return wdl[2 * sample];
     }
-    return inputs_outputs_->op_value_mem_[sample];
+    return FromType(inputs_outputs_->op_value_mem_[sample]);
   }
 
   float GetDVal(int sample) const override {
     if (wdl_) {
-      return inputs_outputs_->op_value_mem_[3 * sample + 1];
+      const float* wdl =
+          sizeof(inputs_outputs_->op_value_mem_[0]) == sizeof(float)
+              ? (float*)inputs_outputs_->op_value_mem_
+              : inputs_outputs_->wdl_cpu_softmax_.get();
+      return wdl[2 * sample + 1];
     }
     return 0.0f;
   }
 
   float GetPVal(int sample, int move_id) const override {
-    return inputs_outputs_->op_policy_mem_[sample * kNumOutputPolicy + move_id];
+    return FromType(
+        inputs_outputs_->op_policy_mem_[sample * kNumOutputPolicy + move_id]);
   }
 
   float GetMVal(int sample) const override {
     if (moves_left_) {
-      return inputs_outputs_->op_moves_left_mem_[sample];
+      return FromType(inputs_outputs_->op_moves_left_mem_[sample]);
     }
     return 0.0f;
   }
 
  private:
   // Memory holding inputs, outputs.
-  std::unique_ptr<InputsOutputs> inputs_outputs_;
+  std::unique_ptr<InputsOutputs<DataType>> inputs_outputs_;
   int batch_size_;
   bool wdl_;
   bool moves_left_;
@@ -190,6 +208,7 @@ class CudaNetwork : public Network {
                       file.format().network_format().moves_left()} {
     MultiHeadWeights weights(file.weights());
     gpu_id_ = options.GetOrDefault<int>("gpu", 0);
+    enable_graph_capture_ = options.GetOrDefault<bool>("graph_capture", true);
 
     const auto nf = file.format().network_format();
     using NF = pblczero::NetworkFormat;
@@ -210,6 +229,10 @@ class CudaNetwork : public Network {
 
     showInfo();
 
+#ifdef USE_CUTLASS
+    CERR << "Compiled with CUTLASS enabled";
+#endif
+
     int total_gpus;
     ReportCUDAErrors(cudaGetDeviceCount(&total_gpus));
 
@@ -218,7 +241,7 @@ class CudaNetwork : public Network {
 
     cudaDeviceProp deviceProp = {};
     cudaGetDeviceProperties(&deviceProp, gpu_id_);
-    showDeviceInfo(deviceProp);
+    showDeviceInfo(deviceProp, gpu_id_);
 
     l2_cache_size_ = deviceProp.l2CacheSize;
     sm_count_ = deviceProp.multiProcessorCount;
@@ -255,7 +278,16 @@ class CudaNetwork : public Network {
     }
 
     if (!multi_stream_) {
+      ReportCUDAErrors(
+          cudaStreamCreateWithFlags(&compute_stream_, cudaStreamNonBlocking));
+      ReportCUDAErrors(
+          cudaStreamCreateWithFlags(&upload_stream_, cudaStreamNonBlocking));
+      ReportCUDAErrors(
+          cudaStreamCreateWithFlags(&download_stream_, cudaStreamNonBlocking));
+      ReportCUDAErrors(cudaEventCreateWithFlags(&compute_ordering_event_,
+                                                cudaEventDisableTiming));
       ReportCUBLASErrors(cublasCreate(&cublas_));
+      ReportCUBLASErrors(cublasSetStream(cublas_, compute_stream_));
       if (has_tensor_cores_)
         ReportCUBLASErrors(cublasSetMathMode(
             cublas_,
@@ -310,6 +342,11 @@ class CudaNetwork : public Network {
       use_res_block_winograd_fuse_opt_ = options.Get<bool>("res_block_fusing");
     }
 
+    bool use_fused_mha = false;
+    if (deviceProp.major >= 8 && fp16) {
+      use_fused_mha = options.GetOrDefault<bool>("fused_mha", true);
+    }
+
     const bool use_gemm_ex = deviceProp.major >= 5;
 
     // 0. Check for SE.
@@ -342,14 +379,14 @@ class CudaNetwork : public Network {
     std::string policy_head =
         options.GetOrDefault<std::string>("policy_head", "vanilla");
     // Check that selected policy head exists.
-    if (weights.policy_heads.count(policy_head) == 0) {
+    if (!weights.policy_heads.contains(policy_head)) {
       throw Exception("The policy head you specified '" + policy_head +
                       "' does not exist in this net.");
     }
     std::string value_head =
         options.GetOrDefault<std::string>("value_head", "winner");
     // Check that selected value head exists.
-    if (weights.value_heads.count(value_head) == 0) {
+    if (!weights.value_heads.contains(value_head)) {
       throw Exception("The value head you specified '" + value_head +
                       "' does not exist in this net.");
     }
@@ -457,7 +494,8 @@ class CudaNetwork : public Network {
           numBlocks_ > 0 ? kNumFilters : kInputPlanes, max_batch_size_,
           static_cast<InputEmbedding>(
               file.format().network_format().input_embedding()) ==
-              InputEmbedding::INPUT_EMBEDDING_PE_DENSE);
+              InputEmbedding::INPUT_EMBEDDING_PE_DENSE,
+          use_gemm_ex, use_fused_mha);
       network_.emplace_back(std::move(attention_body));
 
       encoder_last_ = getLastLayer();
@@ -469,7 +507,7 @@ class CudaNetwork : public Network {
       if (attn_policy_) {
         auto AttentionPolicy = std::make_unique<AttentionPolicyHead<DataType>>(
             getLastLayer(), head, scratch_mem_, attn_body_, act,
-            max_batch_size_);
+            max_batch_size_, use_gemm_ex);
         network_.emplace_back(std::move(AttentionPolicy));
 
         auto policymap = std::make_unique<PolicyMapLayer<DataType>>(
@@ -529,8 +567,8 @@ class CudaNetwork : public Network {
              pblczero::NetworkFormat::VALUE_WDL;
       BaseLayer<DataType>* lastlayer = attn_body_ ? encoder_last_ : resi_last_;
       auto value_main = std::make_unique<ValueHead<DataType>>(
-          lastlayer, head, scratch_mem_, attn_body_, wdl_, act,
-          max_batch_size_, use_gemm_ex);
+          lastlayer, head, scratch_mem_, attn_body_, wdl_, act, max_batch_size_,
+          use_gemm_ex);
       network_.emplace_back(std::move(value_main));
     }
 
@@ -591,18 +629,86 @@ class CudaNetwork : public Network {
 
     tensor_mem_size_ = multi_stream_ ? maxSize : 0;
 
-    // pre-allocate one InputsOutputs object
-    // The first call to allocate memory, create cublas,
-    // strem, etc takes really long (600 ms)
-    std::unique_ptr<InputsOutputs> io = GetInputsOutputs();
+    // pre-allocate cuda graphs for search threads
+    auto allocateCudaGraphs = [&] {
+      ReportCUDAErrors(cudaSetDevice(gpu_id_));
+      CudaNetworkComputation<DataType> comp(this, wdl_, moves_left_);
+      comp.AddInput(InputPlanes{(size_t)kNumInputPlanes});
+      // Make sure cublas is initialized in this thread.
+      comp.ComputeBlocking();
+      for (int i = 0; i < GetMiniBatchSize(); i++) {
+        comp.AddInput(InputPlanes{(size_t)kNumInputPlanes});
+        auto lock = LockEval();
+        comp.CaptureGraph(std::move(lock));
+      }
+    };
+    std::thread t2(allocateCudaGraphs);
+    allocateCudaGraphs();
+    t2.join();
+  }
+
+  std::unique_lock<std::mutex> LockEval() {
+    if (multi_stream_) {
+      return {};
+    } else {
+      return std::unique_lock<std::mutex>{lock_};
+    }
+  }
+
+  bool GetGraphCaptureEnabled() const { return enable_graph_capture_; }
+
+  CudaGraphCapture<DataType> BeginCapture(InputsOutputs<DataType>& io) {
+    if (!multi_stream_) {
+#if CUDA_GRAPH_SUPPORTS_EXTERNAL_EVENTS
+      return {io, upload_stream_, download_stream_};
+#else
+      return {io, compute_stream_, download_stream_};
+#endif
+    } else {
+      return {io, io.upload_stream_, io.download_stream_};
+    }
   }
 
-  void forwardEval(InputsOutputs* io, int batchSize) {
+  void UploadInputs(InputsOutputs<DataType>* io, int batchSize) {
+    // Multu-stream can capture uploads without external events.
+    if (multi_stream_) return;
+    ReportCUDAErrors(
+        cudaMemcpyAsync(io->input_masks_mem_gpu_, io->input_masks_mem_,
+                        batchSize * kInputPlanes * sizeof(uint64_t),
+                        cudaMemcpyHostToDevice, upload_stream_));
+    ReportCUDAErrors(cudaMemcpyAsync(
+        io->input_val_mem_gpu_, io->input_val_mem_,
+        batchSize * kInputPlanes * sizeof(io->input_val_mem_[0]),
+        cudaMemcpyHostToDevice, upload_stream_));
+    ReportCUDAErrors(cudaEventRecord(io->upload_done_event_, upload_stream_));
+    ReportCUDAErrors(
+        cudaStreamWaitEvent(compute_stream_, io->upload_done_event_, 0));
+  }
+
+  void GraphLaunch(InputsOutputs<DataType>* io, int batchSize) {
+#if CUDA_GRAPH_SUPPORTS_EXTERNAL_EVENTS
+    io->cuda_graphs_[batchSize - 1].Launch(io->exec_stream_);
+#else
+    if (!multi_stream_) {
+      UploadInputs(io, batchSize);
+
+      io->cuda_graphs_[batchSize - 1].Launch(compute_stream_);
+      ReportCUDAErrors(
+          cudaEventRecord(io->download_done_event_, compute_stream_));
+    } else {
+      io->cuda_graphs_[batchSize - 1].Launch(io->exec_stream_);
+      ReportCUDAErrors(
+          cudaEventRecord(io->download_done_event_, io->exec_stream_));
+    }
+#endif
+  }
+
+  void forwardEval(InputsOutputs<DataType>* io, int batchSize,
+                   [[maybe_unused]] bool capture = false) {
     // It is safe to evaluate larger than the batchSize
     // as all buffers are designed to handle max_batch_size
     // and the extra invalid results are never read.
     if (batchSize < min_batch_size_) batchSize = min_batch_size_;
-    if (!multi_stream_) lock_.lock();
 
 #ifdef DEBUG_RAW_NPS
     auto t_start = std::chrono::high_resolution_clock::now();
@@ -610,13 +716,13 @@ class CudaNetwork : public Network {
 
     // Expand packed planes to full planes.
     uint64_t* ipDataMasks = io->input_masks_mem_gpu_;
-    float* ipDataValues = io->input_val_mem_gpu_;
+    auto* ipDataValues = io->input_val_mem_gpu_;
 
     DataType* tensor_mem[3];
     void* scratch_mem;
     DataType*** offset_pointers;
     DataType*** head_offset_pointers;
-    cudaStream_t stream;
+    cudaStream_t compute_stream, upload_stream, download_stream;
     cublasHandle_t cublas;
     if (multi_stream_) {
       // We use tensor and scratch memory from InputOutputs (so that multiple
@@ -625,29 +731,49 @@ class CudaNetwork : public Network {
       scratch_mem = io->scratch_mem_;
       offset_pointers = (DataType***)&io->offset_pointers_;
       head_offset_pointers = (DataType***)&io->head_offset_pointers_;
-      stream = io->stream_;
+      compute_stream = io->compute_stream_;
+      upload_stream = io->upload_stream_;
+      download_stream = io->download_stream_;
       cublas = io->cublas_;
     } else {
       for (int i = 0; i < 3; i++) tensor_mem[i] = tensor_mem_[i];
       scratch_mem = scratch_mem_;
       offset_pointers = (DataType***)&offset_pointers_;
       head_offset_pointers = (DataType***)&head_offset_pointers_;
-      stream = 0;  // default stream
+      compute_stream = compute_stream_;
+      upload_stream = upload_stream_;
+      download_stream = download_stream_;
       cublas = cublas_;
     }
 
-    bool fp16 = std::is_same<half, DataType>::value;
-    if (fp16) {
-      expandPlanes_Fp16_NCHW((half*)(tensor_mem[0]), ipDataMasks, ipDataValues,
-                             batchSize * kInputPlanes, stream);
-    } else {
-      expandPlanes_Fp32_NCHW((float*)(tensor_mem[0]), ipDataMasks, ipDataValues,
-                             batchSize * kInputPlanes, stream);
+    if (multi_stream_ || CUDA_GRAPH_SUPPORTS_EXTERNAL_EVENTS) {
+      ReportCUDAErrors(
+          cudaMemcpyAsync(io->input_masks_mem_gpu_, io->input_masks_mem_,
+                          batchSize * kInputPlanes * sizeof(uint64_t),
+                          cudaMemcpyHostToDevice, upload_stream));
+      ReportCUDAErrors(cudaMemcpyAsync(
+          io->input_val_mem_gpu_, io->input_val_mem_,
+          batchSize * kInputPlanes * sizeof(io->input_val_mem_[0]),
+          cudaMemcpyHostToDevice, upload_stream));
+      ReportCUDAErrors(cudaEventRecord(io->upload_done_event_, upload_stream));
+      ReportCUDAErrors(
+          cudaStreamWaitEvent(compute_stream, io->upload_done_event_, 0));
     }
 
-    float* opPol = io->op_policy_mem_gpu_;
-    float* opVal = io->op_value_mem_gpu_;
-    float* opMov = io->op_moves_left_mem_gpu_;
+    if (!multi_stream_) {
+#if CUDA_GRAPH_SUPPORTS_EXTERNAL_EVENTS
+      ReportCUDAErrors(
+          cudaStreamWaitEvent(compute_stream, compute_ordering_event_,
+                              capture ? cudaEventWaitExternal : 0));
+#endif
+    }
+
+    expandPlanes_NCHW(tensor_mem[0], ipDataMasks, ipDataValues,
+                      batchSize * kInputPlanes, compute_stream);
+
+    auto* opPol = io->op_policy_mem_gpu_;
+    auto* opVal = io->op_value_mem_gpu_;
+    auto* opMov = io->op_moves_left_mem_gpu_;
 
     // Figure out if the memory requirment for running the res block would fit
     // in the L2 cache.
@@ -675,7 +801,8 @@ class CudaNetwork : public Network {
       // we can use a single alloc to hold all the required tensors, and enable
       // persistent L2 caching on it
       ReportCUDAErrors(cudaStreamSetAttribute(
-          stream, cudaStreamAttributeAccessPolicyWindow, &stream_attribute));
+          compute_stream, cudaStreamAttributeAccessPolicyWindow,
+          &stream_attribute));
 
       enableCacheOpt = true;
       skip_connection =
@@ -693,7 +820,7 @@ class CudaNetwork : public Network {
       // Input.
       network_[l++]->Eval(batchSize, skip_connection, tensor_mem[0], nullptr,
                           scratch_mem, scratch_size_, nullptr, cublas,
-                          stream);  // input conv
+                          compute_stream);  // input conv
 
       // Residual block.
       for (int block = 0; block < numBlocks_; block++) {
@@ -701,15 +828,15 @@ class CudaNetwork : public Network {
           network_[l++]->Eval(batchSize, tensor_mem[2], skip_connection,
                               nullptr, enableCacheOpt ? nullptr : scratch_mem,
                               scratch_size_, nullptr, cublas,
-                              stream);  // block
+                              compute_stream);  // block
         } else {
           network_[l++]->Eval(batchSize, tensor_mem[0], tensor_mem[2], nullptr,
                               scratch_mem, scratch_size_, nullptr, cublas,
-                              stream);  // conv1
+                              compute_stream);  // conv1
 
           network_[l++]->Eval(batchSize, tensor_mem[2], tensor_mem[0],
                               tensor_mem[2], scratch_mem, scratch_size_,
-                              nullptr, cublas, stream);  // conv2
+                              nullptr, cublas, compute_stream);  // conv2
         }
       }
 
@@ -723,7 +850,7 @@ class CudaNetwork : public Network {
           batchSize, tensor_mem[1],
           (numBlocks_ > 0) ? tensor_mem[2] : tensor_mem[0],
           (numBlocks_ > 0) ? tensor_mem[0] : tensor_mem[2], scratch_mem,
-          scratch_size_, nullptr, cublas, stream,
+          scratch_size_, nullptr, cublas, compute_stream,
           offset_pointers);  // Entire attention body of the network
 
       flow = tensor_mem[1];
@@ -735,7 +862,8 @@ class CudaNetwork : public Network {
     if (enableCacheOpt) {
       // reset the cache settings
       stream_attribute.accessPolicyWindow.num_bytes = 0;
-      cudaStreamSetAttribute(stream, cudaStreamAttributeAccessPolicyWindow,
+      cudaStreamSetAttribute(compute_stream,
+                             cudaStreamAttributeAccessPolicyWindow,
                              &stream_attribute);
       cudaCtxResetPersistingL2Cache();
     }
@@ -745,116 +873,131 @@ class CudaNetwork : public Network {
     if (attn_policy_) {
       network_[l++]->Eval(
           batchSize, spare1, flow, spare2, scratch_mem, scratch_size_, nullptr,
-          cublas, stream,
+          cublas, compute_stream,
           head_offset_pointers);  // Entire Attention policy head except for the
                                   // policy map
-      if (fp16) {
-        network_[l++]->Eval(batchSize, spare2, spare1, nullptr, scratch_mem,
-                            scratch_size_, nullptr, cublas,
-                            stream);  // policy map layer
-        copyTypeConverted(opPol, (half*)spare2, batchSize * kNumOutputPolicy,
-                          stream);  // POLICY output
-      } else {
-        network_[l++]->Eval(batchSize, (DataType*)opPol, spare1, nullptr,
-                            scratch_mem, scratch_size_, nullptr, cublas,
-                            stream);  // policy map layer  // POLICY output
-      }
+      network_[l++]->Eval(
+          batchSize, (DataType*)opPol, spare1, nullptr, scratch_mem,
+          scratch_size_, nullptr, cublas,
+          compute_stream);  // policy map layer  // POLICY output
 
     } else if (conv_policy_) {
       network_[l++]->Eval(batchSize, spare1, flow, nullptr, scratch_mem,
                           scratch_size_, nullptr, cublas,
-                          stream);  // policy conv1
+                          compute_stream);  // policy conv1
 
       network_[l++]->Eval(batchSize, spare2, spare1, nullptr, scratch_mem,
                           scratch_size_, nullptr, cublas,
-                          stream);  // policy conv2
-
-      if (fp16) {
-        network_[l++]->Eval(batchSize, spare1, spare2, nullptr, scratch_mem,
-                            scratch_size_, nullptr, cublas,
-                            stream);  // policy map layer
-        copyTypeConverted(opPol, (half*)(spare1), batchSize * kNumOutputPolicy,
-                          stream);  // POLICY output
-      } else {
-        network_[l++]->Eval(batchSize, (DataType*)opPol, spare2, nullptr,
-                            scratch_mem, scratch_size_, nullptr, cublas,
-                            stream);  // policy map layer  // POLICY output
-      }
+                          compute_stream);  // policy conv2
+
+      network_[l++]->Eval(
+          batchSize, (DataType*)opPol, spare2, nullptr, scratch_mem,
+          scratch_size_, nullptr, cublas,
+          compute_stream);  // policy map layer  // POLICY output
     } else {
       network_[l++]->Eval(batchSize, spare1, flow, nullptr, scratch_mem,
                           scratch_size_, nullptr, cublas,
-                          stream);  // pol conv
-
-      if (fp16) {
-        network_[l++]->Eval(batchSize, spare2, spare1, nullptr, scratch_mem,
-                            scratch_size_, nullptr, cublas,
-                            stream);  // pol FC
+                          compute_stream);  // pol conv
 
-        copyTypeConverted(opPol, (half*)(spare2), batchSize * kNumOutputPolicy,
-                          stream);  // POLICY
-      } else {
-        network_[l++]->Eval(batchSize, (DataType*)opPol, spare1, nullptr,
-                            scratch_mem, scratch_size_, nullptr, cublas,
-                            stream);  // pol FC  // POLICY
-      }
+      network_[l++]->Eval(batchSize, (DataType*)opPol, spare1, nullptr,
+                          scratch_mem, scratch_size_, nullptr, cublas,
+                          compute_stream);  // pol FC  // POLICY
     }
+    ReportCUDAErrors(cudaEventRecord(io->policy_done_event_, compute_stream));
+    ReportCUDAErrors(
+        cudaStreamWaitEvent(download_stream, io->policy_done_event_, 0));
 
     // Copy policy output from device memory to host memory.
-    ReportCUDAErrors(
-        cudaMemcpyAsync(io->op_policy_mem_, io->op_policy_mem_gpu_,
-                        sizeof(float) * kNumOutputPolicy * batchSize,
-                        cudaMemcpyDeviceToHost, stream));
+    ReportCUDAErrors(cudaMemcpyAsync(
+        io->op_policy_mem_, io->op_policy_mem_gpu_,
+        sizeof(io->op_policy_mem_[0]) * kNumOutputPolicy * batchSize,
+        cudaMemcpyDeviceToHost, download_stream));
 
     // value head
-    if (fp16) {
-      network_[l++]->Eval(batchSize, spare1, flow, spare2, scratch_mem,
-                          scratch_size_, nullptr, cublas,
-                          stream);  // value head
-      copyTypeConverted(opVal, (half*)spare1, wdl_ ? 3 * batchSize : batchSize,
-                        stream);
-    } else {
-      network_[l++]->Eval(batchSize, (DataType*)opVal, flow, spare2,
-                          scratch_mem, scratch_size_, nullptr, cublas,
-                          stream);  // value head
+    network_[l++]->Eval(batchSize, (DataType*)opVal, flow, spare2, scratch_mem,
+                        scratch_size_, nullptr, cublas,
+                        compute_stream);  // value head
+    if (!moves_left_ && !multi_stream_) {
+#if CUDA_GRAPH_SUPPORTS_EXTERNAL_EVENTS
+      ReportCUDAErrors(
+          cudaEventRecordWithFlags(compute_ordering_event_, compute_stream,
+                                   capture ? cudaEventRecordExternal : 0));
+#endif
+    }
+    ReportCUDAErrors(cudaEventRecord(io->value_done_event_, compute_stream));
+    ReportCUDAErrors(
+        cudaStreamWaitEvent(download_stream, io->value_done_event_, 0));
+    ReportCUDAErrors(cudaMemcpyAsync(
+        io->op_value_mem_, io->op_value_mem_gpu_,
+        sizeof(io->op_value_mem_[0]) * (wdl_ ? 3 : 1) * batchSize,
+        cudaMemcpyDeviceToHost, download_stream));
+
+    if (wdl_) {
+#if CUDA_GRAPH_SUPPORTS_EXTERNAL_EVENTS
+      ReportCUDAErrors(cudaEventRecordWithFlags(
+          io->wdl_download_done_event_, download_stream,
+          capture ? cudaEventRecordExternal : 0));
+#endif
     }
 
     if (moves_left_) {
       // Moves left head
       network_[l++]->Eval(batchSize, spare1, flow, nullptr, scratch_mem,
                           scratch_size_, nullptr, cublas,
-                          stream);  // moves conv or embedding
+                          compute_stream);  // moves conv or embedding
 
       network_[l++]->Eval(batchSize, spare2, spare1, nullptr, scratch_mem,
                           scratch_size_, nullptr, cublas,
-                          stream);  // moves FC1
+                          compute_stream);  // moves FC1
 
       // Moves left FC2
-      if (fp16) {
-        // TODO: consider fusing the bias-add of FC2 with format conversion.
-        network_[l++]->Eval(batchSize, spare1, spare2, nullptr, scratch_mem,
-                            scratch_size_, nullptr, cublas, stream);
-        copyTypeConverted(opMov, (half*)(spare1), batchSize, stream);
-      } else {
-        network_[l++]->Eval(batchSize, (DataType*)opMov, spare2, nullptr,
-                            scratch_mem, scratch_size_, nullptr, cublas,
-                            stream);
+      network_[l++]->Eval(batchSize, (DataType*)opMov, spare2, nullptr,
+                          scratch_mem, scratch_size_, nullptr, cublas,
+                          compute_stream);
+      if (!multi_stream_) {
+#if CUDA_GRAPH_SUPPORTS_EXTERNAL_EVENTS
+        ReportCUDAErrors(
+            cudaEventRecordWithFlags(compute_ordering_event_, compute_stream,
+                                     capture ? cudaEventRecordExternal : 0));
+#endif
       }
+      ReportCUDAErrors(
+          cudaEventRecord(io->moves_left_done_event_, compute_stream));
+      ReportCUDAErrors(
+          cudaStreamWaitEvent(download_stream, io->moves_left_done_event_, 0));
+      ReportCUDAErrors(
+          cudaMemcpyAsync(io->op_moves_left_mem_, io->op_moves_left_mem_gpu_,
+                          sizeof(io->op_moves_left_mem_[0]) * batchSize,
+                          cudaMemcpyDeviceToHost, download_stream));
     }
-
-    if (multi_stream_) {
-      ReportCUDAErrors(cudaStreamSynchronize(stream));
-    } else {
-      ReportCUDAErrors(cudaDeviceSynchronize());
-      // The next thread can start using the GPU now.
-      lock_.unlock();
+#if CUDA_GRAPH_SUPPORTS_EXTERNAL_EVENTS
+    ReportCUDAErrors(
+        cudaEventRecordWithFlags(io->download_done_event_, download_stream,
+                                 capture ? cudaEventRecordExternal : 0));
+#else
+    if (!capture) {
+      ReportCUDAErrors(
+          cudaEventRecord(io->download_done_event_, download_stream));
     }
+#endif
+  }
 
+  void finishEval(InputsOutputs<DataType>* io, int batchSize) {
+#if !CUDA_GRAPH_SUPPORTS_EXTERNAL_EVENTS
+    ReportCUDAErrors(cudaEventSynchronize(io->download_done_event_));
+#endif
     if (wdl_) {
+#if CUDA_GRAPH_SUPPORTS_EXTERNAL_EVENTS
+      ReportCUDAErrors(cudaEventSynchronize(io->wdl_download_done_event_));
+#endif
       // Value softmax done cpu side.
       for (int i = 0; i < batchSize; i++) {
-        float w = io->op_value_mem_[3 * i + 0];
-        float d = io->op_value_mem_[3 * i + 1];
-        float l = io->op_value_mem_[3 * i + 2];
+        float* wdl = sizeof(io->op_value_mem_[0]) == sizeof(float)
+                         ? (float*)io->op_value_mem_
+                         : io->wdl_cpu_softmax_.get();
+        float w = FromType(io->op_value_mem_[3 * i + 0]);
+        float d = FromType(io->op_value_mem_[3 * i + 1]);
+        float l = FromType(io->op_value_mem_[3 * i + 2]);
         float m = std::max({w, d, l});
         w = std::exp(w - m);
         d = std::exp(d - m);
@@ -862,12 +1005,14 @@ class CudaNetwork : public Network {
         float sum = w + d + l;
         w /= sum;
         l /= sum;
-        d = 1.0f - w - l;
-        io->op_value_mem_[3 * i + 0] = w;
-        io->op_value_mem_[3 * i + 1] = d;
-        io->op_value_mem_[3 * i + 2] = l;
+        d /= sum;
+        wdl[2 * i + 0] = w - l;
+        wdl[2 * i + 1] = d;
       }
     }
+#if CUDA_GRAPH_SUPPORTS_EXTERNAL_EVENTS
+    ReportCUDAErrors(cudaEventSynchronize(io->download_done_event_));
+#endif
   }
 
   ~CudaNetwork() {
@@ -879,7 +1024,11 @@ class CudaNetwork : public Network {
       if (offset_pointers_) ReportCUDAErrors(cudaFree(offset_pointers_));
       if (head_offset_pointers_)
         ReportCUDAErrors(cudaFree(head_offset_pointers_));
-      cublasDestroy(cublas_);
+      ReportCUBLASErrors(cublasDestroy(cublas_));
+      ReportCUDAErrors(cudaStreamDestroy(compute_stream_));
+      ReportCUDAErrors(cudaStreamDestroy(upload_stream_));
+      ReportCUDAErrors(cudaStreamDestroy(download_stream_));
+      ReportCUDAErrors(cudaEventDestroy(compute_ordering_event_));
     }
   }
 
@@ -892,31 +1041,41 @@ class CudaNetwork : public Network {
     return 2 * sm_count_;
   }
 
+  int GetPreferredBatchStep() const override {
+    int preferred_split = 7;
+    while (sm_count_ % preferred_split != 0) preferred_split++;
+    return preferred_split;
+  }
+
   int GetThreads() const override { return 1 + multi_stream_; }
 
   std::unique_ptr<NetworkComputation> NewComputation() override {
     // Set correct gpu id for this computation (as it might have been called
     // from a different thread).
-    ReportCUDAErrors(cudaSetDevice(gpu_id_));
+    int device = -1;
+    ReportCUDAErrors(cudaGetDevice(&device));
+    if (device != gpu_id_) {
+      ReportCUDAErrors(cudaSetDevice(gpu_id_));
+    }
     return std::make_unique<CudaNetworkComputation<DataType>>(this, wdl_,
                                                               moves_left_);
   }
 
-  std::unique_ptr<InputsOutputs> GetInputsOutputs() {
+  std::unique_ptr<InputsOutputs<DataType>> GetInputsOutputs() {
     std::lock_guard<std::mutex> lock(inputs_outputs_lock_);
     if (free_inputs_outputs_.empty()) {
-      return std::make_unique<InputsOutputs>(
+      return std::make_unique<InputsOutputs<DataType>>(
           max_batch_size_, wdl_, moves_left_, tensor_mem_size_, scratch_size_,
           !has_tensor_cores_ && std::is_same<half, DataType>::value);
     } else {
-      std::unique_ptr<InputsOutputs> resource =
+      std::unique_ptr<InputsOutputs<DataType>> resource =
           std::move(free_inputs_outputs_.front());
       free_inputs_outputs_.pop_front();
       return resource;
     }
   }
 
-  void ReleaseInputsOutputs(std::unique_ptr<InputsOutputs> resource) {
+  void ReleaseInputsOutputs(std::unique_ptr<InputsOutputs<DataType>> resource) {
     std::lock_guard<std::mutex> lock(inputs_outputs_lock_);
     free_inputs_outputs_.push_back(std::move(resource));
   }
@@ -925,7 +1084,7 @@ class CudaNetwork : public Network {
   // This function invokes constructor just to please complier and silence
   // warning. Is never called (but compiler thinks that it could).
   void UglyFunctionToSilenceNvccWarning() {
-    InputsOutputs io(0, false, false, false);
+    InputsOutputs<DataType> io(0, false, false, false);
   }
 
  private:
@@ -935,6 +1094,7 @@ class CudaNetwork : public Network {
   int sm_count_;
   int max_batch_size_;
   int min_batch_size_;
+  bool enable_graph_capture_;
   bool wdl_;
   bool moves_left_;
   bool use_res_block_winograd_fuse_opt_;  // fuse operations inside the residual
@@ -971,11 +1131,15 @@ class CudaNetwork : public Network {
   bool has_tensor_cores_;
 
   // not used when multi-steam is enabled
+  cudaStream_t compute_stream_ = nullptr;
+  cudaStream_t upload_stream_ = nullptr;
+  cudaStream_t download_stream_ = nullptr;
+  cudaEvent_t compute_ordering_event_ = nullptr;
   cublasHandle_t cublas_;
   DataType* tensor_mem_[3];
 
   mutable std::mutex inputs_outputs_lock_;
-  std::list<std::unique_ptr<InputsOutputs>> free_inputs_outputs_;
+  std::list<std::unique_ptr<InputsOutputs<DataType>>> free_inputs_outputs_;
 
   void showInfo() const {
     int version;
@@ -996,9 +1160,12 @@ class CudaNetwork : public Network {
       major = CUDART_VERSION / 1000;
       minor = (CUDART_VERSION - major * 1000) / 10;
       pl = CUDART_VERSION - major * 1000 - minor * 10;
-      CERR << "WARNING: CUDA Runtime version mismatch, was compiled with "
-              "version "
-           << major << "." << minor << "." << pl;
+      // After cuda 11, newer version with same major is OK.
+      if (major < 11 || (major != version / 1000) || version < CUDART_VERSION) {
+        CERR << "WARNING: CUDA Runtime version mismatch, was compiled with "
+                "version "
+             << major << "." << minor << "." << pl;
+      }
     }
     cudaDriverGetVersion(&version);
     major = version / 1000;
@@ -1011,11 +1178,27 @@ class CudaNetwork : public Network {
     }
   }
 
-  void showDeviceInfo(const cudaDeviceProp& deviceProp) const {
+  void showDeviceInfo(const cudaDeviceProp& deviceProp,
+                      [[maybe_unused]] int deviceId) const {
     CERR << "GPU: " << deviceProp.name;
     CERR << "GPU memory: " << deviceProp.totalGlobalMem / std::pow(2.0f, 30)
          << " Gb";
-    CERR << "GPU clock frequency: " << deviceProp.clockRate / 1e3f << " MHz";
+    // Get clock rate
+    float clockRateMHz;
+#if CUDART_VERSION >= 13000
+    int clockRatekHz;
+    cudaError_t err =
+        cudaDeviceGetAttribute(&clockRatekHz, cudaDevAttrClockRate, deviceId);
+    if (err != cudaSuccess) {
+      CERR << "Error getting clock rate: " << cudaGetErrorString(err);
+      clockRateMHz = 0.0f;  // Fallback value
+    } else {
+      clockRateMHz = clockRatekHz / 1e3f;
+    }
+#else
+    clockRateMHz = deviceProp.clockRate / 1e3f;
+#endif
+    CERR << "GPU clock frequency: " << clockRateMHz << " MHz";
     CERR << "GPU compute capability: " << deviceProp.major << "."
          << deviceProp.minor;
     CERR << "L2 cache capacity: " << deviceProp.l2CacheSize;
@@ -1039,9 +1222,40 @@ CudaNetworkComputation<DataType>::~CudaNetworkComputation() {
   network_->ReleaseInputsOutputs(std::move(inputs_outputs_));
 }
 
+template <typename DataType>
+void CudaNetworkComputation<DataType>::CaptureGraph(
+    std::unique_lock<std::mutex>&& lock) {
+  if (!network_->GetGraphCaptureEnabled()) return;
+  if (!CudaGraphCapture<DataType>::EnsureEnoughFreeMemory()) {
+    static std::once_flag flag;
+    std::call_once(flag, []() {
+      CERR << "WARNING: Not enough GPU memory to capture CUDA graphs.";
+    });
+    return;
+  }
+  auto capture = network_->BeginCapture(*inputs_outputs_);
+  network_->forwardEval(inputs_outputs_.get(), GetBatchSize(), true);
+  capture.EndCapture();
+  if (lock.owns_lock()) lock.unlock();
+  inputs_outputs_->cuda_graphs_[GetBatchSize() - 1] = capture;
+}
+
 template <typename DataType>
 void CudaNetworkComputation<DataType>::ComputeBlocking() {
-  network_->forwardEval(inputs_outputs_.get(), GetBatchSize());
+  LCTRACE_FUNCTION_SCOPE;
+  assert(GetBatchSize() >= 1);
+  if (inputs_outputs_->cuda_graphs_[GetBatchSize() - 1]) {
+    std::unique_lock<std::mutex> lock = network_->LockEval();
+    network_->GraphLaunch(inputs_outputs_.get(), GetBatchSize());
+  } else {
+    std::unique_lock<std::mutex> lock = network_->LockEval();
+#if !CUDA_GRAPH_SUPPORTS_EXTERNAL_EVENTS
+    network_->UploadInputs(inputs_outputs_.get(), GetBatchSize());
+#endif
+    network_->forwardEval(inputs_outputs_.get(), GetBatchSize());
+    CaptureGraph(std::move(lock));
+  }
+  network_->finishEval(inputs_outputs_.get(), GetBatchSize());
 }
 
 template <typename DataType>
diff --git a/src/neural/backends/cuda/network_cudnn.cc b/src/neural/backends/cuda/network_cudnn.cc
index d7b15147a2..edf7b592e6 100644
--- a/src/neural/backends/cuda/network_cudnn.cc
+++ b/src/neural/backends/cuda/network_cudnn.cc
@@ -26,7 +26,6 @@
 */
 #include <algorithm>
 #include <cassert>
-#include <functional>
 #include <list>
 #include <memory>
 #include <mutex>
@@ -39,8 +38,8 @@
 #include "neural/network_legacy.h"
 #include "neural/tables/attention_policy_map.h"
 #include "neural/tables/policy_map.h"
-#include "utils/bititer.h"
 #include "utils/exception.h"
+#include "utils/fp16_utils.h"
 
 // #define DEBUG_RAW_NPS
 
@@ -99,11 +98,10 @@ class CudnnNetworkComputation : public NetworkComputation {
     const auto iter_val =
         &inputs_outputs_->input_val_mem_[batch_size_ * kInputPlanes];
 
-    int i = 0;
-    for (const auto& plane : input) {
+    for (int i = 0; i < kInputPlanes; i++) {
+      const auto& plane = input[i];
       iter_mask[i] = plane.mask;
-      iter_val[i] = plane.value;
-      i++;
+      ToType(iter_val[i], plane.value);
     }
 
     batch_size_++;
@@ -111,41 +109,47 @@ class CudnnNetworkComputation : public NetworkComputation {
 
   void ComputeBlocking() override;
 
+  void CaptureGraph(std::unique_lock<std::mutex>&& lock = {});
+
   int GetBatchSize() const override { return batch_size_; }
 
   float GetQVal(int sample) const override {
     if (wdl_) {
-      auto w = inputs_outputs_->op_value_mem_[3 * sample + 0];
-      auto l = inputs_outputs_->op_value_mem_[3 * sample + 2];
-      return w - l;
-    } else {
-      return inputs_outputs_->op_value_mem_[sample];
+      const float* wdl =
+          sizeof(inputs_outputs_->op_value_mem_[0]) == sizeof(float)
+              ? (float*)inputs_outputs_->op_value_mem_
+              : inputs_outputs_->wdl_cpu_softmax_.get();
+      return wdl[2 * sample];
     }
+    return FromType(inputs_outputs_->op_value_mem_[sample]);
   }
 
   float GetDVal(int sample) const override {
     if (wdl_) {
-      auto d = inputs_outputs_->op_value_mem_[3 * sample + 1];
-      return d;
-    } else {
-      return 0.0f;
+      const float* wdl =
+          sizeof(inputs_outputs_->op_value_mem_[0]) == sizeof(float)
+              ? (float*)inputs_outputs_->op_value_mem_
+              : inputs_outputs_->wdl_cpu_softmax_.get();
+      return wdl[2 * sample + 1];
     }
+    return 0.0f;
   }
 
   float GetPVal(int sample, int move_id) const override {
-    return inputs_outputs_->op_policy_mem_[sample * kNumOutputPolicy + move_id];
+    return FromType(
+        inputs_outputs_->op_policy_mem_[sample * kNumOutputPolicy + move_id]);
   }
 
   float GetMVal(int sample) const override {
     if (moves_left_) {
-      return inputs_outputs_->op_moves_left_mem_[sample];
+      return FromType(inputs_outputs_->op_moves_left_mem_[sample]);
     }
     return 0.0f;
   }
 
  private:
   // Memory holding inputs, outputs.
-  std::unique_ptr<InputsOutputs> inputs_outputs_;
+  std::unique_ptr<InputsOutputs<DataType>> inputs_outputs_;
   int batch_size_;
   bool wdl_;
   bool moves_left_;
@@ -162,6 +166,7 @@ class CudnnNetwork : public Network {
                       file.format().network_format().moves_left()} {
     MultiHeadWeights weights(file.weights());
     gpu_id_ = options.GetOrDefault<int>("gpu", 0);
+    enable_graph_capture_ = options.GetOrDefault<bool>("graph_capture", true);
 
     conv_policy_ = file.format().network_format().policy() ==
                    pblczero::NetworkFormat::POLICY_CONVOLUTION;
@@ -189,7 +194,7 @@ class CudnnNetwork : public Network {
 
     cudaDeviceProp deviceProp = {};
     cudaGetDeviceProperties(&deviceProp, gpu_id_);
-    showDeviceInfo(deviceProp);
+    showDeviceInfo(deviceProp, gpu_id_);
 
     // Select GPU to run on (for *the current* thread).
     ReportCUDAErrors(cudaSetDevice(gpu_id_));
@@ -229,6 +234,17 @@ class CudnnNetwork : public Network {
       // Override if forced from backend option
       if (options.Exists<bool>("nhwc")) nhwc_ = options.Get<bool>("nhwc");
     }
+    ReportCUDAErrors(
+        cudaStreamCreateWithFlags(&compute_stream_, cudaStreamNonBlocking));
+    ReportCUDAErrors(
+        cudaStreamCreateWithFlags(&upload_stream_, cudaStreamNonBlocking));
+    ReportCUDAErrors(
+        cudaStreamCreateWithFlags(&download_stream_, cudaStreamNonBlocking));
+    ReportCUDAErrors(cudaEventCreateWithFlags(&compute_ordering_event_,
+                                              cudaEventDisableTiming));
+
+    ReportCUBLASErrors(cublasSetStream(cublas_, compute_stream_));
+    ReportCUDNNErrors(cudnnSetStream(cudnn_, compute_stream_));
 
     if (hasTensorCores)
       ReportCUBLASErrors(cublasSetMathMode(
@@ -527,7 +543,7 @@ class CudnnNetwork : public Network {
       if (attn_policy_) {
         auto AttentionPolicy = std::make_unique<AttentionPolicyHead<DataType>>(
             getLastLayer(), head, scratch_mem_, false, ACTIVATION_SELU,
-            max_batch_size_);
+            max_batch_size_, use_gemm_ex);
         network_.emplace_back(std::move(AttentionPolicy));
 
         auto policymap = std::make_unique<PolicyMapLayer<DataType>>(
@@ -586,8 +602,7 @@ class CudnnNetwork : public Network {
       auto FCVal1 = std::make_unique<FCLayer<DataType>>(
           getLastLayer(), head.ip1_val_b.size(), 1, 1, true,
           mish_net ? ACTIVATION_MISH : ACTIVATION_RELU);
-      FCVal1->LoadWeights(&head.ip1_val_w[0], &head.ip1_val_b[0],
-                          scratch_mem_);
+      FCVal1->LoadWeights(&head.ip1_val_w[0], &head.ip1_val_b[0], scratch_mem_);
       network_.emplace_back(std::move(FCVal1));
 
       wdl_ = file.format().network_format().value() ==
@@ -597,8 +612,7 @@ class CudnnNetwork : public Network {
       auto FCVal2 = std::make_unique<FCLayer<DataType>>(
           getLastLayer(), head.ip2_val_b.size(), 1, 1, true,
           fc2_tanh ? ACTIVATION_TANH : ACTIVATION_NONE);
-      FCVal2->LoadWeights(&head.ip2_val_w[0], &head.ip2_val_b[0],
-                          scratch_mem_);
+      FCVal2->LoadWeights(&head.ip2_val_w[0], &head.ip2_val_b[0], scratch_mem_);
       network_.emplace_back(std::move(FCVal2));
     }
     value_out_ = getLastLayer();
@@ -664,45 +678,94 @@ class CudnnNetwork : public Network {
     CERR << "allocated " << 3 * maxSize
          << " bytes of GPU memory to run the network";
 #endif
+
+    // pre-allocate cuda graphs for search threads
+    auto allocateCudaGraphs = [&] {
+      CudnnNetworkComputation<DataType> comp(this, wdl_, moves_left_);
+      comp.AddInput(InputPlanes{(size_t)kNumInputPlanes});
+      // Make sure cublas is initialized in this thread.
+      comp.ComputeBlocking();
+      for (int i = 0; i < GetMiniBatchSize(); i++) {
+        comp.AddInput(InputPlanes{(size_t)kNumInputPlanes});
+        auto lock = LockEval();
+        comp.CaptureGraph(std::move(lock));
+      }
+    };
+    std::thread t2(allocateCudaGraphs);
+    allocateCudaGraphs();
+    t2.join();
   }
 
-  void forwardEval(InputsOutputs* io, int batchSize) {
+  std::unique_lock<std::mutex> LockEval() {
+    return std::unique_lock<std::mutex>{lock_};
+  }
+
+  bool GetGraphCaptureEnabled() const { return enable_graph_capture_; }
+
+  CudaGraphCapture<DataType> BeginCapture(InputsOutputs<DataType>& io) {
+    return {io, compute_stream_, download_stream_};
+  }
+
+  void UploadInputs(InputsOutputs<DataType>* io, int batchSize) {
+    ReportCUDAErrors(
+        cudaMemcpyAsync(io->input_masks_mem_gpu_, io->input_masks_mem_,
+                        batchSize * kInputPlanes * sizeof(uint64_t),
+                        cudaMemcpyHostToDevice, upload_stream_));
+    ReportCUDAErrors(cudaMemcpyAsync(
+        io->input_val_mem_gpu_, io->input_val_mem_,
+        batchSize * kInputPlanes * sizeof(io->input_val_mem_[0]),
+        cudaMemcpyHostToDevice, upload_stream_));
+    ReportCUDAErrors(cudaEventRecord(io->upload_done_event_, upload_stream_));
+    ReportCUDAErrors(
+        cudaStreamWaitEvent(compute_stream_, io->upload_done_event_, 0));
+  }
+
+  void GraphLaunch(InputsOutputs<DataType>* io, int batchSize) {
+    UploadInputs(io, batchSize);
+
+    // cudaGraphUpload was added in CUDA 11.1
+#if CUDART_VERSION >= 11010
+    // Make sure graph has completed upload before launching it.
+    ReportCUDAErrors(cudaStreamSynchronize(io->exec_stream_));
+#endif
+
+    io->cuda_graphs_[batchSize - 1].Launch(compute_stream_);
+    ReportCUDAErrors(
+        cudaEventRecord(io->download_done_event_, compute_stream_));
+  }
+
+  void forwardEval(InputsOutputs<DataType>* io, int batchSize,
+                   bool capture = false) {
     // It is safe to evaluate larger than the batchSize
     // as all buffers are designed to handle max_batch_size
     // and the extra invalid results are never read.
     if (batchSize < min_batch_size_) batchSize = min_batch_size_;
-    std::unique_lock<std::mutex> lock(lock_);
 
 #ifdef DEBUG_RAW_NPS
     auto t_start = std::chrono::high_resolution_clock::now();
 #endif
 
     // TODO: consider supporting multi-stream path for cudnn backend too.
-    cudaStream_t stream = 0;  // default stream
+    cudaStream_t compute_stream = compute_stream_;
+    cudaStream_t download_stream = download_stream_;
 
     // Expand packed planes to full planes.
-    uint64_t* ipDataMasks = io->input_masks_mem_gpu_;
-    float* ipDataValues = io->input_val_mem_gpu_;
+    const uint64_t* ipDataMasks = io->input_masks_mem_gpu_;
+    const auto* ipDataValues = io->input_val_mem_gpu_;
 
-    bool fp16 = std::is_same<half, DataType>::value;
-    if (fp16) {
-      if (nhwc_)
-        expandPlanes_Fp16_NHWC((half*)(tensor_mem_[0]), ipDataMasks,
-                               ipDataValues, batchSize * kInputPlanes, stream);
-      else
-        expandPlanes_Fp16_NCHW((half*)(tensor_mem_[0]), ipDataMasks,
-                               ipDataValues, batchSize * kInputPlanes, stream);
-    } else {
-      expandPlanes_Fp32_NCHW((float*)(tensor_mem_[0]), ipDataMasks,
-                             ipDataValues, batchSize * kInputPlanes, stream);
-    }
+    if (nhwc_)
+      expandPlanes_NHWC(tensor_mem_[0], ipDataMasks, ipDataValues,
+                        batchSize * kInputPlanes, compute_stream);
+    else
+      expandPlanes_NCHW(tensor_mem_[0], ipDataMasks, ipDataValues,
+                        batchSize * kInputPlanes, compute_stream);
 
     // debug code example
     // dumpTensor(tensor_mem_[0], 1024, "After expand Planes", fp16);
 
-    float* opPol = io->op_policy_mem_gpu_;
-    float* opVal = io->op_value_mem_gpu_;
-    float* opMov = io->op_moves_left_mem_gpu_;
+    auto* opPol = io->op_policy_mem_gpu_;
+    auto* opVal = io->op_value_mem_gpu_;
+    auto* opMov = io->op_moves_left_mem_gpu_;
 
     int l = 0;
     // Input.
@@ -710,40 +773,40 @@ class CudnnNetwork : public Network {
         batchSize,
         use_res_block_winograd_fuse_opt_ ? tensor_mem_[1] : tensor_mem_[2],
         tensor_mem_[0], nullptr, scratch_mem_, scratch_size_, cudnn_, cublas_,
-        stream);  // input conv
+        compute_stream);  // input conv
 
     // Residual block.
     for (int block = 0; block < numBlocks_; block++) {
       if (use_res_block_winograd_fuse_opt_) {
         network_[l++]->Eval(batchSize, tensor_mem_[2], tensor_mem_[1], nullptr,
                             scratch_mem_, scratch_size_, cudnn_, cublas_,
-                            stream);  // block
+                            compute_stream);  // block
       } else {
         network_[l++]->Eval(batchSize, tensor_mem_[0], tensor_mem_[2], nullptr,
                             scratch_mem_, scratch_size_, cudnn_, cublas_,
-                            stream);  // conv1
+                            compute_stream);  // conv1
 
         if (use_custom_winograd_) {
           network_[l++]->Eval(batchSize, tensor_mem_[2], tensor_mem_[0],
                               tensor_mem_[2], scratch_mem_, scratch_size_,
-                              cudnn_, cublas_, stream);  // conv2
+                              cudnn_, cublas_, compute_stream);  // conv2
         } else {
           // For SE Resnet, skip connection is added after SE (and bias is added
           // as part of SE).
           if (has_se_) {
             network_[l++]->Eval(batchSize, tensor_mem_[1], tensor_mem_[0],
                                 nullptr, scratch_mem_, scratch_size_, cudnn_,
-                                cublas_, stream);  // conv2
+                                cublas_, compute_stream);  // conv2
           } else {
             network_[l++]->Eval(batchSize, tensor_mem_[2], tensor_mem_[0],
                                 tensor_mem_[2], scratch_mem_, scratch_size_,
-                                cudnn_, cublas_, stream);  // conv2
+                                cudnn_, cublas_, compute_stream);  // conv2
           }
 
           if (has_se_) {
             network_[l++]->Eval(batchSize, tensor_mem_[2], tensor_mem_[1],
                                 tensor_mem_[2], scratch_mem_, scratch_size_,
-                                cudnn_, cublas_, stream);  // SE layer
+                                cudnn_, cublas_, compute_stream);  // SE layer
           }
         }
       }
@@ -753,125 +816,110 @@ class CudnnNetwork : public Network {
     if (attn_policy_) {
       network_[l++]->Eval(
           batchSize, tensor_mem_[0], tensor_mem_[2], tensor_mem_[1],
-          scratch_mem_, scratch_size_, nullptr, cublas_, stream,
+          scratch_mem_, scratch_size_, nullptr, cublas_, compute_stream,
           &head_offset_pointers_);  // Entire Attention policy head except for
                                     // the policy map
-      if (fp16) {
-        network_[l++]->Eval(batchSize, tensor_mem_[1], tensor_mem_[0], nullptr,
-                            scratch_mem_, scratch_size_, nullptr, cublas_,
-                            stream);  // policy map layer
-        copyTypeConverted(opPol, (half*)(tensor_mem_[1]),
-                          batchSize * kNumOutputPolicy,
-                          stream);  // POLICY output
-      } else {
-        network_[l++]->Eval(batchSize, (DataType*)opPol, tensor_mem_[0],
-                            nullptr, scratch_mem_, scratch_size_, nullptr,
-                            cublas_, stream);  // policy map layer
-                                               // POLICY output
-      }
+      network_[l++]->Eval(batchSize, (DataType*)opPol, tensor_mem_[0], nullptr,
+                          scratch_mem_, scratch_size_, nullptr, cublas_,
+                          compute_stream);  // policy map layer
+                                            // POLICY output
 
     } else if (conv_policy_) {
       network_[l++]->Eval(batchSize, tensor_mem_[0], tensor_mem_[2], nullptr,
                           scratch_mem_, scratch_size_, cudnn_, cublas_,
-                          stream);  // policy conv1
+                          compute_stream);  // policy conv1
 
       network_[l++]->Eval(batchSize, tensor_mem_[1], tensor_mem_[0], nullptr,
                           scratch_mem_, scratch_size_, cudnn_, cublas_,
-                          stream);  // policy conv2
+                          compute_stream);  // policy conv2
 
-      if (fp16) {
-        network_[l++]->Eval(batchSize, tensor_mem_[0], tensor_mem_[1], nullptr,
-                            scratch_mem_, scratch_size_, cudnn_, cublas_,
-                            stream);  // policy map layer
-        copyTypeConverted(opPol, (half*)(tensor_mem_[0]),
-                          batchSize * kNumOutputPolicy,
-                          stream);  // POLICY output
-      } else {
-        network_[l++]->Eval(batchSize, (DataType*)opPol, tensor_mem_[1],
-                            nullptr, scratch_mem_, scratch_size_, cudnn_,
-                            cublas_,
-                            stream);  // policy map layer  // POLICY output
-      }
+      network_[l++]->Eval(
+          batchSize, (DataType*)opPol, tensor_mem_[1], nullptr, scratch_mem_,
+          scratch_size_, cudnn_, cublas_,
+          compute_stream);  // policy map layer  // POLICY output
     } else {
       network_[l++]->Eval(batchSize, tensor_mem_[0], tensor_mem_[2], nullptr,
                           scratch_mem_, scratch_size_, cudnn_, cublas_,
-                          stream);  // pol conv
-
-      if (fp16) {
-        network_[l++]->Eval(batchSize, tensor_mem_[1], tensor_mem_[0], nullptr,
-                            scratch_mem_, scratch_size_, cudnn_, cublas_,
-                            stream);  // pol FC
+                          compute_stream);  // pol conv
 
-        copyTypeConverted(opPol, (half*)(tensor_mem_[1]),
-                          batchSize * kNumOutputPolicy, stream);  // POLICY
-      } else {
-        network_[l++]->Eval(batchSize, (DataType*)opPol, tensor_mem_[0],
-                            nullptr, scratch_mem_, scratch_size_, cudnn_,
-                            cublas_, stream);  // pol FC  // POLICY
-      }
+      network_[l++]->Eval(batchSize, (DataType*)opPol, tensor_mem_[0], nullptr,
+                          scratch_mem_, scratch_size_, cudnn_, cublas_,
+                          compute_stream);  // pol FC  // POLICY
     }
 
+    ReportCUDAErrors(cudaEventRecord(io->policy_done_event_, compute_stream));
+    ReportCUDAErrors(
+        cudaStreamWaitEvent(download_stream, io->policy_done_event_, 0));
+
     // Copy policy output from device memory to host memory.
     ReportCUDAErrors(cudaMemcpyAsync(
         io->op_policy_mem_, io->op_policy_mem_gpu_,
-        sizeof(float) * kNumOutputPolicy * batchSize, cudaMemcpyDeviceToHost));
+        sizeof(io->op_policy_mem_[0]) * kNumOutputPolicy * batchSize,
+        cudaMemcpyDeviceToHost, download_stream));
 
     // value head
     network_[l++]->Eval(batchSize, tensor_mem_[0], tensor_mem_[2], nullptr,
                         scratch_mem_, scratch_size_, cudnn_, cublas_,
-                        stream);  // value conv
+                        compute_stream);  // value conv
 
     network_[l++]->Eval(batchSize, tensor_mem_[1], tensor_mem_[0], nullptr,
                         scratch_mem_, scratch_size_, cudnn_, cublas_,
-                        stream);  // value FC1
+                        compute_stream);  // value FC1
 
-    if (fp16) {
-      // TODO: consider fusing the bias-add of FC2 with format conversion.
-      network_[l++]->Eval(batchSize, tensor_mem_[0], tensor_mem_[1], nullptr,
-                          scratch_mem_, scratch_size_, cudnn_, cublas_,
-                          stream);  // value FC2
-      copyTypeConverted(opVal, (half*)(tensor_mem_[0]),
-                        wdl_ ? 3 * batchSize : batchSize, stream);  // VALUE
-    } else {
-      network_[l++]->Eval(batchSize, (DataType*)opVal, tensor_mem_[1], nullptr,
-                          scratch_mem_, scratch_size_, cudnn_, cublas_,
-                          stream);  // value FC2    // VALUE
-    }
+    network_[l++]->Eval(batchSize, (DataType*)opVal, tensor_mem_[1], nullptr,
+                        scratch_mem_, scratch_size_, cudnn_, cublas_,
+                        compute_stream);  // value FC2    // VALUE
+
+    ReportCUDAErrors(cudaEventRecord(io->value_done_event_, compute_stream));
+    ReportCUDAErrors(
+        cudaStreamWaitEvent(download_stream, io->value_done_event_, 0));
+    ReportCUDAErrors(cudaMemcpyAsync(
+        io->op_value_mem_, io->op_value_mem_gpu_,
+        sizeof(io->op_value_mem_[0]) * (wdl_ ? 3 : 1) * batchSize,
+        cudaMemcpyDeviceToHost, download_stream));
 
     if (moves_left_) {
       // Moves left head
       network_[l++]->Eval(batchSize, tensor_mem_[0], tensor_mem_[2], nullptr,
                           scratch_mem_, scratch_size_, cudnn_, cublas_,
-                          stream);  // moves conv
+                          compute_stream);  // moves conv
 
       network_[l++]->Eval(batchSize, tensor_mem_[1], tensor_mem_[0], nullptr,
                           scratch_mem_, scratch_size_, cudnn_, cublas_,
-                          stream);  // moves FC1
+                          compute_stream);  // moves FC1
 
       // Moves left FC2
-      if (fp16) {
-        // TODO: consider fusing the bias-add of FC2 with format conversion.
-        network_[l++]->Eval(batchSize, tensor_mem_[0], tensor_mem_[1], nullptr,
-                            scratch_mem_, scratch_size_, cudnn_, cublas_,
-                            stream);
-        copyTypeConverted(opMov, (half*)(tensor_mem_[0]), batchSize, stream);
-      } else {
-        network_[l++]->Eval(batchSize, (DataType*)opMov, tensor_mem_[1],
-                            nullptr, scratch_mem_, scratch_size_, cudnn_,
-                            cublas_, stream);
-      }
+      network_[l++]->Eval(batchSize, (DataType*)opMov, tensor_mem_[1], nullptr,
+                          scratch_mem_, scratch_size_, cudnn_, cublas_,
+                          compute_stream);
+
+      ReportCUDAErrors(
+          cudaEventRecord(io->moves_left_done_event_, compute_stream));
+      ReportCUDAErrors(
+          cudaStreamWaitEvent(download_stream, io->moves_left_done_event_, 0));
+      ReportCUDAErrors(
+          cudaMemcpyAsync(io->op_moves_left_mem_, io->op_moves_left_mem_gpu_,
+                          sizeof(io->op_moves_left_mem_[0]) * batchSize,
+                          cudaMemcpyDeviceToHost, download_stream));
     }
 
-    ReportCUDAErrors(cudaDeviceSynchronize());
-    // The next thread can start using the GPU now.
-    lock.unlock();
+    if (!capture) {
+      ReportCUDAErrors(
+          cudaEventRecord(io->download_done_event_, download_stream));
+    }
+  }
 
+  void finishEval(InputsOutputs<DataType>* io, int batchSize) {
+    ReportCUDAErrors(cudaEventSynchronize(io->download_done_event_));
     if (wdl_) {
       // Value softmax done cpu side.
       for (int i = 0; i < batchSize; i++) {
-        float w = io->op_value_mem_[3 * i + 0];
-        float d = io->op_value_mem_[3 * i + 1];
-        float l = io->op_value_mem_[3 * i + 2];
+        float* wdl = sizeof(io->op_value_mem_[0]) == sizeof(float)
+                         ? (float*)io->op_value_mem_
+                         : io->wdl_cpu_softmax_.get();
+        float w = FromType(io->op_value_mem_[3 * i + 0]);
+        float d = FromType(io->op_value_mem_[3 * i + 1]);
+        float l = FromType(io->op_value_mem_[3 * i + 2]);
         float m = std::max({w, d, l});
         w = std::exp(w - m);
         d = std::exp(d - m);
@@ -879,10 +927,9 @@ class CudnnNetwork : public Network {
         float sum = w + d + l;
         w /= sum;
         l /= sum;
-        d = 1.0f - w - l;
-        io->op_value_mem_[3 * i + 0] = w;
-        io->op_value_mem_[3 * i + 1] = d;
-        io->op_value_mem_[3 * i + 2] = l;
+        d /= sum;
+        wdl[2 * i + 0] = w - l;
+        wdl[2 * i + 1] = d;
       }
     }
 
@@ -921,6 +968,9 @@ class CudnnNetwork : public Network {
       ReportCUDAErrors(cudaFree(head_offset_pointers_));
     cudnnDestroy(cudnn_);
     cublasDestroy(cublas_);
+    ReportCUDAErrors(cudaStreamDestroy(compute_stream_));
+    ReportCUDAErrors(cudaStreamDestroy(upload_stream_));
+    ReportCUDAErrors(cudaStreamDestroy(download_stream_));
   }
 
   const NetworkCapabilities& GetCapabilities() const override {
@@ -930,25 +980,29 @@ class CudnnNetwork : public Network {
   std::unique_ptr<NetworkComputation> NewComputation() override {
     // Set correct gpu id for this computation (as it might have been called
     // from a different thread).
-    ReportCUDAErrors(cudaSetDevice(gpu_id_));
+    int device = -1;
+    ReportCUDAErrors(cudaGetDevice(&device));
+    if (device != gpu_id_) {
+      ReportCUDAErrors(cudaSetDevice(gpu_id_));
+    }
     return std::make_unique<CudnnNetworkComputation<DataType>>(this, wdl_,
                                                                moves_left_);
   }
 
-  std::unique_ptr<InputsOutputs> GetInputsOutputs() {
+  std::unique_ptr<InputsOutputs<DataType>> GetInputsOutputs() {
     std::lock_guard<std::mutex> lock(inputs_outputs_lock_);
     if (free_inputs_outputs_.empty()) {
-      return std::make_unique<InputsOutputs>(max_batch_size_, wdl_,
-                                             moves_left_);
+      return std::make_unique<InputsOutputs<DataType>>(max_batch_size_, wdl_,
+                                                       moves_left_);
     } else {
-      std::unique_ptr<InputsOutputs> resource =
+      std::unique_ptr<InputsOutputs<DataType>> resource =
           std::move(free_inputs_outputs_.front());
       free_inputs_outputs_.pop_front();
       return resource;
     }
   }
 
-  void ReleaseInputsOutputs(std::unique_ptr<InputsOutputs> resource) {
+  void ReleaseInputsOutputs(std::unique_ptr<InputsOutputs<DataType>> resource) {
     std::lock_guard<std::mutex> lock(inputs_outputs_lock_);
     free_inputs_outputs_.push_back(std::move(resource));
   }
@@ -956,7 +1010,9 @@ class CudnnNetwork : public Network {
   // Apparently nvcc doesn't see constructor invocations through make_unique.
   // This function invokes constructor just to please complier and silence
   // warning. Is never called (but compiler thinks that it could).
-  void UglyFunctionToSilenceNvccWarning() { InputsOutputs io(0, false, false, false); }
+  void UglyFunctionToSilenceNvccWarning() {
+    InputsOutputs<DataType> io(0, false, false, false);
+  }
 
  private:
   const NetworkCapabilities capabilities_;
@@ -965,6 +1021,7 @@ class CudnnNetwork : public Network {
   int gpu_id_;
   int max_batch_size_;
   int min_batch_size_;
+  bool enable_graph_capture_;
   bool wdl_;
   bool moves_left_;
 
@@ -999,7 +1056,12 @@ class CudnnNetwork : public Network {
   size_t scratch_size_;
 
   mutable std::mutex inputs_outputs_lock_;
-  std::list<std::unique_ptr<InputsOutputs>> free_inputs_outputs_;
+  std::list<std::unique_ptr<InputsOutputs<DataType>>> free_inputs_outputs_;
+
+  cudaStream_t compute_stream_ = nullptr;
+  cudaStream_t upload_stream_ = nullptr;
+  cudaStream_t download_stream_ = nullptr;
+  cudaEvent_t compute_ordering_event_ = nullptr;
 
   void showInfo() const {
     int version;
@@ -1020,16 +1082,20 @@ class CudnnNetwork : public Network {
       major = CUDART_VERSION / 1000;
       minor = (CUDART_VERSION - major * 1000) / 10;
       pl = CUDART_VERSION - major * 1000 - minor * 10;
-      CERR << "WARNING: CUDA Runtime version mismatch, was compiled with "
-              "version "
-           << major << "." << minor << "." << pl;
+      // After cuda 11, newer version with same major is OK.
+      if (major < 11 || (major != version / 1000) || version < CUDART_VERSION) {
+        CERR << "WARNING: CUDA Runtime version mismatch, was compiled with "
+                "version "
+             << major << "." << minor << "." << pl;
+      }
     }
     version = (int)cudnnGetVersion();
     major = version / 1000;
     minor = (version - major * 1000) / 100;
     pl = version - major * 1000 - minor * 100;
     CERR << "Cudnn version: " << major << "." << minor << "." << pl;
-    if (version != CUDNN_VERSION) {
+    // Assuming CUDNN > 7.
+    if (major != CUDNN_MAJOR || minor < CUDNN_MINOR) {
       CERR << "WARNING: CUDNN Runtime version mismatch, was compiled with "
               "version "
            << CUDNN_MAJOR << "." << CUDNN_MINOR << "." << CUDNN_PATCHLEVEL;
@@ -1045,11 +1111,27 @@ class CudnnNetwork : public Network {
     }
   }
 
-  void showDeviceInfo(const cudaDeviceProp& deviceProp) const {
+  void showDeviceInfo(const cudaDeviceProp& deviceProp,
+                      [[maybe_unused]] int deviceId) const {
     CERR << "GPU: " << deviceProp.name;
     CERR << "GPU memory: " << deviceProp.totalGlobalMem / std::pow(2.0f, 30)
          << " GiB";
-    CERR << "GPU clock frequency: " << deviceProp.clockRate / 1e3f << " MHz";
+    // Get clock rate
+    float clockRateMHz;
+#if CUDART_VERSION >= 13000
+    int clockRatekHz;
+    cudaError_t err =
+        cudaDeviceGetAttribute(&clockRatekHz, cudaDevAttrClockRate, deviceId);
+    if (err != cudaSuccess) {
+      CERR << "Error getting clock rate: " << cudaGetErrorString(err);
+      clockRateMHz = 0.0f;  // Fallback value
+    } else {
+      clockRateMHz = clockRatekHz / 1e3f;
+    }
+#else
+    clockRateMHz = deviceProp.clockRate / 1e3f;
+#endif
+    CERR << "GPU clock frequency: " << clockRateMHz << " MHz";
     CERR << "GPU compute capability: " << deviceProp.major << "."
          << deviceProp.minor;
 
@@ -1078,9 +1160,37 @@ CudnnNetworkComputation<DataType>::~CudnnNetworkComputation() {
   network_->ReleaseInputsOutputs(std::move(inputs_outputs_));
 }
 
+template <typename DataType>
+void CudnnNetworkComputation<DataType>::CaptureGraph(
+    std::unique_lock<std::mutex>&& lock) {
+  if (!network_->GetGraphCaptureEnabled()) return;
+  if (!CudaGraphCapture<DataType>::EnsureEnoughFreeMemory()) {
+    static std::once_flag flag;
+    std::call_once(flag, []() {
+      CERR << "WARNING: Not enough GPU memory to capture CUDA graphs.";
+    });
+    return;
+  }
+  CudaGraphCapture capture = network_->BeginCapture(*inputs_outputs_);
+  network_->forwardEval(inputs_outputs_.get(), GetBatchSize(), true);
+  capture.EndCapture();
+  if (lock.owns_lock()) lock.unlock();
+  inputs_outputs_->cuda_graphs_[GetBatchSize() - 1] = capture;
+}
+
 template <typename DataType>
 void CudnnNetworkComputation<DataType>::ComputeBlocking() {
-  network_->forwardEval(inputs_outputs_.get(), GetBatchSize());
+  assert(GetBatchSize() >= 1);
+  if (inputs_outputs_->cuda_graphs_[GetBatchSize() - 1]) {
+    std::unique_lock<std::mutex> lock = network_->LockEval();
+    network_->GraphLaunch(inputs_outputs_.get(), GetBatchSize());
+  } else {
+    std::unique_lock<std::mutex> lock = network_->LockEval();
+    network_->UploadInputs(inputs_outputs_.get(), GetBatchSize());
+    network_->forwardEval(inputs_outputs_.get(), GetBatchSize());
+    CaptureGraph(std::move(lock));
+  }
+  network_->finishEval(inputs_outputs_.get(), GetBatchSize());
 }
 
 template <typename DataType>
diff --git a/src/neural/backends/cuda/winograd_helper.inc b/src/neural/backends/cuda/winograd_helper.inc
index 72e9828bb9..749181eee4 100644
--- a/src/neural/backends/cuda/winograd_helper.inc
+++ b/src/neural/backends/cuda/winograd_helper.inc
@@ -843,14 +843,15 @@ __global__ __launch_bounds__(
 }
 
 template <typename T>
-void FilterTransform(int N, int C, T* transformedFilter, const T* filter) {
+void FilterTransform(int N, int C, T* transformedFilter, const T* filter,
+                     cudaStream_t stream) {
   // Each thread processes entire filter block (input 3x3 elements -> output 6x6
   // elements)
   const int kBlockSize = 64;
   const int kBlocks = DivUp(N * C, kBlockSize);
 
-  filterTransform_kernel<<<kBlocks, kBlockSize>>>(N, C, N * C,
-                                                  transformedFilter, filter);
+  filterTransform_kernel<<<kBlocks, kBlockSize, 0, stream>>>(
+      N, C, N * C, transformedFilter, filter);
 
   ReportCUDAErrors(cudaGetLastError());
 }
diff --git a/src/neural/backends/metal/metal_common.h b/src/neural/backends/metal/metal_common.h
index a42c00dcac..0c76d7395b 100644
--- a/src/neural/backends/metal/metal_common.h
+++ b/src/neural/backends/metal/metal_common.h
@@ -36,14 +36,13 @@ static int kInputPlanes = 112;
 struct InputsOutputs {
   InputsOutputs(int maxBatchSize, bool wdl, bool moves_left, bool conv_policy,
                 bool attn_policy) {
-    input_masks_mem_.reserve(maxBatchSize * kInputPlanes);
-    input_val_mem_.reserve(maxBatchSize * kInputPlanes);
-    input_val_mem_expanded_.reserve(maxBatchSize * kInputPlanes * 64);
-    op_policy_mem_.reserve(maxBatchSize * kNumOutputPolicy);
-    op_value_mem_.reserve(maxBatchSize * (wdl ? 3 : 1));
+    input_masks_mem_.resize(maxBatchSize * kInputPlanes);
+    input_val_mem_.resize(maxBatchSize * kInputPlanes);
+    op_policy_mem_.resize(maxBatchSize * kNumOutputPolicy);
+    op_value_mem_.resize(maxBatchSize * (wdl ? 3 : 1));
 
     if (moves_left) {
-      op_moves_left_mem_.reserve(maxBatchSize);
+      op_moves_left_mem_.resize(maxBatchSize);
     };
 
     /**
@@ -53,16 +52,15 @@ struct InputsOutputs {
      * Remove this op_policy_raw_mem_ memory allocation when bug is fixed.
      */
     if (attn_policy) {
-      op_policy_raw_mem_.reserve(maxBatchSize * (64 * 64 + 8 * 24));
+      op_policy_raw_mem_.resize(maxBatchSize * (64 * 64 + 8 * 24));
     } else if (conv_policy) {
-      op_policy_raw_mem_.reserve(maxBatchSize * 73 * 64);
+      op_policy_raw_mem_.resize(maxBatchSize * 73 * 64);
     }
   }
   ~InputsOutputs() {}
 
   std::vector<uint64_t> input_masks_mem_;
   std::vector<float> input_val_mem_;
-  std::vector<float> input_val_mem_expanded_;
   std::vector<float> op_policy_mem_;
   std::vector<float> op_value_mem_;
   std::vector<float> op_moves_left_mem_;
diff --git a/src/neural/backends/metal/mps/MetalNetworkBuilder.h b/src/neural/backends/metal/mps/MetalNetworkBuilder.h
index 74ddd6bcaa..869e014005 100644
--- a/src/neural/backends/metal/mps/MetalNetworkBuilder.h
+++ b/src/neural/backends/metal/mps/MetalNetworkBuilder.h
@@ -51,7 +51,7 @@ class MetalNetworkBuilder {
              Activations& activations, std::string& policy_head,
              std::string& value_head);
 
-  void forwardEval(float* inputs, int batchSize,
+  void forwardEval(float* values, uint64_t* masks, int batchSize,
                    std::vector<float*> output_mems);
 
  private:
diff --git a/src/neural/backends/metal/mps/MetalNetworkBuilder.mm b/src/neural/backends/metal/mps/MetalNetworkBuilder.mm
index 2be155975a..7791d13d85 100644
--- a/src/neural/backends/metal/mps/MetalNetworkBuilder.mm
+++ b/src/neural/backends/metal/mps/MetalNetworkBuilder.mm
@@ -36,13 +36,12 @@ Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
 MetalNetworkBuilder::MetalNetworkBuilder(void){}
 MetalNetworkBuilder::~MetalNetworkBuilder(void){}
 
-//void MetalNetworkBuilder::init(void* weights, void* options)
 std::string MetalNetworkBuilder::init(int gpu_id)
 {
     // All metal devices.
     NSArray<id<MTLDevice>> * devices = MTLCopyAllDevices();
 
-    if ([devices count] <= gpu_id) {
+    if ((NSUInteger)gpu_id >= [devices count]) {
         // No GPU device matching ID.
         [NSException raise:@"Could not find device" format:@"Could not find a GPU or CPU compute device with specified id"];
         return "";
@@ -68,13 +67,17 @@ Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
     NSString * policyHead = [NSString stringWithUTF8String:policy_head.c_str()];
     NSString * valueHead = [NSString stringWithUTF8String:value_head.c_str()];
 
-    // 0. Input placeholder.
-    // @todo - placeholder can be made directly as NHWC to avoid transposes.
+    // 0. Input value and mask placeholders.
     MPSGraphTensor * layer = [graph inputPlaceholderWithInputChannels:kInputPlanes
-                                                               height:8
-                                                                width:8
                                                                 label:@"inputs"];
 
+    MPSGraphTensor * maskTensor = [graph maskPlaceholderWithInputChannels:kInputPlanes
+                                                                    label:@"inputs/mask"];
+
+    layer = [graph expandInputTensorWithMask:maskTensor
+                                       input:layer
+                                       label:@"inputs/expand"];
+
     const NSUInteger kernelSize = 3;
     const bool isPeDenseEmbedding = embedding == InputEmbedding::INPUT_EMBEDDING_PE_DENSE;
 
@@ -302,11 +305,11 @@ Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
     }
 }
 
-void MetalNetworkBuilder::forwardEval(float * inputs, int batchSize, std::vector<float *> output_mems)
+void MetalNetworkBuilder::forwardEval(float * inputs, uint64_t * masks, int batchSize, std::vector<float *> output_mems)
 {
     @autoreleasepool {
         Lc0NetworkGraph * graph = [Lc0NetworkGraph getGraphAt:[NSNumber numberWithInt:this->gpu_id]];
-        [graph runInferenceWithBatchSize:batchSize inputs:inputs outputs:&output_mems[0]];
+        [graph runInferenceWithBatchSize:batchSize inputs:inputs masks:masks outputs:&output_mems[0]];
     }
 }
 
diff --git a/src/neural/backends/metal/mps/NetworkGraph.h b/src/neural/backends/metal/mps/NetworkGraph.h
index 2664b68c7d..dfc163cc48 100644
--- a/src/neural/backends/metal/mps/NetworkGraph.h
+++ b/src/neural/backends/metal/mps/NetworkGraph.h
@@ -50,12 +50,13 @@ static MPSImageFeatureChannelFormat fcFormat = MPSImageFeatureChannelFormatFloat
 
     // Input tensor and tensor data placeholders.
     MPSGraphTensor * _inputTensor;
+    MPSGraphTensor * _maskTensor;
 
     // Variables to track results of graph inference.
     NSArray<MPSGraphTensor *> * _resultTensors;
     NSArray<MPSGraphTensor *> * _targetTensors;
     NSMutableDictionary<NSNumber *, MPSGraphTensorDataDictionary *> * _resultDataDicts;
-    NSMutableDictionary<NSString *, NSObject *> * _readVariables;
+    NSMutableDictionary<NSString *, MPSGraphTensor *> * _readVariables;
 
     // Variables for triple buffering
     dispatch_semaphore_t _doubleBufferingSemaphore;
@@ -72,10 +73,20 @@ static MPSImageFeatureChannelFormat fcFormat = MPSImageFeatureChannelFormatFloat
 -(nonnull instancetype) initWithDevice:(id<MTLDevice> __nonnull)device;
 
 -(nonnull MPSGraphTensor *) inputPlaceholderWithInputChannels:(NSUInteger)channels
-                                                       height:(NSUInteger)height
-                                                        width:(NSUInteger)width
                                                         label:(NSString * __nullable)label;
 
+-(nonnull MPSGraphTensor *) maskPlaceholderWithInputChannels:(NSUInteger)channels
+                                                       label:(NSString * __nullable)label;
+
+-(nonnull MPSGraphTensor *) expandInputTensorWithMask:(MPSGraphTensor * __nonnull)maskTensor
+                                                input:(MPSGraphTensor * __nonnull)inputTensor
+                                                label:(NSString * __nonnull)label;
+
+- (nonnull MPSGraphTensor *) broadcastByStackingTensor:(MPSGraphTensor * __nonnull)input
+                                                  axis:(NSInteger)axis
+                                                 times:(NSUInteger)times
+                                                  name:(NSString * __nonnull)name;
+
 -(nonnull MPSGraphTensor *) addConvolutionBlockWithParent:(MPSGraphTensor * __nonnull)parent
                                            outputChannels:(NSUInteger)outputChannels
                                                kernelSize:(NSUInteger)kernelSize
@@ -199,9 +210,11 @@ static MPSImageFeatureChannelFormat fcFormat = MPSImageFeatureChannelFormatFloat
 
 -(nonnull NSArray<MPSGraphTensor *> *) runInferenceWithBatchSize:(NSUInteger)batchSize
                                                           inputs:(float * __nonnull)inputs
+                                                           masks:(uint64_t * __nonnull)masks
                                                          outputs:(float * __nonnull * __nonnull)outputBuffers;
 
 -(nonnull MPSCommandBuffer *) runCommandSubBatchWithInputs:(float * __nonnull)inputs
+                                                     masks:(uint64_t * __nonnull)masks
                                                   subBatch:(NSUInteger)subBatch
                                               subBatchSize:(NSUInteger)subBatchSize;
 
diff --git a/src/neural/backends/metal/mps/NetworkGraph.mm b/src/neural/backends/metal/mps/NetworkGraph.mm
index 0befa256e6..322308e67b 100644
--- a/src/neural/backends/metal/mps/NetworkGraph.mm
+++ b/src/neural/backends/metal/mps/NetworkGraph.mm
@@ -25,9 +25,11 @@ Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
   Program grant you additional permission to convey the resulting work.
 */
 
+#import <vector>
 #import "neural/network_legacy.h"
+#import "neural/tables/attention_policy_map.h"
+#import "neural/tables/policy_map.h"
 #import "NetworkGraph.h"
-#import <vector>
 
 static MPSGraphConvolution2DOpDescriptor * __nonnull convolution2DDescriptor = [MPSGraphConvolution2DOpDescriptor descriptorWithStrideInX:1
                                                                                                                                 strideInY:1
@@ -66,13 +68,12 @@ -(NSUInteger) size {
 -(NSUInteger) sizeOfDimensions:(NSArray<NSNumber *> *)dimensions {
     NSUInteger size = 1;
     for (NSNumber * dim in dimensions) {
-        if ([dim intValue] < [self.shape count])
-            size *= [self.shape[[dim intValue]] intValue];
+        if ((NSUInteger)[dim intValue] < [self.shape count])
+            size *= [self.shape[(NSUInteger)[dim intValue]] intValue];
     }
     return size;
 }
 
-
 -(NSUInteger) sizeOfDimensionsFrom:(NSNumber *)dimension {
     NSUInteger size = 1;
     for (NSUInteger dim = [dimension intValue]; dim < [self.shape count]; dim++) {
@@ -137,6 +138,7 @@ -(nonnull instancetype) initWithDevice:(id<MTLDevice> __nonnull)device
 
 -(nonnull NSArray<MPSGraphTensor *> *) runInferenceWithBatchSize:(NSUInteger)batchSize
                                                           inputs:(float * __nonnull)inputs
+                                                           masks:(uint64_t * __nonnull)masks
                                                          outputs:(float * __nonnull * __nonnull)outputBuffers
 {
     // Calculate number of sub-batches to split across GPU command buffers for parallel execution.
@@ -144,18 +146,20 @@ -(nonnull instancetype) initWithDevice:(id<MTLDevice> __nonnull)device
     NSUInteger splits = (batchSize + kMinSubBatchSize + 1) / kMinSubBatchSize;
     if (splits > kMaxInflightBuffers) splits = kMaxInflightBuffers;
     NSUInteger subBatchSize = batchSize / splits;
-    NSUInteger inputDataLength = subBatchSize * [_inputTensor sizeOfDimensions:@[@1, @2, @3]];
+    NSUInteger inputDataLength = subBatchSize * [_inputTensor sizeOfDimensionsFrom:@1];
 
     // Split batchSize into smaller sub-batches and run using double-buffering.
     NSUInteger subBatch = 0;
     MPSCommandBuffer * commandBuffer;
     for (subBatch = 0; subBatch < splits - 1; subBatch++) {
         commandBuffer = [self runCommandSubBatchWithInputs:inputs + subBatch * inputDataLength
+                                                     masks:masks + subBatch * inputDataLength
                                                   subBatch:subBatch
                                               subBatchSize:subBatchSize];
     }
     // Last sub-batch may be smaller or larger than others.
     MPSCommandBuffer * latestCommandBuffer = [self runCommandSubBatchWithInputs:inputs + subBatch * inputDataLength
+                                                                          masks:masks + subBatch * inputDataLength
                                                                        subBatch:subBatch
                                                                    subBatchSize:batchSize - subBatch * subBatchSize];
 
@@ -169,6 +173,7 @@ -(nonnull instancetype) initWithDevice:(id<MTLDevice> __nonnull)device
 }
 
 -(nonnull MPSCommandBuffer *) runCommandSubBatchWithInputs:(float * __nonnull)inputs
+                                                     masks:(uint64_t * __nonnull)masks
                                                   subBatch:(NSUInteger)subBatch
                                               subBatchSize:(NSUInteger)subBatchSize
 {
@@ -178,7 +183,7 @@ -(nonnull MPSCommandBuffer *) runCommandSubBatchWithInputs:(float * __nonnull)in
     // Create command buffer for this sub-batch.
     MPSCommandBuffer * commandBuffer = [MPSCommandBuffer commandBufferFromCommandQueue:_queue];
 
-    MPSShape * shape = @[@(subBatchSize), _inputTensor.shape[1], _inputTensor.shape[2], _inputTensor.shape[3]];
+    MPSShape * shape = @[@(subBatchSize), _inputTensor.shape[1], _inputTensor.shape[2]];
 
     NSData * inputData = [NSData dataWithBytesNoCopy:inputs
                                               length:subBatchSize * sizeof(float)
@@ -189,17 +194,32 @@ -(nonnull MPSCommandBuffer *) runCommandSubBatchWithInputs:(float * __nonnull)in
                                                                                 shape:shape
                                                                              dataType:_inputTensor.dataType];
 
+    NSData * maskData = [NSData dataWithBytesNoCopy:masks
+                                             length:subBatchSize * sizeof(uint64_t)
+                                       freeWhenDone:NO];
+
+    MPSGraphTensorData * inputMaskData = [[MPSGraphTensorData alloc] initWithDevice:_device
+                                                                               data:maskData
+                                                                              shape:shape
+                                                                           dataType:MPSDataTypeUInt64];
+
+    NSDictionary * feeds = @{_inputTensor : inputTensorData, _maskTensor : inputMaskData};
+
     // Create execution descriptor with block to update results for each iteration.
     MPSGraphExecutionDescriptor * executionDescriptor = [[MPSGraphExecutionDescriptor alloc] init];
-    executionDescriptor.completionHandler = ^(MPSGraphTensorDataDictionary * resultDictionary, NSError * error) {
-        _resultDataDicts[@(subBatch)] = resultDictionary;
+    executionDescriptor.completionHandler = ^(MPSGraphTensorDataDictionary * resultDictionary, NSError * _Nullable error) {
+        if (error) {
+            NSLog(@"Error occurred during execution: %@", error);
+        } else {
+            _resultDataDicts[@(subBatch)] = resultDictionary;
+        }
 
         // Release double buffering semaphore for the next training iteration to be encoded.
         dispatch_semaphore_signal(_doubleBufferingSemaphore);
     };
 
     [self encodeToCommandBuffer:commandBuffer
-                          feeds:@{_inputTensor : inputTensorData}
+                          feeds:feeds
                   targetTensors:_targetTensors
                targetOperations:nil
             executionDescriptor:executionDescriptor];
@@ -226,9 +246,6 @@ -(void) copyResultsToBuffers:(float * __nonnull * __nonnull)outputBuffers
 
 -(void) setResultTensors:(NSArray<MPSGraphTensor *> * __nonnull)results
 {
-    // Okay to remove nulls from the read variables.
-    [_readVariables removeObjectsForKeys:[_readVariables allKeysForObject:[NSNull null]]];
-
     // Set the results we're interested in.
     _resultTensors = results;
 
@@ -238,16 +255,110 @@ -(void) setResultTensors:(NSArray<MPSGraphTensor *> * __nonnull)results
 }
 
 -(nonnull MPSGraphTensor *) inputPlaceholderWithInputChannels:(NSUInteger)channels
-                                                       height:(NSUInteger)height
-                                                        width:(NSUInteger)width
                                                         label:(NSString * __nullable)label
 {
-    // Create a placeholder tensor that can hold the specified number of sub-batches.
-    _inputTensor = [self placeholderWithShape:@[@(-1), @(channels), @(height), @(width)] name:label];
-
+    _inputTensor = [self placeholderWithShape:@[@(-1), @(channels), @1]
+                                     dataType:MPSDataTypeFloat32
+                                         name:label];
     return _inputTensor;
 }
 
+-(nonnull MPSGraphTensor *) maskPlaceholderWithInputChannels:(NSUInteger)channels
+                                                       label:(NSString * __nullable)label
+{
+    _maskTensor = [self placeholderWithShape:@[@(-1), @(channels), @1]
+                                    dataType:MPSDataTypeUInt64
+                                        name:label];
+    return _maskTensor;
+}
+
+-(nonnull MPSGraphTensor *) expandInputTensorWithMask:(MPSGraphTensor * __nonnull)maskTensor
+                                                input:(MPSGraphTensor * __nonnull)valueTensor
+                                                label:(NSString * __nonnull)label
+{
+    // 64 values to form the bitboard indices.
+    uint64_t bitIndices[64];
+    for (int i = 0; i < 64; i++) {
+        bitIndices[i] = 1ULL << i;
+    }
+    NSData * bitIndicesData = [NSData dataWithBytesNoCopy:bitIndices
+                                                   length:64 * sizeof(uint64_t)
+                                             freeWhenDone:NO];
+
+    MPSGraphTensor * bitIndicesTensor = [self constantWithData:bitIndicesData
+                                                         shape:@[@1, @1, @64]
+                                                      dataType:MPSDataTypeUInt64];
+
+    // Broadcast mask and bit index tensors to [N,C,64]
+    maskTensor = [self broadcastByStackingTensor:maskTensor
+                                            axis:3
+                                           times:64
+                                            name:[NSString stringWithFormat:@"%@/mask/broadcast", label]];
+
+    MPSGraphTensor * expandedMaskTensor;
+    if (@available(macOS 13.0, *)) {
+        // Expand the bitmap using the masks and values.
+        expandedMaskTensor = [self bitwiseANDWithPrimaryTensor:maskTensor
+                                               secondaryTensor:bitIndicesTensor
+                                                          name:[NSString stringWithFormat:@"%@/mask/bitwise_and", label]];
+
+        MPSGraphTensor * zeroTensor = [self constantWithScalar:0.0
+                                                         shape:@[@1]
+                                                      dataType:MPSDataTypeUInt64];
+
+        expandedMaskTensor = [self notEqualWithPrimaryTensor:expandedMaskTensor
+                                             secondaryTensor:zeroTensor
+                                                        name:[NSString stringWithFormat:@"%@/zero_equals", label]];
+    } else {
+        // Alternative method: bitwise ops not available in earlier macos versions, so using integer division and modulo.
+        // Divide by the bit index, which is also a power of 2, to shift the desired bit to position 0.
+        expandedMaskTensor = [self divisionWithPrimaryTensor:maskTensor
+                                             secondaryTensor:bitIndicesTensor
+                                                        name:[NSString stringWithFormat:@"%@/mask/divide", label]];
+
+        // Take modulo 2 to extract the least significant bit
+        MPSGraphTensor * twoTensor = [self constantWithScalar:2.0
+                                                        shape:@[@1]
+                                                     dataType:MPSDataTypeUInt64];
+
+        expandedMaskTensor = [self moduloWithPrimaryTensor:expandedMaskTensor
+                                           secondaryTensor:twoTensor
+                                                      name:[NSString stringWithFormat:@"%@/mask/modulo", label]];
+    }
+
+    // Broadcast input tensor values to match the expanded dimensions.
+    valueTensor = [self broadcastByStackingTensor:valueTensor
+                                             axis:3
+                                            times:64
+                                             name:[NSString stringWithFormat:@"%@/input/broadcast", label]];
+
+    expandedMaskTensor = [self castTensor:expandedMaskTensor
+                                   toType:MPSDataTypeFloat32
+                                     name:[NSString stringWithFormat:@"%@/input/cast", label]];
+
+    // Final multiplication: value * mask
+    expandedMaskTensor = [self multiplicationWithPrimaryTensor:expandedMaskTensor
+                                               secondaryTensor:valueTensor
+                                                          name:[NSString stringWithFormat:@"%@/input/multiply", label]];
+
+    // Reshape to final output format [batch_size, kInputPlanes, 8, 8]
+    return [self reshapeTensor:expandedMaskTensor
+                     withShape:@[@(-1), valueTensor.shape[1], @8, @8]
+                          name:[NSString stringWithFormat:@"%@/input/reshape", label]];
+}
+
+- (nonnull MPSGraphTensor *) broadcastByStackingTensor:(MPSGraphTensor * __nonnull)input
+                                                  axis:(NSInteger)axis
+                                                 times:(NSUInteger)times
+                                                  name:(NSString * __nonnull)name
+{
+    NSMutableArray<MPSGraphTensor *> * stackedTensors = [NSMutableArray array];
+    for (NSUInteger i = 0; i < times; i++) {
+        [stackedTensors addObject:input];
+    }
+    return [self stackTensors:stackedTensors axis:axis name:name];
+}
+
 -(nonnull MPSGraphTensor *) addConvolutionBlockWithParent:(MPSGraphTensor * __nonnull)parent
                                            outputChannels:(NSUInteger)outputChannels
                                                kernelSize:(NSUInteger)kernelSize
@@ -471,23 +582,37 @@ -(nonnull MPSGraphTensor *) addSEUnitWithParent:(MPSGraphTensor * __nonnull)pare
 }
 
 -(nonnull MPSGraphTensor *) addPolicyMapLayerWithParent:(MPSGraphTensor * __nonnull)parent
-                                              policyMap:(uint32_t * __nonnull)policyMap
+                                              policyMap:(const short * __nonnull)policyMap
+                                                mapSize:(NSUInteger)mapSize
                                                   label:(NSString * __nonnull)label
 {
-    NSData * policyMapData = [NSData dataWithBytesNoCopy:policyMap
-                                                  length:kNumPolicyOutputs * sizeof(uint32_t)
-                                            freeWhenDone:NO];
+    if ([parent sizeOfDimensionsFrom:@1] < mapSize) {
+        [NSException raise:@"Invalid parent tensor shape"
+                    format:@"Parent tensor non-batch dimensions (%zu) is less than mapping tensor size of (%zu) for policy mapping.",
+                           [parent sizeOfDimensionsFrom:@1], mapSize];
+    }
 
-    MPSGraphTensor * mappingTensor = [self constantWithData:policyMapData
+    // The mapping is an array of 64x?? squares, where each square contains a number from -1 to 1857.
+    // The mapping is flattened to a 1D array of size 1858, where each index corresponds to a square
+    // that had a value != -1.
+    uint32_t mappingIndices[kNumPolicyOutputs];
+    for (NSUInteger i = 0; i < mapSize; i++) {
+        if (policyMap[i] == -1) continue;
+        mappingIndices[policyMap[i]] = i;
+    }
+
+    NSData * policyMapIndexData = [NSData dataWithBytesNoCopy:mappingIndices
+                                                       length:kNumPolicyOutputs * sizeof(uint32_t)
+                                                 freeWhenDone:NO];
+
+    MPSGraphTensor * indicesTensor = [self constantWithData:policyMapIndexData
                                                       shape:@[@(kNumPolicyOutputs)]
                                                    dataType:MPSDataTypeUInt32];
 
-    MPSGraphTensor * flatConvTensor = [self flatten2DTensor:parent
-                                                       axis:1
-                                                       name:[NSString stringWithFormat:@"%@/flatten", label]];
+    parent = [self flatten2DTensor:parent axis:1 name:[NSString stringWithFormat:@"%@/flatten", label]];
 
-    MPSGraphTensor * policyTensor = [self gatherWithUpdatesTensor:flatConvTensor
-                                                    indicesTensor:mappingTensor
+    MPSGraphTensor * policyTensor = [self gatherWithUpdatesTensor:parent
+                                                    indicesTensor:indicesTensor
                                                              axis:1
                                                   batchDimensions:0
                                                              name:[NSString stringWithFormat:@"%@/gather", label]];
@@ -506,7 +631,6 @@ -(nonnull MPSGraphTensor *) addEncoderLayerWithParent:(MPSGraphTensor * __nonnul
                                              normtype:(NSString * __nonnull)normtype
                                                 label:(NSString * __nonnull)label
 {
-    NSUInteger dModel = encoder.mha.q_b.size();
     MPSGraphTensor * mhaQ = [self addFullyConnectedLayerWithParent:parent
                                                     outputChannels:encoder.mha.q_b.size()
                                                            weights:&encoder.mha.q_w[0]
@@ -605,15 +729,16 @@ -(nonnull MPSGraphTensor *) addEncoderLayerWithParent:(MPSGraphTensor * __nonnul
                                                label:[NSString stringWithFormat:@"%@/ln2", label]];
     }
     else if ([normtype isEqual:@"rmsnorm"] || [normtype isEqual:@"skipfirst"]) {
-        enc = [self addRmsNormalizationWithParent:enc
-                            scaledSecondaryTensor:ffn
-                                           gammas:&encoder.ln2_gammas[0]
-                                            alpha:alpha
-                                            label:[NSString stringWithFormat:@"%@/ln1", label]];
+        return [self addRmsNormalizationWithParent:enc
+                             scaledSecondaryTensor:ffn
+                                            gammas:&encoder.ln2_gammas[0]
+                                             alpha:alpha
+                                             label:[NSString stringWithFormat:@"%@/ln1", label]];
     }
     else {
         [NSException raise:@"Invalid normalization type."
                     format:@"Invalid normalization type specified: %@", normtype];
+        return nil;
     }
 }
 
@@ -882,7 +1007,8 @@ -(nonnull MPSGraphTensor *) scaledQKMatmulWithQueries:(MPSGraphTensor * __nonnul
 
     qkMatmul = [self multiplicationWithPrimaryTensor:qkMatmul
                                      secondaryTensor:[self constantWithScalar:scale
-                                                                        shape:@[@1] dataType:qkMatmul.dataType]
+                                                                        shape:@[@1]
+                                                                     dataType:qkMatmul.dataType]
                                                 name:[NSString stringWithFormat:@"%@/scale", label]];
     return qkMatmul;
 }
@@ -944,6 +1070,14 @@ -(nonnull MPSGraphTensor *) attentionPolicyPromoMatmulConcatWithParent:(MPSGraph
 
     parent = [self reshapeTensor:parent withShape:@[@(-1), @64, @64] name:[NSString stringWithFormat:@"%@/parent_reshape", label]];
 
+    MPSGraphTensor * slice = [self sliceTensor:parent dimension:1 start:48 length:8 name:[NSString stringWithFormat:@"%@/slice_policy_1", label]];
+    slice = [self sliceTensor:slice dimension:2 start:56 length:8 name:[NSString stringWithFormat:@"%@/slice_policy_2", label]];
+    slice = [self reshapeTensor:slice withShape:@[@(-1), @64] name:[NSString stringWithFormat:@"%@/slice_reshape", label]];
+    slice = [self broadcastByStackingTensor:slice axis:2 times:3 name:[NSString stringWithFormat:@"%@/slice_broadcast", label]];
+    slice = [self transposeTensor:slice dimension:1 withDimension:2 name:[NSString stringWithFormat:@"%@/slice_transpose", label]];
+
+    promo = [self additionWithPrimaryTensor:promo secondaryTensor:slice name:[NSString stringWithFormat:@"%@/offset_add", label]];
+
     return [self concatTensor:parent withTensor:promo dimension:1 name:[NSString stringWithFormat:@"%@/concat", label]];
 }
 
@@ -1263,7 +1397,8 @@ -(nonnull MPSGraphTensor *) makePolicyHeadWithTensor:(MPSGraphTensor * __nonnull
                                            scale:1.0f / sqrt(policyDModel)
                                            label:[NSString stringWithFormat:@"%@/self_attention/kq", label]];
 
-        // 6. Slice last 8 keys (k[:, 56:, :]) and matmul with policy promotion weights, then concat to matmul_qk.
+        // 6. Slice last 8 keys (k[:, 48:56, 56:64]) and matmul with policy promotion weights,
+        //    add to promotion logits then concat to matmul_qk.
         policy = [self attentionPolicyPromoMatmulConcatWithParent:policy
                                                          withKeys:keys
                                                           weights:&head.ip4_pol_w[0]
@@ -1272,6 +1407,12 @@ -(nonnull MPSGraphTensor *) makePolicyHeadWithTensor:(MPSGraphTensor * __nonnull
                                                         sliceFrom:56
                                                       channelSize:policyDModel
                                                             label:[NSString stringWithFormat:@"%@/promo_logits", label]];
+
+        policy = [self addPolicyMapLayerWithParent:policy
+                                         policyMap:&lczero::kAttnPolicyMap[0]
+                                           mapSize:(64 * 64 + 8 * 24)
+                                             label:[NSString stringWithFormat:@"%@/policy_mapping", label]];
+
     }
     else if (convolutionPolicy) {
         if (attentionBody) {
@@ -1296,30 +1437,10 @@ -(nonnull MPSGraphTensor *) makePolicyHeadWithTensor:(MPSGraphTensor * __nonnull
                                                label:[NSString stringWithFormat:@"%@/conv2", label]];
 
 
-        /**
-         * @todo policy map implementation has bug in MPSGraph (GatherND not working in graph).
-         * Implementation of policy map to be done in CPU for now.
-         *
-         * Reinstate this section when bug is fixed. See comments below.
-         *
-         // [1858 -> HWC or CHW]
-         const bool HWC = false;
-         std::vector<uint32_t> policy_map(1858);
-         for (const auto& mapping : kConvPolicyMap) {
-         if (mapping == -1) continue;
-         const auto index = &mapping - kConvPolicyMap;
-         const auto displacement = index / 64;
-         const auto square = index % 64;
-         const auto row = square / 8;
-         const auto col = square % 8;
-         if (HWC) {
-         policy_map[mapping] = ((row * 8) + col) * 80 + displacement;
-         } else {
-         policy_map[mapping] = ((displacement * 8) + row) * 8 + col;
-         }
-         }
-         policy = builder_->makePolicyMapLayer(policy, &policy_map[0], "policy_map");
-         */
+        policy = [self addPolicyMapLayerWithParent:policy
+                                         policyMap:&lczero::kConvPolicyMap[0]
+                                           mapSize:(73 * 64)
+                                             label:[NSString stringWithFormat:@"%@/policy_mapping", label]];
     }
     else {
         if (attentionBody) {
@@ -1391,10 +1512,10 @@ -(nonnull MPSGraphTensor *) makeValueHeadWithTensor:(MPSGraphTensor * __nonnull)
 
     value = [self addFullyConnectedLayerWithParent:value
                                     outputChannels:head.ip2_val_b.size()
-                                            weights:&head.ip2_val_w[0]
+                                           weights:&head.ip2_val_w[0]
                                             biases:&head.ip2_val_b[0]
                                         activation:wdl ? @"softmax" : @"tanh"
-                                                label:[NSString stringWithFormat:@"%@/fc2", label]];
+                                             label:[NSString stringWithFormat:@"%@/fc2", label]];
 
     return value;
 }
diff --git a/src/neural/backends/metal/network_metal.cc b/src/neural/backends/metal/network_metal.cc
index 0a45eb74da..46f29459b5 100644
--- a/src/neural/backends/metal/network_metal.cc
+++ b/src/neural/backends/metal/network_metal.cc
@@ -160,99 +160,30 @@ MetalNetwork::MetalNetwork(const WeightsFile& file, const OptionsDict& options)
                     "' does not exist in this net.");
   }
 
-  auto embedding = static_cast<InputEmbedding>(file.format().network_format().input_embedding());
-  builder_->build(kInputPlanes, weights, embedding, attn_body, attn_policy_, conv_policy_,
-                  wdl_, moves_left_, activations, policy_head, value_head);
+  auto embedding = static_cast<InputEmbedding>(
+      file.format().network_format().input_embedding());
+  builder_->build(kInputPlanes, weights, embedding, attn_body, attn_policy_,
+                  conv_policy_, wdl_, moves_left_, activations, policy_head,
+                  value_head);
 }
 
 void MetalNetwork::forwardEval(InputsOutputs* io, int batchSize) {
-  // Expand encoded input into N x 112 x 8 x 8.
-  float* dptr = &io->input_val_mem_expanded_[0];
-  for (size_t i = 0; i < batchSize; i++) {
-    for (size_t j = 0; j < kInputPlanes; j++) {
-      const float value = io->input_val_mem_[j + i * kInputPlanes];
-      const uint64_t mask = io->input_masks_mem_[j + i * kInputPlanes];
-      for (auto k = 0; k < 64; k++) {
-        *(dptr++) = (mask & (((uint64_t)1) << k)) != 0 ? value : 0;
-      }
-    }
-  }
-
   // Metal is not thread-safe, so lock is needed.
   lock_.lock();
 
-  if (attn_policy_ || conv_policy_) {
-    /**
-     * @todo policy map implementation has bug in MPSGraph (GatherND not working
-     * in graph). Implementation of policy map to be done in CPU for now.
-     *
-     * Remove this if-branch when bug is fixed. See comments above.
-     */
-
-    if (moves_left_) {
-      builder_->forwardEval(&io->input_val_mem_expanded_[0], batchSize,
-                            {&io->op_policy_raw_mem_[0], &io->op_value_mem_[0],
-                             &io->op_moves_left_mem_[0]});
-    } else {
-      builder_->forwardEval(
-          &io->input_val_mem_expanded_[0], batchSize,
-          {&io->op_policy_raw_mem_[0], &io->op_value_mem_[0]});
-    }
-    // The next thread can start using the GPU now.
-    lock_.unlock();
-
-    if (attn_policy_) {
-      // Promotion offset calculation.
-      for (size_t batch = 0; batch < batchSize; batch++) {
-        for (int k = 0; k < 8; k++) {      // y in cuda
-          for (int j = 0; j < 8; j++) {    // w in cuda
-            for (int i = 0; i < 3; i++) {  // c in cuda
-              // Promotion offsets already precalculated and stored in GPU.
-              // Just the main policy offsets need to be added here.
-              io->op_policy_raw_mem_[batch * (64 * 64 + 8 * 24) + 64 * 64 +
-                                     24 * k + 3 * j + i] +=
-                  io->op_policy_raw_mem_[batch * (64 * 64 + 8 * 24) +
-                                         (48 + k) * 64 + 56 + j];
-            }
-          }
-        }
-      }
-      // Mapping from attention policy to lc0 policy
-      for (size_t batch = 0; batch < batchSize; batch++) {
-        for (size_t i = 0; i < 64 * 64 + 8 * 24; i++) {
-          size_t j = kAttnPolicyMap[i];
-          if (j >= 0) {
-            io->op_policy_mem_[batch * 1858 + j] =
-                io->op_policy_raw_mem_[batch * (64 * 64 + 8 * 24) + i];
-          }
-        }
-      }
-    } else if (conv_policy_) {
-      // Mapping from convolutional policy to lc0 policy
-      for (size_t batch = 0; batch < batchSize; batch++) {
-        for (size_t i = 0; i < 73 * 64; i++) {
-          short j = kConvPolicyMap[i];
-          if (j >= 0) {
-            io->op_policy_mem_[batch * 1858 + j] =
-                io->op_policy_raw_mem_[batch * 80 * 64 + i];
-          }
-        }
-      }
-    }
-
+  if (moves_left_) {
+    builder_->forwardEval(&io->input_val_mem_[0], &io->input_masks_mem_[0],
+                          batchSize,
+                          {&io->op_policy_mem_[0], &io->op_value_mem_[0],
+                           &io->op_moves_left_mem_[0]});
   } else {
-    if (moves_left_) {
-      builder_->forwardEval(&io->input_val_mem_expanded_[0], batchSize,
-                            {&io->op_policy_mem_[0], &io->op_value_mem_[0],
-                             &io->op_moves_left_mem_[0]});
-    } else {
-      builder_->forwardEval(&io->input_val_mem_expanded_[0], batchSize,
-                            {&io->op_policy_mem_[0], &io->op_value_mem_[0]});
-    }
-
-    // The next thread can start using the GPU now.
-    lock_.unlock();
+    builder_->forwardEval(&io->input_val_mem_[0], &io->input_masks_mem_[0],
+                          batchSize,
+                          {&io->op_policy_mem_[0], &io->op_value_mem_[0]});
   }
+
+  // The next thread can start using the GPU now.
+  lock_.unlock();
 }
 
 std::unique_ptr<Network> MakeMetalNetwork(const std::optional<WeightsFile>& w,
diff --git a/src/neural/backends/network_demux.cc b/src/neural/backends/network_demux.cc
index a1a28f779f..accf9bd12f 100644
--- a/src/neural/backends/network_demux.cc
+++ b/src/neural/backends/network_demux.cc
@@ -25,125 +25,221 @@
   Program grant you additional permission to convey the resulting work.
 */
 
+#include <algorithm>
+#include <atomic>
 #include <condition_variable>
+#include <cstdlib>
+#include <mutex>
 #include <queue>
 #include <thread>
 
 #include "neural/factory.h"
-#include "utils/exception.h"
 
 namespace lczero {
 namespace {
 
+class DemuxingComputation;
+
+struct DemuxingWork {
+  DemuxingComputation* source_ = nullptr;
+  std::unique_ptr<NetworkComputation> computation_;
+  int start_ = 0;
+  int end_ = 0;
+
+  DemuxingWork(int sample) : end_(sample) {}
+  DemuxingWork(DemuxingComputation* source, int start, int end)
+      : source_(source), start_(start), end_(end) {
+    assert(start_ != end_);
+  }
+
+  auto operator<=>(const DemuxingWork& b) const { return end_ <=> b.end_; }
+};
+
 class DemuxingNetwork;
-class DemuxingComputation : public NetworkComputation {
+class DemuxingBackend;
+class DemuxingComputation final : public NetworkComputation {
+  std::tuple<const std::unique_ptr<NetworkComputation>&, int> GetParent(
+      int sample) const {
+    auto iter = std::lower_bound(parents_.begin(), parents_.end(), sample + 1);
+    assert(iter != parents_.end());
+    assert(sample >= iter->start_);
+    assert(sample < iter->end_);
+    return {iter->computation_, sample - iter->start_};
+  }
+
  public:
   DemuxingComputation(DemuxingNetwork* network) : network_(network) {}
+  ~DemuxingComputation() {
+    // Wait for other threads to stop using this object. It must be spinloop for
+    // correct synchronization between notify_one and destructor.
+    while (dataready_.load(std::memory_order_acquire) != -1) {
+      SpinloopPause();
+    }
+  }
 
-  void AddInput(InputPlanes&& input) override { planes_.emplace_back(input); }
+  void AddInput(InputPlanes&& input) override {
+    planes_.emplace_back(std::move(input));
+  }
 
   void ComputeBlocking() override;
 
   int GetBatchSize() const override { return planes_.size(); }
 
   float GetQVal(int sample) const override {
-    const int idx = sample / partial_size_;
-    const int offset = sample % partial_size_;
-    return parents_[idx]->GetQVal(offset);
+    auto [parent, offset] = GetParent(sample);
+    if (!parent) return 0;
+    return parent->GetQVal(offset);
   }
 
   float GetDVal(int sample) const override {
-    int idx = sample / partial_size_;
-    int offset = sample % partial_size_;
-    return parents_[idx]->GetDVal(offset);
+    auto [parent, offset] = GetParent(sample);
+    if (!parent) return 0;
+    return parent->GetDVal(offset);
   }
 
   float GetMVal(int sample) const override {
-    int idx = sample / partial_size_;
-    int offset = sample % partial_size_;
-    return parents_[idx]->GetMVal(offset);
+    auto [parent, offset] = GetParent(sample);
+    if (!parent) return 0;
+    return parent->GetMVal(offset);
   }
 
   float GetPVal(int sample, int move_id) const override {
-    const int idx = sample / partial_size_;
-    const int offset = sample % partial_size_;
-    return parents_[idx]->GetPVal(offset, move_id);
+    auto [parent, offset] = GetParent(sample);
+    if (!parent) return 0;
+    return parent->GetPVal(offset, move_id);
   }
 
   void NotifyComplete() {
-    std::unique_lock<std::mutex> lock(mutex_);
-    dataready_--;
-    if (dataready_ == 0) {
+    if (1 == dataready_.fetch_sub(1, std::memory_order_release)) {
+      {
+        std::lock_guard lock(mutex_);
+      }
       dataready_cv_.notify_one();
+      dataready_.store(-1, std::memory_order_release);
     }
   }
 
-  NetworkComputation* AddParentFromNetwork(Network* network) {
-    std::unique_lock<std::mutex> lock(mutex_);
-    parents_.emplace_back(network->NewComputation());
-    const int cur_idx = (parents_.size() - 1) * partial_size_;
-    for (int i = cur_idx; i < std::min(GetBatchSize(), cur_idx + partial_size_);
-         i++) {
-      parents_.back()->AddInput(std::move(planes_[i]));
-    }
-    return parents_.back().get();
-  }
-
  private:
   std::vector<InputPlanes> planes_;
   DemuxingNetwork* network_;
-  std::vector<std::unique_ptr<NetworkComputation>> parents_;
+  std::vector<DemuxingWork> parents_;
+
+  std::mutex mutex_;
+  std::condition_variable dataready_cv_;
+  std::atomic<int> dataready_ = -1;
+
+  friend class DemuxingBackend;
+};
+
+class DemuxingBackend {
+ public:
+  ~DemuxingBackend() {
+    while (!threads_.empty()) {
+      threads_.back().join();
+      threads_.pop_back();
+    }
+    while (!queue_.empty()) {
+      queue_.front()->source_->NotifyComplete();
+      queue_.pop();
+    }
+  }
+
+  void Assign(std::unique_ptr<Network>&& network, const OptionsDict& opts,
+              std::atomic<bool>& abort) {
+    network_ = std::move(network);
+    int nn_threads = opts.GetOrDefault<int>("threads", 0);
+    if (nn_threads == 0) {
+      nn_threads = network_->GetThreads();
+    }
+    for (int i = 0; i < nn_threads; i++) {
+      threads_.emplace_back([&] { Worker(abort); });
+    }
+  }
+
+  void Enqueue(DemuxingWork* work) {
+    {
+      std::unique_lock lock(mutex_);
+      queue_.push(work);
+    }
+    dataready_cv_.notify_one();
+  }
+
+  void Abort() {
+    {
+      std::unique_lock lock(mutex_);
+    }
+    dataready_cv_.notify_all();
+  }
 
+  void Worker(std::atomic<bool>& abort) {
+    while (!abort.load(std::memory_order_relaxed)) {
+      DemuxingWork* work = nullptr;
+      {
+        std::unique_lock lock(mutex_);
+        dataready_cv_.wait(lock, [&] {
+          return abort.load(std::memory_order_relaxed) || !queue_.empty();
+        });
+        if (abort.load(std::memory_order_relaxed)) return;
+        if (!queue_.empty()) {
+          work = queue_.front();
+          queue_.pop();
+        }
+      }
+      if (work) {
+        work->computation_ = network_->NewComputation();
+        auto& planes = work->source_->planes_;
+        for (int i = work->start_; i < work->end_; i++) {
+          work->computation_->AddInput(std::move(planes[i]));
+        }
+        work->computation_->ComputeBlocking();
+        work->source_->NotifyComplete();
+      }
+    }
+  }
+
+ private:
   std::mutex mutex_;
   std::condition_variable dataready_cv_;
-  int dataready_ = 0;
-  int partial_size_ = 0;
+  std::vector<std::thread> threads_;
+  std::unique_ptr<Network> network_;
+  std::queue<DemuxingWork*> queue_;
 };
 
-class DemuxingNetwork : public Network {
+class DemuxingNetwork final : public Network {
  public:
   DemuxingNetwork(const std::optional<WeightsFile>& weights,
-                  const OptionsDict& options) {
-    minimum_split_size_ = options.GetOrDefault<int>("minimum-split-size", 0);
+                  const OptionsDict& options)
+      : backends_(std::max(size_t(1), options.ListSubdicts().size())) {
     const auto parents = options.ListSubdicts();
     if (parents.empty()) {
       // If options are empty, or multiplexer configured in root object,
       // initialize on root object and default backend.
       auto backends = NetworkFactory::Get()->GetBackendsList();
-      AddBackend(backends[0], weights, options);
+      AddBackend(0, backends[0], weights, options);
     }
 
+    int i = 0;
     for (const auto& name : parents) {
-      AddBackend(name, weights, options.GetSubdict(name));
+      AddBackend(i++, name, weights, options.GetSubdict(name));
     }
   }
 
-  void AddBackend(const std::string& name,
+  void AddBackend(int index, const std::string& name,
                   const std::optional<WeightsFile>& weights,
                   const OptionsDict& opts) {
     const std::string backend = opts.GetOrDefault<std::string>("backend", name);
 
-    networks_.emplace_back(
-        NetworkFactory::Get()->Create(backend, weights, opts));
+    auto network = NetworkFactory::Get()->Create(backend, weights, opts);
 
-    int nn_threads = opts.GetOrDefault<int>("threads", 0);
-    if (nn_threads == 0) {
-      nn_threads = networks_.back()->GetThreads();
-    }
-
-    min_batch_size_ =
-        std::min(min_batch_size_, networks_.back()->GetMiniBatchSize());
-    is_cpu_ &= networks_.back()->IsCpu();
-
-    if (networks_.size() == 1) {
-      capabilities_ = networks_.back()->GetCapabilities();
+    min_batch_size_ = std::min(min_batch_size_, network->GetMiniBatchSize());
+    batch_step_ = std::max(batch_step_, network->GetPreferredBatchStep());
+    is_cpu_ &= network->IsCpu();
+    if (index == 0) {
+      capabilities_ = network->GetCapabilities();
     } else {
-      capabilities_.Merge(networks_.back()->GetCapabilities());
-    }
-
-    for (int i = 0; i < nn_threads; ++i) {
-      threads_.emplace_back([this]() { Worker(); });
+      capabilities_.Merge(network->GetCapabilities());
     }
+    backends_[index].Assign(std::move(network), opts, abort_);
   }
 
   std::unique_ptr<NetworkComputation> NewComputation() override {
@@ -155,102 +251,86 @@ class DemuxingNetwork : public Network {
   }
 
   int GetMiniBatchSize() const override {
-    return min_batch_size_ * threads_.size();
+    return min_batch_size_ * backends_.size();
   }
 
-  bool IsCpu() const override { return is_cpu_; }
-
-  void Enqueue(DemuxingComputation* computation) {
-    std::lock_guard<std::mutex> lock(mutex_);
-    queue_.push(computation);
-    cv_.notify_one();
-  }
-
-  ~DemuxingNetwork() {
-    Abort();
-    Wait();
-    // Unstuck waiting computations.
-    while (!queue_.empty()) {
-      queue_.front()->NotifyComplete();
-      queue_.pop();
-    }
-  }
+  int GetPreferredBatchStep() const override { return batch_step_; }
 
-  void Worker() {
-    // While Abort() is not called (and it can only be called from destructor).
-    while (!abort_) {
-      {
-        {
-          std::unique_lock<std::mutex> lock(mutex_);
-          // Wait until there's come work to compute.
-          cv_.wait(lock, [&] { return abort_ || !queue_.empty(); });
-          if (abort_) break;
-        }
+  bool IsCpu() const override { return is_cpu_; }
 
-        // While there is a work in queue, process it.
-        while (true) {
-          DemuxingComputation* to_notify;
-          {
-            std::unique_lock<std::mutex> lock(mutex_);
-            if (queue_.empty()) break;
-            to_notify = queue_.front();
-            queue_.pop();
-          }
-          long long net_idx = ++(counter_) % networks_.size();
-          NetworkComputation* to_compute =
-              to_notify->AddParentFromNetwork(networks_[net_idx].get());
-          to_compute->ComputeBlocking();
-          to_notify->NotifyComplete();
-        }
-      }
-    }
-  }
+  ~DemuxingNetwork() { Abort(); }
 
   void Abort() {
-    {
-      std::lock_guard<std::mutex> lock(mutex_);
-      abort_ = true;
+    abort_.store(true, std::memory_order_relaxed);
+    for (auto& b : backends_) {
+      b.Abort();
     }
-    cv_.notify_all();
   }
 
-  void Wait() {
-    while (!threads_.empty()) {
-      threads_.back().join();
-      threads_.pop_back();
-    }
-  }
-
-  std::vector<std::unique_ptr<Network>> networks_;
+  std::vector<DemuxingBackend> backends_;
   NetworkCapabilities capabilities_;
   int min_batch_size_ = std::numeric_limits<int>::max();
+  int batch_step_ = 1;
   bool is_cpu_ = true;
-  std::queue<DemuxingComputation*> queue_;
-  int minimum_split_size_ = 0;
-  std::atomic<long long> counter_;
-  bool abort_ = false;
-
-  std::mutex mutex_;
-  std::condition_variable cv_;
-
-  std::vector<std::thread> threads_;
+  std::atomic<int64_t> start_index_;
+  std::atomic<bool> abort_ = false;
 };
 
 void DemuxingComputation::ComputeBlocking() {
   if (GetBatchSize() == 0) return;
-  partial_size_ = (GetBatchSize() + network_->threads_.size() - 1) /
-                  network_->threads_.size();
-  if (partial_size_ < network_->minimum_split_size_) {
-    partial_size_ = std::min(GetBatchSize(), network_->minimum_split_size_);
+  // Calculate batch_step_ size split count.
+  int splits = 1 + (GetBatchSize() - 1) / network_->batch_step_;
+  // Calculate the minimum number of splits per backend.
+  int split_size_per_backend = splits / network_->backends_.size();
+  // Calculate how many backends get extra work.
+  int extra_split_backends =
+      splits - split_size_per_backend * network_->backends_.size();
+
+  // Find the first backend which got less work from the previous batch.
+  int start_index =
+      network_->start_index_.fetch_add(std::max(1, extra_split_backends),
+                                       std::memory_order_relaxed) %
+      network_->backends_.size();
+
+  int end_index =
+      (start_index + extra_split_backends) % network_->backends_.size();
+  int work_start = 0;
+  int work_items = split_size_per_backend > 0 ? network_->backends_.size()
+                                             : extra_split_backends;
+  // First store the work item count and reserve memory from them.
+  dataready_.store(work_items, std::memory_order_relaxed);
+  parents_.reserve(work_items);
+  int i = start_index;
+  // First send work to backends which get extra work.
+  int split_size = split_size_per_backend + 1;
+  for (; i != end_index; i = (i + 1) % network_->backends_.size()) {
+    assert(work_start != GetBatchSize());
+    int work_end = work_start + split_size * network_->batch_step_;
+    work_end = std::min(work_end, GetBatchSize());
+    parents_.emplace_back(this, work_start, work_end);
+    network_->backends_[i].Enqueue(&parents_.back());
+    work_start = work_end;
   }
-  const int splits = (GetBatchSize() + partial_size_ - 1) / partial_size_;
-
-  std::unique_lock<std::mutex> lock(mutex_);
-  dataready_ = splits;
-  for (int j = 0; j < splits; j++) {
-    network_->Enqueue(this);
+  // Queue remaining work items which don't get extra work.
+  split_size--;
+  if (split_size > 0) {
+    do {
+      assert(work_start != GetBatchSize());
+      int work_end = work_start + split_size * network_->batch_step_;
+      work_end = std::min(work_end, GetBatchSize());
+      parents_.emplace_back(this, work_start, work_end);
+      network_->backends_[i].Enqueue(&parents_.back());
+      work_start = work_end;
+      i = (i + 1) % network_->backends_.size();
+    } while (i != start_index);
   }
-  dataready_cv_.wait(lock, [this]() { return dataready_ == 0; });
+  assert(work_start == GetBatchSize());
+  assert(work_items == (int)parents_.size());
+  // Wait until all backends complete their work.
+  std::unique_lock<std::mutex> lock(mutex_);
+  dataready_cv_.wait(lock, [this]() {
+    return dataready_.load(std::memory_order_acquire) <= 0;
+  });
 }
 
 std::unique_ptr<Network> MakeDemuxingNetwork(
diff --git a/src/neural/backends/network_onnx.cc b/src/neural/backends/network_onnx.cc
deleted file mode 100644
index c44331af72..0000000000
--- a/src/neural/backends/network_onnx.cc
+++ /dev/null
@@ -1,514 +0,0 @@
-/*
-  This file is part of Leela Chess Zero.
-  Copyright (C) 2021-2023 The LCZero Authors
-
-  Leela Chess is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  Leela Chess is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
-
-  Additional permission under GNU GPL version 3 section 7
-
-  If you modify this Program, or any covered work, by linking or
-  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
-  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
-  modified version of those libraries), containing parts covered by the
-  terms of the respective license agreement, the licensors of this
-  Program grant you additional permission to convey the resulting work.
-*/
-
-#include <algorithm>
-#include <cassert>
-#include <fstream>
-#include <iterator>
-#include <memory>
-#include <string>
-#include <vector>
-
-#if __has_include("dml_provider_factory.h")
-#include "dml_provider_factory.h"
-#define USE_DML
-#endif
-
-#include "cpu_provider_factory.h"
-#include "neural/factory.h"
-#include "neural/loader.h"
-#include "neural/network.h"
-#include "neural/onnx/converter.h"
-#include "onnxruntime_cxx_api.h"
-#include "utils/bf16_utils.h"
-#include "utils/bititer.h"
-#include "utils/commandline.h"
-#include "utils/exception.h"
-#include "utils/fp16_utils.h"
-#include "utils/logging.h"
-
-namespace lczero {
-namespace {
-
-enum class OnnxProvider { CPU, CUDA, DML, ROCM, TRT };
-
-class OnnxNetwork;
-
-template <typename DataType>
-class OnnxComputation : public NetworkComputation {
- public:
-  OnnxComputation(OnnxNetwork* network);
-  void AddInput(InputPlanes&& input) override;
-  int GetBatchSize() const override { return raw_input_.size(); }
-  void ComputeBlocking() override;
-  float GetQVal(int sample) const override;
-  float GetDVal(int sample) const override;
-  float GetPVal(int sample, int move_id) const override;
-  float GetMVal(int sample) const override;
-
- private:
-  Ort::Value PrepareInputs(int start, int batch_size);
-
-  OnnxNetwork* network_;
-  std::vector<InputPlanes> raw_input_;
-  std::vector<DataType> input_tensor_data_;
-  std::vector<Ort::Value> output_tensors_;
-  std::vector<std::vector<DataType>> output_tensors_data_;
-  std::vector<size_t> output_tensors_step_;
-};
-
-class OnnxNetwork : public Network {
- public:
-  OnnxNetwork(const WeightsFile& file, const OptionsDict& options,
-              OnnxProvider provider);
-  std::unique_ptr<NetworkComputation> NewComputation() override {
-    if (fp16_) {
-      return std::make_unique<OnnxComputation<Ort::Float16_t>>(this);
-    } else if (bf16_) {
-      return std::make_unique<OnnxComputation<Ort::BFloat16_t>>(this);
-    } else {
-      return std::make_unique<OnnxComputation<float>>(this);
-    }
-  }
-  const NetworkCapabilities& GetCapabilities() const override {
-    return capabilities_;
-  }
-  int GetMiniBatchSize() const override {
-    return batch_size_ == -1 ? Network::GetMiniBatchSize()
-                             : batch_size_ * steps_;
-  }
-  bool IsCpu() const override { return provider_ == OnnxProvider::CPU; }
-
-  Ort::SessionOptions GetOptions(int gpu, int threads, int batch_size);
-
-  Ort::Env onnx_env_;
-  // Prepare sessions for this many multiples of the batch size;
-  int steps_;
-  std::vector<Ort::Session> session_;
-  std::vector<std::string> inputs_;
-  // Points to strings in inputs_.
-  std::vector<const char*> inputs_cstr_;
-  std::vector<std::string> outputs_;
-  // Points to strings in outputs_.
-  std::vector<const char*> outputs_cstr_;
-  // Indices in output_cstr_ vector.
-  int policy_head_ = -1;
-  int wdl_head_ = -1;
-  int value_head_ = -1;
-  int mlh_head_ = -1;
-  NetworkCapabilities capabilities_;
-  bool fp16_;
-  bool bf16_;
-  // The batch size to use, or -1 for variable.
-  int batch_size_;
-  // The lower limit for variable batch size.
-  int min_batch_size_;
-  static constexpr int max_batch_size_ = 1024;
-  // For conditional locking if running the DML/ROCM/TRT provider.
-  OnnxProvider provider_;
-  std::mutex lock_;
-};
-
-template <typename DataType>
-OnnxComputation<DataType>::OnnxComputation(OnnxNetwork* network)
-    : network_(network) {
-  output_tensors_data_.resize(network_->outputs_.size());
-  output_tensors_step_.resize(network_->outputs_.size());
-  output_tensors_step_[network_->policy_head_] = 1858;
-  output_tensors_data_[network_->policy_head_] =
-      std::vector<DataType>(1858 * network_->max_batch_size_);
-  if (network_->wdl_head_ != -1) {
-    output_tensors_step_[network_->wdl_head_] = 3;
-    output_tensors_data_[network_->wdl_head_] =
-        std::vector<DataType>(3 * network_->max_batch_size_);
-  }
-  if (network_->value_head_ != -1) {
-    output_tensors_step_[network_->value_head_] = 1;
-    output_tensors_data_[network_->value_head_] =
-        std::vector<DataType>(network_->max_batch_size_);
-  }
-  if (network_->mlh_head_ != -1) {
-    output_tensors_step_[network_->mlh_head_] = 1;
-    output_tensors_data_[network_->mlh_head_] =
-        std::vector<DataType>(network_->max_batch_size_);
-  }
-}
-
-template <typename DataType>
-void OnnxComputation<DataType>::AddInput(InputPlanes&& input) {
-  raw_input_.emplace_back(input);
-  if (raw_input_.size() > network_->max_batch_size_) {
-    throw Exception("NN input exceeds max batch size of " +
-                    std::to_string(network_->max_batch_size_) + ".");
-  }
-}
-
-float AsFloat(float x) { return x; }
-float AsFloat(Ort::Float16_t x) {
-  uint16_t tmp;
-  std::memcpy(&tmp, reinterpret_cast<uint16_t*>(&x), sizeof(uint16_t));
-  return FP16toFP32(tmp);
-}
-float AsFloat(Ort::BFloat16_t x) {
-  uint16_t tmp;
-  std::memcpy(&tmp, reinterpret_cast<uint16_t*>(&x), sizeof(uint16_t));
-  return BF16toFP32(tmp);
-}
-
-template <typename DataType>
-float OnnxComputation<DataType>::GetQVal(int sample) const {
-  if (network_->wdl_head_ != -1) {
-    const auto& data = output_tensors_data_[network_->wdl_head_];
-    return AsFloat(data[sample * 3 + 0]) - AsFloat(data[sample * 3 + 2]);
-  } else {
-    const auto& data = output_tensors_data_[network_->value_head_];
-    return AsFloat(data[sample]);
-  }
-}
-
-template <typename DataType>
-float OnnxComputation<DataType>::GetDVal(int sample) const {
-  if (network_->wdl_head_ == -1) return 0.0f;
-  const auto& data = output_tensors_data_[network_->wdl_head_];
-  return AsFloat(data[sample * 3 + 1]);
-}
-
-template <typename DataType>
-float OnnxComputation<DataType>::GetPVal(int sample, int move_id) const {
-  const auto& data = output_tensors_data_[network_->policy_head_];
-  return AsFloat(data[sample * 1858 + move_id]);
-}
-
-template <typename DataType>
-float OnnxComputation<DataType>::GetMVal(int sample) const {
-  if (network_->mlh_head_ == -1) return 0.0f;
-  const auto& data = output_tensors_data_[network_->mlh_head_];
-  return AsFloat(data[sample]);
-}
-
-void AsDataType(float x, float* y) { *y = x; }
-void AsDataType(float x, Ort::Float16_t* y) {
-  uint16_t tmp = FP32toFP16(x);
-  std::memcpy(reinterpret_cast<uint16_t*>(y), &tmp, sizeof(uint16_t));
-}
-void AsDataType(float x, Ort::BFloat16_t* y) {
-  uint16_t tmp = FP32toBF16(x);
-  std::memcpy(reinterpret_cast<uint16_t*>(y), &tmp, sizeof(uint16_t));
-}
-
-template <typename DataType>
-Ort::Value OnnxComputation<DataType>::PrepareInputs(int start, int batch_size) {
-  input_tensor_data_.clear();
-  input_tensor_data_.resize(batch_size * kInputPlanes * 8 * 8);
-  auto iter = input_tensor_data_.data();
-  int end = std::min(start + batch_size, static_cast<int>(raw_input_.size()));
-  for (int i = start; i < end; i++) {
-    for (const auto& plane : raw_input_[i]) {
-      DataType value;
-      AsDataType(plane.value, &value);
-      for (auto bit : IterateBits(plane.mask)) {
-        *(iter + bit) = value;
-      }
-      iter += 64;
-    }
-  }
-  for (int i = end; i < start + batch_size; i++) {
-    for (int j = 0; j < kInputPlanes * 64; j++) {
-      *iter++ = DataType();
-    }
-  }
-
-  auto memory_info =
-      Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
-
-  output_tensors_.clear();
-  for (size_t i = 0; i < output_tensors_step_.size(); i++) {
-    int size = output_tensors_step_[i];
-    int64_t dims[] = {batch_size, size};
-    output_tensors_.emplace_back(Ort::Value::CreateTensor<DataType>(
-        memory_info, output_tensors_data_[i].data() + start * size,
-        size * batch_size, dims, 2));
-  }
-
-  int64_t dims[] = {batch_size, kInputPlanes, 8, 8};
-  return Ort::Value::CreateTensor<DataType>(memory_info,
-                                            input_tensor_data_.data(),
-                                            input_tensor_data_.size(), dims, 4);
-}
-
-template <typename DataType>
-void OnnxComputation<DataType>::ComputeBlocking() {
-  int batch_size = network_->batch_size_;
-  if (batch_size < 0) {
-    batch_size = std::max(static_cast<int>(raw_input_.size()),
-                          network_->min_batch_size_);
-  }
-  for (size_t i = 0; i < raw_input_.size();) {
-    int step = (raw_input_.size() - i + batch_size - 1) / batch_size;
-    if (step > network_->steps_) step = network_->steps_;
-    int batch = batch_size * step;
-
-    auto input_tensor = PrepareInputs(i, batch);
-    // The DML onnxruntime execution provider is documented as not supporting
-    // multi-threaded calls to Run on the same inference session. We found the
-    // same to be true for the ROCm execution provider (at least for CNNs).
-    // TODO: This may be a onnxruntime/ROCm bug, check onnxruntime 1.16 release.
-    if (network_->provider_ == OnnxProvider::DML ||
-        network_->provider_ == OnnxProvider::ROCM ||
-        network_->provider_ == OnnxProvider::TRT) {
-      network_->lock_.lock();
-    }
-    network_->session_[step - 1].Run(
-        {}, network_->inputs_cstr_.data(), &input_tensor, 1,
-        network_->outputs_cstr_.data(), output_tensors_.data(),
-        output_tensors_.size());
-    if (network_->provider_ == OnnxProvider::DML ||
-        network_->provider_ == OnnxProvider::ROCM ||
-        network_->provider_ == OnnxProvider::TRT) {
-      network_->lock_.unlock();
-    }
-    i += batch;
-  }
-}
-
-Ort::SessionOptions OnnxNetwork::GetOptions(int gpu, int threads,
-                                            int batch_size) {
-  Ort::SessionOptions options;
-  options.SetIntraOpNumThreads(threads);
-  options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
-
-  if (batch_size > 0) {
-    // Override the default (variable) batch size.
-    Ort::ThrowOnError(
-        OrtGetApiBase()
-            ->GetApi(ORT_API_VERSION)
-            ->AddFreeDimensionOverrideByName(options, "batch", batch_size));
-  }
-
-  switch (provider_) {
-    case OnnxProvider::DML:
-      options.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
-      options.DisableMemPattern();
-#ifdef USE_DML
-      Ort::ThrowOnError(
-          OrtSessionOptionsAppendExecutionProvider_DML(options, gpu));
-#else
-      throw Exception("ONNX backend internal error.");
-#endif
-      break;
-    case OnnxProvider::TRT: {
-      options.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
-
-      std::string cache_dir = CommandLine::BinaryDirectory() + "/trt_cache";
-      std::map<std::string, std::string> trt_options;
-      trt_options["device_id"] = std::to_string(gpu);
-      trt_options["trt_fp16_enable"] = fp16_ ? "1" : "0";
-      trt_options["trt_int8_enable"] = "0";
-      trt_options["trt_max_partition_iterations"] = "1000";
-      trt_options["trt_min_subgraph_size"] = "1";
-      trt_options["trt_engine_cache_enable"] = "1";
-      trt_options["trt_engine_cache_prefix"] =
-          "Lc0_ONNX_TRT_batch_" + std::to_string(batch_size) + "_";
-      trt_options["trt_engine_cache_path"] = cache_dir;
-      trt_options["trt_timing_cache_enable"] = "1";
-      trt_options["trt_timing_cache_path"] = cache_dir;
-      trt_options["trt_layer_norm_fp32_fallback"] = "1";
-      trt_options["trt_force_sequential_engine_build"] = "1";
-      // Looks like we need I/O binding to enable this.
-      // trt_options["trt_cuda_graph_enable"] = "1";
-      if (batch_size < 0) {
-        trt_options["trt_profile_min_shapes"] =
-            inputs_[0] + ":" + std::to_string(min_batch_size_) + "x112x8x8";
-        trt_options["trt_profile_max_shapes"] =
-            inputs_[0] + ":" + std::to_string(max_batch_size_) + "x112x8x8";
-        trt_options["trt_profile_opt_shapes"] =
-            inputs_[0] + ":" + std::to_string(max_batch_size_ / 4) + "x112x8x8";
-      } else {
-        trt_options["trt_profile_min_shapes"] =
-            inputs_[0] + ":" + std::to_string(batch_size_) + "x112x8x8";
-        trt_options["trt_profile_max_shapes"] =
-            inputs_[0] + ":" + std::to_string(batch_size_ * steps_) +
-            "x112x8x8";
-        trt_options["trt_profile_opt_shapes"] =
-            inputs_[0] + ":" + std::to_string(batch_size_ * steps_) +
-            "x112x8x8";
-      }
-      std::vector<const char*> keys;
-      std::vector<const char*> values;
-      for (const auto& [key, value] : trt_options) {
-        keys.push_back(key.c_str());
-        values.push_back(value.c_str());
-      }
-
-      const auto& api = Ort::GetApi();
-      OrtTensorRTProviderOptionsV2* trt_options_v2;
-      Ort::ThrowOnError(api.CreateTensorRTProviderOptions(&trt_options_v2));
-      Ort::ThrowOnError(api.UpdateTensorRTProviderOptions(
-          trt_options_v2, keys.data(), values.data(), keys.size()));
-      options.AppendExecutionProvider_TensorRT_V2(*trt_options_v2);
-      api.ReleaseTensorRTProviderOptions(trt_options_v2);
-      break;
-    }
-    case OnnxProvider::ROCM: {
-      OrtROCMProviderOptions rocm_options;
-      rocm_options.device_id = gpu;
-      options.AppendExecutionProvider_ROCM(rocm_options);
-      break;
-    }
-    case OnnxProvider::CUDA: {
-      OrtCUDAProviderOptions cuda_options;
-      cuda_options.device_id = gpu;
-      options.AppendExecutionProvider_CUDA(cuda_options);
-      break;
-    }
-    case OnnxProvider::CPU:
-      auto status = OrtSessionOptionsAppendExecutionProvider_CPU(options, 0);
-      if (status) {
-        std::string error_message = Ort::GetApi().GetErrorMessage(status);
-        OrtErrorCode error_code = Ort::GetApi().GetErrorCode(status);
-        Ort::GetApi().ReleaseStatus(status);
-        throw Exception("ONNX CPU error " + std::to_string(error_code) + ": " +
-                        error_message);
-      }
-      break;
-  }
-  return options;
-}
-
-OnnxNetwork::OnnxNetwork(const WeightsFile& file, const OptionsDict& opts,
-                         OnnxProvider provider)
-    : onnx_env_(ORT_LOGGING_LEVEL_WARNING, "lc0"),
-      capabilities_{file.format().network_format().input(),
-                    file.format().network_format().output(),
-                    file.format().network_format().moves_left()},
-      fp16_(file.onnx_model().data_type() == pblczero::OnnxModel::FLOAT16),
-      bf16_(file.onnx_model().data_type() == pblczero::OnnxModel::BFLOAT16),
-      provider_(provider) {
-  onnx_env_.DisableTelemetryEvents();
-  batch_size_ =
-      opts.GetOrDefault<int>("batch", provider == OnnxProvider::DML ? 16 : -1);
-  steps_ =
-      opts.GetOrDefault<int>("steps", provider == OnnxProvider::DML ? 4 : 1);
-  min_batch_size_ = opts.GetOrDefault<int>(
-      "min_batch", provider == OnnxProvider::TRT ? 4 : 1);
-  int gpu = opts.GetOrDefault<int>("gpu", 0);
-  int threads =
-      opts.GetOrDefault<int>("threads", provider == OnnxProvider::CPU ? 1 : 0);
-
-  // Sanity checks.
-  if (batch_size_ <= 0) {
-    batch_size_ = -1;  // Variable batch size.
-    steps_ = 1;
-  }
-  if (batch_size_ * steps_ > max_batch_size_) {
-    batch_size_ = max_batch_size_ / steps_;
-  }
-
-  const auto& md = file.onnx_model();
-  if (!md.has_input_planes()) {
-    throw Exception("NN doesn't have input planes defined.");
-  }
-  inputs_.emplace_back(md.input_planes());
-  if (!md.has_output_policy()) {
-    throw Exception("NN doesn't have policy head defined.");
-  }
-  policy_head_ = outputs_.size();
-  outputs_.emplace_back(md.output_policy());
-  if (md.has_output_wdl()) {
-    wdl_head_ = outputs_.size();
-    outputs_.emplace_back(md.output_wdl());
-  } else if (md.has_output_value()) {
-    value_head_ = outputs_.size();
-    outputs_.emplace_back(md.output_value());
-  } else {
-    throw Exception("NN doesn't have value head.");
-  }
-  if (md.has_output_mlh()) {
-    mlh_head_ = outputs_.size();
-    outputs_.emplace_back(md.output_mlh());
-  }
-  std::transform(inputs_.begin(), inputs_.end(),
-                 std::back_inserter(inputs_cstr_),
-                 [](const auto& x) { return x.c_str(); });
-  std::transform(outputs_.begin(), outputs_.end(),
-                 std::back_inserter(outputs_cstr_),
-                 [](const auto& x) { return x.c_str(); });
-
-  for (int step = 1; step <= steps_; step++)
-    session_.emplace_back(onnx_env_, file.onnx_model().model().data(),
-                          file.onnx_model().model().size(),
-                          GetOptions(gpu, threads, batch_size_ * step));
-}
-
-template <OnnxProvider kProvider>
-std::unique_ptr<Network> MakeOnnxNetwork(const std::optional<WeightsFile>& w,
-                                         const OptionsDict& opts) {
-  if (!w) throw Exception("The ONNX backend requires a network file.");
-
-  if (w->has_onnx_model()) {
-    return std::make_unique<OnnxNetwork>(*w, opts, kProvider);
-  } else {
-    WeightsToOnnxConverterOptions converter_options;
-    converter_options.opset = opts.GetOrDefault<int>("opset", 17);
-    converter_options.alt_mish = opts.GetOrDefault<bool>(
-        "alt_mish", kProvider == OnnxProvider::CPU ? true : false);
-    converter_options.alt_layernorm = opts.GetOrDefault<bool>(
-        "alt_layernorm", kProvider == OnnxProvider::DML ? true : false);
-    converter_options.no_shape = opts.GetOrDefault<bool>("no_shape", false);
-    converter_options.policy_head =
-        opts.GetOrDefault<std::string>("policy_head", "vanilla");
-    converter_options.value_head =
-        opts.GetOrDefault<std::string>("value_head", "winner");
-
-    std::string datatype;
-    if (opts.Exists<std::string>("datatype")) {
-      datatype = opts.Get<std::string>("datatype");
-    } else {
-      bool fp16 = opts.GetOrDefault<bool>(
-          "fp16", kProvider == OnnxProvider::CPU ? false : true);
-      datatype = fp16 ? "f16" : "f32";
-    }
-    converter_options.data_type =
-        WeightsToOnnxConverterOptions::StringToDataType(datatype);
-
-    auto converted = ConvertWeightsToOnnx(*w, converter_options);
-    return std::make_unique<OnnxNetwork>(converted, opts, kProvider);
-  }
-}
-
-#ifdef USE_ROCM
-REGISTER_NETWORK("onnx-rocm", MakeOnnxNetwork<OnnxProvider::ROCM>, 64)
-#endif
-#ifdef USE_DML
-REGISTER_NETWORK("onnx-dml", MakeOnnxNetwork<OnnxProvider::DML>, 63)
-#endif
-REGISTER_NETWORK("onnx-trt", MakeOnnxNetwork<OnnxProvider::TRT>, 60)
-REGISTER_NETWORK("onnx-cuda", MakeOnnxNetwork<OnnxProvider::CUDA>, 61)
-REGISTER_NETWORK("onnx-cpu", MakeOnnxNetwork<OnnxProvider::CPU>, 62)
-
-}  // namespace
-}  // namespace lczero
diff --git a/src/neural/backends/onnx/network_onnx.cc b/src/neural/backends/onnx/network_onnx.cc
new file mode 100644
index 0000000000..f5ac887cda
--- /dev/null
+++ b/src/neural/backends/onnx/network_onnx.cc
@@ -0,0 +1,966 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2021-2023 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#include <algorithm>
+#include <cassert>
+#include <filesystem>
+#include <fstream>
+#include <iomanip>
+#include <iterator>
+#include <list>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "onnx_conf.h"
+
+#ifdef USE_ONNX_CUDART
+#include "cuda_runtime.h"
+#include "neural/backends/onnx/onnx_kernels.h"
+#endif
+
+#include "neural/factory.h"
+#include "neural/loader.h"
+#include "neural/network.h"
+#include "neural/onnx/converter.h"
+#include "onnxruntime_cxx_api.h"
+#include "utils/bf16_utils.h"
+#include "utils/bititer.h"
+#include "utils/commandline.h"
+#include "utils/exception.h"
+#include "utils/fp16_utils.h"
+#include "utils/logging.h"
+#include "utils/trace.h"
+
+namespace lczero {
+namespace onnx {
+
+enum class OnnxProvider { CPU, CUDA, DML, ROCM, TRT, MIGRAPHX };
+
+class OnnxNetwork;
+
+static constexpr int kNumOutputPolicy = 1858;
+
+struct InputsOutputs {
+  InputsOutputs(OnnxNetwork* network);
+  ~InputsOutputs() {
+    switch (provider_) {
+      case OnnxProvider::CUDA:
+      case OnnxProvider::TRT:
+#ifdef USE_ONNX_CUDART
+        ReportCUDAErrors(cudaEventDestroy(inputs_uploaded_event_));
+        ReportCUDAErrors(cudaEventDestroy(inputs_processed_event_));
+        ReportCUDAErrors(cudaEventDestroy(evaluation_done_event_));
+        ReportCUDAErrors(cudaEventDestroy(outputs_download_event_));
+        ReportCUDAErrors(cudaFree(input_tensor_upload_device_));
+        ReportCUDAErrors(cudaFree(input_tensor_data_device_));
+        for (void* ptr : output_tensors_data_device_) {
+          ReportCUDAErrors(cudaFree(ptr));
+        }
+        ReportCUDAErrors(cudaFreeHost(input_tensor_data_));
+        for (void* ptr : output_tensors_data_) {
+          ReportCUDAErrors(cudaFreeHost(ptr));
+        }
+        break;
+#endif
+      default:
+        free(input_tensor_data_);
+        for (void* ptr : output_tensors_data_) {
+          free(ptr);
+        }
+    }
+  }
+  OnnxProvider provider_;
+  void* input_tensor_data_;
+  void* input_tensor_upload_device_;
+  void* input_tensor_data_device_;
+  std::vector<void*> output_tensors_data_;
+  std::vector<void*> output_tensors_data_device_;
+  std::vector<size_t> output_tensors_step_;
+  // To be removed when converting to new backend interface.
+  std::vector<float> wdl_output_data_;
+  Ort::MemoryInfo memory_info_{nullptr};
+#ifdef USE_ONNX_CUDART
+  cudaEvent_t inputs_uploaded_event_ = nullptr;
+  cudaEvent_t inputs_processed_event_ = nullptr;
+  cudaEvent_t evaluation_done_event_ = nullptr;
+  cudaEvent_t outputs_download_event_ = nullptr;
+#endif
+};
+
+template <typename DataType>
+class OnnxComputation final : public NetworkComputation {
+ public:
+  OnnxComputation(OnnxNetwork* network);
+  ~OnnxComputation();
+  void AddInput(InputPlanes&& input) override;
+  int GetBatchSize() const override;
+  void ComputeBlocking() override;
+  float GetQVal(int sample) const override;
+  float GetDVal(int sample) const override;
+  float GetPVal(int sample, int move_id) const override;
+  float GetMVal(int sample) const override;
+
+ private:
+  Ort::IoBinding PrepareInputs(int start, int batch_size, int step);
+
+  OnnxNetwork* network_;
+  size_t input_size_ = 0;
+  std::vector<InputPlanes> raw_input_;
+  std::unique_ptr<InputsOutputs> inputs_outputs_;
+};
+
+class OnnxNetwork final : public Network {
+ public:
+  OnnxNetwork(const WeightsFile& file, const OptionsDict& options,
+              OnnxProvider provider, bool cpu_wdl);
+  ~OnnxNetwork();
+  std::unique_ptr<NetworkComputation> NewComputation() override {
+#ifdef USE_ONNX_CUDART
+    if (provider_ == OnnxProvider::CUDA || provider_ == OnnxProvider::TRT) {
+      int device = -1;
+      ReportCUDAErrors(cudaGetDevice(&device));
+      if (device != gpu_) {
+        ReportCUDAErrors(cudaSetDevice(gpu_));
+      }
+    }
+#endif
+    if (fp16_) {
+      return std::make_unique<OnnxComputation<Ort::Float16_t>>(this);
+    } else if (bf16_) {
+      return std::make_unique<OnnxComputation<Ort::BFloat16_t>>(this);
+    } else {
+      return std::make_unique<OnnxComputation<float>>(this);
+    }
+  }
+  const NetworkCapabilities& GetCapabilities() const override {
+    return capabilities_;
+  }
+  int GetMiniBatchSize() const override {
+    return batch_size_ == -1 ? Network::GetMiniBatchSize()
+                             : batch_size_ * steps_;
+  }
+  int GetPreferredBatchStep() const override {
+    return batch_size_ == -1 ? min_batch_size_ : batch_size_;
+  }
+  bool IsCpu() const override { return provider_ == OnnxProvider::CPU; }
+
+  Ort::SessionOptions GetOptions(int threads, int batch_size, uint64_t hash, int optimize);
+
+  std::unique_ptr<InputsOutputs> GetInputsOutputs() {
+    std::lock_guard<std::mutex> lock(inputs_outputs_lock_);
+    if (free_inputs_outputs_.empty()) {
+      return std::make_unique<InputsOutputs>(this);
+    } else {
+      std::unique_ptr<InputsOutputs> resource =
+          std::move(free_inputs_outputs_.front());
+      free_inputs_outputs_.pop_front();
+      return resource;
+    }
+  }
+
+  void ReleaseInputsOutputs(std::unique_ptr<InputsOutputs> resource) {
+    std::lock_guard<std::mutex> lock(inputs_outputs_lock_);
+    free_inputs_outputs_.push_back(std::move(resource));
+  }
+
+  Ort::Env onnx_env_;
+  // Prepare sessions for this many multiples of the batch size;
+  int steps_;
+  std::vector<Ort::Session> session_;
+  std::vector<std::string> inputs_;
+  std::vector<std::string> outputs_;
+  // Indices in output_ vector.
+  int policy_head_ = -1;
+  int wdl_head_ = -1;
+  int value_head_ = -1;
+  int mlh_head_ = -1;
+  NetworkCapabilities capabilities_;
+  bool fp16_;
+  bool bf16_;
+  bool cpu_wdl_;
+  // The batch size to use, or -1 for variable.
+  int batch_size_;
+  // The lower limit for variable batch size.
+  int min_batch_size_;
+  int gpu_;
+  static constexpr int max_batch_size_ = 1024;
+  // For conditional locking if running the DML/ROCM/TRT provider.
+  OnnxProvider provider_;
+  std::mutex lock_;
+  // For shared device addresses.
+#ifdef USE_ONNX_CUDART
+  cudaStream_t compute_stream_ = nullptr;
+  cudaStream_t upload_stream_ = nullptr;
+  cudaStream_t download_stream_ = nullptr;
+#endif
+
+ private:
+  std::mutex inputs_outputs_lock_;
+  std::list<std::unique_ptr<InputsOutputs>> free_inputs_outputs_;
+};
+
+InputsOutputs::InputsOutputs(OnnxNetwork* network)
+    : provider_(network->provider_) {
+  int max_batch_size = network->max_batch_size_;
+  int value_head = network->value_head_;
+  int wdl_head = network->wdl_head_;
+  int policy_head = network->policy_head_;
+  int mlh_head = network->mlh_head_;
+  int data_size = (network->fp16_ | network->bf16_) ? 2 : 4;
+  int outputs_size =
+      std::max({value_head, wdl_head, policy_head, mlh_head}) + 1;
+  output_tensors_data_.resize(outputs_size);
+  output_tensors_data_device_.resize(outputs_size);
+  output_tensors_step_.resize(outputs_size);
+  if (wdl_head != -1) {
+    wdl_output_data_.resize(3 * max_batch_size);
+  }
+  output_tensors_step_[policy_head] = kNumOutputPolicy;
+  if (wdl_head != -1) {
+    output_tensors_step_[wdl_head] = 3;
+  }
+  if (value_head != -1) {
+    output_tensors_step_[value_head] = 1;
+  }
+  if (mlh_head != -1) {
+    output_tensors_step_[mlh_head] = 1;
+  }
+
+  switch (provider_) {
+    case OnnxProvider::CUDA:
+    case OnnxProvider::TRT:
+#ifdef USE_ONNX_CUDART
+      ReportCUDAErrors(
+          cudaEventCreate(&inputs_processed_event_, cudaEventDisableTiming));
+      ReportCUDAErrors(
+          cudaEventCreate(&inputs_uploaded_event_, cudaEventDisableTiming));
+      ReportCUDAErrors(
+          cudaEventCreate(&evaluation_done_event_, cudaEventDisableTiming));
+      ReportCUDAErrors(
+          cudaEventCreate(&outputs_download_event_, cudaEventDisableTiming));
+      ReportCUDAErrors(
+          cudaHostAlloc(&input_tensor_data_,
+                        max_batch_size * kInputPlanes * sizeof(InputPlane), 0));
+      for (int i = 0; i < outputs_size; i++) {
+        ReportCUDAErrors(cudaHostAlloc(
+            &output_tensors_data_[i],
+            max_batch_size * output_tensors_step_[i] * data_size, 0));
+      }
+
+      output_tensors_data_device_.resize(outputs_size);
+      ReportCUDAErrors(
+          cudaMalloc(&input_tensor_upload_device_,
+                     max_batch_size * kInputPlanes * sizeof(InputPlane)));
+      ReportCUDAErrors(
+          cudaMalloc(&input_tensor_data_device_,
+                     max_batch_size * kInputPlanes * 8 * 8 * data_size));
+      for (int i = 0; i < outputs_size; i++) {
+        ReportCUDAErrors(
+            cudaMalloc(&output_tensors_data_device_[i],
+                       max_batch_size * output_tensors_step_[i] * data_size));
+      }
+      memory_info_ = Ort::MemoryInfo{"Cuda", OrtDeviceAllocator, network->gpu_,
+                                     OrtMemTypeDefault};
+      break;
+#endif
+    default:
+      input_tensor_data_ =
+          malloc(max_batch_size * kInputPlanes * 8 * 8 * data_size);
+      for (int i = 0; i < outputs_size; i++) {
+        output_tensors_data_[i] =
+            malloc(max_batch_size * output_tensors_step_[i] * data_size);
+      }
+      input_tensor_data_device_ = input_tensor_data_;
+      for (int i = 0; i < outputs_size; i++) {
+        output_tensors_data_device_[i] = output_tensors_data_[i];
+      }
+      memory_info_ =
+          Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
+  }
+}
+
+OnnxNetwork::~OnnxNetwork() {
+#ifdef USE_ONNX_CUDART
+  if (provider_ == OnnxProvider::TRT || provider_ == OnnxProvider::CUDA) {
+    ReportCUDAErrors(cudaStreamDestroy(compute_stream_));
+    ReportCUDAErrors(cudaStreamDestroy(upload_stream_));
+    ReportCUDAErrors(cudaStreamDestroy(download_stream_));
+  }
+#endif
+}
+
+template <typename DataType>
+OnnxComputation<DataType>::OnnxComputation(OnnxNetwork* network)
+    : network_(network) {
+  inputs_outputs_ = network_->GetInputsOutputs();
+}
+
+template <typename DataType>
+OnnxComputation<DataType>::~OnnxComputation() {
+  network_->ReleaseInputsOutputs(std::move(inputs_outputs_));
+}
+
+void AsDataType(float x, float* y) { *y = x; }
+void AsDataType(float x, Ort::Float16_t* y) {
+  uint16_t tmp = FP32toFP16(x);
+  std::memcpy(reinterpret_cast<uint16_t*>(y), &tmp, sizeof(uint16_t));
+}
+void AsDataType(float x, Ort::BFloat16_t* y) {
+  uint16_t tmp = FP32toBF16(x);
+  std::memcpy(reinterpret_cast<uint16_t*>(y), &tmp, sizeof(uint16_t));
+}
+
+template <typename DataType>
+void OnnxComputation<DataType>::AddInput(InputPlanes&& input) {
+  if (input_size_ >= network_->max_batch_size_) {
+    throw Exception("NN input exceeds max batch size of " +
+                    std::to_string(network_->max_batch_size_) + ".");
+  }
+#ifdef USE_ONNX_CUDART
+  if (network_->provider_ == OnnxProvider::CUDA ||
+      network_->provider_ == OnnxProvider::TRT) {
+    assert(input.size() == kInputPlanes);
+    uint64_t* masks =
+        static_cast<uint64_t*>(inputs_outputs_->input_tensor_data_) +
+        input_size_ * kInputPlanes;
+    uint64_t* mask_end =
+        static_cast<uint64_t*>(inputs_outputs_->input_tensor_data_) +
+        network_->max_batch_size_ * kInputPlanes;
+    DataType* values =
+        reinterpret_cast<DataType*>(mask_end) + input_size_ * kInputPlanes;
+    for (size_t i = 0; i < kInputPlanes; i++) {
+      masks[i] = input[i].mask;
+      DataType value;
+      AsDataType(input[i].value, &value);
+      values[i] = value;
+    }
+    input_size_++;
+    if (input_size_ > network_->max_batch_size_) {
+      throw Exception("NN input exceeds max batch size of " +
+                      std::to_string(network_->max_batch_size_) + ".");
+    }
+    return;
+  }
+#endif
+  raw_input_.emplace_back(std::move(input));
+  input_size_++;
+}
+template <typename DataType>
+int OnnxComputation<DataType>::GetBatchSize() const {
+  return input_size_;
+}
+
+float AsFloat(float x) { return x; }
+float AsFloat(Ort::Float16_t x) {
+  uint16_t tmp;
+  std::memcpy(&tmp, reinterpret_cast<uint16_t*>(&x), sizeof(uint16_t));
+  return FP16toFP32(tmp);
+}
+float AsFloat(Ort::BFloat16_t x) {
+  uint16_t tmp;
+  std::memcpy(&tmp, reinterpret_cast<uint16_t*>(&x), sizeof(uint16_t));
+  return BF16toFP32(tmp);
+}
+
+template <typename DataType>
+float OnnxComputation<DataType>::GetQVal(int sample) const {
+  if (network_->wdl_head_ != -1) {
+    return inputs_outputs_->wdl_output_data_[sample * 3 + 0] -
+           inputs_outputs_->wdl_output_data_[sample * 3 + 2];
+  } else {
+    DataType* data = static_cast<DataType*>(
+        inputs_outputs_->output_tensors_data_[network_->value_head_]);
+    return AsFloat(data[sample]);
+  }
+}
+
+template <typename DataType>
+float OnnxComputation<DataType>::GetDVal(int sample) const {
+  if (network_->wdl_head_ == -1) return 0.0f;
+  return inputs_outputs_->wdl_output_data_[sample * 3 + 1];
+}
+
+template <typename DataType>
+float OnnxComputation<DataType>::GetPVal(int sample, int move_id) const {
+  DataType* data = static_cast<DataType*>(
+      inputs_outputs_->output_tensors_data_[network_->policy_head_]);
+  return AsFloat(data[sample * kNumOutputPolicy + move_id]);
+}
+
+template <typename DataType>
+float OnnxComputation<DataType>::GetMVal(int sample) const {
+  if (network_->mlh_head_ == -1) return 0.0f;
+  DataType* data = static_cast<DataType*>(
+      inputs_outputs_->output_tensors_data_[network_->mlh_head_]);
+  return AsFloat(data[sample]);
+}
+
+template <typename DataType>
+Ort::IoBinding OnnxComputation<DataType>::PrepareInputs(int start,
+                                                        int batch_size,
+                                                        int step) {
+#ifdef USE_ONNX_CUDART
+  if (network_->provider_ != OnnxProvider::CUDA &&
+      network_->provider_ != OnnxProvider::TRT)
+#endif
+  {
+    DataType* iter =
+        static_cast<DataType*>(inputs_outputs_->input_tensor_data_);
+    iter += start * kInputPlanes * 8 * 8;
+    std::memset(static_cast<void*>(iter), 0,
+                batch_size * kInputPlanes * 8 * 8 * sizeof(DataType));
+    int end = std::min(start + batch_size, static_cast<int>(input_size_));
+    for (int i = start; i < end; i++) {
+      for (const auto& plane : raw_input_[i]) {
+        DataType value;
+        AsDataType(plane.value, &value);
+        for (auto bit : IterateBits(plane.mask)) {
+          *(iter + bit) = value;
+        }
+        iter += 64;
+      }
+    }
+  }
+
+  Ort::IoBinding binding{network_->session_[step - 1]};
+  for (size_t i = 0; i < inputs_outputs_->output_tensors_step_.size(); i++) {
+    int size = inputs_outputs_->output_tensors_step_[i];
+    int64_t dims[] = {batch_size, size};
+    binding.BindOutput(
+        network_->outputs_[i].c_str(),
+        Ort::Value::CreateTensor<DataType>(
+            inputs_outputs_->memory_info_,
+            static_cast<DataType*>(
+                inputs_outputs_->output_tensors_data_device_[i]) +
+                start * size,
+            size * batch_size, dims, 2));
+  }
+
+  int64_t dims[] = {batch_size, kInputPlanes, 8, 8};
+  binding.BindInput(
+      network_->inputs_[0].c_str(),
+      Ort::Value::CreateTensor<DataType>(
+          inputs_outputs_->memory_info_,
+          static_cast<DataType*>(inputs_outputs_->input_tensor_data_device_) +
+              start * kInputPlanes * 8 * 8,
+          batch_size * kInputPlanes * 8 * 8, dims, 4));
+  return binding;
+}
+
+template <typename DataType>
+void OnnxComputation<DataType>::ComputeBlocking() {
+  LCTRACE_FUNCTION_SCOPE;
+  int batch_size = network_->batch_size_;
+  if (batch_size < 0) {
+    batch_size =
+        std::max(static_cast<int>(input_size_), network_->min_batch_size_);
+  }
+  // Only the DML onnxruntime execution provider is documented as needing
+  // locking, but it seems all GPU backends need it.
+  if (network_->provider_ != OnnxProvider::CPU) {
+    network_->lock_.lock();
+  }
+  for (size_t i = 0; i < (size_t)input_size_;) {
+    int step = (input_size_ - i + batch_size - 1) / batch_size;
+    if (step > network_->steps_) step = network_->steps_;
+    int batch = batch_size * step;
+    if (network_->provider_ == OnnxProvider::TRT && network_->batch_size_ > 0) {
+      batch = std::min((int)input_size_ - (int)i, batch);
+    }
+
+    auto binding = PrepareInputs(i, batch, step);
+
+    Ort::RunOptions options = {};
+#ifdef USE_ONNX_CUDART
+    if (network_->provider_ == OnnxProvider::TRT ||
+        network_->provider_ == OnnxProvider::CUDA) {
+      if (i == 0) {
+        ReportCUDAErrors(
+            cudaStreamWaitEvent(network_->upload_stream_,
+                                inputs_outputs_->inputs_processed_event_));
+      }
+      const char* src_masks =
+          static_cast<char*>(inputs_outputs_->input_tensor_data_);
+      char* dst_masks =
+          static_cast<char*>(inputs_outputs_->input_tensor_upload_device_);
+      src_masks += i * kInputPlanes * sizeof(uint64_t);
+      dst_masks += i * kInputPlanes * (sizeof(uint64_t) + sizeof(DataType));
+      ReportCUDAErrors(cudaMemcpyAsync(
+          dst_masks, src_masks, batch * kInputPlanes * sizeof(uint64_t),
+          cudaMemcpyHostToDevice, network_->upload_stream_));
+      char* src_values =
+          static_cast<char*>(inputs_outputs_->input_tensor_data_);
+      src_values += network_->max_batch_size_ * kInputPlanes * sizeof(uint64_t);
+      src_values += i * kInputPlanes * sizeof(DataType);
+      char* dst_values = dst_masks + batch * kInputPlanes * sizeof(uint64_t);
+      ReportCUDAErrors(cudaMemcpyAsync(
+          dst_values, src_values, batch * kInputPlanes * sizeof(DataType),
+          cudaMemcpyHostToDevice, network_->upload_stream_));
+      ReportCUDAErrors(cudaEventRecord(inputs_outputs_->inputs_uploaded_event_,
+                                       network_->upload_stream_));
+      ReportCUDAErrors(cudaStreamWaitEvent(
+          network_->compute_stream_, inputs_outputs_->inputs_uploaded_event_));
+      if (network_->fp16_) {
+        half* dst =
+            reinterpret_cast<half*>(inputs_outputs_->input_tensor_data_device_);
+        dst += i * kInputPlanes * 8 * 8;
+        expandPlanesOnnx(dst, dst_masks, batch * kInputPlanes,
+                         network_->compute_stream_);
+      } else if (network_->bf16_) {
+        __nv_bfloat16* dst = reinterpret_cast<__nv_bfloat16*>(
+            inputs_outputs_->input_tensor_data_device_);
+        dst += i * kInputPlanes * 8 * 8;
+        expandPlanesOnnx(dst, dst_masks, batch * kInputPlanes,
+                         network_->compute_stream_);
+      } else {
+        float* dst = reinterpret_cast<float*>(
+            inputs_outputs_->input_tensor_data_device_);
+        dst += i * kInputPlanes * 8 * 8;
+        expandPlanesOnnx(dst, dst_masks, batch * kInputPlanes,
+                         network_->compute_stream_);
+      }
+
+      ReportCUDAErrors(cudaEventRecord(inputs_outputs_->inputs_processed_event_,
+                                       network_->upload_stream_));
+      if (i == 0) {
+        ReportCUDAErrors(
+            cudaStreamWaitEvent(network_->compute_stream_,
+                                inputs_outputs_->outputs_download_event_));
+      }
+      options.AddConfigEntry("disable_synchronize_execution_providers", "1");
+    } else
+#endif
+    {
+      binding.SynchronizeInputs();
+    }
+    network_->session_[step - 1].Run(options, binding);
+#ifdef USE_ONNX_CUDART
+    if (network_->provider_ == OnnxProvider::TRT ||
+        network_->provider_ == OnnxProvider::CUDA) {
+      for (size_t j = 0; j < inputs_outputs_->output_tensors_step_.size();
+           j++) {
+        ReportCUDAErrors(
+            cudaEventRecord(inputs_outputs_->evaluation_done_event_,
+                            network_->compute_stream_));
+        ReportCUDAErrors(
+            cudaStreamWaitEvent(network_->download_stream_,
+                                inputs_outputs_->evaluation_done_event_));
+        size_t offset = i * inputs_outputs_->output_tensors_step_[j];
+        ReportCUDAErrors(cudaMemcpyAsync(
+            static_cast<DataType*>(inputs_outputs_->output_tensors_data_[j]) +
+                offset,
+            static_cast<DataType*>(
+                inputs_outputs_->output_tensors_data_device_[j]) +
+                offset,
+            batch * inputs_outputs_->output_tensors_step_[j] * sizeof(DataType),
+            cudaMemcpyDeviceToHost, network_->download_stream_));
+        ReportCUDAErrors(
+            cudaEventRecord(inputs_outputs_->outputs_download_event_,
+                            network_->download_stream_));
+      }
+    } else
+#endif
+    {
+      binding.SynchronizeOutputs();
+    }
+    i += batch;
+  }
+  if (network_->provider_ != OnnxProvider::CPU) {
+    network_->lock_.unlock();
+  }
+#ifdef USE_ONNX_CUDART
+  if (network_->provider_ == OnnxProvider::TRT ||
+      network_->provider_ == OnnxProvider::CUDA) {
+    ReportCUDAErrors(
+        cudaEventSynchronize(inputs_outputs_->outputs_download_event_));
+  }
+#endif
+  if (network_->wdl_head_ != -1) {
+    const DataType* data = static_cast<DataType*>(
+        inputs_outputs_->output_tensors_data_[network_->wdl_head_]);
+    for (size_t i = 0; i < input_size_; i++) {
+      float w = AsFloat(data[i * 3 + 0]);
+      float d = AsFloat(data[i * 3 + 1]);
+      float l = AsFloat(data[i * 3 + 2]);
+      if (network_->cpu_wdl_) {
+        // Value softmax done cpu side.
+        float m = std::max({w, d, l});
+        w = std::exp(w - m);
+        d = std::exp(d - m);
+        l = std::exp(l - m);
+        float sum = w + d + l;
+        w /= sum;
+        l /= sum;
+        d /= sum;
+      }
+      inputs_outputs_->wdl_output_data_[3 * i + 0] = w;
+      inputs_outputs_->wdl_output_data_[3 * i + 1] = d;
+      inputs_outputs_->wdl_output_data_[3 * i + 2] = l;
+    }
+  }
+}
+
+Ort::SessionOptions OnnxNetwork::GetOptions(int threads, int batch_size,
+                                            uint64_t hash, int optimize) {
+  Ort::SessionOptions options;
+  options.SetIntraOpNumThreads(threads);
+  GraphOptimizationLevel level = GraphOptimizationLevel::ORT_DISABLE_ALL;
+  switch (optimize) {
+    case 0:
+      level = GraphOptimizationLevel::ORT_DISABLE_ALL;
+      break;
+    case 1:
+      level = GraphOptimizationLevel::ORT_ENABLE_BASIC;
+      break;
+    case 2:
+      level = GraphOptimizationLevel::ORT_ENABLE_EXTENDED;
+      break;
+    default:
+      level = GraphOptimizationLevel::ORT_ENABLE_ALL;
+      break;
+  }
+  options.SetGraphOptimizationLevel(level);
+
+  if (batch_size > 0 && provider_ != OnnxProvider::TRT) {
+    // Override the default (variable) batch size.
+    Ort::ThrowOnError(
+        OrtGetApiBase()
+            ->GetApi(ORT_API_VERSION)
+            ->AddFreeDimensionOverrideByName(options, "batch", batch_size));
+  }
+
+  switch (provider_) {
+    case OnnxProvider::DML: {
+      std::unordered_map<std::string, std::string> dml_options;
+      dml_options["device_id"] = std::to_string(gpu_);
+      dml_options["performance_preference"] = "high_performance";
+      options.AppendExecutionProvider("DML", dml_options);
+      break;
+    }
+    case OnnxProvider::TRT: {
+      options.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
+
+      std::string cache_dir = CommandLine::BinaryDirectory() + "/trt_cache";
+      std::map<std::string, std::string> trt_options;
+      trt_options["device_id"] = std::to_string(gpu_);
+      trt_options["trt_builder_optimization_level"] = std::to_string(std::clamp(optimize, 0, 5));
+      trt_options["trt_fp16_enable"] = optimize >= 6 ? "1" : "0";
+#if ORT_API_VERSION >= 23
+      trt_options["trt_bf16_enable"] = optimize >= 7 ? "1" : "0";
+#endif
+      trt_options["trt_int8_enable"] = optimize >= 8 ? "1" : "0";
+      trt_options["trt_max_partition_iterations"] = "1000";
+      trt_options["trt_min_subgraph_size"] = "1";
+      trt_options["trt_engine_cache_enable"] = "1";
+      // We need the batch size as well as the hash, as it is set after loading.
+      std::ostringstream oss;
+      oss << std::hex << hash;
+      trt_options["trt_engine_cache_prefix"] =
+          "Lc0_ONNX_TRT_ORT_" + Ort::GetVersionString() + "_batch_" +
+          (batch_size < 0 ? std::to_string(batch_size)
+                          : std::to_string(batch_size - batch_size_ + 1) + "-" +
+                                std::to_string(batch_size)) +
+          "_" + std::to_string(optimize) + "_" + oss.str() + "_";
+      trt_options["trt_engine_cache_path"] = cache_dir;
+      trt_options["trt_timing_cache_enable"] = "1";
+      trt_options["trt_timing_cache_path"] = cache_dir;
+      trt_options["trt_layer_norm_fp32_fallback"] = "1";
+      trt_options["trt_force_sequential_engine_build"] = "1";
+      trt_options["trt_context_memory_sharing_enable"] = "1";
+      // Looks like we need I/O binding to enable this.
+#ifdef USE_ONNX_CUDART
+      trt_options["has_user_compute_stream"] = "1";
+#endif
+      if (batch_size < 0) {
+        trt_options["trt_profile_min_shapes"] =
+            inputs_[0] + ":" + std::to_string(min_batch_size_) + "x112x8x8";
+        trt_options["trt_profile_max_shapes"] =
+            inputs_[0] + ":" + std::to_string(max_batch_size_) + "x112x8x8";
+        trt_options["trt_profile_opt_shapes"] =
+            inputs_[0] + ":" + std::to_string(max_batch_size_ / 4) + "x112x8x8";
+      } else {
+        trt_options["trt_profile_min_shapes"] =
+            inputs_[0] + ":" + std::to_string(batch_size - batch_size_ + 1) +
+            "x112x8x8";
+        trt_options["trt_profile_max_shapes"] =
+            inputs_[0] + ":" + std::to_string(batch_size) + "x112x8x8";
+        trt_options["trt_profile_opt_shapes"] =
+            inputs_[0] + ":" + std::to_string(batch_size) + "x112x8x8";
+      }
+      std::vector<const char*> keys;
+      std::vector<const char*> values;
+      for (const auto& [key, value] : trt_options) {
+        keys.push_back(key.c_str());
+        values.push_back(value.c_str());
+      }
+
+      const auto& api = Ort::GetApi();
+      OrtTensorRTProviderOptionsV2* trt_options_v2;
+      Ort::ThrowOnError(api.CreateTensorRTProviderOptions(&trt_options_v2));
+      Ort::ThrowOnError(api.UpdateTensorRTProviderOptions(
+          trt_options_v2, keys.data(), values.data(), keys.size()));
+#ifdef USE_ONNX_CUDART
+      Ort::ThrowOnError(api.UpdateTensorRTProviderOptionsWithValue(
+          trt_options_v2, "user_compute_stream", compute_stream_));
+#endif
+      options.AppendExecutionProvider_TensorRT_V2(*trt_options_v2);
+      api.ReleaseTensorRTProviderOptions(trt_options_v2);
+      break;
+    }
+    case OnnxProvider::ROCM: {
+      OrtROCMProviderOptions rocm_options;
+      rocm_options.device_id = gpu_;
+      options.AppendExecutionProvider_ROCM(rocm_options);
+      break;
+    }
+    case OnnxProvider::MIGRAPHX: {
+      std::unordered_map<std::string, std::string> migraphx_options;
+      migraphx_options["device_id"] = std::to_string(gpu_);
+      migraphx_options["migraphx_exhaustive_tune"] = optimize >= 5 ? "1" : "0";
+      migraphx_options["migraphx_fp16_enable"] = optimize >= 6 ? "1" : "0";
+      migraphx_options["migraphx_bf16_enable"] = optimize >= 7 ? "1" : "0";
+      migraphx_options["migraphx_fp8_enable"] = optimize >= 8 ? "1" : "0";
+      std::filesystem::path cache_dir = CommandLine::BinaryDirectory();
+      cache_dir /= "migraphx_cache";
+
+      if (!std::filesystem::exists(cache_dir)) {
+        std::filesystem::create_directories(cache_dir);
+      }
+      migraphx_options["migraphx_model_cache_dir"] = cache_dir.string();
+
+      options.AppendExecutionProvider("MIGraphX", migraphx_options);
+      break;
+    }
+    case OnnxProvider::CUDA: {
+      OrtCUDAProviderOptions cuda_options;
+      cuda_options.device_id = gpu_;
+#ifdef USE_ONNX_CUDART
+      cuda_options.has_user_compute_stream = true;
+      cuda_options.user_compute_stream = compute_stream_;
+#endif
+      options.AppendExecutionProvider_CUDA(cuda_options);
+      break;
+    }
+    case OnnxProvider::CPU:
+      // The CPU execution provider is always available.
+      break;
+  }
+  return options;
+}
+
+OnnxNetwork::OnnxNetwork(const WeightsFile& file, const OptionsDict& opts,
+                         OnnxProvider provider, bool cpu_wdl)
+    : onnx_env_(ORT_LOGGING_LEVEL_WARNING, "lc0"),
+      capabilities_{file.format().network_format().input(),
+                    file.format().network_format().output(),
+                    file.format().network_format().moves_left()},
+      fp16_(file.onnx_model().data_type() == pblczero::OnnxModel::FLOAT16),
+      bf16_(file.onnx_model().data_type() == pblczero::OnnxModel::BFLOAT16),
+      cpu_wdl_(cpu_wdl),
+      provider_(provider) {
+  onnx_env_.DisableTelemetryEvents();
+
+  gpu_ = opts.GetOrDefault<int>("gpu", 0);
+
+#ifdef USE_ONNX_CUDART
+  if (provider_ == OnnxProvider::CUDA || provider_ == OnnxProvider::TRT) {
+    cudaDeviceProp deviceProp = {};
+    if (!cudaGetDeviceProperties(&deviceProp, gpu_)) {
+      CERR << "GPU: " << deviceProp.name;
+      CERR << "GPU memory: " << deviceProp.totalGlobalMem / std::pow(2.0f, 30)
+           << " Gb";
+      int clockRate = 0;
+      ReportCUDAErrors(
+          cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, gpu_));
+      CERR << "GPU clock frequency: " << clockRate / 1e3f << " MHz";
+    }
+#if CUDART_VERSION >= 12080
+    int runtime_version;
+    ReportCUDAErrors(cudaRuntimeGetVersion(&runtime_version));
+    if (runtime_version >= 12080) {
+      int attr;
+      ReportCUDAErrors(
+          cudaDeviceGetAttribute(&attr, cudaDevAttrGpuPciDeviceId, gpu_));
+      uint32_t pci_device = attr;
+      CERR << "GPU device ID: " << std::hex << (pci_device & 0xffff) << ":"
+           << (pci_device >> 16);
+      ReportCUDAErrors(
+          cudaDeviceGetAttribute(&attr, cudaDevAttrGpuPciSubsystemId, gpu_));
+      uint32_t pci_subsystem = attr;
+      CERR << "GPU subsystem ID: " << std::hex << (pci_subsystem & 0xffff)
+           << ":" << (pci_subsystem >> 16) << std::dec;
+    }
+#endif
+  }
+#endif
+
+  int threads =
+      opts.GetOrDefault<int>("threads", provider == OnnxProvider::CPU ? 1 : 0);
+  int default_batch = -1;
+  int default_steps = 1;
+  int default_min_batch = 1;
+  switch (provider) {
+    case OnnxProvider::DML:
+    case OnnxProvider::MIGRAPHX:
+      default_batch = 16;
+      default_steps = 4;
+      break;
+    case OnnxProvider::TRT:
+      default_min_batch = 4;
+    default:
+      break;
+  }
+
+  int optimize = opts.GetOrDefault<int>("optimize", 3);
+  batch_size_ = opts.GetOrDefault<int>("batch", default_batch);
+  steps_ = opts.GetOrDefault<int>("steps", default_steps);
+  min_batch_size_ = opts.GetOrDefault<int>("min_batch", default_min_batch);
+
+  // Sanity checks.
+  if (batch_size_ <= 0) {
+    batch_size_ = -1;  // Variable batch size.
+    steps_ = 1;
+  }
+  if (batch_size_ * steps_ > max_batch_size_) {
+    batch_size_ = max_batch_size_ / steps_;
+  }
+
+  const auto& md = file.onnx_model();
+  if (!md.has_input_planes()) {
+    throw Exception("NN doesn't have input planes defined.");
+  }
+  inputs_.emplace_back(md.input_planes());
+  if (!md.has_output_policy()) {
+    throw Exception("NN doesn't have policy head defined.");
+  }
+  policy_head_ = outputs_.size();
+  outputs_.emplace_back(md.output_policy());
+  if (md.has_output_wdl()) {
+    wdl_head_ = outputs_.size();
+    outputs_.emplace_back(md.output_wdl());
+  } else if (md.has_output_value()) {
+    value_head_ = outputs_.size();
+    outputs_.emplace_back(md.output_value());
+  } else {
+    throw Exception("NN doesn't have value head.");
+  }
+  if (md.has_output_mlh()) {
+    mlh_head_ = outputs_.size();
+    outputs_.emplace_back(md.output_mlh());
+  }
+  uint64_t hash = 0;
+  if (provider == OnnxProvider::TRT) {
+    hash = std::hash<std::string_view>()(md.model());
+  }
+  switch (provider) {
+    case OnnxProvider::TRT:
+    case OnnxProvider::CUDA:
+#ifdef USE_ONNX_CUDART
+      ReportCUDAErrors(cudaSetDevice(gpu_));
+      ReportCUDAErrors(cudaStreamCreate(&compute_stream_));
+      ReportCUDAErrors(cudaStreamCreate(&upload_stream_));
+      ReportCUDAErrors(cudaStreamCreate(&download_stream_));
+#else
+      CERR << "WARNING: Simplified version without CUDA enhancements.";
+#endif
+      break;
+    default:
+      break;
+  }
+
+  for (int step = 1; step <= steps_; step++)
+    session_.emplace_back(onnx_env_, file.onnx_model().model().data(),
+                          file.onnx_model().model().size(),
+                          GetOptions(threads, batch_size_ * step, hash, optimize));
+}
+
+template <OnnxProvider kProvider>
+std::unique_ptr<Network> MakeOnnxNetwork(const std::optional<WeightsFile>& w,
+                                         const OptionsDict& opts) {
+  if (!w) throw Exception("The ONNX backend requires a network file.");
+
+  if (w->has_onnx_model()) {
+    return std::make_unique<OnnxNetwork>(*w, opts, kProvider, false);
+  } else {
+    WeightsToOnnxConverterOptions converter_options;
+    converter_options.ir = opts.GetOrDefault<int>("ir", -1);
+    converter_options.alt_mish = opts.GetOrDefault<bool>(
+        "alt_mish", kProvider == OnnxProvider::CPU ? true : false);
+    converter_options.alt_layernorm = opts.GetOrDefault<bool>(
+        "alt_layernorm",
+        kProvider == OnnxProvider::DML &&
+                w->format().network_format().ffn_activation() ==
+                    pblczero::NetworkFormat::ACTIVATION_RELU_2
+            ? true
+            : false);
+    converter_options.no_shape = opts.GetOrDefault<bool>("no_shape", false);
+    converter_options.policy_head =
+        opts.GetOrDefault<std::string>("policy_head", "vanilla");
+    converter_options.value_head =
+        opts.GetOrDefault<std::string>("value_head", "winner");
+    converter_options.no_wdl_softmax = true;
+    // No execution provider has a better mish version, some don't even have it.
+    converter_options.real_mish = false;
+
+    std::string datatype;
+    if (opts.Exists<std::string>("datatype")) {
+      datatype = opts.Get<std::string>("datatype");
+    } else {
+      bool fp16 = opts.GetOrDefault<bool>(
+          "fp16", kProvider == OnnxProvider::CPU ? false : true);
+      datatype = fp16 ? "f16" : "f32";
+    }
+    converter_options.data_type =
+        WeightsToOnnxConverterOptions::StringToDataType(datatype);
+    converter_options.opset = opts.GetOrDefault<int>(
+        "opset", converter_options.data_type ==
+                         WeightsToOnnxConverterOptions::DataType::kBFloat16
+                     ? 22
+                     : 17);
+
+    auto converted = ConvertWeightsToOnnx(*w, converter_options);
+    return std::make_unique<OnnxNetwork>(converted, opts, kProvider, true);
+  }
+}
+
+#ifdef USE_MIGRAPHX
+REGISTER_NETWORK("onnx-migraphx", MakeOnnxNetwork<OnnxProvider::MIGRAPHX>, 65)
+#endif
+#ifdef USE_ROCM
+REGISTER_NETWORK("onnx-rocm", MakeOnnxNetwork<OnnxProvider::ROCM>, 64)
+#endif
+#ifdef USE_DML
+REGISTER_NETWORK("onnx-dml", MakeOnnxNetwork<OnnxProvider::DML>, 63)
+#endif
+REGISTER_NETWORK("onnx-trt", MakeOnnxNetwork<OnnxProvider::TRT>, 60)
+REGISTER_NETWORK("onnx-cuda", MakeOnnxNetwork<OnnxProvider::CUDA>, 61)
+REGISTER_NETWORK("onnx-cpu", MakeOnnxNetwork<OnnxProvider::CPU>, 62)
+
+}  // namespace onnx
+}  // namespace lczero
diff --git a/src/neural/backends/onnx/onnx_kernels.cu b/src/neural/backends/onnx/onnx_kernels.cu
new file mode 100644
index 0000000000..1da1d0f232
--- /dev/null
+++ b/src/neural/backends/onnx/onnx_kernels.cu
@@ -0,0 +1,94 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2025 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#include <cstdint>
+
+#include "neural/backends/onnx/onnx_kernels.h"
+#include "utils/exception.h"
+
+namespace lczero {
+namespace onnx {
+
+template <unsigned bits_per_thread, typename DataType>
+__global__ void expandPlanes_kernel(DataType* output, const uint64_t* masks,
+                                    const DataType* values, unsigned n) {
+  unsigned index = threadIdx.x + blockDim.x * blockIdx.x;
+  index *= bits_per_thread;
+  unsigned planeIndex = index >> 6;
+  if (planeIndex >= n) return;
+
+  uint64_t mask = masks[planeIndex];
+  unsigned sqIndex = index & 0x3F;
+  DataType value = static_cast<DataType>(values[planeIndex]);
+  DataType op[bits_per_thread] = {};
+  mask >>= sqIndex;
+  for (unsigned i = 0; i < bits_per_thread; i++) {
+    if (mask & 0x1) {
+      op[i] = value;
+    }
+    mask >>= 1;
+  }
+  for (unsigned i = 0; i < bits_per_thread; i++) {
+    output[index + i] = op[i];
+  }
+}
+
+template <typename DataType>
+void expandPlanesOnnx(DataType* output, const void* input, unsigned n,
+                      cudaStream_t stream) {
+  constexpr unsigned bits_per_thread = 2;
+  int threads = n * 8 * 8 / bits_per_thread;
+  const int blockSize = 256;
+  int blocks = DivUp(threads, blockSize);
+
+  const uint64_t* masks = static_cast<const uint64_t*>(input);
+  const DataType* values = reinterpret_cast<const DataType*>(masks + n);
+
+  expandPlanes_kernel<bits_per_thread>
+      <<<blocks, blockSize, 0, stream>>>(output, masks, values, n);
+
+  ReportCUDAErrors(cudaGetLastError());
+}
+
+void CudaError(cudaError_t status, const char* file, int line) {
+  if (status != cudaSuccess) {
+    auto err = std::string("CUDA error: ") + cudaGetErrorString(status) + " (" +
+               file + ":" + std::to_string(line) + ") ";
+    throw Exception(err);
+  }
+}
+
+template void expandPlanesOnnx<half>(half* output, const void* input,
+                                     unsigned n, cudaStream_t stream);
+template void expandPlanesOnnx<float>(float* output, const void* input,
+                                      unsigned n, cudaStream_t stream);
+template void expandPlanesOnnx<__nv_bfloat16>(__nv_bfloat16* output,
+                                              const void* input, unsigned n,
+                                              cudaStream_t stream);
+
+}  // namespace onnx
+}  // namespace lczero
diff --git a/src/neural/backends/onnx/onnx_kernels.h b/src/neural/backends/onnx/onnx_kernels.h
new file mode 100644
index 0000000000..f16b981da7
--- /dev/null
+++ b/src/neural/backends/onnx/onnx_kernels.h
@@ -0,0 +1,49 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2025 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#pragma once
+
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+namespace lczero {
+namespace onnx {
+
+// Expand input planes from bitmask to floating point tensors. It is used as a
+// preprocessing step of ONNX models.
+template <typename DataType>
+void expandPlanesOnnx(DataType* output, const void* input, unsigned n,
+                      cudaStream_t stream);
+
+#define ReportCUDAErrors(status) CudaError(status, __FILE__, __LINE__)
+void CudaError(cudaError_t status, const char* file, int line);
+
+inline int DivUp(int a, int b) { return (a + b - 1) / b; }
+
+}  // namespace onnx
+}  // namespace lczero
diff --git a/src/neural/backends/opencl/OpenCL.h b/src/neural/backends/opencl/OpenCL.h
index 369aae7b68..08b0324d58 100644
--- a/src/neural/backends/opencl/OpenCL.h
+++ b/src/neural/backends/opencl/OpenCL.h
@@ -36,7 +36,13 @@ using net_t = float;
 #include <string>
 #include <vector>
 
+#if __has_include("CL/opencl.hpp")
+#include "CL/opencl.hpp"
+#elif __has_include("OpenCL/opencl.hpp")
+#include "OpenCL/opencl.hpp"
+#else
 #include "opencl.hpp"
+#endif
 
 #include "neural/backends/opencl/OpenCLBuffers.h"
 #include "neural/backends/opencl/OpenCLParams.h"
diff --git a/src/neural/backends/sycl/common_kernels.dp.cpp b/src/neural/backends/sycl/common_kernels.dp.cpp
index 65335e5e6a..8cae7bbf49 100644
--- a/src/neural/backends/sycl/common_kernels.dp.cpp
+++ b/src/neural/backends/sycl/common_kernels.dp.cpp
@@ -20,7 +20,6 @@
 */
 
 #include <sycl/sycl.hpp>
-#include "dpct/dpct.hpp"
 #include <algorithm>
 #include <cassert>
 
@@ -881,7 +880,7 @@ void globalAvgPool_kernel(T* output, const T* input,
     "--use-experimental-features=masked-sub-group-operation" to use the
     experimental helper function to migrate __shfl_down_sync.
     */
-    S += dpct::shift_sub_group_left(item_ct1.get_sub_group(), S, offset);
+    S += sycl::shift_group_left(item_ct1.get_sub_group(), S, offset);
   }
 
   float avg = S / elementsPerWarp;
@@ -960,8 +959,10 @@ void globalScale(int N, int C, T* output, const T* input, const T* scaleBias,
             sycl::range<3>(1, 1, kBlocks) * sycl::range<3>(1, 1, kBlockSize),
             sycl::range<3>(1, 1, kBlockSize)),
         [=](sycl::nd_item<3> item_ct1) {
-          ((sycl::half*)output, (sycl::half*)input, (sycl::half*)scaleBias,
-           (sycl::half*)prevLayerBias, N * C * 8 * 8, C, 8 * 8 * C, activation);
+          globalScale_kernel_fp16_nhwc(
+              (sycl::half*)output, (sycl::half*)input, (sycl::half*)scaleBias,
+              (sycl::half*)prevLayerBias, N * C * 8 * 8, C, 8 * 8 * C,
+              activation, item_ct1);
         });
   } else {
     sycl_queue.parallel_for(
@@ -1126,7 +1127,7 @@ void softmax_opt_64_kernel(T* output, const T* input,
   "--use-experimental-features=masked-sub-group-operation" to use the
   experimental helper function to migrate __shfl_sync.
   */
-  maxval = dpct::select_from_sub_group(item_ct1.get_sub_group(), maxval, 0);
+  maxval = sycl::select_from_group(item_ct1.get_sub_group(), maxval, 0);
 
   ex[0] = sycl::exp(x[0] - maxval);
   ex[1] = sycl::exp(x[1] - maxval);
@@ -1139,7 +1140,7 @@ void softmax_opt_64_kernel(T* output, const T* input,
   "--use-experimental-features=masked-sub-group-operation" to use the
   experimental helper function to migrate __shfl_sync.
   */
-  Sum = dpct::select_from_sub_group(item_ct1.get_sub_group(), Sum, 0);
+  Sum = sycl::select_from_group(item_ct1.get_sub_group(), Sum, 0);
 
   ex[0] = ex[0] / Sum;
   ex[1] = ex[1] / Sum;
@@ -1162,11 +1163,16 @@ void softmax_opt_64_kernel(T* output, const T* input,
 // C threads per block, N blocks
 template <typename T>
 void softmax_kernel(T* output, const T* input, const T* input2,
-                    const sycl::nd_item<3> &item_ct1, float &sum, float &maxval) {
+                    const sycl::nd_item<3> &item_ct1, float &localsum,
+                    float &localmax) {
   int n = item_ct1.get_group(2);
   int c = item_ct1.get_local_id(2);
   int C = item_ct1.get_local_range(2);
   int index = n * C + c;
+  sycl::atomic_ref<float, sycl::memory_order::relaxed,
+                   sycl::memory_scope::work_group> maxval(localmax);
+  sycl::atomic_ref<float, sycl::memory_order::relaxed,
+                   sycl::memory_scope::work_group> sum(localsum);
 
   // softmax = tf.exp(logits) / tf.reduce_sum(tf.exp(logits), axis)
 
@@ -1183,7 +1189,7 @@ void softmax_kernel(T* output, const T* input, const T* input2,
 
   // Get max across warp first, and then update across C dimension
   float warpmax = warpMax(x, item_ct1);
-  if ((c & 0x1F) == 0) atomicMaxFloat(&maxval, warpmax);
+  if ((c & 0x1F) == 0) maxval.fetch_max(warpmax);
 
   
   item_ct1.barrier(sycl::access::fence_space::local_space);
@@ -1195,8 +1201,7 @@ void softmax_kernel(T* output, const T* input, const T* input2,
 
   // update shared memory sum across C dimension
   if ((c & 0x1F) == 0)
-      dpct::atomic_fetch_add<sycl::access::address_space::generic_space>(&sum,
-                                                                         val);
+      sum.fetch_add(val);
 
   
   item_ct1.barrier(sycl::access::fence_space::local_space);
@@ -1243,7 +1248,8 @@ void Softmax(int N, int C, T* output, const T* input, const T* input2, sycl::que
   }
 }
 
-__dpct_inline__ float shared_sum_for_layer_norm(
+[[gnu::always_inline]]
+inline float shared_sum_for_layer_norm(
     float x, const sycl::nd_item<3>& item_ct1,
     sycl::local_accessor<float, 2> sum) {
   // compute warp-wide sum
@@ -1676,6 +1682,74 @@ void applyInputGating(T* output, const T* input, const T* mult, const T* add,
                        });
 }
 
+template<typename T, int kWorkPerThread>
+static void genOffsetPointers_kernel(T** offsets, int heads, int block_size,
+                                     int depth, int d_model, T* k, T* q, T* b1,
+                                     T* v, T* b2,
+                                     const sycl::nd_item<1>& item_ct) {
+  const int i = item_ct.get_global_id(0) * kWorkPerThread;
+  if (i >= block_size) return;
+  const int h = i % heads;
+  const int n = i / heads;
+  int w;
+  T* res[kWorkPerThread];
+  for (w = 0; w < kWorkPerThread; w++) {
+    res[w] = k + h * depth + 64 * d_model * n + w * depth;
+    offsets[i + w] = res[w];
+  }
+
+  for (w = 0; w < kWorkPerThread; w++) {
+    res[w] = q + h * depth + 64 * d_model * n + w * depth;
+    offsets[i + w + block_size] = res[w];
+  }
+
+  for (w = 0; w < kWorkPerThread; w++) {
+    res[w] = b1 + i * 64 * 64 + w * 64 * 64;
+    offsets[i + w + 2 * block_size] = res[w];
+  }
+
+  for (w = 0; w < kWorkPerThread; w++) {
+    res[w] = v + h * depth + 64 * d_model * n + w * depth;
+    offsets[i + w + 3 * block_size] = res[w];
+  }
+
+  for (w = 0; w < kWorkPerThread; w++) {
+    res[w] =  b2 + h * depth + 64 * d_model * n + w * depth;
+    offsets[i + w + 4 * block_size] = res[w];
+  }
+}
+
+template <typename T>
+void genOffsetPointers(T** offsets, int heads, int max_batch, int depth,
+                       int d_model, T* k, T* q, T* b1,
+                       T* v, T* b2, sycl::queue& sycl_queue) {
+  const int block_size = heads * max_batch;
+  // Process two elements per thread to use 128 bit store instructions.
+  constexpr int kWorkPerThread = 2;
+  constexpr int kWorkGroupSize = 128;
+  if (block_size % kWorkPerThread != 0) {
+    // Handle odd block sizes.
+    sycl::range<1> global(DivUp(block_size, kWorkGroupSize));
+    sycl::range<1> local(kWorkGroupSize);
+    sycl_queue.parallel_for(sycl::nd_range<1>(global*local, local),
+        [=](sycl::nd_item<1> item_ct) {
+        genOffsetPointers_kernel<T, 1>(offsets, heads, block_size,
+                                       depth, d_model, k, q, b1,
+                                       v, b2, item_ct);
+        });
+  } else {
+    // Handle even block size
+    sycl::range<1> global(DivUp(block_size, kWorkGroupSize*kWorkPerThread));
+    sycl::range<1> local(kWorkGroupSize);
+    sycl_queue.parallel_for(sycl::nd_range<1>(global*local, local),
+        [=](sycl::nd_item<1> item_ct) {
+        genOffsetPointers_kernel<T, kWorkPerThread>(offsets, heads, block_size,
+                                                    depth, d_model, k, q, b1,
+                                                    v, b2, item_ct);
+        });
+  }
+}
+
 // Template instantiation.
 template void copyTypeConverted<sycl::half, float>(sycl::half* op, float* ip, int N, sycl::queue &sycl_queue);
 template void copyTypeConverted<float, sycl::half>(float* op, sycl::half* ip, int N, sycl::queue &sycl_queue);
@@ -1950,5 +2024,13 @@ template void applyInputGating<sycl::half>(sycl::half* output, const sycl::half*
 template void applyInputGating<float>(float* output, const float* input,
                                       const float* mult, const float* add,
                                       int N, int C, int output_size, sycl::queue &sycl_queue);
+
+template void genOffsetPointers<float>(float** offsets, int heads, int max_batch, int depth,
+                       int d_model, float* k, float* q, float* b1,
+                       float* v, float* b2, sycl::queue& sycl_queue);
+
+template void genOffsetPointers<sycl::half>(sycl::half** offsets, int heads, int max_batch, int depth,
+                       int d_model, sycl::half* k, sycl::half* q, sycl::half* b1,
+                       sycl::half* v, sycl::half* b2, sycl::queue& sycl_queue);
 }  // namespace sycldnn_backend
 }  // namespace lczero
diff --git a/src/neural/backends/sycl/cuBlasContext.h b/src/neural/backends/sycl/cuBlasContext.h
index 5e201b82bd..f330ce8150 100644
--- a/src/neural/backends/sycl/cuBlasContext.h
+++ b/src/neural/backends/sycl/cuBlasContext.h
@@ -61,7 +61,7 @@ class cuBlasContextManager{
 
 
 #include "hip/hip_runtime.h" 
-#include "hipblas.h"
+#include "hipblas/hipblas.h"
 
 class hipBlasContextManager;
 static hipBlasContextManager *_hipBlasContextManager;
diff --git a/src/neural/backends/sycl/fp16_kernels.dp.cpp b/src/neural/backends/sycl/fp16_kernels.dp.cpp
index bb89b65a97..a6921e9733 100644
--- a/src/neural/backends/sycl/fp16_kernels.dp.cpp
+++ b/src/neural/backends/sycl/fp16_kernels.dp.cpp
@@ -20,14 +20,9 @@
 */
 
 #include <sycl/sycl.hpp>
-#include "dpct/dpct.hpp"
 #include "sycl_common.h"
 #include "neural/backends/shared/activation.h"
 
-// Allow building on an old architecture.
-#if DPCT_COMPATIBILITY_TEMP < 530
-#define SKIP_FP16_BITS 1
-#endif
 #include "winograd_helper.h"
 
 namespace lczero {
@@ -597,9 +592,7 @@ void OutputInputTransformKernel_fp16_shmem_board(
   int c = k;
   // top-left
   {
-    sycl::half inEl[6][6] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    sycl::half inEl[6][6] = {};
 
 #pragma unroll
     for (int i = 0; i < 5; i++)
@@ -617,9 +610,7 @@ void OutputInputTransformKernel_fp16_shmem_board(
 
   // top-right
   {
-    sycl::half inEl[6][6] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    sycl::half inEl[6][6] = {};
 
 #pragma unroll
     for (int i = 0; i < 5; i++)
@@ -637,9 +628,7 @@ void OutputInputTransformKernel_fp16_shmem_board(
 
   // bottom-left
   {
-    sycl::half inEl[6][6] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    sycl::half inEl[6][6] = {};
 
 #pragma unroll
     for (int i = 0; i < 5; i++)
@@ -657,9 +646,7 @@ void OutputInputTransformKernel_fp16_shmem_board(
 
   // bottom-right
   {
-    sycl::half inEl[6][6] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    sycl::half inEl[6][6] = {};
 
 #pragma unroll
     for (int i = 0; i < 5; i++)
diff --git a/src/neural/backends/sycl/kernels.h b/src/neural/backends/sycl/kernels.h
index 05954e32c2..2330cae9f5 100644
--- a/src/neural/backends/sycl/kernels.h
+++ b/src/neural/backends/sycl/kernels.h
@@ -20,7 +20,6 @@
 */
 
 #include <sycl/sycl.hpp>
-#include "dpct/dpct.hpp"
 #include "sycl_common.h"
 #include "neural/backends/shared/activation.h"
 
@@ -146,5 +145,9 @@ void inputPreprocessForAttentionBody(T* output, const T* input,
 template <typename T>
 void applyInputGating(T* output, const T* input, const T* mult, const T* add,
                       int N, int HW, int C, sycl::queue &sycl_queue);
+
+template <typename T>
+void genOffsetPointers(T** offsets, int heads, int max_batch, int depth,
+                       int d_model, T* k, T* q, T* b1, T* v, T* b2, sycl::queue &sycl_queue);
 }  // namespace sycldnn_backend
 }  // namespace lczero
diff --git a/src/neural/backends/sycl/layers.cc.dp.cpp b/src/neural/backends/sycl/layers.cc.dp.cpp
index fa49425a0e..8a046ab292 100644
--- a/src/neural/backends/sycl/layers.cc.dp.cpp
+++ b/src/neural/backends/sycl/layers.cc.dp.cpp
@@ -20,7 +20,6 @@
 */
 
 #include <sycl/sycl.hpp>
-#include "dpct/dpct.hpp"
 #include "layers.h"
 
 #include <cassert>
@@ -28,7 +27,7 @@
 #include <vector>
 
 #ifdef USE_HIPBLAS 
-#include "hipblas.h"
+#include "hipblas/hipblas.h"
 #include "cuBlasContext.h"
 #elif defined(USE_CUBLAS)
 #include <sycl/backend/cuda.hpp>
@@ -46,12 +45,15 @@
 #include "neural/network.h"
 #include "neural/tables/attention_policy_map.h"
 #include "utils/fp16_utils.h"
-#include "dpct/lib_common_utils.hpp"
 
 #include <cmath>
 
 
 #ifdef USE_HIPBLAS
+#if hipblasVersionMajor < 3
+#define HIPBLAS_COMPUTE_16F HIPBLAS_R_16F
+#define HIPBLAS_COMPUTE_32F HIPBLAS_R_32F
+#endif
 #define transpose_type hipblasOperation_t 
 #define transpose_type_transpose HIPBLAS_OP_T  
 #define transpose_type_notranspose HIPBLAS_OP_N 
@@ -237,17 +239,15 @@ void SELayer<float>::Eval(int N, float* output, const float* input,
   sycl_queue.submit([&](sycl::handler &cgh) {
         //auto d_A = b_A.get_access<sycl::access::mode::read_write>(cgh);
         
-        cgh.host_task([=](sycl::interop_handle ih) {
+        cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
 
-        auto cudaStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_cuda>(sycl_queue);
+        auto cudaStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
         cublasSetStream(handle, cudaStreamHandle);  
 
         ReportCUBLASErrors(cublasSgemm(handle, transpose_type_transpose, transpose_type_notranspose, numFc1Out_,
                                  N, C, &alpha, w1_, C, op2, C, &beta, op1,
                                  numFc1Out_));
 
-        cudaStreamSynchronize(cudaStreamHandle);
-
         });
   });
   #elif defined(USE_HIPBLAS)
@@ -256,16 +256,14 @@ void SELayer<float>::Eval(int N, float* output, const float* input,
   sycl_queue.submit([&](sycl::handler &cgh) {
         //auto d_A = b_A.get_access<sycl::access::mode::read_write>(cgh);
         
-        cgh.host_task([=](sycl::interop_handle ih) {
+        cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+        auto hipStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_hip>();
 
-        auto hipStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_hip>(sycl_queue);
         hipblasSetStream(handle, hipStreamHandle);  
 
         hipblasSgemm(handle, transpose_type_transpose, transpose_type_notranspose, numFc1Out_,
                                  N, C, &alpha, w1_, C, op2, C, &beta, op1,
                                  numFc1Out_);
-
-        hipStreamSynchronize(hipStreamHandle);
         });
   });  
   #else
@@ -284,33 +282,30 @@ void SELayer<float>::Eval(int N, float* output, const float* input,
   sycl_queue.submit([&](sycl::handler &cgh) {
         
         
-        cgh.host_task([=](sycl::interop_handle ih) {
+        cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
 
-        auto cudaStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_cuda>(sycl_queue);
+        auto cudaStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
         cublasSetStream(handle, cudaStreamHandle);  
 
         ReportCUBLASErrors(cublasSgemm(handle, transpose_type_transpose, transpose_type_notranspose, 2 * C, N,
                                  numFc1Out_, &alpha, w2_, numFc1Out_, op1,
                                  numFc1Out_, &beta, op2, 2 * C));
 
-        cudaStreamSynchronize(cudaStreamHandle);
-        
         });
   });
 
   #elif defined(USE_HIPBLAS)
   sycl_queue.submit([&](sycl::handler &cgh) {
         
-        cgh.host_task([=](sycl::interop_handle ih) {
+        cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+        auto hipStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_hip>();
 
-        auto hipStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_hip>(sycl_queue);
         hipblasSetStream(handle, hipStreamHandle);  
 
         hipblasSgemm(handle, transpose_type_transpose, transpose_type_notranspose, 2 * C, N,
                                  numFc1Out_, &alpha, w2_, numFc1Out_, op1,
                                  numFc1Out_, &beta, op2, 2 * C);
 
-        hipStreamSynchronize(hipStreamHandle);
         
         });
   });
@@ -373,17 +368,15 @@ void SELayer<sycl::half>::Eval(int N, sycl::half* output, const sycl::half* inpu
 
     sycl_queue.submit([&](sycl::handler &cgh) {
        
-        cgh.host_task([=](sycl::interop_handle ih) {
+        cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
 
-        auto cudaStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_cuda>(sycl_queue);
+        auto cudaStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
         cublasSetStream(handle, cudaStreamHandle);  
     
         ReportCUBLASErrors(cublasHgemm(handle, transpose_type_transpose, transpose_type_notranspose, numFc1Out_,
                                    N, C, &alpha, ((const half *)w1_), C, ((const half *)op2), C, &beta, ((half *)op1),
                                    numFc1Out_));
     
-        cudaStreamSynchronize(cudaStreamHandle);
-        
         });
     });
 
@@ -391,10 +384,9 @@ void SELayer<sycl::half>::Eval(int N, sycl::half* output, const sycl::half* inpu
     hipblasHandle_t handle = hipBlasContextManager::gethipBlasHandle_t();
 
     sycl_queue.submit([&](sycl::handler &cgh) {
-      cgh.host_task([=](sycl::interop_handle ih) {
+      cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+        auto hipStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_hip>();
 
-        auto hipStreamHandle =
-            sycl::get_native<sycl::backend::ext_oneapi_hip>(sycl_queue);
         hipblasSetStream(handle, hipStreamHandle);
 
         hipblasHgemm(handle, transpose_type_transpose,
@@ -402,7 +394,6 @@ void SELayer<sycl::half>::Eval(int N, sycl::half* output, const sycl::half* inpu
                      ((const hipblasHalf *)w1_), C, ((const hipblasHalf *)op2), C,
                      &beta, ((hipblasHalf *)op1), numFc1Out_);
 
-        hipStreamSynchronize(hipStreamHandle);
       });
     });
 #else
@@ -418,9 +409,9 @@ void SELayer<sycl::half>::Eval(int N, sycl::half* output, const sycl::half* inpu
 
     sycl_queue_.submit([&](sycl::handler &cgh) {
         
-        cgh.host_task([=](sycl::interop_handle ih) {
+        cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
 
-        auto cudaStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_cuda>(sycl_queue);
+        auto cudaStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
         cublasSetStream(handle, cudaStreamHandle);   
     
         // 3. Second fully connected layer.
@@ -428,16 +419,13 @@ void SELayer<sycl::half>::Eval(int N, sycl::half* output, const sycl::half* inpu
                                    numFc1Out_, &alpha, ((const half *)w2_), numFc1Out_, ((const half *)op1),
                                    numFc1Out_, &beta, ((half *)op2), 2 * C));
   
-        cudaStreamSynchronize(cudaStreamHandle);
-        
         });
     });  
     
 #elif defined(USE_HIPBLAS)
     sycl_queue.submit([&](sycl::handler &cgh) {
-      cgh.host_task([=](sycl::interop_handle ih) {
-        auto hipStreamHandle =
-            sycl::get_native<sycl::backend::ext_oneapi_hip>(sycl_queue);
+      cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+        auto hipStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_hip>();
         hipblasSetStream(handle, hipStreamHandle);
 
         hipblasHgemm(
@@ -446,7 +434,6 @@ void SELayer<sycl::half>::Eval(int N, sycl::half* output, const sycl::half* inpu
             ((const hipblasHalf *)op1), numFc1Out_, &beta, ((hipblasHalf *)op2),
             2 * C);
 
-        hipStreamSynchronize(hipStreamHandle);
       });
     });
 #else
@@ -562,9 +549,9 @@ template <>
 
     sycl_queue.submit([&](sycl::handler &cgh) {
         
-         cgh.host_task([=](sycl::interop_handle ih) {
+         cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
 
-         auto cudaStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_cuda>(sycl_queue);
+         auto cudaStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
          cublasSetStream(handle, cudaStreamHandle);    
   
          ReportCUBLASErrors(cublasHgemm(handle, transpose_type_transpose, transpose_type_notranspose, num_outputs,
@@ -572,16 +559,13 @@ template <>
                                   ((const half *)input_tensor), num_inputs, &beta, ((half *)output_tensor),
                                   num_outputs));
 
-         cudaStreamSynchronize(cudaStreamHandle);
-        
        });
    });  
 #elif defined(USE_HIPBLAS)
   hipblasHandle_t handle = hipBlasContextManager::gethipBlasHandle_t();
   sycl_queue.submit([&](sycl::handler &cgh) {
-    cgh.host_task([=](sycl::interop_handle ih) {
-      auto hipStreamHandle =
-          sycl::get_native<sycl::backend::ext_oneapi_hip>(sycl_queue);
+    cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+      auto hipStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_hip>();
       hipblasSetStream(handle, hipStreamHandle);
 
       hipblasHgemm(
@@ -590,7 +574,6 @@ template <>
           num_inputs, ((const hipblasHalf *)input_tensor), num_inputs, &beta,
           ((hipblasHalf *)output_tensor), num_outputs);
 
-        hipStreamSynchronize(hipStreamHandle);
       });
   });
 #else
@@ -625,9 +608,9 @@ void FCLayer<float>::Eval(int N, float* output_tensor,
 
   sycl_queue.submit([&](sycl::handler &cgh) {
         
-        cgh.host_task([=](sycl::interop_handle ih) {
+        cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
 
-        auto cudaStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_cuda>(sycl_queue);
+        auto cudaStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
         cublasSetStream(handle, cudaStreamHandle);    
 
 
@@ -636,17 +619,14 @@ void FCLayer<float>::Eval(int N, float* output_tensor,
                                  input_tensor, num_inputs, &beta, output_tensor,
                                  num_outputs));
 
-        cudaStreamSynchronize(cudaStreamHandle);
-        
       });
   });  
   #elif defined(USE_HIPBLAS)
   hipblasHandle_t handle = hipBlasContextManager::gethipBlasHandle_t();
   sycl_queue.submit([&](sycl::handler &cgh) {
-        
-        cgh.host_task([=](sycl::interop_handle ih) {
+      cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+        auto hipStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_hip>();
 
-        auto hipStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_hip>(sycl_queue);
         hipblasSetStream(handle, hipStreamHandle);    
 
 
@@ -655,7 +635,6 @@ void FCLayer<float>::Eval(int N, float* output_tensor,
                                  input_tensor, num_inputs, &beta, output_tensor,
                                  num_outputs);
 
-        hipStreamSynchronize(hipStreamHandle);
         
       });
   });
@@ -939,9 +918,9 @@ template <>
   
    sycl_queue.submit([&](sycl::handler &cgh) {
         //auto d_A = b_A.get_access<sycl::access::mode::read_write>(cgh);
-         cgh.host_task([=](sycl::interop_handle ih) {
+         cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
   
-          auto cudaStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_cuda>(sycl_queue);
+          auto cudaStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
           cublasSetStream(handle, cudaStreamHandle);
 
           ReportCUBLASErrors(cublasGemmStridedBatchedEx(
@@ -950,8 +929,6 @@ template <>
              batchSize, CUDA_R_16F, CUBLAS_GEMM_DEFAULT));
 
           
-           cudaStreamSynchronize(cudaStreamHandle);
-        
          });   
    });
   
@@ -959,18 +936,16 @@ template <>
   hipblasHandle_t handle = hipBlasContextManager::gethipBlasHandle_t();
 
   sycl_queue.submit([&](sycl::handler &cgh) {
-    cgh.host_task([=](sycl::interop_handle ih) {
-      auto hipStreamHandle =
-          sycl::get_native<sycl::backend::ext_oneapi_hip>(sycl_queue);
+    cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+      auto hipStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_hip>();
       hipblasSetStream(handle, hipStreamHandle);
 
       hipblasGemmStridedBatchedEx(
           handle, transpose_type_notranspose, transpose_type_notranspose, N, M,
           K, &alpha, B, HIPBLAS_R_16F, N, N * K, A, HIPBLAS_R_16F, K, K * M,
-          &beta, Out, HIPBLAS_R_16F, N, N * M, batchSize, HIPBLAS_R_16F,
+          &beta, Out, HIPBLAS_R_16F, N, N * M, batchSize, HIPBLAS_COMPUTE_16F,
           HIPBLAS_GEMM_DEFAULT);
 
-      hipStreamSynchronize(hipStreamHandle);
     });
   });
 #else
@@ -1008,8 +983,8 @@ template <> void BaseLayer<float>::cublasRowMajorMatrixMul(const float* A, const
     #ifdef USE_CUBLAS
     sycl_queue.submit([&](sycl::handler &cgh) {
         //auto d_A = b_A.get_access<sycl::access::mode::read_write>(cgh);
-        cgh.host_task([=](sycl::interop_handle ih) {
-            auto cudaStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_cuda>(sycl_queue);
+        cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+            auto cudaStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
             cublasSetStream(handle, cudaStreamHandle);   
 
           ReportCUBLASErrors(cublasGemmStridedBatchedEx(
@@ -1018,25 +993,21 @@ template <> void BaseLayer<float>::cublasRowMajorMatrixMul(const float* A, const
           batchSize, CUDA_R_32F, CUBLAS_GEMM_DEFAULT));
 
           
-          cudaStreamSynchronize(cudaStreamHandle);
-
         });
     });
     #elif defined(USE_HIPBLAS)
     sycl_queue.submit([&](sycl::handler &cgh) {
         //auto d_A = b_A.get_access<sycl::access::mode::read_write>(cgh);
-        cgh.host_task([=](sycl::interop_handle ih) {
-            auto hipStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_hip>(sycl_queue);
+        cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+            auto hipStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_hip>();
             hipblasSetStream(handle, hipStreamHandle);   
 
           hipblasGemmStridedBatchedEx(
             handle, transpose_type_notranspose, transpose_type_notranspose, N, M, K, &floatOne, B, HIPBLAS_R_32F, N,
             N * K, A, HIPBLAS_R_32F, K, K * M, &floatZero, Out, HIPBLAS_R_32F, N, N * M,
-          batchSize, HIPBLAS_R_32F, HIPBLAS_GEMM_DEFAULT);
+          batchSize, HIPBLAS_COMPUTE_32F, HIPBLAS_GEMM_DEFAULT);
 
           
-          hipStreamSynchronize(hipStreamHandle);
-
         });
     });  
     #else
@@ -1192,9 +1163,9 @@ template <>
 #ifdef USE_CUBLAS
     sycl_queue.submit([&](sycl::handler &cgh) {
          
-         cgh.host_task([=](sycl::interop_handle ih) {
+         cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
   
-          auto cudaStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_cuda>(sycl_queue);
+          auto cudaStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
           cublasSetStream(handle, cudaStreamHandle);
 
 
@@ -1203,22 +1174,18 @@ template <>
          N * K, A, CUDA_R_16F, K, 0, &zero_h, Out, CUDA_R_16F, N, N * M,
          batchSize, CUDA_R_16F, CUBLAS_GEMM_DEFAULT));
 
-         cudaStreamSynchronize(cudaStreamHandle);
-        
          });   
    });
 #elif defined(USE_HIPBLAS)
     sycl_queue.submit([&](sycl::handler &cgh) {
-      cgh.host_task([=](sycl::interop_handle ih) {
-         auto hipStreamHandle =
-             sycl::get_native<sycl::backend::ext_oneapi_hip>(sycl_queue);
+      cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+         auto hipStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_hip>();
          hipblasSetStream(handle, hipStreamHandle);
          hipblasGemmStridedBatchedEx(
               handle, transpose_type_notranspose, transpose_type_notranspose,
               N, M, K, &alpha, B, HIPBLAS_R_16F, N, N * K, A, HIPBLAS_R_16F, K,
-              0, &beta, Out, HIPBLAS_R_16F, N, N * M, batchSize, HIPBLAS_R_16F,
+              0, &beta, Out, HIPBLAS_R_16F, N, N * M, batchSize, HIPBLAS_COMPUTE_16F,
               HIPBLAS_GEMM_DEFAULT);
-         hipStreamSynchronize(hipStreamHandle);
       });
     });
 #else
@@ -1257,9 +1224,9 @@ void Conv1Layer<float>::cublasSpecialMatrixMul(const float* A, const float* B,
     #ifdef USE_CUBLAS
     sycl_queue.submit([&](sycl::handler &cgh) {
         
-        cgh.host_task([=](sycl::interop_handle ih) {
+        cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
   
-         auto cudaStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_cuda>(sycl_queue);
+         auto cudaStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
          cublasSetStream(handle, cudaStreamHandle);
 
 
@@ -1268,25 +1235,21 @@ void Conv1Layer<float>::cublasSpecialMatrixMul(const float* A, const float* B,
           N * K, A, CUDA_R_32F, K, 0, &floatZero, Out, CUDA_R_32F, N, N * M,
           batchSize, CUDA_R_32F, CUBLAS_GEMM_DEFAULT));
 
-         cudaStreamSynchronize(cudaStreamHandle);
-
         });   
     });
     #elif defined(USE_HIPBLAS)
     sycl_queue.submit([&](sycl::handler &cgh) {
         
-        cgh.host_task([=](sycl::interop_handle ih) {
+        cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+         auto hipStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_hip>();
   
-         auto hipStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_hip>(sycl_queue);
          hipblasSetStream(handle, hipStreamHandle);
 
 
         hipblasGemmStridedBatchedEx(
           handle, transpose_type_notranspose, transpose_type_notranspose, N, M, K, &floatOne, B, HIPBLAS_R_32F, N,
           N * K, A, HIPBLAS_R_32F, K, 0, &floatZero, Out, HIPBLAS_R_32F, N, N * M,
-          batchSize, HIPBLAS_R_32F, HIPBLAS_GEMM_DEFAULT);
-
-         hipStreamSynchronize(hipStreamHandle);
+          batchSize, HIPBLAS_COMPUTE_32F, HIPBLAS_GEMM_DEFAULT);
 
         });   
     });
@@ -1305,14 +1268,11 @@ void Conv1Layer<DataType>::Eval(int N, DataType* output, const DataType* input,
                                 size_t /*scratch_size*/,
                                 sycl::queue &sycl_queue, DataType***) {
 
-  sycl_queue.wait();
-  
   //CERR << "Conv1Layer<DataType>::Eval. ";
 
   cublasSpecialMatrixMul(weights_, input, output, C, H * W, c_input_, N, sycl_queue);
  // CERR << "cublasSpecialMatrixMul. ";
 
-  sycl_queue.wait();
   if (use_bias_){
   // CERR << "addBias. " << N << " " << C << " " << H << " " << W;
     addBias_NCHW(output, output, biases_, N, C, H, W, act_, sycl_queue);
@@ -1320,8 +1280,6 @@ void Conv1Layer<DataType>::Eval(int N, DataType* output, const DataType* input,
     addVectors(output, output, (DataType*)nullptr, N * C * H * W, N * C * H * W, 0, act_, sycl_queue);
   //  CERR << "addVectors. ";
   }
-
-  sycl_queue.wait();
 }
 
 template <typename DataType>
@@ -1792,24 +1750,22 @@ static void cublasXgemm(transpose_type transa,
     unsigned short alpha_h = FP32toFP16(alpha);
     unsigned short beta_h = FP32toFP16(beta);
     sycl_queue.submit([&](sycl::handler &cgh) {
-        cgh.host_task([=](sycl::interop_handle ih) {
-        auto cudaStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_cuda>(sycl_queue);
+        cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+        auto cudaStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
         cublasSetStream(handle, cudaStreamHandle);    
         ReportCUBLASErrors(cublasHgemm(
           handle, transa, transb, m, n, k, (const half*)&alpha_h, ((const half *)A),
           lda, ((const half *)B), ldb, (const half*)&beta_h, ((half *)C), ldc));
-        cudaStreamSynchronize(cudaStreamHandle);  
       });
     });
   } else { 
     sycl_queue.submit([&](sycl::handler &cgh) {
-        cgh.host_task([=](sycl::interop_handle ih) {  
-        auto cudaStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_cuda>(sycl_queue);
+        cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {  
+        auto cudaStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
         cublasSetStream(handle, cudaStreamHandle);  
         ReportCUBLASErrors(cublasSgemm(handle, transa, transb, m, n, k, &alpha,
                                    (const float*)A, lda, (const float*)B, ldb,
                                    &beta, (float*)C, ldc));
-        cudaStreamSynchronize(cudaStreamHandle);
 
         });
       });
@@ -1820,21 +1776,19 @@ static void cublasXgemm(transpose_type transa,
     unsigned short alpha_h = FP32toFP16(alpha);
     unsigned short beta_h = FP32toFP16(beta);
     sycl_queue.submit([&](sycl::handler &cgh) {
-      cgh.host_task([=](sycl::interop_handle ih) {
-        auto hipStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_hip>(sycl_queue);
+      cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+        auto hipStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_hip>();
         hipblasSetStream(handle, hipStreamHandle);
         hipblasHgemm(handle, transa, transb, m, n, k, &alpha_h, (const hipblasHalf*)A,
           lda, (const hipblasHalf*)B, ldb, &beta_h, (hipblasHalf*)C, ldc);
-        hipStreamSynchronize(hipStreamHandle);
         });
       });
   } else {
     sycl_queue.submit([&](sycl::handler &cgh) {
-      cgh.host_task([=](sycl::interop_handle ih) {  
-        auto hipStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_hip>(sycl_queue);
+      cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {  
+        auto hipStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_hip>();
         hipblasSetStream(handle, hipStreamHandle);  
         hipblasSgemm(handle, transa, transb, m, n, k, &alpha, (const float*)A, lda, (const float*)B, ldb, &beta, (float*)C, ldc);
-        hipStreamSynchronize(hipStreamHandle);
         });
       });
   }
@@ -1860,8 +1814,8 @@ static void cublasXGemmStridedBatched(transpose_type transa, transpose_type tran
     unsigned short beta_h = FP32toFP16(beta);
     
     sycl_queue.submit([&](sycl::handler &cgh) {
-        cgh.host_task([=](sycl::interop_handle ih) {
-        auto cudaStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_cuda>(sycl_queue);
+        cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+        auto cudaStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
         cublasSetStream(handle, cudaStreamHandle);    
 
         ReportCUBLASErrors(cublasGemmStridedBatchedEx(
@@ -1869,7 +1823,6 @@ static void cublasXGemmStridedBatched(transpose_type transa, transpose_type tran
           B, CUDA_R_16F, ldb, strideB, &beta_h, C, CUDA_R_16F, ldc, strideC,
           batchCount, CUDA_R_16F, CUBLAS_GEMM_DEFAULT));
         
-        cudaStreamSynchronize(cudaStreamHandle);
 
       });
 
@@ -1879,9 +1832,9 @@ static void cublasXGemmStridedBatched(transpose_type transa, transpose_type tran
     
     sycl_queue.submit([&](sycl::handler &cgh) {
         
-        cgh.host_task([=](sycl::interop_handle ih) {
+        cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
     
-        auto cudaStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_cuda>(sycl_queue);
+        auto cudaStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
         cublasSetStream(handle, cudaStreamHandle);    
     
         ReportCUBLASErrors(cublasGemmStridedBatchedEx(
@@ -1889,7 +1842,6 @@ static void cublasXGemmStridedBatched(transpose_type transa, transpose_type tran
         CUDA_R_32F, ldb, strideB, &beta, C, CUDA_R_32F, ldc, strideC,
         batchCount, CUDA_R_32F, CUBLAS_GEMM_DEFAULT));
   
-        cudaStreamSynchronize(cudaStreamHandle);
   
       });
     });
@@ -1902,34 +1854,32 @@ static void cublasXGemmStridedBatched(transpose_type transa, transpose_type tran
 
     sycl_queue.submit([&](sycl::handler &cgh) {
 
-        cgh.host_task([=](sycl::interop_handle ih) {
+      cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+        auto hipStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_hip>();
 
-        auto hipStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_hip>(sycl_queue);
         hipblasSetStream(handle, hipStreamHandle);
 
         hipblasGemmStridedBatchedEx(
         handle, transa, transb, m, n, k, &alpha_h, A, HIPBLAS_R_16F, lda, strideA, B,
         HIPBLAS_R_16F, ldb, strideB, &beta_h, C, HIPBLAS_R_16F, ldc, strideC,
-        batchCount, HIPBLAS_R_16F, HIPBLAS_GEMM_DEFAULT);
+        batchCount, HIPBLAS_COMPUTE_16F, HIPBLAS_GEMM_DEFAULT);
 
-        hipStreamSynchronize(hipStreamHandle);
 
       });
     });
   } else {
     sycl_queue.submit([&](sycl::handler &cgh) {
 
-        cgh.host_task([=](sycl::interop_handle ih) {
+      cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+        auto hipStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_hip>();
     
-        auto hipStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_hip>(sycl_queue);
         hipblasSetStream(handle, hipStreamHandle);    
     
         hipblasGemmStridedBatchedEx(
         handle, transa, transb, m, n, k, &alpha, A, HIPBLAS_R_32F, lda, strideA, B,
         HIPBLAS_R_32F, ldb, strideB, &beta, C, HIPBLAS_R_32F, ldc, strideC,
-        batchCount, HIPBLAS_R_32F, HIPBLAS_GEMM_DEFAULT);
+        batchCount, HIPBLAS_COMPUTE_32F, HIPBLAS_GEMM_DEFAULT);
   
-        hipStreamSynchronize(hipStreamHandle);
   
       });
     });
@@ -1957,16 +1907,14 @@ static void cublasXGemmBatched(transpose_type transa,
 
 
     sycl_queue.submit([&](sycl::handler &cgh) {
-        cgh.host_task([=](sycl::interop_handle ih) {
-        auto cudaStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_cuda>(sycl_queue);
+        cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+        auto cudaStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
         cublasSetStream(handle, cudaStreamHandle);    
 
         ReportCUBLASErrors(cublasHgemmBatched(
         handle, transa, transb, m, n, k, (const half*)&alpha_h, (half**)A, lda,
         (half**)B, ldb, (const half*)&beta_h, (half**)C, ldc, batchCount));
         
-        cudaStreamSynchronize(cudaStreamHandle);
-
       });
 
     });
@@ -1974,16 +1922,14 @@ static void cublasXGemmBatched(transpose_type transa,
   } else {
     
     sycl_queue.submit([&](sycl::handler &cgh) {
-        cgh.host_task([=](sycl::interop_handle ih) {
-        auto cudaStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_cuda>(sycl_queue);
+        cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+        auto cudaStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_cuda>();
         cublasSetStream(handle, cudaStreamHandle);    
 
         ReportCUBLASErrors(cublasSgemmBatched(
         handle, transa, transb, m, n, k, &alpha, (float**)A, lda, (float**)B,
         ldb, &beta, (float**)C, ldc, batchCount));
         
-        cudaStreamSynchronize(cudaStreamHandle);
-
       });
 
     });
@@ -1999,17 +1945,15 @@ static void cublasXGemmBatched(transpose_type transa,
 
 
     sycl_queue.submit([&](sycl::handler &cgh) {
-        cgh.host_task([=](sycl::interop_handle ih) {
+        cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+        auto hipStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_hip>();
 
-        auto hipStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_hip>(sycl_queue);
         hipblasSetStream(handle, hipStreamHandle);       
 
         hipblasHgemmBatched(
         handle, transa, transb, m, n, k, (const hipblasHalf*)&alpha_h, (hipblasHalf**)A, lda,
         (hipblasHalf**)B, ldb, (const hipblasHalf*)&beta_h, (hipblasHalf**)C, ldc, batchCount);
         
-        hipStreamSynchronize(hipStreamHandle);
-
       });
 
     });
@@ -2017,16 +1961,15 @@ static void cublasXGemmBatched(transpose_type transa,
   } else {
     
     sycl_queue.submit([&](sycl::handler &cgh) {
-        cgh.host_task([=](sycl::interop_handle ih) {
+        cgh.ext_codeplay_enqueue_native_command([=](sycl::interop_handle ih) {
+        auto hipStreamHandle = ih.get_native_queue<sycl::backend::ext_oneapi_hip>();
 
-        auto hipStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_hip>(sycl_queue);
         hipblasSetStream(handle, hipStreamHandle);        
 
         hipblasSgemmBatched(
         handle, transa, transb, m, n, k, &alpha, (float**)A, lda, (float**)B,
         ldb, &beta, (float**)C, ldc, batchCount);
         
-        hipStreamSynchronize(hipStreamHandle);
 
       });
 
@@ -2179,27 +2122,13 @@ void EncoderBlock<DataType>::Eval(int N, DataType* in_out_tensor,
   // matmul_qk = tf.matmul(q, k, transpose_b=True)
   {
     if (*offset_pointers == nullptr) {
-      std::vector<DataType*> offsets(encoder_heads_ * max_batch_size_ * 5);
-      for (int i = 0; i < encoder_heads_ * max_batch_size_; i++) {
-        int h = i % encoder_heads_;
-        int n = i / encoder_heads_;
-        offsets[i] = mha_k + h * depth + 64 * d_model * n;
-        offsets[i + encoder_heads_ * max_batch_size_] =
-            mha_q + h * depth + 64 * d_model * n;
-        offsets[i + 2 * encoder_heads_ * max_batch_size_] =
-            buffer1 + i * 64 * 64;
-        offsets[i + 3 * encoder_heads_ * max_batch_size_] =
-            mha_v + h * depth + 64 * d_model * n;
-        offsets[i + 4 * encoder_heads_ * max_batch_size_] =
-            buffer2 + h * depth + 64 * d_model * n;
-      }
       
       *offset_pointers = sycl::malloc_device<DataType*>(
                                encoder_heads_ * max_batch_size_ * 5,
                                sycl_queue_);
-
-      sycl_queue.memcpy(*offset_pointers, offsets.data(),
-                      encoder_heads_ * max_batch_size_ * 5 * sizeof(DataType*)).wait();
+      genOffsetPointers(*offset_pointers, encoder_heads_, max_batch_size_,
+                        depth, d_model, mha_k, mha_q, buffer1,
+                        mha_v, buffer2, sycl_queue_);
     }
 
     cublasXGemmBatched<DataType>(transpose_type_transpose, transpose_type_notranspose,
diff --git a/src/neural/backends/sycl/layers.h b/src/neural/backends/sycl/layers.h
index 6682429510..850e29ecc0 100644
--- a/src/neural/backends/sycl/layers.h
+++ b/src/neural/backends/sycl/layers.h
@@ -22,8 +22,6 @@
 #pragma once
 
 #include <sycl/sycl.hpp>
-#include "dpct/dpct.hpp"
-#include "dpct/blas_utils.hpp"
 
 #include <cstddef>
 
diff --git a/src/neural/backends/sycl/network_sycl.cc.dp.cpp b/src/neural/backends/sycl/network_sycl.cc.dp.cpp
index 873545ab8c..11683c8aae 100644
--- a/src/neural/backends/sycl/network_sycl.cc.dp.cpp
+++ b/src/neural/backends/sycl/network_sycl.cc.dp.cpp
@@ -22,7 +22,6 @@
 #define DPCT_COMPAT_RT_VERSION 12020
 
 #include <sycl/sycl.hpp>
-#include "dpct/dpct.hpp"
 #include <algorithm>
 #include <cassert>
 #include <functional>
@@ -202,28 +201,47 @@ class SyclNetwork : public Network {
 
     max_batch_size_ = options.GetOrDefault<int>("max_batch", 1024);
 
+    // Get all available platforms
+    auto platforms = sycl::platform::get_platforms();
     
+    if (platforms.empty()) {
+      throw Exception("No SYCL platform found.");
+    }
+    showPlatformInfo(platforms);
+    
+    // A vector to store all sycl devices.
+    std::vector<sycl::device> devices;
 
-    int total_gpus = dpct::dev_mgr::instance().device_count();
+    for (const auto& platform : platforms) {
+       auto platform_devices = platform.get_devices();
+       devices.insert(devices.end(), platform_devices.begin(), platform_devices.end());
+    }
 
-    if (gpu_id_ >= total_gpus)
+    if (gpu_id_ >= (int)devices.size() || gpu_id_ < 0)
       throw Exception("Invalid GPU Id: " + std::to_string(gpu_id_));
-
     
-    //dpct::dev_mgr::instance().get_device(gpu_id_).get_device_info(deviceProp);
-
-    sycl_queue_ = new sycl::queue{dpct::dev_mgr::instance().get_device(gpu_id_), [] (sycl::exception_list exceptions) {
-
+    // Is it a cpu device?
+    is_cpu_ = devices[gpu_id_].is_cpu();
+    // Get the number of compute units(execution units).
+    compute_units_ = devices[gpu_id_].get_info<sycl::info::device::max_compute_units>();
+    // Get context.
+    sycl::context context{devices[gpu_id_]};
+    auto exceptions_handler = [&] (sycl::exception_list exceptions) {
         for (std::exception_ptr const& e : exceptions) {
-                    try {
-                          std::rethrow_exception(e);
-                        } catch(sycl::exception const& e) {
-                    
-				std::cout << "Caught asynchronous SYCL exception during GEMM:\n" << e.what() << std::endl;
-                        }
-             
-                 }
-    },  sycl::property_list{sycl::property::queue::in_order{}}};
+           try {
+               std::rethrow_exception(e);
+            } catch(sycl::exception const& e) {
+				CERR 
+                << "Caught asynchronous SYCL exception during GEMM:\n"
+                << e.what() 
+                << "\n ";
+                std::terminate();
+            }
+        }
+    };
+    
+    sycl_queue_ = new sycl::queue{context, devices[gpu_id_], 
+              exceptions_handler, sycl::property_list{sycl::property::queue::in_order{}} };
 
     showDeviceInfo(*sycl_queue_);
 
@@ -243,10 +261,12 @@ class SyclNetwork : public Network {
 
 
     if (fp16) {
-      dpct::has_capability_or_fail(sycl_queue_->get_device(), {sycl::aspect::fp16});
-        CERR << "Using Fp16 "; 
+      if (!sycl_queue_->get_device().has(sycl::aspect::fp16)) {
+        throw Exception("Requested fp16 is not supported by the device");
+      }
+      CERR << "Using Fp16 "; 
     } else {
-        CERR << "Using Fp32 ";
+      CERR << "Using Fp32 ";
     }
 
     const int kNumInputPlanes = kInputPlanes;
@@ -741,93 +761,72 @@ class SyclNetwork : public Network {
           batchSize, spare1, flow, spare2, scratch_mem, scratch_size_, io_sycl_queue_,
           head_offset_pointers);  // Entire Attention policy head except for the
                                   // policy map
-          io_sycl_queue_.wait();
       if (fp16) {
         network_[l++]->Eval(batchSize, spare2, spare1, nullptr, scratch_mem,
                             scratch_size_, io_sycl_queue_, nullptr);  // policy map layer
 
-        io_sycl_queue_.wait();
 
         copyTypeConverted(opPol, (sycl::half*)spare2,
                           batchSize * kNumOutputPolicy,
                           io_sycl_queue_);  // POLICY output
-        io_sycl_queue_.wait();
       } else {
         network_[l++]->Eval(batchSize, (DataType*)opPol, spare1, nullptr,
                             scratch_mem, scratch_size_, io_sycl_queue_, nullptr);  // policy map layer  // POLICY output
         
-        io_sycl_queue_.wait();
       }
  
     } else if (conv_policy_) {
 
       network_[l++]->Eval(batchSize, spare1, flow, nullptr, scratch_mem,
                           scratch_size_, io_sycl_queue_, nullptr);  // policy conv1
-      io_sycl_queue_.wait();
 
       network_[l++]->Eval(batchSize, spare2, spare1, nullptr, scratch_mem,
                           scratch_size_, io_sycl_queue_, nullptr);  // policy conv2
-      io_sycl_queue_.wait();
 
       if (fp16) {
         network_[l++]->Eval(batchSize, spare1, spare2, nullptr, scratch_mem,
                             scratch_size_, io_sycl_queue_, nullptr);  // policy map layer
 
-        io_sycl_queue_.wait();
         copyTypeConverted(opPol, (sycl::half*)(spare1),
                           batchSize * kNumOutputPolicy,
                           io_sycl_queue_);  // POLICY output
 
-        io_sycl_queue_.wait();
 
       } else {
         network_[l++]->Eval(batchSize, (DataType*)opPol, spare2, nullptr,
                             scratch_mem, scratch_size_, io_sycl_queue_, nullptr);  
                             // policy map layer  // POLICY output
-        io_sycl_queue_.wait();
       }
 
     } else {
       
       network_[l++]->Eval(batchSize, spare1, flow, nullptr, scratch_mem,
                           scratch_size_, io_sycl_queue_, nullptr);  // pol conv
-      io_sycl_queue_.wait();
 
       if (fp16) {
         network_[l++]->Eval(batchSize, spare2, spare1, nullptr, scratch_mem,
                             scratch_size_, io_sycl_queue_, nullptr);  // pol FC
-        io_sycl_queue_.wait();
 
         copyTypeConverted(opPol, (sycl::half*)(spare2),
                           batchSize * kNumOutputPolicy,
                           io_sycl_queue_);  // POLICY
-        io_sycl_queue_.wait();
       } else {
         network_[l++]->Eval(batchSize, (DataType*)opPol, spare1, nullptr,
                             scratch_mem, scratch_size_, io_sycl_queue_, nullptr);  // pol FC  // POLICY
-        io_sycl_queue_.wait();
       }
     }
 
-    // Copy policy output from device memory to host memory.
-
-    io_sycl_queue_.memcpy(io->op_policy_mem_, io->op_policy_mem_gpu_, sizeof(float) * kNumOutputPolicy * batchSize);
-    io_sycl_queue_.wait();
-
 
     // value head
     if (fp16) {
       network_[l++]->Eval(batchSize, spare1, flow, spare2, scratch_mem,
                           scratch_size_, io_sycl_queue_, nullptr);  // value head
-      io_sycl_queue_.wait();
 
       copyTypeConverted(opVal, (sycl::half*)spare1, wdl_ ? 3 * batchSize : batchSize,
                         io_sycl_queue_);
-      io_sycl_queue_.wait();
     } else {
       network_[l++]->Eval(batchSize, (DataType*)opVal, flow, spare2,
                           scratch_mem, scratch_size_, io_sycl_queue_, nullptr);  // value head
-      io_sycl_queue_.wait();
     }
 
     if (moves_left_) {
@@ -836,13 +835,9 @@ class SyclNetwork : public Network {
       network_[l++]->Eval(batchSize, spare1, flow, nullptr, scratch_mem,
                           scratch_size_, io_sycl_queue_, nullptr);  // moves conv or embedding
 
-      io_sycl_queue_.wait();
-
       network_[l++]->Eval(batchSize, spare2, spare1, nullptr, scratch_mem,
                           scratch_size_, io_sycl_queue_, nullptr);  // moves FC1
 
-      io_sycl_queue_.wait();
-
       // Moves left FC2
       if (fp16) {
         // TODO: consider fusing the bias-add of FC2 with format conversion.
@@ -851,30 +846,29 @@ class SyclNetwork : public Network {
         network_[l++]->Eval(batchSize, spare1, spare2, nullptr, scratch_mem,
                             scratch_size_, io_sycl_queue_, nullptr);
         
-        io_sycl_queue_.wait();
 
         copyTypeConverted(opMov, (sycl::half*)(spare1), batchSize, io_sycl_queue_);
-        io_sycl_queue_.wait();
       
       } else {
 
         network_[l++]->Eval(batchSize, (DataType*)opMov, spare2, nullptr,
                             scratch_mem, scratch_size_, io_sycl_queue_, nullptr);
-        io_sycl_queue_.wait();
 
       }
     }
+    
+    // Copy policy output from device memory to host memory.
+    auto event = io_sycl_queue_.memcpy(io->op_policy_mem_, io->op_policy_mem_gpu_, sizeof(float) * kNumOutputPolicy * batchSize);
 
-    if (multi_stream_) {
-        io_sycl_queue_.wait();
-    } else {
-        io_sycl_queue_.wait();
+    if (!multi_stream_) {
       //ReportCUDAErrors(
         //  DPCT_CHECK_ERROR(dpct::get_current_device().queues_wait_and_throw()));
       // The next thread can start using the GPU now.
       lock_.unlock();
     }
 
+    event.wait();
+
     if (wdl_) {
       // Value softmax done cpu side.
       for (int i = 0; i < batchSize; i++) {
@@ -888,7 +882,7 @@ class SyclNetwork : public Network {
         float sum = w + d + l;
         w /= sum;
         l /= sum;
-        d = 1.0f - w - l;
+        d /= sum;
         io->op_value_mem_shared_[3 * i + 0] = w;
         io->op_value_mem_shared_[3 * i + 1] = d;
         io->op_value_mem_shared_[3 * i + 2] = l;
@@ -916,14 +910,18 @@ class SyclNetwork : public Network {
     return capabilities_;
   }
 
+  // Check if device is the cpu for thread handling.
+  bool IsCpu() const override { return is_cpu_; }
+
+  int GetThreads() const override { return 1 + multi_stream_; }
+
+  int GetMiniBatchSize() const override {
+     if (is_cpu_) return 47;
+       // Simple heuristic that seems to work for a wide range of GPUs.
+       return 2 * compute_units_;
+    }
+  
   std::unique_ptr<NetworkComputation> NewComputation() override {
-    // Set correct gpu id for this computation (as it might have been called
-    // from a different thread).
-    /*
-    DPCT1093:90: The "gpu_id_" device may be not the one intended for use.
-    Adjust the selected device if needed.
-    */
-    dpct::select_device(gpu_id_);
     return std::make_unique<SyclNetworkComputation<DataType>>(this, wdl_,
                                                               moves_left_);
   }
@@ -953,6 +951,7 @@ class SyclNetwork : public Network {
   int gpu_id_;
   int l2_cache_size_;
   int max_batch_size_;
+  int compute_units_;
   bool wdl_;
   bool moves_left_;
   bool use_res_block_winograd_fuse_opt_;  // fuse operations inside the residual
@@ -960,10 +959,12 @@ class SyclNetwork : public Network {
   bool multi_stream_;                     // run multiple parallel network evals
   bool allow_cache_opt_;  // try to fit residual block activations in L2 cache
 
+
   // Currently only one NN Eval can happen a time (we can fix this if needed
   // by allocating more memory).
   mutable std::mutex lock_;
-  sycl::queue * sycl_queue_;
+  sycl::queue* sycl_queue_;
+  bool is_cpu_;
 
 
   int numBlocks_;
@@ -997,15 +998,52 @@ class SyclNetwork : public Network {
   mutable std::mutex inputs_outputs_lock_;
   std::list<std::unique_ptr<InputsOutputs>> free_inputs_outputs_;
 
-  void showDeviceInfo(const sycl::queue & mqueue) const {
-    CERR << "PLATFORM: " << mqueue.get_device().get_platform().get_info<sycl::info::platform::name>();
-    CERR << "GPU: " << mqueue.get_device().get_info<sycl::info::device::name>();
-    CERR << "GPU memory: " << mqueue.get_device().get_info<sycl::info::device::max_mem_alloc_size>();
-    CERR << "GPU clock frequency: " << mqueue.get_device().get_info<sycl::info::device::max_clock_frequency>(); 
-    CERR << "L2 cache capacity: " << mqueue.get_device().get_info<sycl::info::device::local_mem_size>();
-    CERR << "Global memory Size: " << mqueue.get_device().get_info<sycl::info::device::global_mem_size>();
-
-  } 
+  void showDeviceInfo(const sycl::queue &mqueue) const {
+    CERR << "Device-Info...";
+    CERR << "Platform: " 
+         << mqueue.get_device().get_platform().get_info<sycl::info::platform::name>() 
+         << " selected";
+    std::string device_type = mqueue.get_device().is_gpu() ? "GPU" : "CPU";
+    CERR << device_type << ": " 
+         << mqueue.get_device().get_info<sycl::info::device::name>();
+    CERR << device_type << ": " 
+         << mqueue.get_device().get_info<sycl::info::device::max_mem_alloc_size>() / (1024 * 1024) 
+         << " MB (max allocation)";
+    CERR << device_type << " clock frequency: " 
+         << mqueue.get_device().get_info<sycl::info::device::max_clock_frequency>() 
+         << " MHz";
+    CERR << "L2 cache capacity: " 
+         << mqueue.get_device().get_info<sycl::info::device::local_mem_size>() / (1024) 
+         << " KB";
+    CERR << "Global memory size: " 
+         << mqueue.get_device().get_info<sycl::info::device::global_mem_size>() / (1024 * 1024) 
+         << " MB";         
+    CERR << "...Device-Info-End";
+    }
+    
+    void showPlatformInfo(const std::vector<sycl::platform>& platforms) {
+       CERR << "Platform-List...";
+       for (size_t i = 0; i < platforms.size(); ++i) {
+           std::string version = platforms[i].get_info<sycl::info::platform::version>();
+           
+           for (const auto& device : platforms[i].get_devices()) {
+               std::string device_type;
+               switch (device.get_info<sycl::info::device::device_type>()) {
+                   case sycl::info::device_type::gpu: 
+                       device_type = "GPU"; break;
+                   case sycl::info::device_type::cpu: 
+                       device_type = "CPU"; break;
+                   default: 
+                       device_type = "Other"; break;
+                }
+                CERR << "Platform " << i << " (version: " << version << "):" << device_type
+                     << " (Name" << ": " 
+                     << device.get_platform().get_info<sycl::info::platform::name>() << ")";
+            }
+        }
+        
+        CERR << "...Platform-List-End";
+    }
 };
 
 template <typename DataType>
@@ -1101,16 +1139,17 @@ std::unique_ptr<Network> MakeSyclNetworkAuto(
     const std::optional<WeightsFile>& weights, const OptionsDict& options) {
   int gpu_id = options.GetOrDefault<int>("gpu", 0);
 
-  try {
-    CERR << "Trying to switch to [sycl-fp16]...";
-    dpct::has_capability_or_fail(dpct::dev_mgr::instance().get_device(gpu_id),
-                                 {sycl::aspect::fp16});
-    CERR << "Switched to [sycl-fp16]...";
-    return MakeSyclNetwork<sycl::half>(weights, options);
-  } catch (std::exception& e) {
+  auto devices = sycl::device::get_devices();
+  if (gpu_id >= devices.size()) {
+      throw Exception("Invalid GPU ID");
+   }
+  CERR << "Trying to switch to [sycl-fp16]...";
+  if (devices[gpu_id].has(sycl::aspect::fp16)) {
+    CERR << "Switched to [sycl-fp16]..."; 
+    return MakeSyclNetwork<sycl::half>(weights, options);     
+  } else {
     CERR << "Device does not support sycl-fp16";
   }
-
   CERR << "Switched to [sycl]...";
   return MakeSyclNetwork<float>(weights, options);
 }
diff --git a/src/neural/backends/sycl/sycl_common.h b/src/neural/backends/sycl/sycl_common.h
index b0241da5e4..bbaee55645 100644
--- a/src/neural/backends/sycl/sycl_common.h
+++ b/src/neural/backends/sycl/sycl_common.h
@@ -22,8 +22,6 @@
 #pragma once
 
 #include <sycl/sycl.hpp>
-#include "dpct/dpct.hpp"
-#include "dpct/blas_utils.hpp"
 
 #include "utils/exception.h"
 
diff --git a/src/neural/backends/sycl/winograd_helper.h b/src/neural/backends/sycl/winograd_helper.h
index 184fa9b4df..175b925506 100644
--- a/src/neural/backends/sycl/winograd_helper.h
+++ b/src/neural/backends/sycl/winograd_helper.h
@@ -20,12 +20,12 @@
 */
 
 #include <sycl/sycl.hpp>
-#include "dpct/dpct.hpp"
 
 namespace lczero {
 namespace sycldnn_backend {
 
-__dpct_inline__ float mishActivate(float el) {
+[[gnu::always_inline]]
+inline float mishActivate(float el) {
   auto e = sycl::native::exp(el);
   auto n = e * e + 2.0f * e;
   auto d = el / (n + 2.0f);
@@ -35,7 +35,8 @@ __dpct_inline__ float mishActivate(float el) {
     return el - 2.0f * d;
   }
 }
-__dpct_inline__ float activate(float cVal, ActivationFunction activation) {
+[[gnu::always_inline]]
+inline float activate(float cVal, ActivationFunction activation) {
   switch (activation) {
     case ACTIVATION_RELU:
       if (cVal < 0) cVal = 0;
@@ -69,8 +70,8 @@ __dpct_inline__ float activate(float cVal, ActivationFunction activation) {
 }
 
 template <typename T, int M, int N, int K>
-__dpct_inline__ void matrixMul_gpu_serial(T* c, const T* a, const T* b) {
-#ifndef SKIP_FP16_BITS
+[[gnu::always_inline]]
+inline void matrixMul_gpu_serial(T* c, const T* a, const T* b) {
 #pragma unroll
   for (int i = 0; i < M; ++i)
 #pragma unroll
@@ -80,11 +81,11 @@ __dpct_inline__ void matrixMul_gpu_serial(T* c, const T* a, const T* b) {
       for (int k = 0; k < K; ++k) S += a[i * K + k] * b[k * N + j];
       c[i * N + j] = S;
     }
-#endif
 }
 
 template <typename T>
-__dpct_inline__ void FilterTransform4x4(T* transformed_filter,
+[[gnu::always_inline]]
+inline void FilterTransform4x4(T* transformed_filter,
                                         const T* filter) {
   // transform applied to filter (of size 3x3)
   T G[6 * 3] = {1.0f / 4,  0,         0,         -1.0f / 6,  -1.0f / 6,
@@ -102,7 +103,8 @@ __dpct_inline__ void FilterTransform4x4(T* transformed_filter,
 }
 
 template <typename T>
-__dpct_inline__ void InputTransform4x4(T* transformedInput, const T* input) {
+[[gnu::always_inline]]
+inline void InputTransform4x4(T* transformedInput, const T* input) {
   // transform applied to input tile (of size 4x4)
   const T Bt[6 * 6] = {4, 0, -5, 0,  1, 0, 0, -4, -4, 1,  1, 0,
                        0, 4, -4, -1, 1, 0, 0, -2, -1, 2,  1, 0,
@@ -118,7 +120,8 @@ __dpct_inline__ void InputTransform4x4(T* transformedInput, const T* input) {
 }
 
 template <typename T>
-__dpct_inline__ void OutputTransform4x4(T* output, const T* transformedOutput) {
+[[gnu::always_inline]]
+inline void OutputTransform4x4(T* output, const T* transformedOutput) {
   // transform applied to result
   const T At[4 * 6] = {1, 1, 1, 1, 1, 0, 0, 1, -1, 2, -2, 0,
                        0, 1, 1, 4, 4, 0, 0, 1, -1, 8, -8, 1};
@@ -210,8 +213,7 @@ void InputTransform_kernel(int N, int C, const T* input, T* output,
 
   // top-left
   {
-    T inEl[6][6] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    T inEl[6][6] = {};
 
 #pragma unroll
     for (int i = 0; i < 5; i++)
@@ -229,8 +231,7 @@ void InputTransform_kernel(int N, int C, const T* input, T* output,
 
   // top-right
   {
-    T inEl[6][6] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    T inEl[6][6] = {};
 
 #pragma unroll
     for (int i = 0; i < 5; i++)
@@ -248,8 +249,7 @@ void InputTransform_kernel(int N, int C, const T* input, T* output,
 
   // bottom-left
   {
-    T inEl[6][6] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    T inEl[6][6] = {};
 
 #pragma unroll
     for (int i = 0; i < 5; i++)
@@ -267,8 +267,7 @@ void InputTransform_kernel(int N, int C, const T* input, T* output,
 
   // bottom-right
   {
-    T inEl[6][6] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    T inEl[6][6] = {};
 
 #pragma unroll
     for (int i = 0; i < 5; i++)
@@ -301,7 +300,6 @@ void OutputTransform_kernel(int N, int C, int se_K, T* output,
                                        const T* w2, const T* b2,
                                        const sycl::nd_item<3> &item_ct1,
                                        float *shared_data) {
-#ifndef SKIP_FP16_BITS
   const bool fp16 = std::is_same<sycl::half, T>::value;
 
   int k = item_ct1.get_local_id(2);
@@ -442,11 +440,11 @@ void OutputTransform_kernel(int N, int C, int se_K, T* output,
             *((sycl::uint4*)&board[h][4]);
     }
   }
-#endif
 }
 
 // fast reduction for the warp
-__dpct_inline__ float warpReduce(float x, const sycl::nd_item<3>& item_ct1) {
+[[gnu::always_inline]]
+inline float warpReduce(float x, const sycl::nd_item<3>& item_ct1) {
 #pragma unroll
   for (int mask = 16; mask > 0; mask >>= 1)
     /*
@@ -462,13 +460,14 @@ __dpct_inline__ float warpReduce(float x, const sycl::nd_item<3>& item_ct1) {
     device. Modify the size of the work-group to ensure that the value of the
     right-most dimension is a multiple of "32".
     */
-    x += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), x, mask);
+    x += sycl::permute_group_by_xor(item_ct1.get_sub_group(), x, mask);
 
   return x;
 }
 
 // fast max reduction for the warp
-__dpct_inline__ float warpMax(float x, const sycl::nd_item<3>& item_ct1) {
+[[gnu::always_inline]]
+inline float warpMax(float x, const sycl::nd_item<3>& item_ct1) {
 #pragma unroll
   for (int mask = 16; mask > 0; mask >>= 1)
     /*
@@ -484,29 +483,16 @@ __dpct_inline__ float warpMax(float x, const sycl::nd_item<3>& item_ct1) {
     device. Modify the size of the work-group to ensure that the value of the
     right-most dimension is a multiple of "32".
     */
-    x = sycl::max(x, (float)(dpct::permute_sub_group_by_xor(
+    x = sycl::max(x, (float)(sycl::permute_group_by_xor(
                          item_ct1.get_sub_group(), x, mask)));
 
   return x;
 }
 
-// atomic max implementation for floats
-__dpct_inline__ float atomicMaxFloat(float* addr, float val) {
-  float max;
-  max = !sycl::signbit(val)
-            ? sycl::bit_cast<float>(dpct::atomic_fetch_max<
-                                    sycl::access::address_space::generic_space>(
-                  (int*)addr, sycl::bit_cast<int>(val)))
-            : sycl::bit_cast<float>(dpct::atomic_fetch_min<
-                                    sycl::access::address_space::generic_space>(
-                  (unsigned int*)addr, sycl::bit_cast<unsigned int>(val)));
-
-  return max;
-}
-
 // Helper fuction to do vector loads/stores
 template <typename T>
-__dpct_inline__ void copyAs(void* dst, const void* src) {
+[[gnu::always_inline]]
+inline void copyAs(void* dst, const void* src) {
   *((T*)(dst)) = *((const T*)(src));
 }
 
@@ -530,7 +516,6 @@ void OutputTransform_SE_relu_InputTransform_kernel(
     const T* w1, const T* b1, const T* w2, const T* b2,
     const sycl::nd_item<3>& item_ct1, float* shared_data,
     sycl::local_accessor<float, 2> shared_sums) {
-#ifndef SKIP_FP16_BITS
   const bool fp16 = std::is_same<sycl::half, T>::value;
 
   int k = item_ct1.get_local_id(2);
@@ -671,8 +656,7 @@ void OutputTransform_SE_relu_InputTransform_kernel(
   int c = k;
   // top-left
   {
-    T inEl[6][6] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    T inEl[6][6] = {};
 
 #pragma unroll
     for (int i = 0; i < 5; i++)
@@ -690,8 +674,7 @@ void OutputTransform_SE_relu_InputTransform_kernel(
 
   // top-right
   {
-    T inEl[6][6] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    T inEl[6][6] = {};
 
 #pragma unroll
     for (int i = 0; i < 5; i++)
@@ -709,8 +692,7 @@ void OutputTransform_SE_relu_InputTransform_kernel(
 
   // bottom-left
   {
-    T inEl[6][6] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    T inEl[6][6] = {};
 
 #pragma unroll
     for (int i = 0; i < 5; i++)
@@ -728,8 +710,7 @@ void OutputTransform_SE_relu_InputTransform_kernel(
 
   // bottom-right
   {
-    T inEl[6][6] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    T inEl[6][6] = {};
 
 #pragma unroll
     for (int i = 0; i < 5; i++)
@@ -744,7 +725,6 @@ void OutputTransform_SE_relu_InputTransform_kernel(
       for (int x = 0; x < 6; x++)
         output[TEMP_INDEX_HWNC(y, x, n * 4 + 3, c)] = inEl[y][x];
   }
-#endif
 }
 
 constexpr int kOpInpTransformBlockSize = 64;
@@ -760,7 +740,6 @@ register pressure.
 void OutputTransform_relu_InputTransform_kernel(
     int N, int C, T* output, const T* input, T* skip, const T* bias,
     const sycl::nd_item<3>& item_ct1) {
-#ifndef SKIP_FP16_BITS
   const bool fp16 = std::is_same<sycl::half, T>::value;
 
   int k = item_ct1.get_local_id(2) +
@@ -838,8 +817,7 @@ void OutputTransform_relu_InputTransform_kernel(
   int c = k;
   // top-left
   {
-    T inEl[6][6] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    T inEl[6][6] = {};
 
 #pragma unroll
     for (int i = 0; i < 5; i++)
@@ -857,8 +835,7 @@ void OutputTransform_relu_InputTransform_kernel(
 
   // top-right
   {
-    T inEl[6][6] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    T inEl[6][6] = {};
 
 #pragma unroll
     for (int i = 0; i < 5; i++)
@@ -876,8 +853,7 @@ void OutputTransform_relu_InputTransform_kernel(
 
   // bottom-left
   {
-    T inEl[6][6] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    T inEl[6][6] = {};
 
 #pragma unroll
     for (int i = 0; i < 5; i++)
@@ -895,8 +871,7 @@ void OutputTransform_relu_InputTransform_kernel(
 
   // bottom-right
   {
-    T inEl[6][6] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    T inEl[6][6] = {};
 
 #pragma unroll
     for (int i = 0; i < 5; i++)
@@ -911,7 +886,6 @@ void OutputTransform_relu_InputTransform_kernel(
       for (int x = 0; x < 6; x++)
         output[TEMP_INDEX_HWNC(y, x, n * 4 + 3, c)] = inEl[y][x];
   }
-#endif
 }
 
 template <typename T>
diff --git a/src/neural/backends/xla/network_xla.cc b/src/neural/backends/xla/network_xla.cc
index 1d0cad0a71..336f03ef25 100644
--- a/src/neural/backends/xla/network_xla.cc
+++ b/src/neural/backends/xla/network_xla.cc
@@ -81,10 +81,10 @@ class XlaNetwork : public Network {
     return std::make_unique<XlaComputation>(this);
   }
   int GetMiniBatchSize() const override {
-    // 32 is the default prefetch size, subtract it so that backend doesn't
-    // crash.
-    // TODO make it better when we have a proper way to query the batch size.
-    return runner_->GetMaxBatchSize() - 32;
+    return runner_->GetMaxBatchSize();
+  }
+  int GetPreferredBatchStep() const override {
+    return runner_->GetPreferredBatchStep();
   }
 
  private:
diff --git a/src/neural/backends/xla/xla_runner.cc b/src/neural/backends/xla/xla_runner.cc
index 35b893dc67..0adac0dbd0 100644
--- a/src/neural/backends/xla/xla_runner.cc
+++ b/src/neural/backends/xla/xla_runner.cc
@@ -170,6 +170,7 @@ void XlaRunner::SetFrozenInputs(
 }
 
 size_t XlaRunner::GetMaxBatchSize() const { return executables_.back().first; }
+size_t XlaRunner::GetPreferredBatchStep() const { return executables_.front().first; }
 
 std::vector<std::unique_ptr<XlaMutableTensor>> XlaRunner::ExecuteBlocking(
     const std::vector<XlaMutableTensor*>& inputs) {
diff --git a/src/neural/backends/xla/xla_runner.h b/src/neural/backends/xla/xla_runner.h
index 931571362e..4c8f71c374 100644
--- a/src/neural/backends/xla/xla_runner.h
+++ b/src/neural/backends/xla/xla_runner.h
@@ -34,8 +34,8 @@
 #include <vector>
 
 #include "neural/backends/xla/pjrt.h"
-#include "neural/xla/hlo.pb.h"
 #include "neural/xla/xla_tensor.h"
+#include "proto/hlo.pb.h"
 
 namespace lczero {
 
@@ -60,6 +60,7 @@ class XlaRunner {
   // Maximum supported batch size. It's expected that the capacity (not size) of
   // the input tensors would be able to fit this size.
   size_t GetMaxBatchSize() const;
+  size_t GetPreferredBatchStep() const;
 
  private:
   std::unique_ptr<PjrtClient> pjrt_client_;
diff --git a/src/neural/factory.cc b/src/neural/factory.cc
index a458a3b9e7..778fb47c90 100644
--- a/src/neural/factory.cc
+++ b/src/neural/factory.cc
@@ -29,6 +29,7 @@
 
 #include <algorithm>
 
+#include "default_backend.h"
 #include "neural/loader.h"
 #include "neural/shared_params.h"
 #include "utils/commandline.h"
@@ -54,7 +55,15 @@ void NetworkFactory::RegisterNetwork(const std::string& name,
 
 std::vector<std::string> NetworkFactory::GetBackendsList() const {
   std::vector<std::string> result;
-  for (const auto& x : factories_) result.emplace_back(x.name);
+#ifdef DEFAULT_BACKEND
+  result.emplace_back(DEFAULT_BACKEND);
+#endif
+  for (const auto& x : factories_) {
+#ifdef DEFAULT_BACKEND
+    if (x.name == result[0]) continue;
+#endif
+    result.emplace_back(x.name);
+  }
   return result;
 }
 
diff --git a/src/neural/factory.h b/src/neural/factory.h
index a3221ed902..fb52c29bb1 100644
--- a/src/neural/factory.h
+++ b/src/neural/factory.h
@@ -108,23 +108,23 @@ class NetworkFactory {
   friend class Register;
 };
 
-#define REGISTER_NETWORK_WITH_COUNTER2(name, func, priority, counter)     \
-  namespace {                                                             \
-  namespace ns##counter {                                                 \
-  static NetworkFactory::Register regH38fhs##counter(                     \
-      name,                                                               \
-      [](const std::optional<WeightsFile>& w, const OptionsDict& o) {     \
-        return func(w, o);                                                \
-      },                                                                  \
-      priority);                                                          \
-  static BackendManager::Register regK03nv##counter(                      \
-      std::make_unique<NetworkAsBackendFactory>(                          \
-          name,                                                           \
-          [](const std::optional<WeightsFile>& w, const OptionsDict& o) { \
-            return func(w, o);                                            \
-          },                                                              \
-          priority));                                                     \
-  }                                                                       \
+#define REGISTER_NETWORK_WITH_COUNTER2(name, func, priority, counter)       \
+  namespace {                                                               \
+  namespace ns##counter {                                                   \
+    [[maybe_unused]] static NetworkFactory::Register regH38fhs##counter(    \
+        name,                                                               \
+        [](const std::optional<WeightsFile>& w, const OptionsDict& o) {     \
+          return func(w, o);                                                \
+        },                                                                  \
+        priority);                                                          \
+    [[maybe_unused]] static BackendManager::Register regK03nv##counter(     \
+        std::make_unique<NetworkAsBackendFactory>(                          \
+            name,                                                           \
+            [](const std::optional<WeightsFile>& w, const OptionsDict& o) { \
+              return func(w, o);                                            \
+            },                                                              \
+            priority));                                                     \
+  }                                                                         \
   }
 
 #define REGISTER_NETWORK_WITH_COUNTER(name, func, priority, counter) \
diff --git a/src/neural/loader.cc b/src/neural/loader.cc
index 5b5840edf8..9b706cabed 100644
--- a/src/neural/loader.cc
+++ b/src/neural/loader.cc
@@ -182,7 +182,8 @@ WeightsFile ParseWeightsProto(const std::string& buffer) {
   }
 
   if (net.has_weights() &&
-      net.format().weights_encoding() != pblczero::Format::LINEAR16) {
+      net.format().weights_encoding() != pblczero::Format::LINEAR16 &&
+      net_ver < GetVersionInt(0, 33, 0)) {
     throw Exception("Invalid weight file: unsupported encoding.");
   }
 
diff --git a/src/neural/network.h b/src/neural/network.h
index b46e63a745..becf424427 100644
--- a/src/neural/network.h
+++ b/src/neural/network.h
@@ -121,6 +121,7 @@ class Network {
   virtual void InitThread(int /*id*/) {}
   virtual bool IsCpu() const { return false; }
   virtual int GetMiniBatchSize() const { return 256; }
+  virtual int GetPreferredBatchStep() const { return 1; }
   virtual ~Network() = default;
 };
 
diff --git a/src/neural/network_legacy.cc b/src/neural/network_legacy.cc
index 53846353c6..8c54b64973 100644
--- a/src/neural/network_legacy.cc
+++ b/src/neural/network_legacy.cc
@@ -142,7 +142,11 @@ BaseWeights::MHA::MHA(const pblczero::Weights::MHA& mha)
       dense_w(LayerAdapter(mha.dense_w()).as_vector()),
       dense_b(LayerAdapter(mha.dense_b()).as_vector()),
       smolgen(Smolgen(mha.smolgen())),
-      has_smolgen(mha.has_smolgen()) {}
+      has_smolgen(mha.has_smolgen()) {
+  if (mha.has_rpe_q() || mha.has_rpe_k() || mha.has_rpe_v()) {
+    throw Exception("RPE weights file not supported.");
+  }
+}
 
 BaseWeights::FFN::FFN(const pblczero::Weights::FFN& ffn)
     : dense1_w(LayerAdapter(ffn.dense1_w()).as_vector()),
diff --git a/src/neural/onnx/adapters.h b/src/neural/onnx/adapters.h
index e83a9385a7..fc04096d9f 100644
--- a/src/neural/onnx/adapters.h
+++ b/src/neural/onnx/adapters.h
@@ -30,8 +30,8 @@
 #include <initializer_list>
 
 #include "neural/onnx/builder.h"
-#include "neural/onnx/onnx.pb.h"
 #include "proto/net.pb.h"
+#include "proto/onnx.pb.h"
 #include "utils/weights_adapter.h"
 
 namespace lczero {
diff --git a/src/neural/onnx/builder.cc b/src/neural/onnx/builder.cc
index fe09d5cb1c..94b7db650a 100644
--- a/src/neural/onnx/builder.cc
+++ b/src/neural/onnx/builder.cc
@@ -30,24 +30,30 @@
 #include <initializer_list>
 
 #include "neural/onnx/adapters.h"
-#include "neural/onnx/onnx.pb.h"
 #include "utils/exception.h"
-#include "utils/random.h"
 #include "version.h"
 
 namespace lczero {
 
-OnnxBuilder::OnnxBuilder(int opset) : opset_(opset) {
+OnnxBuilder::OnnxBuilder(int opset, int ir) : opset_(opset) {
   if (opset < 7 || opset > 22) {
     throw Exception("Only ONNX opsets between 7 and 22 are supported.");
   }
-  model_.set_ir_version(4);
+  // Map of latest opset corresponding to IR version.
+  std::map<int, int> opset_to_ir = {{8, 3},  {9, 4},   {10, 5},
+                                    {11, 6}, {14, 7},  {18, 8},
+                                    {20, 9}, {22, 10}, {99, 11}};
+  if (ir < 0) ir = opset_to_ir.upper_bound(opset - 1)->second;
+  if (ir < 3 || ir > 10) {
+    throw Exception("Only ONNX IR between 3 and 10 is supported.");
+  }
+  model_.set_ir_version(ir);
   model_.set_domain("org.lczero.models.*");
   model_.set_producer_name("Lc0");
   model_.set_producer_version(GetVersionStr());
   model_.add_opset_import()->set_version(opset);
-  model_.mutable_graph()->set_name("org.lczero/converted/" +
-                                   Random::Get().GetString(16));
+  // TODO change to real network name when it becomes available.
+  model_.mutable_graph()->set_name("org.lczero/converted");
 }
 
 namespace {
diff --git a/src/neural/onnx/builder.h b/src/neural/onnx/builder.h
index 4ada3c37f7..7fa7323306 100644
--- a/src/neural/onnx/builder.h
+++ b/src/neural/onnx/builder.h
@@ -30,7 +30,7 @@
 #include <initializer_list>
 #include <string>
 
-#include "neural/onnx/onnx.pb.h"
+#include "proto/onnx.pb.h"
 
 namespace lczero {
 
@@ -45,7 +45,7 @@ class OnnxConst {
 // Builds Onnx::ModelProto.
 class OnnxBuilder {
  public:
-  OnnxBuilder(int opset);
+  OnnxBuilder(int opset, int ir = -1);
   void AddInput(const std::string& name, std::initializer_list<int> dims,
                 pblczero::TensorProto::DataType datatype);
   void AddOutput(const std::string& name, std::initializer_list<int> dims,
diff --git a/src/neural/onnx/converter.cc b/src/neural/onnx/converter.cc
index 07986d4ef6..c59069842a 100644
--- a/src/neural/onnx/converter.cc
+++ b/src/neural/onnx/converter.cc
@@ -250,10 +250,10 @@ std::string Converter::EndOptionalBf16Fix(OnnxBuilder* builder,
 
 std::string Converter::MakeMish(OnnxBuilder* builder, const std::string& input,
                                 const std::string& name) {
-  if (!options_.alt_mish || options_.opset < 9) {
+  if (!options_.alt_mish) {
     std::string flow = input;
     flow = StartOptionalBf16Fix(builder, flow, name);
-    if (options_.opset >= 18) {
+    if (options_.opset >= 18 && options_.real_mish) {
       flow = builder->Mish(name, flow);
       return EndOptionalBf16Fix(builder, flow, name);
     }
@@ -263,29 +263,14 @@ std::string Converter::MakeMish(OnnxBuilder* builder, const std::string& input,
     return builder->Mul(name, flow, input);
   } else {
     auto in = input;
-    if (options_.data_type !=
-        WeightsToOnnxConverterOptions::DataType::kFloat32) {
-      in = builder->Cast(name + "/to_float", in,
-                         pblczero::TensorProto::FLOAT);
-    }
-    const OnnxConst& two =
-        static_cast<const OnnxConst&>(FloatOnnxConst({2.0f}, {1}));
-    const OnnxConst& zero =
-        static_cast<const OnnxConst&>(FloatOnnxConst({0.0f}, {1}));
-    auto e = builder->Exp(name + "/exp", in);
+    auto one = builder->AddInitializer(name + "/one", *GetScalarConverter(1));
+    auto two = builder->AddInitializer(name + "/two", *GetScalarConverter(2));
+    auto e = builder->Exp(name + "/e", in);
     auto flow = builder->Add(name + "/e+2", e, two);
-    auto n = builder->Mul(name + "/n", e, flow);
-    flow = builder->Add(name + "/n+2", n, two);
-    auto d = builder->Div(name + "/d", in, flow);
-    auto f = builder->Mul(name + "/n*d", n, d);
-    flow = builder->Mul(name + "/2*d", d, two);
-    auto t = builder->Sub(name + "/in-2*d", in, flow);
-    flow = builder->Greater(name + "/compare", in, zero);
-    flow = builder->Where(name, flow, t, f);
-    if (options_.data_type !=
-        WeightsToOnnxConverterOptions::DataType::kFloat32) {
-      flow = builder->Cast(name + "/to_data_type", flow, GetDataType());
-    }
+    flow = builder->Mul(name + "/e*e+2e", e, flow);
+    flow = builder->Div(name + "/2/(e*e+2e)", two, flow);
+    flow = builder->Add(name + "/1+2/(e*e+2e)", flow, one);
+    flow = builder->Div(name + "/in/(1+2/(e*e+2e))", in, flow);
     return flow;
   }
 }
@@ -758,7 +743,7 @@ std::string Converter::MakeAttentionBody(OnnxBuilder* builder,
 
   if (weights.ip_mult_gate.size() > 0 || weights.ip_add_gate.size() > 0) {
     flow = builder->Reshape(
-        "/attn_body/ma_gating/rehape1", flow,
+        "/attn_body/ma_gating/rehape", flow,
         builder->AddInitializer("/const/ma_gating/shape1",
                                 Int64OnnxConst({-1, 64, embedding_size}, {3})));
     if (weights.ip_mult_gate.size() > 0) {
@@ -771,17 +756,23 @@ std::string Converter::MakeAttentionBody(OnnxBuilder* builder,
                           *GetWeghtsConverter(weights.ip_add_gate,
                                               {64, embedding_size}, {1, 0}));
     }
-    flow = builder->Reshape(
-        "/attn_body/ma_gating/rehape2", flow,
-        builder->AddInitializer("/const/ma_gating/shape2",
-                                Int64OnnxConst({-1, embedding_size}, {2})));
   }
 
+  flow = builder->Reshape(
+      "/attn_body/rehape", flow,
+      builder->AddInitializer("/const/ma_gating/shape2",
+                              Int64OnnxConst({-1, embedding_size}, {2})));
+
   float alpha = std::pow(2.0f * NumEncBlocks(), -0.25f);
 
   if (input_embedding == network_format::INPUT_EMBEDDING_PE_DENSE) {
-    flow = MakeFFN(builder, weights.ip_emb_ffn, embedding_size, flow,
-                   "/attn_body", default_activation_, alpha);
+    const auto ffn_activation = static_cast<ActivationFunction>(
+        src_.format().network_format().ffn_activation());
+    flow =
+        MakeFFN(builder, weights.ip_emb_ffn, embedding_size, flow, "/attn_body",
+                ffn_activation == ACTIVATION_DEFAULT ? default_activation_
+                                                     : ffn_activation,
+                alpha);
     flow = MakeLayerNorm(
         builder, flow, "/attn_body/ln2",
         *GetWeghtsConverter(weights.ip_emb_ffn_ln_gammas, {embedding_size}),
@@ -921,7 +912,7 @@ void Converter::MakePolicyHead(pblczero::OnnxModel* onnx, OnnxBuilder* builder,
                                const std::string& input,
                                const MultiHeadWeights& weights) {
   // Check that selected policy head exists.
-  if (weights.policy_heads.count(options_.policy_head) == 0) {
+  if (!weights.policy_heads.contains(options_.policy_head)) {
     throw Exception("The policy head you specified '" + options_.policy_head +
                     "'" + " does not exist in this net.");
   }
@@ -989,7 +980,7 @@ void Converter::MakeValueHead(pblczero::OnnxModel* onnx, OnnxBuilder* builder,
                               const std::string& input,
                               const MultiHeadWeights& weights) {
   // Check that selected value head exists.
-  if (weights.value_heads.count(options_.value_head) == 0) {
+  if (!weights.value_heads.contains(options_.value_head)) {
     throw Exception("The value head you specified '" + options_.value_head +
                     "'" + " does not exist in this net.");
   }
@@ -1033,9 +1024,11 @@ void Converter::MakeValueHead(pblczero::OnnxModel* onnx, OnnxBuilder* builder,
                         *GetWeghtsConverter(head.ip2_val_w, {128, 3}, {1, 0}));
     flow = builder->Add("/value/dense2/add", flow,
                         *GetWeghtsConverter(head.ip2_val_b, {3}));
-    auto output = builder->Softmax(options_.output_wdl, flow);
-    builder->AddOutput(output, {options_.batch_size, 3}, GetDataType());
-    onnx->set_output_wdl(output);
+    if (!options_.no_wdl_softmax) {
+      flow = builder->Softmax(options_.output_wdl, flow);
+    }
+    builder->AddOutput(flow, {options_.batch_size, 3}, GetDataType());
+    onnx->set_output_wdl(flow);
   } else {
     flow =
         builder->MatMul("/value/dense2/matmul", flow,
@@ -1092,15 +1085,15 @@ void Converter::MakeMovesLeftHead(pblczero::OnnxModel* onnx,
       *GetWeghtsConverter(weights.ip2_mov_w, {mlh_fc1_outputs, 1}, {1, 0}));
   flow = builder->Add("/mlh/dense2/add", flow,
                       *GetWeghtsConverter(weights.ip2_mov_b, {1}));
-  flow = MakeActivation(builder, flow, "/mlh/dense2", default_activation_);
-  auto output = builder->Identity(options_.output_mlh, flow);
+  // Explicity ReLU activation.
+  auto output = builder->Relu(options_.output_mlh, flow);
   builder->AddOutput(output, {options_.batch_size, 1}, GetDataType());
   onnx->set_output_mlh(output);
 }
 
 void Converter::GenerateOnnx(pblczero::OnnxModel* onnx) {
   MultiHeadWeights weights(src_.weights());
-  OnnxBuilder builder(options_.opset);
+  OnnxBuilder builder(options_.opset, options_.ir);
 
   if (GetDataType() == pblczero::TensorProto::FLOAT16) {
     onnx->set_data_type(pblczero::OnnxModel::FLOAT16);
diff --git a/src/neural/onnx/converter.h b/src/neural/onnx/converter.h
index 632f65c94b..e6c768aad9 100644
--- a/src/neural/onnx/converter.h
+++ b/src/neural/onnx/converter.h
@@ -29,8 +29,8 @@
 
 #include <string>
 
-#include "neural/onnx/onnx.pb.h"
 #include "proto/net.pb.h"
+#include "proto/onnx.pb.h"
 
 namespace lczero {
 
@@ -45,9 +45,12 @@ struct WeightsToOnnxConverterOptions {
   std::string output_mlh = "/output/mlh";
   int batch_size = -1;
   int opset = 17;
-  bool alt_mish = false;       // Use "Mish" approximation (fp32 only).
+  int ir = -1;                 // ONNX IR, -1 for auto.
+  bool alt_mish = false;       // Use "Mish" approximation.
+  bool real_mish = true;       // Use "Mish" operator (opset 18+ and !alt_mish).
   bool alt_layernorm = false;  // Discrete "LayerNormalization" implementation.
   bool no_shape = false;       // Avoid use of "Shape" operator.
+  bool no_wdl_softmax = false; // Skip wdl softmax.
   std::string policy_head = "vanilla";
   std::string value_head = "winner";
 
diff --git a/src/neural/register.cc b/src/neural/register.cc
index 3f37d2a9ce..2a61e99352 100644
--- a/src/neural/register.cc
+++ b/src/neural/register.cc
@@ -29,6 +29,7 @@
 
 #include <algorithm>
 
+#include "default_backend.h"
 #include "neural/shared_params.h"
 
 namespace lczero {
@@ -52,6 +53,12 @@ std::vector<std::string> BackendManager::GetBackendNames() const {
   std::transform(priority_and_names.begin(), priority_and_names.end(),
                  std::back_inserter(result),
                  [](const std::pair<int, std::string>& p) { return p.second; });
+#ifdef DEFAULT_BACKEND
+  std::string name = DEFAULT_BACKEND;
+  auto pos = std::find(result.begin(), result.end(), name);
+  if (pos == result.end()) throw Exception("Unknown backend: " + name);
+  std::rotate(result.begin(), pos, pos + 1);
+#endif
   return result;
 }
 
diff --git a/src/neural/register.h b/src/neural/register.h
index fda3cdda8f..db30ef2a52 100644
--- a/src/neural/register.h
+++ b/src/neural/register.h
@@ -71,9 +71,9 @@ class BackendManager {
   std::vector<std::unique_ptr<BackendFactory>> algorithms_;
 };
 
-#define REGISTER_BACKEND(factory)                   \
-  namespace {                                       \
-  static SearchFactory::Register reg29c93##factory( \
-      std::make_unique<factory>());                 \
+#define REGISTER_BACKEND(factory)                                    \
+  namespace {                                                        \
+  [[maybe_unused]] static SearchFactory::Register reg29c93##factory( \
+      std::make_unique<factory>());                                  \
   }
 }  // namespace lczero
diff --git a/src/neural/wrapper.cc b/src/neural/wrapper.cc
index 2d935f231a..11d9dac78f 100644
--- a/src/neural/wrapper.cc
+++ b/src/neural/wrapper.cc
@@ -34,6 +34,7 @@
 #include "neural/shared_params.h"
 #include "utils/atomic_vector.h"
 #include "utils/fastmath.h"
+#include "utils/trace.h"
 
 namespace lczero {
 namespace {
@@ -121,6 +122,7 @@ class NetworkAsBackendComputation : public BackendComputation {
   void ComputeBlocking() override {
     for (auto& entry : entries_) computation_->AddInput(std::move(entry.input));
     computation_->ComputeBlocking();
+    LCTRACE_FUNCTION_SCOPE;
     for (size_t i = 0; i < entries_.size(); ++i) {
       const EvalResultPtr& result = entries_[i].result;
       if (result.q) *result.q = computation_->GetQVal(i);
@@ -132,6 +134,7 @@ class NetworkAsBackendComputation : public BackendComputation {
 
   void SoftmaxPolicy(std::span<float> dst,
                      const NetworkComputation* computation, int idx) {
+    LCTRACE_FUNCTION_SCOPE;
     const std::vector<Move>& moves = entries_[idx].legal_moves;
     const int transform = entries_[idx].transform;
     // Copy the values to the destination array and compute the maximum.
@@ -192,4 +195,4 @@ std::unique_ptr<Backend> NetworkAsBackendFactory::Create(
   return std::make_unique<NetworkAsBackend>(std::move(network), options);
 }
 
-}  // namespace lczero
\ No newline at end of file
+}  // namespace lczero
diff --git a/src/neural/xla/hlo_builder.cc b/src/neural/xla/hlo_builder.cc
index 873d2d50a8..d018eb5deb 100644
--- a/src/neural/xla/hlo_builder.cc
+++ b/src/neural/xla/hlo_builder.cc
@@ -536,7 +536,7 @@ std::optional<HloComputation> HloBuilder::GetComputationId(
 HloComputation HloBuilder::AddComputation(std::string_view name,
                                           const HloBuilder& builder) {
   std::unordered_map<size_t, size_t> id_map;
-  if (computation_names_.count(std::string(name))) {
+  if (computation_names_.contains(std::string(name))) {
     throw Exception("Computation with name " + std::string(name) +
                     " already exists");
   }
diff --git a/src/neural/xla/hlo_builder.h b/src/neural/xla/hlo_builder.h
index 1211446765..652ccd6326 100644
--- a/src/neural/xla/hlo_builder.h
+++ b/src/neural/xla/hlo_builder.h
@@ -32,7 +32,7 @@
 #include <unordered_map>
 #include <vector>
 
-#include "neural/xla/hlo.pb.h"
+#include "proto/hlo.pb.h"
 #include "utils/logging.h"
 
 namespace lczero {
@@ -187,4 +187,4 @@ class HloContext {
   pblczero::XlaOpMetadata saved_metadata_;
 };
 
-}  // namespace lczero
\ No newline at end of file
+}  // namespace lczero
diff --git a/src/neural/xla/onnx2hlo.cc b/src/neural/xla/onnx2hlo.cc
index 017618e155..c6211ca2ba 100644
--- a/src/neural/xla/onnx2hlo.cc
+++ b/src/neural/xla/onnx2hlo.cc
@@ -32,8 +32,6 @@
 #include <unordered_map>
 #include <unordered_set>
 
-#include "neural/onnx/onnx.pb.h"
-#include "neural/xla/hlo.pb.h"
 #include "neural/xla/hlo_builder.h"
 #include "neural/xla/print_hlo.h"
 #include "utils/bf16_utils.h"
@@ -660,7 +658,7 @@ class Onnx2HloConverter {
   bool AllInputsConstant(const pblczero::NodeProto& node) {
     for (const auto& input : node.input()) {
       const std::string name(input);
-      if (initializers_.count(name)) continue;
+      if (initializers_.contains(name)) continue;
       if (auto iter = onnx_name_to_hlo_flow_.find(name);
           iter != onnx_name_to_hlo_flow_.end() &&
           iter->second->opcode() == "constant") {
@@ -1746,4 +1744,4 @@ std::unique_ptr<XlaTensor> OnnxTensorToXlaTensor(
       onnx_tensor.raw_data());
 }
 
-}  // namespace lczero
\ No newline at end of file
+}  // namespace lczero
diff --git a/src/neural/xla/onnx2hlo.h b/src/neural/xla/onnx2hlo.h
index e06436915d..ddc7bd16df 100644
--- a/src/neural/xla/onnx2hlo.h
+++ b/src/neural/xla/onnx2hlo.h
@@ -31,9 +31,9 @@
 #include <string_view>
 #include <vector>
 
-#include "neural/onnx/onnx.pb.h"
-#include "neural/xla/hlo.pb.h"
 #include "neural/xla/xla_tensor.h"
+#include "proto/hlo.pb.h"
+#include "proto/onnx.pb.h"
 
 namespace lczero {
 
diff --git a/src/neural/xla/print_hlo.h b/src/neural/xla/print_hlo.h
index e906bbe346..c7db16cbfa 100644
--- a/src/neural/xla/print_hlo.h
+++ b/src/neural/xla/print_hlo.h
@@ -29,7 +29,7 @@
 
 #include <iostream>
 
-#include "neural/xla/hlo.pb.h"
+#include "proto/hlo.pb.h"
 
 namespace lczero {
 
@@ -43,4 +43,4 @@ struct PrettyPrintHloOptions {
 void PrettyPrintHlo(const pblczero::HloModuleProto& module,
                     PrettyPrintHloOptions options, std::ostream& stream);
 
-}  // namespace lczero
\ No newline at end of file
+}  // namespace lczero
diff --git a/src/neural/xla/xla_tensor.h b/src/neural/xla/xla_tensor.h
index b49766b9cf..43f9899522 100644
--- a/src/neural/xla/xla_tensor.h
+++ b/src/neural/xla/xla_tensor.h
@@ -33,7 +33,7 @@
 #include <string>
 #include <vector>
 
-#include "neural/xla/hlo.pb.h"
+#include "proto/hlo.pb.h"
 #include "utils/exception.h"
 
 namespace lczero {
@@ -136,4 +136,4 @@ class XlaMutableTensor : public XlaTensor {
   std::unique_ptr<char[]> data_;
 };
 
-}  // namespace lczero
\ No newline at end of file
+}  // namespace lczero
diff --git a/src/python/weights.h b/src/python/weights.h
index 86dd805f19..53ad0968e4 100644
--- a/src/python/weights.h
+++ b/src/python/weights.h
@@ -235,7 +235,8 @@ class Backend {
 class GameState {
  public:
   GameState(const std::optional<std::string> startpos,
-            const std::vector<std::string>& moves) {
+            const std::vector<std::string>& moves,
+            const bool is_c960): is_c960_(is_c960) {
     ChessBoard starting_board;
     int no_capture_ply;
     int full_moves;
@@ -246,11 +247,16 @@ class GameState {
                    full_moves * 2 - (starting_board.flipped() ? 1 : 2));
 
     for (const auto& m : moves) {
-      Move move(m, history_.IsBlackToMove());
+      auto board = history_.Last().GetBoard();
+      Move move = board.ParseMove(m);
       history_.Append(move);
     }
   }
 
+  GameState(const std::optional<std::string> startpos,
+            const std::vector<std::string>& moves)
+      : GameState(startpos, moves, false) {}
+
   std::unique_ptr<Input> as_input(const Backend& backend) const {
     int tmp;
     return std::make_unique<Input>(
@@ -264,8 +270,8 @@ class GameState {
     bool is_black = history_.IsBlackToMove();
     std::vector<std::string> result;
     for (auto m : ms) {
-      if (is_black) m.Mirror();
-      result.push_back(m.as_string());
+      if (is_black) m.Flip();
+      result.push_back(m.ToString(is_c960_));
     }
     return result;
   }
@@ -274,7 +280,7 @@ class GameState {
     auto ms = history_.Last().GetBoard().GenerateLegalMoves();
     std::vector<int> result;
     for (auto m : ms) {
-      result.push_back(m.as_nn_index(/* transform= */ 0));
+      result.push_back(MoveToNNIndex(m, /* transform= */ 0));
     }
     return result;
   }
@@ -287,6 +293,7 @@ class GameState {
 
  private:
   PositionHistory history_;
+  bool is_c960_;
 };
 
 }  // namespace python
diff --git a/src/search/classic/params.cc b/src/search/classic/params.cc
index 0caff4fde3..e61b0f9c88 100644
--- a/src/search/classic/params.cc
+++ b/src/search/classic/params.cc
@@ -525,6 +525,10 @@ const OptionId BaseSearchParams::kUCIRatingAdvId{
 const OptionId BaseSearchParams::kSearchSpinBackoffId{
     "search-spin-backoff", "SearchSpinBackoff",
     "Enable backoff for the spin lock that acquires available searcher."};
+const OptionId BaseSearchParams::kGarbageCollectionDelayId{
+    "garbage-collection-delay", "GarbageCollectionDelay",
+    "The percentage of expected move time until garbage collection start. "
+    "Delay lets search find transpositions to freed search tree branches."};
 
 const OptionId SearchParams::kMaxPrefetchBatchId{
     "max-prefetch", "MaxPrefetch",
@@ -626,6 +630,7 @@ void BaseSearchParams::Populate(OptionsParser* options) {
   options->Add<StringOption>(kUCIOpponentId);
   options->Add<FloatOption>(kUCIRatingAdvId, -10000.0f, 10000.0f) = 0.0f;
   options->Add<BoolOption>(kSearchSpinBackoffId) = false;
+  options->Add<FloatOption>(kGarbageCollectionDelayId, 0.0f, 100.0f) = 10.0f;
 }
 
 void SearchParams::Populate(OptionsParser* options) {
@@ -719,7 +724,8 @@ BaseSearchParams::BaseSearchParams(const OptionsDict& options)
           options.Get<int>(kMaxCollisionVisitsScalingEndId)),
       kMaxCollisionVisitsScalingPower(
           options.Get<float>(kMaxCollisionVisitsScalingPowerId)),
-      kSearchSpinBackoff(options_.Get<bool>(kSearchSpinBackoffId)) {}
+      kSearchSpinBackoff(options_.Get<bool>(kSearchSpinBackoffId)),
+      kGarbageCollectionDelay(options_.Get<float>(kGarbageCollectionDelayId)) {}
 
 SearchParams::SearchParams(const OptionsDict& options)
     : BaseSearchParams(options),
diff --git a/src/search/classic/params.h b/src/search/classic/params.h
index 679e3fe4ec..d84dbad5d8 100644
--- a/src/search/classic/params.h
+++ b/src/search/classic/params.h
@@ -159,6 +159,10 @@ class BaseSearchParams {
   }
   bool GetSearchSpinBackoff() const { return kSearchSpinBackoff; }
 
+  float GetGarbageCollectionDelay() const {
+    return kGarbageCollectionDelay;
+  }
+
   // Search parameter IDs.
   static const OptionId kMiniBatchSizeId;
   static const OptionId kCpuctId;
@@ -226,6 +230,7 @@ class BaseSearchParams {
   static const OptionId kUCIOpponentId;
   static const OptionId kUCIRatingAdvId;
   static const OptionId kSearchSpinBackoffId;
+  static const OptionId kGarbageCollectionDelayId;
 
  protected:
   const OptionsDict& options_;
@@ -284,6 +289,7 @@ class BaseSearchParams {
   const int kMaxCollisionVisitsScalingEnd;
   const float kMaxCollisionVisitsScalingPower;
   const bool kSearchSpinBackoff;
+  const float kGarbageCollectionDelay;
 };
 
 class SearchParams : public BaseSearchParams {
diff --git a/src/search/classic/search.cc b/src/search/classic/search.cc
index f38395a38f..7a4d1c7deb 100644
--- a/src/search/classic/search.cc
+++ b/src/search/classic/search.cc
@@ -29,7 +29,6 @@
 
 #include <algorithm>
 #include <array>
-#include <chrono>
 #include <cmath>
 #include <iomanip>
 #include <iostream>
@@ -42,6 +41,7 @@
 #include "utils/fastmath.h"
 #include "utils/random.h"
 #include "utils/spinhelper.h"
+#include "utils/trace.h"
 
 namespace lczero {
 namespace classic {
@@ -282,6 +282,7 @@ void Search::SendUciInfo() REQUIRES(nodes_mutex_) REQUIRES(counters_mutex_) {
             .count();
     if (time_since_first_batch_ms > 0) {
       common_info.nps = total_playouts_ * 1000 / time_since_first_batch_ms;
+      common_info.eps = network_evaluations_ * 1000 / time_since_first_batch_ms;
     }
   }
   common_info.tb_hits = tb_hits_.load(std::memory_order_acquire);
@@ -426,7 +427,7 @@ float Search::GetDrawScore(bool is_odd_depth) const {
 }
 
 namespace {
-inline float GetFpu(const SearchParams& params, Node* node, bool is_root_node,
+inline float GetFpu(const SearchParams& params, const Node* node, bool is_root_node,
                     float draw_score) {
   const auto value = params.GetFpuValue(is_root_node);
   return params.GetFpuAbsolute(is_root_node)
@@ -436,7 +437,7 @@ inline float GetFpu(const SearchParams& params, Node* node, bool is_root_node,
 }
 
 // Faster version for if visited_policy is readily available already.
-inline float GetFpu(const SearchParams& params, Node* node, bool is_root_node,
+inline float GetFpu(const SearchParams& params, const Node* node, bool is_root_node,
                     float draw_score, float visited_pol) {
   const auto value = params.GetFpuValue(is_root_node);
   return params.GetFpuAbsolute(is_root_node)
@@ -453,7 +454,10 @@ inline float ComputeCpuct(const SearchParams& params, uint32_t N,
 }
 }  // namespace
 
-std::vector<std::string> Search::GetVerboseStats(Node* node) const {
+// Ignore the last tuple element when sorting in GetVerboseStats
+static bool operator<(const EdgeAndNode&, const EdgeAndNode&) { return false; }
+
+std::vector<std::string> Search::GetVerboseStats(const Node* node) const {
   assert(node == root_node_ || node->GetParent() == root_node_);
   const bool is_root = (node == root_node_);
   const bool is_odd_depth = !is_root;
@@ -463,16 +467,14 @@ std::vector<std::string> Search::GetVerboseStats(Node* node) const {
   const float cpuct = ComputeCpuct(params_, node->GetN(), is_root);
   const float U_coeff =
       cpuct * std::sqrt(std::max(node->GetChildrenVisits(), 1u));
-  std::vector<EdgeAndNode> edges;
-  for (const auto& edge : node->Edges()) edges.push_back(edge);
-
-  std::sort(edges.begin(), edges.end(),
-            [&fpu, &U_coeff, &draw_score](EdgeAndNode a, EdgeAndNode b) {
-              return std::forward_as_tuple(
-                         a.GetN(), a.GetQ(fpu, draw_score) + a.GetU(U_coeff)) <
-                     std::forward_as_tuple(
-                         b.GetN(), b.GetQ(fpu, draw_score) + b.GetU(U_coeff));
-            });
+  std::vector<std::tuple<uint32_t, float, EdgeAndNode>> edges;
+  edges.reserve(node->GetNumEdges());
+  for (const auto& edge : node->Edges()) {
+    edges.emplace_back(edge.GetN(),
+                       edge.GetQ(fpu, draw_score) + edge.GetU(U_coeff),
+                       edge);
+  }
+  std::sort(edges.begin(), edges.end());
 
   auto print = [](auto* oss, auto pre, auto v, auto post, auto w, int p = 0) {
     *oss << pre << std::setw(w) << std::setprecision(p) << v << post;
@@ -544,7 +546,8 @@ std::vector<std::string> Search::GetVerboseStats(Node* node) const {
   std::vector<std::string> infos;
   const auto m_evaluator =
       backend_attributes_.has_mlh ? MEvaluator(params_, node) : MEvaluator();
-  for (const auto& edge : edges) {
+  for (const auto& edge_tuple : edges) {
+    const auto& edge = std::get<2>(edge_tuple);
     float Q = edge.GetQ(fpu, draw_score);
     float M = m_evaluator.GetMUtility(edge, Q);
     std::ostringstream oss;
@@ -622,7 +625,7 @@ void Search::MaybeTriggerStop(const IterationStats& stats,
   // Already responded bestmove, nothing to do here.
   if (bestmove_is_sent_) return;
   // Don't stop when the root node is not yet expanded.
-  if (total_playouts_ + initial_visits_ == 0) return;
+  if (stats.total_nodes == 0) return;
 
   if (!stop_.load(std::memory_order_acquire)) {
     if (stopper_->ShouldStop(stats, hints)) FireStopInternal();
@@ -1104,7 +1107,7 @@ void SearchWorker::RunTasks(int tid) {
             // We got the spin lock, double check we're still in the clear.
             if (nta < tc) {
               id = tasks_taken_.fetch_add(1, std::memory_order_acq_rel);
-              task = &picking_tasks_[id];
+              task = picking_tasks_.data() + id;
               task_taking_started_.store(0, std::memory_order_release);
               break;
             }
@@ -1152,7 +1155,7 @@ void SearchWorker::RunTasks(int tid) {
           break;
         }
       }
-      picking_tasks_[id].complete = true;
+      picking_tasks_.data()[id].complete = true;
       completed_tasks_.fetch_add(1, std::memory_order_acq_rel);
     }
   }
@@ -1160,7 +1163,7 @@ void SearchWorker::RunTasks(int tid) {
 
 void SearchWorker::ExecuteOneIteration() {
   // 1. Initialize internal structures.
-  InitializeIteration(search_->backend_->CreateComputation());
+  InitializeIteration();
 
   if (params_.GetMaxConcurrentSearchers() != 0) {
     std::unique_ptr<SpinHelper> spin_helper;
@@ -1249,9 +1252,12 @@ void SearchWorker::ExecuteOneIteration() {
 
 // 1. Initialize internal structures.
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-void SearchWorker::InitializeIteration(
-    std::unique_ptr<BackendComputation> computation) {
-  computation_ = std::move(computation);
+void SearchWorker::InitializeIteration() {
+  LCTRACE_FUNCTION_SCOPE;
+  // Free the old computation before allocating a new one. This works better
+  // when backend caches buffer allocations between computations.
+  computation_.reset();
+  computation_ = search_->backend_->CreateComputation();
   minibatch_.clear();
   minibatch_.reserve(2 * target_minibatch_size_);
 }
@@ -1282,6 +1288,7 @@ int CalculateCollisionsLeft(int64_t nodes, const SearchParams& params) {
 }  // namespace
 
 void SearchWorker::GatherMinibatch() {
+  LCTRACE_FUNCTION_SCOPE;
   // Total number of nodes to process.
   int minibatch_size = 0;
   int cur_n = 0;
@@ -1383,6 +1390,7 @@ void SearchWorker::GatherMinibatch() {
       }
     }
     if (some_ooo) {
+      LCTRACE_FUNCTION_SCOPE;
       SharedMutex::Lock lock(search_->nodes_mutex_);
       for (int i = static_cast<int>(minibatch_.size()) - 1; i >= new_start;
            i--) {
@@ -1407,6 +1415,7 @@ void SearchWorker::GatherMinibatch() {
       }
     }
 
+    LCTRACE_FUNCTION_SCOPE;
     // Check for stop at the end so we have at least one node.
     for (size_t i = new_start; i < minibatch_.size(); i++) {
       auto& picked_node = minibatch_[i];
@@ -1435,6 +1444,7 @@ void SearchWorker::GatherMinibatch() {
 
 void SearchWorker::ProcessPickedTask(int start_idx, int end_idx,
                                      TaskWorkspace* workspace) {
+  LCTRACE_FUNCTION_SCOPE;
   auto& history = workspace->history;
   history = search_->played_history_;
 
@@ -1565,6 +1575,7 @@ void SearchWorker::PickNodesToExtendTask(
     const std::vector<Move>& moves_to_base,
     std::vector<NodeToProcess>* receiver,
     TaskWorkspace* workspace) NO_THREAD_SAFETY_ANALYSIS {
+  LCTRACE_FUNCTION_SCOPE;
   // TODO: Bring back pre-cached nodes created outside locks in a way that works
   // with tasks.
   // TODO: pre-reserve visits_to_perform for expected depth and likely maximum
@@ -2004,22 +2015,9 @@ void SearchWorker::ExtendNode(Node* node, int depth,
   node->CreateEdges(legal_moves);
 }
 
-// Returns whether node was already in cache.
-bool SearchWorker::AddNodeToComputation(Node* node) {
-  std::vector<Move> moves;
-  if (node && node->HasChildren()) {
-    moves.reserve(node->GetNumEdges());
-    for (const auto& edge : node->Edges()) moves.emplace_back(edge.GetMove());
-  } else {
-    moves = history_.Last().GetBoard().GenerateLegalMoves();
-  }
-  return computation_->AddInput(EvalPosition{history_.GetPositions(), moves},
-                                EvalResultPtr{}) ==
-         BackendComputation::FETCHED_IMMEDIATELY;
-}
-
 // 2b. Copy collisions into shared collisions.
 void SearchWorker::CollectCollisions() {
+  LCTRACE_FUNCTION_SCOPE;
   SharedMutex::Lock lock(search_->nodes_mutex_);
 
   for (const NodeToProcess& node_to_process : minibatch_) {
@@ -2033,6 +2031,7 @@ void SearchWorker::CollectCollisions() {
 // 3. Prefetch into cache.
 // ~~~~~~~~~~~~~~~~~~~~~~~
 void SearchWorker::MaybePrefetchIntoCache() {
+  LCTRACE_FUNCTION_SCOPE;
   // TODO(mooskagh) Remove prefetch into cache if node collisions work well.
   // If there are requests to NN, but the batch is not full, try to prefetch
   // nodes which are likely useful in future.
@@ -2056,13 +2055,17 @@ int SearchWorker::PrefetchIntoCache(Node* node, int budget, bool is_odd_depth) {
 
   // We are in a leaf, which is not yet being processed.
   if (!node || node->GetNStarted() == 0) {
-    if (AddNodeToComputation(node)) {
+    if (search_->backend_->GetCachedEvaluation(
+            EvalPosition{history_.GetPositions(), {}})) {
       // Make it return 0 to make it not use the slot, so that the function
       // tries hard to find something to cache even among unpopular moves.
       // In practice that slows things down a lot though, as it's not always
       // easy to find what to cache.
       return 1;
     }
+    auto moves = history_.Last().GetBoard().GenerateLegalMoves();
+    computation_->AddInput(EvalPosition{history_.GetPositions(), moves},
+                           EvalResultPtr{});
     return 1;
   }
 
@@ -2146,6 +2149,7 @@ void SearchWorker::RunNNComputation() {
 // 5. Retrieve NN computations (and terminal values) into nodes.
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 void SearchWorker::FetchMinibatchResults() {
+  LCTRACE_FUNCTION_SCOPE;
   // Populate NN/cached results, or terminal results, into nodes.
   for (auto& node_to_process : minibatch_) {
     FetchSingleNodeResult(&node_to_process);
@@ -2194,6 +2198,7 @@ void SearchWorker::FetchSingleNodeResult(NodeToProcess* node_to_process) {
 // 6. Propagate the new nodes' information to all their parents in the tree.
 // ~~~~~~~~~~~~~~
 void SearchWorker::DoBackupUpdate() {
+  LCTRACE_FUNCTION_SCOPE;
   // Nodes mutex for doing node updates.
   SharedMutex::Lock lock(search_->nodes_mutex_);
 
@@ -2287,6 +2292,9 @@ void SearchWorker::DoBackupUpdateSingleNode(
     }
   }
   search_->total_playouts_ += node_to_process.multivisit;
+  if (node_to_process.nn_queried && !node_to_process.is_cache_hit) {
+    search_->network_evaluations_++;
+  }
   search_->cum_depth_ += node_to_process.depth * node_to_process.multivisit;
   search_->max_depth_ = std::max(search_->max_depth_, node_to_process.depth);
 }
@@ -2363,6 +2371,7 @@ bool SearchWorker::MaybeSetBounds(Node* p, float m, int* n_to_fix,
 // 7. Update the Search's status and progress information.
 //~~~~~~~~~~~~~~~~~~~~
 void SearchWorker::UpdateCounters() {
+  LCTRACE_FUNCTION_SCOPE;
   search_->PopulateCommonIterationStats(&iteration_stats_);
   search_->MaybeTriggerStop(iteration_stats_, &latest_time_manager_hints_);
   search_->MaybeOutputInfo();
diff --git a/src/search/classic/search.h b/src/search/classic/search.h
index c267522e04..34293f3173 100644
--- a/src/search/classic/search.h
+++ b/src/search/classic/search.h
@@ -126,7 +126,7 @@ class Search {
 
   // Returns verbose information about given node, as vector of strings.
   // Node can only be root or ponder (depth 1).
-  std::vector<std::string> GetVerboseStats(Node* node) const;
+  std::vector<std::string> GetVerboseStats(const Node* node) const;
 
   // Returns the draw score at the root of the search. At odd depth pass true to
   // the value of @is_odd_depth to change the sign of the draw score.
@@ -181,6 +181,7 @@ class Search {
   Edge* last_outputted_info_edge_ GUARDED_BY(nodes_mutex_) = nullptr;
   ThinkingInfo last_outputted_uci_info_ GUARDED_BY(nodes_mutex_);
   int64_t total_playouts_ GUARDED_BY(nodes_mutex_) = 0;
+  int64_t network_evaluations_ GUARDED_BY(nodes_mutex_) = 0;
   int64_t total_batches_ GUARDED_BY(nodes_mutex_) = 0;
   // Maximum search depth = length of longest path taken in PickNodetoExtend.
   uint16_t max_depth_ GUARDED_BY(nodes_mutex_) = 0;
@@ -278,7 +279,7 @@ class SearchWorker {
   // The same operations one by one:
   // 1. Initialize internal structures.
   // @computation is the computation to use on this iteration.
-  void InitializeIteration(std::unique_ptr<BackendComputation> computation);
+  void InitializeIteration();
 
   // 2. Gather minibatch.
   void GatherMinibatch();
@@ -398,15 +399,14 @@ class SearchWorker {
   };
 
   NodeToProcess PickNodeToExtend(int collision_limit);
-  bool AddNodeToComputation(Node* node);
   int PrefetchIntoCache(Node* node, int budget, bool is_odd_depth);
   void DoBackupUpdateSingleNode(const NodeToProcess& node_to_process);
   // Returns whether a node's bounds were set based on its children.
   bool MaybeSetBounds(Node* p, float m, int* n_to_fix, float* v_delta,
                       float* d_delta, float* m_delta) const;
   void PickNodesToExtend(int collision_limit);
-  void PickNodesToExtendTask(Node* starting_point, int collision_limit,
-                             int base_depth,
+  void PickNodesToExtendTask(Node* starting_point, int base_depth,
+                             int collision_limit,
                              const std::vector<Move>& moves_to_base,
                              std::vector<NodeToProcess>* receiver,
                              TaskWorkspace* workspace);
@@ -432,7 +432,6 @@ class SearchWorker {
   PositionHistory history_;
   int number_out_of_order_ = 0;
   const SearchParams& params_;
-  std::unique_ptr<Node> precached_node_;
   const bool moves_left_support_;
   IterationStats iteration_stats_;
   StoppersHints latest_time_manager_hints_;
diff --git a/src/search/classic/stoppers/common.cc b/src/search/classic/stoppers/common.cc
index 8ca61c4c00..313176bd9c 100644
--- a/src/search/classic/stoppers/common.cc
+++ b/src/search/classic/stoppers/common.cc
@@ -131,8 +131,8 @@ void PopulateCommonUciStoppers(ChainedSearchStopper* stopper,
   }
 
   // "go nodes" stopper.
-  int64_t node_limit = 0;
-  if (params.nodes) {
+  int64_t node_limit = 4000000000;
+  if (params.nodes.has_value()) {
     if (options.Get<bool>(kNodesAsPlayoutsId)) {
       stopper->AddStopper(std::make_unique<PlayoutsStopper>(
           *params.nodes, options.Get<float>(kSmartPruningFactorId) > 0.0f));
@@ -140,8 +140,7 @@ void PopulateCommonUciStoppers(ChainedSearchStopper* stopper,
       node_limit = *params.nodes;
     }
   }
-  // always limit nodes to avoid exceeding the limit 4000000000. That number is
-  // default when node_limit = 0.
+  // Always limit nodes to avoid exceeding the limit 4000000000.
   stopper->AddStopper(std::make_unique<VisitsStopper>(
       node_limit, options.Get<float>(kSmartPruningFactorId) > 0.0f));
 
diff --git a/src/search/classic/stoppers/stoppers.cc b/src/search/classic/stoppers/stoppers.cc
index 896d39a38a..5cf4ecd092 100644
--- a/src/search/classic/stoppers/stoppers.cc
+++ b/src/search/classic/stoppers/stoppers.cc
@@ -97,8 +97,12 @@ MemoryWatchingStopper::MemoryWatchingStopper(int ram_limit_mb,
                                              uint32_t nodes,
                                              bool populate_remaining_playouts)
     : VisitsStopper(
-          (ram_limit_mb * 1000000LL - total_memory + avg_node_size * nodes) /
-              avg_node_size,
+          [&]() -> size_t {
+            const auto ram_limit = ram_limit_mb * 1000000LL;
+            const auto nodes_memory = avg_node_size * nodes;
+            if (ram_limit + nodes_memory < total_memory) return 0;
+            return (ram_limit + nodes_memory - total_memory) / avg_node_size;
+          }(),
           populate_remaining_playouts) {
   LOGFILE << "RAM limit " << ram_limit_mb << "MB. Memory allocated is "
           << (total_memory - avg_node_size * nodes) / 1000000
diff --git a/src/search/classic/stoppers/stoppers.h b/src/search/classic/stoppers/stoppers.h
index 3cc220129c..7232d8000e 100644
--- a/src/search/classic/stoppers/stoppers.h
+++ b/src/search/classic/stoppers/stoppers.h
@@ -54,7 +54,7 @@ class ChainedSearchStopper : public SearchStopper {
 class VisitsStopper : public SearchStopper {
  public:
   VisitsStopper(int64_t limit, bool populate_remaining_playouts)
-      : nodes_limit_(limit ? limit : 4000000000ll),
+      : nodes_limit_(limit),
         populate_remaining_playouts_(populate_remaining_playouts) {}
   int64_t GetVisitsLimit() const { return nodes_limit_; }
   bool ShouldStop(const IterationStats&, StoppersHints*) override;
diff --git a/src/search/classic/wrapper.cc b/src/search/classic/wrapper.cc
index d0935448a6..8ba6f0fd49 100644
--- a/src/search/classic/wrapper.cc
+++ b/src/search/classic/wrapper.cc
@@ -30,7 +30,8 @@
 #include "search/classic/stoppers/factory.h"
 #include "search/register.h"
 #include "search/search.h"
-#include "src/neural/shared_params.h"
+#include "neural/shared_params.h"
+#include "utils/trace.h"
 
 namespace lczero {
 namespace classic {
@@ -97,18 +98,21 @@ MoveList StringsToMovelist(const std::vector<std::string>& moves,
 }
 
 void ClassicSearch::NewGame() {
+  LCTRACE_FUNCTION_SCOPE;
   search_.reset();
   tree_.reset();
   time_manager_ = MakeTimeManager(*options_);
 }
 
 void ClassicSearch::SetPosition(const GameState& pos) {
+  LCTRACE_FUNCTION_SCOPE;
   if (!tree_) tree_ = std::make_unique<NodeTree>();
   const bool is_same_game = tree_->ResetToPosition(pos);
   if (!is_same_game) time_manager_ = MakeTimeManager(*options_);
 }
 
 void ClassicSearch::StartSearch(const GoParams& params) {
+  LCTRACE_FUNCTION_SCOPE;
   auto forwarder =
       std::make_unique<NonOwningUciRespondForwarder>(uci_responder_);
   if (options_->Get<Button>(kClearTree).TestAndReset()) tree_->TrimTreeAtHead();
@@ -140,6 +144,7 @@ class ClassicSearchFactory : public SearchFactory {
   std::string_view GetName() const override { return "classic"; }
   std::unique_ptr<SearchBase> CreateSearch(
       UciResponder* responder, const OptionsDict* options) const override {
+    LCTRACE_FUNCTION_SCOPE;
     return std::make_unique<ClassicSearch>(responder, options);
   }
 
diff --git a/src/search/dag_classic/node.cc b/src/search/dag_classic/node.cc
index adc321b7ae..1ad4762f0e 100644
--- a/src/search/dag_classic/node.cc
+++ b/src/search/dag_classic/node.cc
@@ -32,6 +32,7 @@
 #include <cmath>
 #include <cstring>
 #include <iostream>
+#include <fstream>
 #include <list>
 #include <sstream>
 #include <thread>
@@ -128,7 +129,7 @@ void Node::Trim() {
   d_ = 0.0f;
   m_ = 0.0f;
   n_ = 0;
-  n_in_flight_ = 0;
+  n_in_flight_.store(0, std::memory_order_release);
 
   // edge_
 
@@ -140,6 +141,15 @@ void Node::Trim() {
   repetition_ = false;
 }
 
+LowNode::~LowNode() {
+  NodeGarbageCollector::Instance().AddToGcQueue(child_);
+}
+
+Node::~Node() {
+  NodeGarbageCollector::Instance().AddToGcQueue(sibling_);
+  UnsetLowNode();
+}
+
 Node* Node::GetChild() const {
   if (!low_node_) return nullptr;
   return low_node_->GetChild()->get();
@@ -154,7 +164,7 @@ float Node::GetVisitedPolicy() const {
 }
 
 uint32_t Node::GetNInFlight() const {
-  return n_in_flight_;
+  return n_in_flight_.load(std::memory_order_acquire);
 }
 
 uint32_t Node::GetChildrenVisits() const {
@@ -172,7 +182,7 @@ std::string Node::DebugString() const {
   oss << " <Node> This:" << this << " LowNode:" << low_node_.get()
       << " Index:" << index_ << " Move:" << GetMove().ToString(true)
       << " Sibling:" << sibling_.get() << " P:" << GetP() << " WL:" << wl_
-      << " D:" << d_ << " M:" << m_ << " N:" << n_ << " N_:" << n_in_flight_
+      << " D:" << d_ << " M:" << m_ << " N:" << n_ << " N_:" << GetNInFlight()
       << " Term:" << static_cast<int>(terminal_type_)
       << " Bounds:" << static_cast<int>(lower_bound_) - 2 << ","
       << static_cast<int>(upper_bound_) - 2;
@@ -187,8 +197,7 @@ std::string LowNode::DebugString() const {
       << " M:" << m_ << " N:" << n_ << " NP:" << num_parents_
       << " Term:" << static_cast<int>(terminal_type_)
       << " Bounds:" << static_cast<int>(lower_bound_) - 2 << ","
-      << static_cast<int>(upper_bound_) - 2
-      << " IsTransposition:" << is_transposition;
+      << static_cast<int>(upper_bound_) - 2;
   return oss.str();
 }
 
@@ -319,14 +328,19 @@ void Node::SetBounds(GameResult lower, GameResult upper) {
 }
 
 bool Node::TryStartScoreUpdate() {
-  if (n_ == 0 && n_in_flight_ > 0) return false;
-  ++n_in_flight_;
-  return true;
+  if (n_ > 0) {
+    n_in_flight_.fetch_add(1, std::memory_order_acq_rel);
+    return true;
+  } else {
+    uint32_t expected_n_if_flight_ = 0;
+    return n_in_flight_.compare_exchange_strong(expected_n_if_flight_, 1,
+                                              std::memory_order_acq_rel);
+  }
 }
 
 void Node::CancelScoreUpdate(uint32_t multivisit) {
-  assert(n_in_flight_ >= (uint32_t)multivisit);
-  n_in_flight_ -= multivisit;
+  assert(GetNInFlight() >= (uint32_t)multivisit);
+  n_in_flight_.fetch_sub(multivisit, std::memory_order_acq_rel);
 }
 
 void LowNode::FinalizeScoreUpdate(float v, float d, float m,
@@ -366,8 +380,8 @@ void Node::FinalizeScoreUpdate(float v, float d, float m, uint32_t multivisit) {
   // Increment N.
   n_ += multivisit;
   // Decrement virtual loss.
-  assert(n_in_flight_ >= (uint32_t)multivisit);
-  n_in_flight_ -= multivisit;
+  assert(GetNInFlight() >= (uint32_t)multivisit);
+  n_in_flight_.fetch_sub(multivisit, std::memory_order_acq_rel);
 }
 
 void Node::AdjustForTerminal(float v, float d, float m, uint32_t multivisit) {
@@ -382,41 +396,37 @@ void Node::AdjustForTerminal(float v, float d, float m, uint32_t multivisit) {
 }
 
 void Node::IncrementNInFlight(uint32_t multivisit) {
-  n_in_flight_ += multivisit;
+  n_in_flight_.fetch_add(multivisit, std::memory_order_acq_rel);
 }
 
-void LowNode::ReleaseChildren(
-    std::vector<std::unique_ptr<Node>>& released_nodes) {
-  released_nodes.emplace_back(std::move(child_));
+void LowNode::ReleaseChildren() {
+  NodeGarbageCollector::Instance().AddToGcQueue(child_);
 }
 
-void LowNode::ReleaseChildrenExceptOne(
-    Node* node_to_save, std::vector<std::unique_ptr<Node>>& released_nodes) {
+void LowNode::ReleaseChildrenExceptOne(Node* node_to_save) {
+  auto& ngc = NodeGarbageCollector::Instance();
   // Stores node which will have to survive (or nullptr if it's not found).
   std::unique_ptr<Node> saved_node;
-  // Pointer to unique_ptr, so that we could move from it.
-  for (std::unique_ptr<Node>* node = &child_; *node;
-       node = (*node)->GetSibling()) {
+  // Pointer to atomic_unique_ptr, so that we could move from it.
+  for (auto node = &child_; *node; node = (*node)->GetSibling()) {
     // If current node is the one that we have to save.
     if (node->get() == node_to_save) {
       // Kill all remaining siblings.
-      released_nodes.emplace_back(std::move(*(*node)->GetSibling()));
+      ngc.AddToGcQueue(*(*node)->GetSibling());
       // Save the node, and take the ownership from the unique_ptr.
-      saved_node = std::move(*node);
+      saved_node.reset(node->release());
       break;
     }
   }
   // Make saved node the only child. (kills previous siblings).
-  released_nodes.emplace_back(std::move(child_));
+  ngc.AddToGcQueue(child_);
   child_ = std::move(saved_node);
 }
 
-void Node::ReleaseChildrenExceptOne(
-    Node* node_to_save,
-    std::vector<std::unique_ptr<Node>>& released_nodes) const {
+void Node::ReleaseChildrenExceptOne(Node* node_to_save) const {
   // Sometime we have no graph yet or a reverted terminal without low node.
   if (low_node_) {
-    low_node_->ReleaseChildrenExceptOne(node_to_save, released_nodes);
+    low_node_->ReleaseChildrenExceptOne(node_to_save);
   }
 }
 
@@ -430,14 +440,71 @@ void Node::UnsetLowNode() {
   low_node_.reset();
 }
 
+#ifndef NDEBUG
+namespace {
+static Node::VisitorId::storage current_visitor_id = 0;
+}
+
+Node::VisitorId::VisitorId() {
+  id_ = ++current_visitor_id;
+  if (id_ == 0)
+    id_ = ++current_visitor_id;
+}
+
+Node::VisitorId::~VisitorId() {
+  assert(current_visitor_id == id_);
+}
+
+bool LowNode::Visit(Node::VisitorId::type id) {
+  if (visitor_id_ == id)
+    return false;
+  visitor_id_ = id;
+  return true;
+}
+
+template<typename VisitorType, typename EdgeVisitorType>
+static void TreeWalk(const Node* node, bool as_opponent,
+                     Node::VisitorId::type id,
+                     VisitorType visitor, EdgeVisitorType edge) {
+  const std::shared_ptr<LowNode>& low_node = node->GetLowNode();
+  if (!low_node || !low_node->Visit(id)) {
+    return;
+  }
+
+  visitor(low_node.get(), as_opponent);
+
+  for (auto& child_edge : node->Edges()) {
+    auto child = child_edge.node();
+    if (child == nullptr) {
+      break;
+    }
+    edge(child, as_opponent, low_node.get());
+  }
+
+  for (auto& child_edge : node->Edges()) {
+    auto child = child_edge.node();
+    if (child == nullptr) {
+      return;
+    }
+    TreeWalk(child, !as_opponent, id, visitor, edge);
+  }
+}
+
 static std::string PtrToNodeName(const void* ptr) {
   std::ostringstream oss;
   oss << "n_" << ptr;
   return oss.str();
 }
 
-std::string LowNode::DotNodeString() const {
-  std::ostringstream oss;
+template<typename VisitorType, typename EdgeVisitorType>
+static void TreeWalk(const Node* node, bool as_opponent,
+                     VisitorType visitor, EdgeVisitorType edge) {
+  Node::VisitorId id{};
+  edge(node, as_opponent, nullptr);
+  TreeWalk(node, !as_opponent, id, visitor, edge);
+}
+
+void LowNode::DotNodeString(std::ofstream& oss) const {
   oss << PtrToNodeName(this) << " ["
       << "shape=box";
   // Adjust formatting to limit node size.
@@ -459,23 +526,20 @@ std::string LowNode::DotNodeString() const {
       << std::showpos                                    //
       << "\\nBounds=" << static_cast<int>(lower_bound_) - 2 << ","
       << static_cast<int>(upper_bound_) - 2
-      << "\\nIsTransposition=" << is_transposition  //
       << std::noshowpos                             //
       << "\\n\\nThis=" << this << "\\nEdges=" << edges_.get()
       << "\\nNumEdges=" << static_cast<int>(num_edges_)
       << "\\nChild=" << child_.get() << "\\n\"";
-  oss << "];";
-  return oss.str();
+  oss << "];" << std::endl;
 }
 
-std::string Node::DotEdgeString(bool as_opponent, const LowNode* parent) const {
-  std::ostringstream oss;
+void Node::DotEdgeString(std::ofstream& oss, bool as_opponent, const LowNode* parent) const {
   oss << (parent == nullptr ? "top" : PtrToNodeName(parent)) << " -> "
       << (low_node_ ? PtrToNodeName(low_node_.get()) : PtrToNodeName(this))
       << " [";
   oss << "label=\""
       << (parent == nullptr ? "N/A" : GetMove(as_opponent).ToString(true))
-      << "\\lN=" << n_ << "\\lN_=" << n_in_flight_;
+      << "\\lN=" << n_ << "\\lN_=" << GetNInFlight();
   oss << "\\l\"";
   // Set precision for tooltip.
   oss << std::fixed << std::setprecision(5);
@@ -485,7 +549,7 @@ std::string Node::DotEdgeString(bool as_opponent, const LowNode* parent) const {
       << "\\nWL= " << wl_                             //
       << std::noshowpos                               //
       << "\\nD=" << d_ << "\\nM=" << m_ << "\\nN=" << n_
-      << "\\nN_=" << n_in_flight_
+      << "\\nN_=" << GetNInFlight()
       << "\\nTerm=" << static_cast<int>(terminal_type_)  //
       << std::showpos                                    //
       << "\\nBounds=" << static_cast<int>(lower_bound_) - 2 << ","
@@ -493,15 +557,10 @@ std::string Node::DotEdgeString(bool as_opponent, const LowNode* parent) const {
       << std::noshowpos                                               //
       << "\\nLowNode=" << low_node_.get() << "\\nParent=" << parent
       << "\\nIndex=" << index_ << "\\nSibling=" << sibling_.get() << "\\n\"";
-  oss << "];";
-  return oss.str();
+  oss << "];" << std::endl;
 }
 
-std::string Node::DotGraphString(bool as_opponent) const {
-  std::ostringstream oss;
-  std::unordered_set<const LowNode*> seen;
-  std::list<std::pair<const Node*, bool>> unvisited_fifo;
-
+void Node::DotGraphString(std::ofstream& oss, bool as_opponent) const {
   oss << "strict digraph {" << std::endl;
   oss << "edge ["
       << "headport=n"
@@ -514,83 +573,37 @@ std::string Node::DotGraphString(bool as_opponent) const {
       << "];" << std::endl;
   oss << "ranksep=" << 4.0f * std::log10(GetN()) << std::endl;
 
-  oss << DotEdgeString(!as_opponent) << std::endl;
-  if (low_node_) {
-    seen.insert(low_node_.get());
-    unvisited_fifo.push_back(std::pair(this, as_opponent));
-  }
-
-  while (!unvisited_fifo.empty()) {
-    auto [parent_node, parent_as_opponent] = unvisited_fifo.front();
-    unvisited_fifo.pop_front();
-
-    auto parent_low_node = parent_node->GetLowNode().get();
-    seen.insert(parent_low_node);
-    oss << parent_low_node->DotNodeString() << std::endl;
-
-    for (auto& child_edge : parent_node->Edges()) {
-      auto child = child_edge.node();
-      if (child == nullptr) break;
-
-      oss << child->DotEdgeString(parent_as_opponent) << std::endl;
-      auto child_low_node = child->GetLowNode().get();
-      if (child_low_node != nullptr &&
-          (seen.find(child_low_node) == seen.end())) {
-        seen.insert(child_low_node);
-        unvisited_fifo.push_back(std::pair(child, !parent_as_opponent));
-      }
-    }
-  }
+  TreeWalk(this, !as_opponent,
+    [&](const LowNode* low_node, bool) {
+      low_node->DotNodeString(oss);
+    },
+    [&](const Node* node, bool as_opponent, const LowNode* parent) {
+      node->DotEdgeString(oss, as_opponent, parent);
+    });
 
   oss << "}" << std::endl;
-
-  return oss.str();
 }
 
 bool Node::ZeroNInFlight() const {
-  std::unordered_set<const LowNode*> seen;
-  std::list<const Node*> unvisited_fifo;
   size_t nonzero_node_count = 0;
-
-  if (GetNInFlight() > 0) {
-    std::cerr << DebugString() << std::endl;
-    ++nonzero_node_count;
-  }
-  if (low_node_) {
-    seen.insert(low_node_.get());
-    unvisited_fifo.push_back(this);
-  }
-
-  while (!unvisited_fifo.empty()) {
-    auto parent_node = unvisited_fifo.front();
-    unvisited_fifo.pop_front();
-
-    for (auto& child_edge : parent_node->Edges()) {
-      auto child = child_edge.node();
-      if (child == nullptr) break;
-
-      if (child->GetNInFlight() > 0) {
-        std::cerr << child->DebugString() << std::endl;
+  TreeWalk(this, false,
+    [](const LowNode*, bool) {},
+    [&](const Node* node, bool, const LowNode*) {
+      if (node->GetNInFlight() > 0) [[unlikely]] {
+        CERR << node->DebugString() << std::endl;
         ++nonzero_node_count;
       }
-
-      auto child_low_node = child->GetLowNode().get();
-      if (child_low_node != nullptr &&
-          (seen.find(child_low_node) == seen.end())) {
-        seen.insert(child_low_node);
-        unvisited_fifo.push_back(child);
-      }
-    }
-  }
+    });
 
   if (nonzero_node_count > 0) {
-    std::cerr << "GetNInFlight() is nonzero on " << nonzero_node_count
+    CERR << "GetNInFlight() is nonzero on " << nonzero_node_count
               << " nodes" << std::endl;
     return false;
   }
 
   return true;
 }
+#endif
 
 void Node::SortEdges() const {
   assert(low_node_);
@@ -637,6 +650,14 @@ std::string EdgeAndNode::DebugString() const {
 // NodeTree
 /////////////////////////////////////////////////////////////////////////
 
+NodeTree::~NodeTree() {
+  auto& ngc = NodeGarbageCollector::Instance();
+  ngc.AddToGcQueue(gamebegin_node_);
+  ngc.NotifyThreadGoingSleep();
+  // Start garbage collection now because we delete everything.
+  ngc.Start();
+}
+
 void NodeTree::MakeMove(Move move) {
   Node* new_head = nullptr;
   for (auto& n : current_head_->Edges()) {
@@ -648,10 +669,8 @@ void NodeTree::MakeMove(Move move) {
       break;
     }
   }
-  // Free old released nodes before adding new.
-  released_nodes_.clear();
   // Release nodes from last move if any.
-  current_head_->ReleaseChildrenExceptOne(new_head, released_nodes_);
+  current_head_->ReleaseChildrenExceptOne(new_head);
   new_head = current_head_->GetChild();
   current_head_ =
       new_head ? new_head : current_head_->CreateSingleChildNode(move);
@@ -661,6 +680,8 @@ void NodeTree::MakeMove(Move move) {
 
 void NodeTree::TrimTreeAtHead() {
   current_head_->Trim();
+  // Flush the thread local destruction queue.
+  NodeGarbageCollector::Instance().NotifyThreadGoingSleep();
 }
 
 bool NodeTree::ResetToPosition(const GameState& pos) {
@@ -690,6 +711,7 @@ bool NodeTree::ResetToPosition(const GameState& pos) {
   // retain old n_ and q_ (etc) data, even though its old children were
   // previously trimmed; we need to reset current_head_ in that case.
   if (!seen_old_head) TrimTreeAtHead();
+  NodeGarbageCollector::Instance().NotifyThreadGoingSleep();
   return seen_old_head;
 }
 
@@ -709,11 +731,232 @@ bool NodeTree::ResetToPosition(const std::string& starting_fen,
 }
 
 void NodeTree::DeallocateTree() {
-  released_nodes_.emplace_back(std::move(gamebegin_node_));
-  // Free all released nodes.
-  released_nodes_.clear();
+  NodeGarbageCollector::Instance().AddToGcQueue(gamebegin_node_);
   current_head_ = nullptr;
 }
 
+NodeGarbageCollector::NodeGarbageCollector() :
+  gc_thread_{[this]() {GCThread();}} {
+}
+
+template<typename UniquePtr>
+void NodeGarbageCollector::AddToGcQueue(UniquePtr& shared_node) {
+  std::unique_ptr<Node> node(shared_node.release());
+  if (ShouldQueue(node)) {
+    LocalWork().emplace_back(std::move(node));
+  }
+}
+
+NodeGarbageCollector::~NodeGarbageCollector() {
+  state_.store(Exit, std::memory_order_release);
+#ifndef NO_STD_ATOMIC_WAIT
+  state_.notify_all();
+#else
+  {
+    Mutex::Lock lock(state_mutex_);
+    state_signal_.notify_all();
+  }
+#endif
+  gc_thread_.join();
+}
+
+bool NodeGarbageCollector::SetState(State& old, State desired) {
+  bool rv =  state_.compare_exchange_strong(old, desired,
+                                            std::memory_order_acq_rel);
+  if (rv) {
+#ifndef NO_STD_ATOMIC_WAIT
+    state_.notify_all();
+#else
+    Mutex::Lock lock(state_mutex_);
+    state_signal_.notify_all();
+#endif
+  }
+  return rv;
+}
+
+void NodeGarbageCollector::Start() {
+  State s = state_.load(std::memory_order_acquire);
+  do {
+    if (s == Running)
+      break;
+    assert(s != Exit);
+  } while (!SetState(s, Running));
+}
+
+void NodeGarbageCollector::Stop() {
+  State old = Running;
+  SetState(old, GoToSleep);
+}
+
+void NodeGarbageCollector::Abort() {
+  Stop();
+}
+
+NodeGarbageCollector::State NodeGarbageCollector::Wait() const {
+  State s;
+  while ((s = state_.load(std::memory_order_acquire)) != Sleeping) {
+    assert(s != Exit);
+#ifndef NO_STD_ATOMIC_WAIT
+    state_.wait(s, std::memory_order_acquire);
+#else
+    Mutex::Lock lock(state_mutex_);
+    state_signal_.wait(lock.get_raw(), [this, s]() {return s != state_;});
+#endif
+  }
+  return s;
+}
+
+void NodeGarbageCollector::NotifyThreadGoingSleep() {
+  if (LocalWork().empty()) {
+    return;
+  }
+  ReleaseNodesWork new_work;
+  LocalWork().swap(new_work);
+}
+
+bool NodeGarbageCollector::IsActive() const {
+  return state_.load(std::memory_order_acquire) == Running;
+}
+
+bool NodeGarbageCollector::ShouldQueue(std::unique_ptr<Node>& node) const {
+  // We don't want to queue null pointers.
+  if (!node) {
+    return false;
+  }
+
+  // If state is exit, it means thread local queues have been destroyed.
+  State s = state_.load(std::memory_order_acquire);
+  if (s == Exit) {
+    return false;
+  }
+
+  // We directly free the node, if queue is running and we are in the GC thread.
+  // All other queue request should be pushed to the thread local batch.
+  return s != Running || !LocalWork().IsWorker();
+}
+
+void NodeGarbageCollector::GCThread() {
+  auto& shared_work = LocalWork(true);
+  assert(shared_work.IsWorker());
+  State s;
+  while ((s = state_.load(std::memory_order_acquire)) != Exit) {
+    if (s == GoToSleep) {
+      // Signal other threads that we have stopped destruction work.
+      if (SetState(s, Sleeping)) {
+        s = Sleeping;
+      } else {
+        continue;
+      }
+    }
+    if (s == Sleeping) {
+#ifndef NO_STD_ATOMIC_WAIT
+      state_.wait(Sleeping, std::memory_order_acquire);
+#else
+      Mutex::Lock lock(state_mutex_);
+      state_signal_.wait(lock.get_raw(), [this]() {return Sleeping != state_;});
+#endif
+      if (!shared_work.empty()) {
+        // Check for early exit from previous free. The work can be freed
+        // before the batch is full.
+        ReleaseNodesWork new_work(true);
+        new_work.swap(shared_work);
+      }
+      continue;
+    }
+
+    assert(s == Running);
+
+    bool empty = true;
+    std::vector<std::unique_ptr<Node>> nodes;
+    {
+      SpinMutex::Lock lock(mutex_);
+      if (!released_nodes_.empty()) {
+        empty = false;
+        nodes = std::move(released_nodes_.front());
+        released_nodes_.pop_front();
+      }
+    }
+
+    if (!empty) {
+      LOGFILE << "Garbage collection starting.";
+    }
+
+    // Free nodes one by one. LowNode destructor calls AddToGcQueue which allows
+    // recursive destruction terminate before freeing a whole branch.
+    while (!nodes.empty()) {
+      if (!IsActive()) {
+        break;
+      }
+      nodes.pop_back();
+    }
+
+    if (!empty) {
+      LOGFILE << "Garbage collection ending.";
+    }
+
+    // Go to sleep if empty or search stopped.
+    if (empty || !IsActive()) {
+      // Lock is requrired to avoid race between other thread queueing work and
+      // calling Start().
+      SpinMutex::Lock lock(mutex_);
+      // There wasn't enough time to free all nodes. They must go back to the
+      // list.
+      if (!nodes.empty()) {
+        released_nodes_.emplace_front(std::move(nodes));
+      }
+
+      // Going to sleep if the queue is empty.
+      if (released_nodes_.empty()) {
+        State old = Running;
+        SetState(old, Sleeping);
+      }
+    }
+  }
+}
+ReleaseNodesWork::ReleaseNodesWork(bool gc_thread) :
+    is_gc_thread_(gc_thread) {
+  released_nodes_.reserve(kCapacity);
+}
+
+bool ReleaseNodesWork::IsWorker() const {
+  return is_gc_thread_;
+}
+
+void ReleaseNodesWork::emplace_back(std::unique_ptr<Node>&& node) {
+  if (!node) return;
+  released_nodes_.emplace_back(std::forward<std::unique_ptr<Node>>(node));
+  if (released_nodes_.size() == kCapacity) {
+    ReleaseNodesWork new_work(is_gc_thread_);
+    swap(new_work);
+  }
+}
+
+bool ReleaseNodesWork::empty() const {
+  return released_nodes_.empty();
+}
+
+void ReleaseNodesWork::swap(ReleaseNodesWork &other) {
+  assert(IsWorker() == other.IsWorker());
+  std::swap(released_nodes_, other.released_nodes_);
+}
+
+ReleaseNodesWork::~ReleaseNodesWork() {
+  Submit();
+}
+
+void ReleaseNodesWork::Submit() {
+  if (released_nodes_.empty()) {
+    return;
+  }
+  auto& worker = NodeGarbageCollector::Instance();
+  SpinMutex::Lock lock(worker.mutex_);
+  // If this is worker, we have oldest nodes. Keep them at front of the queue.
+  if (IsWorker()) {
+    worker.released_nodes_.emplace_front(std::move(released_nodes_));
+  } else {
+    worker.released_nodes_.emplace_back(std::move(released_nodes_));
+  }
+}
+
 }  // namespace dag_classic
 }  // namespace lczero
diff --git a/src/search/dag_classic/node.h b/src/search/dag_classic/node.h
index 837d91027b..b74a64a9d4 100644
--- a/src/search/dag_classic/node.h
+++ b/src/search/dag_classic/node.h
@@ -36,6 +36,11 @@
 #include <memory>
 #include <mutex>
 
+#if __cpp_lib_atomic_wait < 201907L
+#define NO_STD_ATOMIC_WAIT 1
+#include <condition_variable>
+#endif
+
 #include "chess/board.h"
 #include "chess/callbacks.h"
 #include "chess/gamestate.h"
@@ -95,6 +100,80 @@ namespace dag_classic {
 #define __arm__
 #endif
 
+// Atomic unique_ptr based on the public domain code from
+// https://stackoverflow.com/a/42811152 .
+template <class T>
+class atomic_unique_ptr {
+  using pointer = T*;
+  using unique_pointer = std::unique_ptr<T>;
+
+ public:
+  // Manage no pointer.
+  constexpr atomic_unique_ptr() noexcept : ptr() {}
+
+  // Make pointer @p managed.
+  explicit atomic_unique_ptr(pointer p) noexcept : ptr(p) {}
+
+  // Move the managed pointer ownership from another atomic_unique_ptr.
+  atomic_unique_ptr(atomic_unique_ptr&& p) noexcept : ptr(p.release()) {}
+  // Move the managed pointer ownership from another atomic_unique_ptr.
+  atomic_unique_ptr& operator=(atomic_unique_ptr&& p) noexcept {
+    reset(p.release());
+    return *this;
+  }
+
+  // Move the managed object ownership from a unique_ptr.
+  atomic_unique_ptr(unique_pointer&& p) noexcept : ptr(p.release()) {}
+  // Move the managed object ownership from a unique_ptr.
+  atomic_unique_ptr& operator=(unique_pointer&& p) noexcept {
+    reset(p.release());
+    return *this;
+  }
+
+  // Replace the managed pointer, deleting the old one.
+  void reset(pointer p = pointer()) noexcept {
+    auto old = ptr.exchange(p, std::memory_order_acq_rel);
+    if (old) delete old;
+  }
+  // Release ownership of and delete the owned pointer.
+  ~atomic_unique_ptr() { reset(); }
+
+  // Returns the managed pointer.
+  operator pointer() const noexcept { return get(); }
+  // Returns the managed pointer.
+  pointer operator->() const noexcept { return get(); }
+  // Returns the managed pointer.
+  pointer get() const noexcept {
+    return ptr.load(std::memory_order_acquire);
+  }
+
+  // Checks whether there is a managed pointer.
+  explicit operator bool() const noexcept { return get() != pointer(); }
+
+  // Replace the managed pointer, only releasing returning the old one.
+  pointer set(pointer p = pointer()) noexcept {
+    return ptr.exchange(p, std::memory_order_acq_rel);
+  }
+  // Return the managed pointer and release its ownership.
+  pointer release() noexcept { return set(pointer()); }
+
+  // Move managed pointer from @source, iff the managed pointer equals
+  // @expected.
+  bool compare_exchange(pointer& expected,
+                        atomic_unique_ptr<T>& source) noexcept {
+    if (ptr.compare_exchange_strong(expected, source.get(),
+                                    std::memory_order_acq_rel)) {
+      source.release();
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+ private:
+  std::atomic<pointer> ptr;
+};
+
 class Node;
 class Edge {
  public:
@@ -163,6 +242,9 @@ class Edge_Iterator;
 template <bool is_const>
 class VisitedNode_Iterator;
 
+class NodeGarbageCollector;
+class ReleaseNodesWork;
+
 class LowNode;
 class Node {
  public:
@@ -184,7 +266,7 @@ class Node {
         lower_bound_(GameResult::BLACK_WON),
         upper_bound_(GameResult::WHITE_WON),
         repetition_(false) {}
-  ~Node() { UnsetLowNode(); }
+  ~Node();
 
   // Trim node, resetting everything except parent, sibling, edge and index.
   void Trim();
@@ -201,7 +283,7 @@ class Node {
   // Get first child.
   Node* GetChild() const;
   // Get next sibling.
-  std::unique_ptr<Node>* GetSibling() { return &sibling_; }
+  atomic_unique_ptr<Node>* GetSibling() { return &sibling_; }
   // Moves sibling in.
   void MoveSiblingIn(std::unique_ptr<Node>& sibling) {
     sibling_ = std::move(sibling);
@@ -217,7 +299,7 @@ class Node {
   uint32_t GetChildrenVisits() const;
   uint32_t GetTotalVisits() const;
   // Returns n + n_in_flight.
-  int GetNStarted() const { return n_ + n_in_flight_; }
+  int GetNStarted() const { return n_ + GetNInFlight(); }
 
   float GetQ(float draw_score) const { return wl_ + draw_score * d_; }
   // Returns node eval, i.e. average subtree V for non-terminal node and -1/0/1
@@ -272,9 +354,7 @@ class Node {
   // Deletes all children except one.
   // The node provided may be moved, so should not be relied upon to exist
   // afterwards.
-  void ReleaseChildrenExceptOne(
-      Node* node_to_save,
-      std::vector<std::unique_ptr<Node>>& released_nodes) const;
+  void ReleaseChildrenExceptOne(Node* node_to_save) const;
 
   // Returns move from the point of view of the player making it (if as_opponent
   // is false) or as opponent (if as_opponent is true).
@@ -287,7 +367,7 @@ class Node {
   float GetP() const { return edge_.GetP(); }
   void SetP(float val) { edge_.SetP(val); }
 
-  std::shared_ptr<LowNode> GetLowNode() const { return low_node_; }
+  const std::shared_ptr<LowNode>& GetLowNode() const { return low_node_; }
 
   void SetLowNode(std::shared_ptr<LowNode> low_node);
   void UnsetLowNode();
@@ -296,11 +376,12 @@ class Node {
   std::string DebugString() const;
   // Return string describing the edge from node's parent to its low node in the
   // Graphviz dot format.
-  std::string DotEdgeString(bool as_opponent = false,
-                            const LowNode* parent = nullptr) const;
+  void DotEdgeString(std::ofstream& file,
+                     bool as_opponent = false,
+                     const LowNode* parent = nullptr) const;
   // Return string describing the graph starting at this node in the Graphviz
   // dot format.
-  std::string DotGraphString(bool as_opponent = false) const;
+  void DotGraphString(std::ofstream& file, bool as_opponent = false) const;
 
   // Returns true if graph under this node has every n_in_flight_ == 0 and
   // prints offending nodes and low nodes and stats to cerr otherwise.
@@ -316,6 +397,34 @@ class Node {
 
   bool WLDMInvariantsHold() const;
 
+#ifndef NDEBUG
+  // RAII holder was a visitor. It will automatically release the reservation
+  // when going out of scope. It is possible to use visitor for branches. There
+  // must be a full tree walk before id value wraps arround or walk will ignore
+  // some nodes.
+  // It doesn't support concurrent access currently. API emulates mutexes which
+  // makes it possible to add limited number of concurrent access and waiting
+  // for free resources if needed.
+  struct VisitorId {
+    using type = uint32_t;
+    using storage = uint32_t;
+
+    VisitorId(const VisitorId&) = delete;
+
+    explicit VisitorId();
+    ~VisitorId();
+
+    operator type() const {
+      return id_;
+    }
+
+    friend class Node;
+    friend class LowNode;
+  private:
+    type id_;
+  };
+#endif
+
  private:
   // To minimize the number of padding bytes and to avoid having unnecessary
   // padding when new fields are added, we arrange the fields by size, largest
@@ -332,15 +441,15 @@ class Node {
   // the perspective of the player-to-move for the position. WL stands for "W
   // minus L". Is equal to Q if draw score is 0.
   double wl_ = 0.0f;
+  // Averaged draw probability. Works similarly to WL, except that D is not
+  // flipped depending on the side to move.
+  double d_ = 0.0f;
 
   // 8 byte fields on 64-bit platforms, 4 byte on 32-bit.
   // Pointer to a next sibling. nullptr if there are no further siblings.
-  std::unique_ptr<Node> sibling_;
+  atomic_unique_ptr<Node> sibling_;
 
   // 4 byte fields.
-  // Averaged draw probability. Works similarly to WL, except that D is not
-  // flipped depending on the side to move.
-  float d_ = 0.0f;
   // Estimated remaining plies.
   float m_ = 0.0f;
   // How many completed visits this node had.
@@ -348,7 +457,7 @@ class Node {
   // (AKA virtual loss.) How many threads currently process this node (started
   // but not finished). This value is added to n during selection which node
   // to pick in MCTS, and also when selecting the best move.
-  uint32_t n_in_flight_ = 0;
+  std::atomic<uint32_t> n_in_flight_ = 0;
 
   // Move and policy for this edge.
   Edge edge_;
@@ -377,8 +486,7 @@ class LowNode {
   LowNode()
       : terminal_type_(Terminal::NonTerminal),
         lower_bound_(GameResult::BLACK_WON),
-        upper_bound_(GameResult::WHITE_WON),
-        is_transposition(false) {}
+        upper_bound_(GameResult::WHITE_WON) {}
   // Init from from another low node, but use it for NNEval only.
   LowNode(const LowNode& p)
       : wl_(p.wl_),
@@ -387,8 +495,7 @@ class LowNode {
         num_edges_(p.num_edges_),
         terminal_type_(Terminal::NonTerminal),
         lower_bound_(GameResult::BLACK_WON),
-        upper_bound_(GameResult::WHITE_WON),
-        is_transposition(false) {
+        upper_bound_(GameResult::WHITE_WON) {
     assert(p.edges_);
     edges_ = std::make_unique<Edge[]>(num_edges_);
     std::memcpy(edges_.get(), p.edges_.get(), num_edges_ * sizeof(Edge));
@@ -398,8 +505,7 @@ class LowNode {
       : num_edges_(moves.size()),
         terminal_type_(Terminal::NonTerminal),
         lower_bound_(GameResult::BLACK_WON),
-        upper_bound_(GameResult::WHITE_WON),
-        is_transposition(false) {
+        upper_bound_(GameResult::WHITE_WON) {
     edges_ = Edge::FromMovelist(moves);
   }
   // Init @edges_ with moves from @moves and 0 policy.
@@ -408,15 +514,15 @@ class LowNode {
       : num_edges_(moves.size()),
         terminal_type_(Terminal::NonTerminal),
         lower_bound_(GameResult::BLACK_WON),
-        upper_bound_(GameResult::WHITE_WON),
-        is_transposition(false) {
+        upper_bound_(GameResult::WHITE_WON) {
     edges_ = Edge::FromMovelist(moves);
     child_ = std::make_unique<Node>(edges_[index], index);
   }
+  ~LowNode();
 
   void SetNNEval(const EvalResult* eval) {
     assert(n_ == 0);
-    assert(child_ == nullptr);
+    assert(!child_);
 
     for (size_t idx = 0; idx < num_edges_; idx++) {
       edges_.get()[idx].SetP(eval->p[idx]);
@@ -430,7 +536,7 @@ class LowNode {
   }
 
   // Gets the first child.
-  std::unique_ptr<Node>* GetChild() { return &child_; }
+  atomic_unique_ptr<Node>* GetChild() { return &child_; }
 
   // Returns whether a node has children.
   bool HasChildren() const { return num_edges_ > 0; }
@@ -473,13 +579,12 @@ class LowNode {
   void AdjustForTerminal(float v, float d, float m, uint32_t multivisit);
 
   // Deletes all children.
-  void ReleaseChildren(std::vector<std::unique_ptr<Node>>& released_nodes);
+  void ReleaseChildren();
 
   // Deletes all children except one.
   // The node provided may be moved, so should not be relied upon to exist
   // afterwards.
-  void ReleaseChildrenExceptOne(
-      Node* node_to_save, std::vector<std::unique_ptr<Node>>& released_nodes);
+  void ReleaseChildrenExceptOne(Node* node_to_save);
 
   // Return move policy for edge/node at @index.
   const Edge& GetEdgeAt(uint16_t index) const;
@@ -487,7 +592,7 @@ class LowNode {
   // Debug information about the node.
   std::string DebugString() const;
   // Return string describing this node in the Graphviz dot format.
-  std::string DotNodeString() const;
+  void DotNodeString(std::ofstream& file) const;
 
   void SortEdges() {
     assert(edges_);
@@ -497,21 +602,25 @@ class LowNode {
 
   // Add new parent with @n_in_flight visits.
   void AddParent() {
-    ++num_parents_;
+    num_parents_.fetch_add(1, std::memory_order_acq_rel);
 
     assert(num_parents_ > 0);
-
-    is_transposition |= num_parents_ > 1;
   }
   // Remove parent and its first visit.
   void RemoveParent() {
     assert(num_parents_ > 0);
-    --num_parents_;
+    num_parents_.fetch_sub(1, std::memory_order_acq_rel);
+  }
+  bool IsTransposition() const {
+    return num_parents_.load(std::memory_order_acquire) > 1;
   }
-  bool IsTransposition() const { return is_transposition; }
 
   bool WLDMInvariantsHold() const;
 
+#ifndef NDEBUG
+  bool Visit(Node::VisitorId::type id);
+#endif
+
  private:
   // To minimize the number of padding bytes and to avoid having unnecessary
   // padding when new fields are added, we arrange the fields by size, largest
@@ -524,17 +633,17 @@ class LowNode {
   // perspective of the player-to-move for the position.
   // WL stands for "W minus L". Is equal to Q if draw score is 0.
   double wl_ = 0.0f;
+  // Averaged draw probability. Works similarly to WL, except that D is not
+  // flipped depending on the side to move.
+  double d_ = 0.0f;
 
   // 8 byte fields on 64-bit platforms, 4 byte on 32-bit.
   // Array of edges.
   std::unique_ptr<Edge[]> edges_;
   // Pointer to the first child. nullptr when no children.
-  std::unique_ptr<Node> child_;
+  atomic_unique_ptr<Node> child_;
 
   // 4 byte fields.
-  // Averaged draw probability. Works similarly to WL, except that D is not
-  // flipped depending on the side to move.
-  float d_ = 0.0f;
   // Estimated remaining plies.
   float m_ = 0.0f;
   // How many completed visits this node had.
@@ -542,7 +651,7 @@ class LowNode {
 
   // 2 byte fields.
   // Number of parents.
-  uint16_t num_parents_ = 0;
+  std::atomic<uint16_t> num_parents_ = {};
 
   // 1 byte fields.
   // Number of edges in @edges_.
@@ -553,8 +662,11 @@ class LowNode {
   // Best and worst result for this node.
   GameResult lower_bound_ : 2;
   GameResult upper_bound_ : 2;
-  // Low node is a transposition (for ever).
-  bool is_transposition : 1;
+  // Debug only id as the last to avoid taking place of actively used variables
+  // in the cache.
+#ifndef NDEBUG
+  Node::VisitorId::storage visitor_id_ = {};
+#endif
 };
 
 // Check that LowNode still fits into an expected cache line size.
@@ -646,8 +758,8 @@ class EdgeAndNode {
 template <bool is_const>
 class Edge_Iterator : public EdgeAndNode {
  public:
-  using Ptr = std::conditional_t<is_const, const std::unique_ptr<Node>*,
-                                 std::unique_ptr<Node>*>;
+  using Ptr = std::conditional_t<is_const, const atomic_unique_ptr<Node>*,
+                                 atomic_unique_ptr<Node>*>;
   using value_type = Edge_Iterator;
   using iterator_category = std::forward_iterator_tag;
   using difference_type = std::ptrdiff_t;
@@ -688,26 +800,44 @@ class Edge_Iterator : public EdgeAndNode {
   // If there is node, return it. Otherwise spawn a new one and return it.
   Node* GetOrSpawnNode(Node* parent) {
     if (node_) return node_;  // If there is already a node, return it.
-    Actualize();              // But maybe other thread already did that.
-    if (node_) return node_;  // If it did, return.
-    // Now we are sure we have to create a new node.
-    // Suppose there are nodes with idx 3 and 7, and we want to insert one with
-    // idx 5. Here is how it looks like:
-    //    node_ptr_ -> &Node(idx_.3).sibling_  ->  Node(idx_.7)
-    // Here is how we do that:
-    // 1. Store pointer to a node idx_.7:
-    //    node_ptr_ -> &Node(idx_.3).sibling_  ->  nullptr
-    //    tmp -> Node(idx_.7)
-    std::unique_ptr<Node> tmp = std::move(*node_ptr_);
-    // 2. Create fresh Node(idx_.5):
-    //    node_ptr_ -> &Node(idx_.3).sibling_  ->  Node(idx_.5)
-    //    tmp -> Node(idx_.7)
+
+    // We likely need to add a new node, prepare it now.
     auto low_parent = parent->GetLowNode()->GetEdgeAt(current_idx_);
-    *node_ptr_ = std::make_unique<Node>(low_parent, current_idx_);
-    // 3. Attach stored pointer back to a list:
-    //    node_ptr_ ->
-    //         &Node(idx_.3).sibling_ -> Node(idx_.5).sibling_ -> Node(idx_.7)
-    (*node_ptr_)->MoveSiblingIn(tmp);
+    atomic_unique_ptr<Node> new_node =
+        std::make_unique<Node>(low_parent, current_idx_);
+    while (true) {
+      auto node = Actualize();  // But maybe other thread already did that.
+      if (node_) return node_;  // If it did, return.
+
+      // New node needs to be added, but we might be in a race with another
+      // thread doing what we do or adding a different index to the same
+      // sibling.
+
+      // Suppose there are nodes with idx 3 and 7, and we want to insert one
+      // with idx 5. Here is how it looks like:
+      //    node_ptr_ -> &Node(idx_.3).sibling_  ->  Node(idx_.7)
+      // Here is how we do that:
+      // 1. Store pointer to a node idx_.7:
+      //    node_ptr_ -> &Node(idx_.3).sibling_  ->  nullptr
+      //    tmp -> Node(idx_.7)
+      // 2. Create fresh Node(idx_.5):
+      //    node_ptr_ -> &Node(idx_.3).sibling_  ->  Node(idx_.5)
+      //    tmp -> Node(idx_.7)
+      // 3. Attach stored pointer back to a list:
+      //    node_ptr_ ->
+      //         &Node(idx_.3).sibling_ -> Node(idx_.5).sibling_ -> Node(idx_.7)
+
+      // Atomically add the new node into the right place.
+      // Set new node's sibling to the expected sibling seen by Actualize in
+      // node_ptr_.
+      auto new_sibling = new_node->GetSibling();
+      new_sibling->set(node);
+      // Try to atomically insert the new node and stop if it works.
+      if (node_ptr_->compare_exchange(node, new_node)) break;
+      // Recover from failure and try again.
+      // Release expected sibling to avoid double free.
+      new_sibling->release();
+    }
     // 4. Actualize:
     //    node_ -> &Node(idx_.5)
     //    node_ptr_ -> &Node(idx_.5).sibling_ -> Node(idx_.7)
@@ -716,22 +846,30 @@ class Edge_Iterator : public EdgeAndNode {
   }
 
  private:
-  void Actualize() {
+  // Moves node_ptr_ as close as possible to the target index and returns the
+  // contents of node_ptr_ for use by atomic insert in GetOrSpawnNode.
+  Node* Actualize() {
     // If node_ptr_ is behind, advance it.
     // This is needed (and has to be 'while' rather than 'if') as other threads
     // could spawn new nodes between &node_ptr_ and *node_ptr_ while we didn't
     // see.
-    while (*node_ptr_ && (*node_ptr_)->Index() < current_idx_) {
-      node_ptr_ = (*node_ptr_)->GetSibling();
+    // Read the direct pointer just once as other threads may change it between
+    // uses.
+    auto node = node_ptr_->get();
+    while (node != nullptr && node->Index() < current_idx_) {
+      node_ptr_ = node->GetSibling();
+      node = node_ptr_->get();
     }
     // If in the end node_ptr_ points to the node that we need, populate node_
     // and advance node_ptr_.
-    if (*node_ptr_ && (*node_ptr_)->Index() == current_idx_) {
-      node_ = (*node_ptr_).get();
-      node_ptr_ = node_->GetSibling();
+    if (node != nullptr && node->Index() == current_idx_) {
+      node_ = node;
+      node_ptr_ = node->GetSibling();
     } else {
       node_ = nullptr;
     }
+
+    return node;
   }
 
   // Pointer to a pointer to the next node. Has to be a pointer to pointer
@@ -822,7 +960,7 @@ typedef absl::flat_hash_map<uint64_t, std::weak_ptr<LowNode>>
 
 class NodeTree {
  public:
-  ~NodeTree() { DeallocateTree(); }
+  ~NodeTree();
   // Adds a move to current_head_.
   void MakeMove(Move move);
   // Resets the current head to ensure it doesn't carry over details from a
@@ -853,8 +991,91 @@ class NodeTree {
   std::unique_ptr<Node> gamebegin_node_;
   PositionHistory history_;
   std::vector<Move> moves_;
-  // Nodes released from DAG and to be freed later.
+};
+
+// Implement thread local queues. It tracks GC thread to allow faster removal in
+// the thread.
+class ReleaseNodesWork {
+  static constexpr size_t kCapacity = 32;
+public:
+  ReleaseNodesWork(bool gc_thread = false);
+  ~ReleaseNodesWork();
+  bool IsWorker() const;
+
+  // A limited vector like interface to operate on the container.
+  void emplace_back(std::unique_ptr<Node>&& node);
+  bool empty() const;
+
+  // Swap is used to transfer queue into a new stack variable. The stack
+  // variable will flush the queue in the desctructor.
+  void swap(ReleaseNodesWork &other);
+private:
+  // Flush the local queue to the shared queue.
+  void Submit();
+
+  // No locks required because only one thread can access this object.
   std::vector<std::unique_ptr<Node>> released_nodes_;
+  bool is_gc_thread_;
+};
+
+class NodeGarbageCollector {
+  NodeGarbageCollector();
+  ~NodeGarbageCollector();
+public:
+  enum State {
+    Running,
+    GoToSleep,
+    Sleeping,
+    Exit,
+  };
+
+  // Access to the singleton which is only created on the demand.
+  static NodeGarbageCollector& Instance() {
+    static NodeGarbageCollector singleton;
+    return singleton;
+  }
+  // Delays node destruction until GC thread activates.
+  template<typename UniquePtr>
+  void AddToGcQueue(UniquePtr& node);
+
+  // Allow search to control when garbage collection runs.
+  void Start();
+  void Stop();
+  State Wait() const;
+  void Abort();
+
+  // Moves thread local GC queue to the shared queue. This avoid case where a
+  // thread frees only a few branches which will be stuck in the thread local
+  // queue. A few big branches can have a major memory impact. If thread exits,
+  // there is no need to call this.
+  void NotifyThreadGoingSleep();
+
+private:
+  // Helper to transition between states safely
+  bool SetState(State& old, State desired);
+  bool IsActive() const;
+  bool ShouldQueue(std::unique_ptr<Node>& node) const;
+  // The collection thread implementation.
+  void GCThread();
+  // Thread local collection queue. Local queues flush to the shared queue
+  // in batches to avoid lock contention.
+  static ReleaseNodesWork& LocalWork(bool gc_thread = false) {
+    static thread_local ReleaseNodesWork shared{gc_thread};
+    return shared;
+  }
+
+  std::atomic<State> state_ = {Sleeping};
+#ifdef NO_STD_ATOMIC_WAIT
+  // Fallback conditional variable when c++ library doesn't implement
+  // std::atomic::wait().
+  mutable Mutex state_mutex_;
+  mutable std::condition_variable state_signal_;
+#endif
+  std::thread gc_thread_;
+  SpinMutex mutex_;
+  std::deque<std::vector<std::unique_ptr<Node>>> released_nodes_ GUARDED_BY(mutex_);
+
+  friend class ReleaseNodesWork;
 };
 
 }  // namespace dag_classic
diff --git a/src/search/dag_classic/search.cc b/src/search/dag_classic/search.cc
index 620c942f64..6861371dc6 100644
--- a/src/search/dag_classic/search.cc
+++ b/src/search/dag_classic/search.cc
@@ -27,6 +27,8 @@
 
 #include "search/dag_classic/search.h"
 
+#include <absl/cleanup/cleanup.h>
+
 #include <algorithm>
 #include <array>
 #include <chrono>
@@ -41,6 +43,7 @@
 #include "utils/fastmath.h"
 #include "utils/random.h"
 #include "utils/spinhelper.h"
+#include "utils/trace.h"
 
 namespace lczero {
 namespace dag_classic {
@@ -146,6 +149,44 @@ class MEvaluator {
   bool parent_within_threshold_ = false;
 };
 
+// Unpack task_count_ atomic which holds both task_count_ and tasks_taken_. It
+// can unpack a value from an already read value or load it from the atomic
+// variable.
+// Variables are packed together because there is a potential race between task
+// workers and ResetTasks. A task worker can read tasks_taken_ and task_count
+// to a local register. A task worker can be suspended by kernel before tries
+// to acquire work. Other threads can process all tasks and main thread resets
+// tasks before the suspended thread resumes. The suspended thread now manages
+// to acquire work based on stale values if the stale tasks_taken was zero.
+// Packed values avoid the race because compare exchange is checking both when
+// incrementing tasks_taken_.
+template<typename T>
+std::tuple<int, int, int> ReadTaskCount(T& task_count) {
+  int packed;
+  if constexpr(std::is_same_v<T, std::atomic<int>>) {
+    packed = task_count.load(std::memory_order_acquire);
+  } else {
+    packed = task_count;
+  }
+  // The top half is tasks taken.
+  const int shift = SearchWorker::kTasksTakenShift;
+  int tasks_taken = packed >> shift;
+  // The bottom is task count. The first shift moves the sign bit from the lower
+  // half to the hardware sign bit. The second shift lowers bits back to the
+  // original positions and duplicates the sign bit if it is set.
+  int tc = (packed << shift) >> shift;
+  return {packed, tasks_taken, tc};
+}
+
+[[maybe_unused]]
+bool IsTasksCompleted(const std::atomic<int>& task_count,
+                      const std::atomic<int>& completed_tasks) {
+  int tc = 0, nta = 0;
+  std::tie(std::ignore, nta, tc) = ReadTaskCount(task_count);
+  int ct = completed_tasks.load(std::memory_order_acquire);
+  return tc == ct || (nta == ct && tc == -1);
+}
+
 }  // namespace
 
 Search::Search(const NodeTree& tree, Backend* backend,
@@ -176,6 +217,8 @@ Search::Search(const NodeTree& tree, Backend* backend,
   // enough to prevent expired entries later during the search.
   absl::erase_if(*tt_, [](const auto& item) { return item.second.expired(); });
 
+  LOGFILE << "Transposition table garbage collection done.";
+
   if (params_.GetMaxConcurrentSearchers() != 0) {
     pending_searchers_.store(params_.GetMaxConcurrentSearchers(),
                              std::memory_order_release);
@@ -204,7 +247,7 @@ Search::Search(const NodeTree& tree, Backend* backend,
 }
 
 namespace {
-void ApplyDirichletNoise(Node* node, float eps, double alpha) {
+void ApplyDirichletNoise(LowNode* node, float eps, double alpha) {
   float total = 0;
   std::vector<float> noise;
 
@@ -217,10 +260,12 @@ void ApplyDirichletNoise(Node* node, float eps, double alpha) {
   if (total < std::numeric_limits<float>::min()) return;
 
   int noise_idx = 0;
-  for (const auto& child : node->Edges()) {
-    auto* edge = child.edge();
-    edge->SetP(edge->GetP() * (1 - eps) + eps * noise[noise_idx++] / total);
-  }
+  auto edges = node->GetEdges();
+  std::transform(edges, edges + node->GetNumEdges(), edges,
+      [&](auto edge) {
+        edge.SetP(edge.GetP() * (1 - eps) + eps * noise[noise_idx++] / total);
+        return edge;
+      });
 }
 }  // namespace
 
@@ -263,7 +308,8 @@ inline double WDLRescale(float& v, float& d, float wdl_rescale_ratio,
 }
 }  // namespace
 
-void Search::SendUciInfo() REQUIRES(nodes_mutex_) REQUIRES(counters_mutex_) {
+void Search::SendUciInfo(const classic::IterationStats& stats)
+                         REQUIRES(nodes_mutex_) REQUIRES(counters_mutex_) {
   const auto max_pv = params_.GetMultiPv();
   const auto edges = GetBestChildrenNoTemperature(root_node_, max_pv, 0);
   const auto score_type = params_.GetScoreType();
@@ -276,17 +322,15 @@ void Search::SendUciInfo() REQUIRES(nodes_mutex_) REQUIRES(counters_mutex_) {
   ThinkingInfo common_info;
   common_info.depth = cum_depth_ / (total_playouts_ ? total_playouts_ : 1);
   common_info.seldepth = max_depth_;
-  common_info.time = GetTimeSinceStart();
+  common_info.time = stats.time_since_movestart;
   if (!per_pv_counters) {
     common_info.nodes = total_playouts_ + initial_visits_;
   }
-  if (nps_start_time_) {
-    const auto time_since_first_batch_ms =
-        std::chrono::duration_cast<std::chrono::milliseconds>(
-            std::chrono::steady_clock::now() - *nps_start_time_)
-            .count();
+  if (stats.time_since_first_batch) {
+    const auto time_since_first_batch_ms = stats.time_since_first_batch;
     if (time_since_first_batch_ms > 0) {
       common_info.nps = total_playouts_ * 1000 / time_since_first_batch_ms;
+      common_info.eps = network_evaluations_ * 1000 / time_since_first_batch_ms;
     }
   }
   common_info.tb_hits = tb_hits_.load(std::memory_order_acquire);
@@ -389,7 +433,7 @@ void Search::SendUciInfo() REQUIRES(nodes_mutex_) REQUIRES(counters_mutex_) {
 
 // Decides whether anything important changed in stats and new info should be
 // shown to a user.
-void Search::MaybeOutputInfo() {
+void Search::MaybeOutputInfo(const classic::IterationStats& stats) {
   SharedMutex::Lock lock(nodes_mutex_);
   Mutex::Lock counters_lock(counters_mutex_);
   if (!bestmove_is_sent_ && current_best_edge_ &&
@@ -400,7 +444,7 @@ void Search::MaybeOutputInfo() {
        last_outputted_uci_info_.seldepth != max_depth_ ||
        last_outputted_uci_info_.time + kUciInfoMinimumFrequencyMs <
            GetTimeSinceStart())) {
-    SendUciInfo();
+    SendUciInfo(stats);
     if (params_.GetLogLiveStats()) {
       SendMovesStats();
     }
@@ -419,13 +463,18 @@ int64_t Search::GetTimeSinceStart() const {
       .count();
 }
 
-int64_t Search::GetTimeSinceFirstBatch() const REQUIRES(counters_mutex_) {
+int64_t Search::GetTimeSinceFirstBatch() const {
   if (!nps_start_time_) return 0;
   return std::chrono::duration_cast<std::chrono::milliseconds>(
              std::chrono::steady_clock::now() - *nps_start_time_)
       .count();
 }
 
+void Search::RecordNPSStartTime() {
+  if (nps_start_time_) return;
+  nps_start_time_ = std::chrono::steady_clock::now();
+}
+
 // Root is depth 0, i.e. even depth.
 float Search::GetDrawScore(bool is_odd_depth) const {
   return (is_odd_depth == played_history_.IsBlackToMove()
@@ -434,7 +483,7 @@ float Search::GetDrawScore(bool is_odd_depth) const {
 }
 
 namespace {
-inline float GetFpu(const SearchParams& params, Node* node, bool is_root_node,
+inline float GetFpu(const SearchParams& params, const Node* node, bool is_root_node,
                     float draw_score) {
   const auto value = params.GetFpuValue(is_root_node);
   return params.GetFpuAbsolute(is_root_node)
@@ -444,7 +493,7 @@ inline float GetFpu(const SearchParams& params, Node* node, bool is_root_node,
 }
 
 // Faster version for if visited_policy is readily available already.
-inline float GetFpu(const SearchParams& params, Node* node, bool is_root_node,
+inline float GetFpu(const SearchParams& params, const Node* node, bool is_root_node,
                     float draw_score, float visited_pol) {
   const auto value = params.GetFpuValue(is_root_node);
   return params.GetFpuAbsolute(is_root_node)
@@ -461,8 +510,11 @@ inline float ComputeCpuct(const SearchParams& params, uint32_t N,
 }
 }  // namespace
 
+// Ignore the last tuple element when sorting in GetVerboseStats
+static bool operator<(const EdgeAndNode&, const EdgeAndNode&) { return false; }
+
 std::vector<std::string> Search::GetVerboseStats(
-    Node* node, std::optional<Move> move_to_node) const {
+    const Node* node, std::optional<Move> move_to_node) const {
   const bool is_root = (node == root_node_);
   const bool is_odd_depth = !is_root;
   const bool is_black_to_move = (played_history_.IsBlackToMove() == is_root);
@@ -471,16 +523,14 @@ std::vector<std::string> Search::GetVerboseStats(
   const float cpuct = ComputeCpuct(params_, node->GetTotalVisits(), is_root);
   const float U_coeff =
       cpuct * std::sqrt(std::max(node->GetChildrenVisits(), 1u));
-  std::vector<EdgeAndNode> edges;
-  for (const auto& edge : node->Edges()) edges.push_back(edge);
-
-  std::sort(edges.begin(), edges.end(),
-            [&fpu, &U_coeff, &draw_score](EdgeAndNode a, EdgeAndNode b) {
-              return std::forward_as_tuple(
-                         a.GetN(), a.GetQ(fpu, draw_score) + a.GetU(U_coeff)) <
-                     std::forward_as_tuple(
-                         b.GetN(), b.GetQ(fpu, draw_score) + b.GetU(U_coeff));
-            });
+  std::vector<std::tuple<uint32_t, float, EdgeAndNode>> edges;
+  edges.reserve(node->GetNumEdges());
+  for (const auto& edge : node->Edges()) {
+    edges.emplace_back(edge.GetN(),
+                       edge.GetQ(fpu, draw_score) + edge.GetU(U_coeff),
+                       edge);
+  }
+  std::sort(edges.begin(), edges.end());
 
   auto print = [](auto* oss, auto pre, auto v, auto post, auto w, int p = 0) {
     *oss << pre << std::setw(w) << std::setprecision(p) << v << post;
@@ -558,7 +608,8 @@ std::vector<std::string> Search::GetVerboseStats(
   std::vector<std::string> infos;
   const auto m_evaluator =
       backend_attributes_.has_mlh ? MEvaluator(params_, node) : MEvaluator();
-  for (const auto& edge : edges) {
+  for (const auto& edge_tuple : edges) {
+    const auto& edge = std::get<2>(edge_tuple);
     float Q = edge.GetQ(fpu, draw_score);
     float M = m_evaluator.GetMUtility(edge, Q);
     std::ostringstream oss;
@@ -624,16 +675,24 @@ void Search::MaybeTriggerStop(const classic::IterationStats& stats,
   // Already responded bestmove, nothing to do here.
   if (bestmove_is_sent_) return;
   // Don't stop when the root node is not yet expanded.
-  if (total_playouts_ + initial_visits_ == 0) return;
+  if (stats.total_nodes == 0) return;
 
   if (!stop_.load(std::memory_order_acquire)) {
-    if (stopper_->ShouldStop(stats, hints)) FireStopInternal();
+    const float delay = params_.GetGarbageCollectionDelay() / 100.0f;
+    if (stopper_->ShouldStop(stats, hints)) {
+      FireStopInternal();
+    } else if (!gc_started_ &&
+        stats.time_since_movestart > delay *
+        (stats.time_since_movestart + hints->GetEstimatedRemainingTimeMs())) {
+      NodeGarbageCollector::Instance().Start();
+      gc_started_ = true;
+    }
   }
 
   // If we are the first to see that stop is needed.
   if (stop_.load(std::memory_order_acquire) && ok_to_respond_bestmove_ &&
       !bestmove_is_sent_) {
-    SendUciInfo();
+    SendUciInfo(stats);
     EnsureBestMoveKnown();
     SendMovesStats();
     BestMoveInfo info(final_bestmove_, final_pondermove_);
@@ -641,6 +700,7 @@ void Search::MaybeTriggerStop(const classic::IterationStats& stats,
     stopper_->OnSearchDone(stats);
     bestmove_is_sent_ = true;
     current_best_edge_ = EdgeAndNode();
+    NodeGarbageCollector::Instance().Stop();
   }
 }
 
@@ -930,13 +990,7 @@ void Search::PopulateCommonIterationStats(classic::IterationStats* stats) {
   stats->time_since_movestart = GetTimeSinceStart();
 
   SharedMutex::SharedLock nodes_lock(nodes_mutex_);
-  {
-    Mutex::Lock counters_lock(counters_mutex_);
-    stats->time_since_first_batch = GetTimeSinceFirstBatch();
-    if (!nps_start_time_ && total_playouts_ > 0) {
-      nps_start_time_ = std::chrono::steady_clock::now();
-    }
-  }
+  stats->time_since_first_batch = GetTimeSinceFirstBatch();
   stats->total_nodes = total_playouts_ + initial_visits_;
   stats->nodes_since_movestart = total_playouts_;
   stats->batches_since_movestart = total_batches_;
@@ -1007,7 +1061,7 @@ void Search::WatchdogThread() {
   while (true) {
     PopulateCommonIterationStats(&stats);
     MaybeTriggerStop(stats, &hints);
-    MaybeOutputInfo();
+    MaybeOutputInfo(stats);
 
     constexpr auto kMaxWaitTimeMs = 100;
     constexpr auto kMinWaitTimeMs = 1;
@@ -1039,6 +1093,7 @@ void Search::FireStopInternal() {
 }
 
 void Search::Stop() {
+  NodeGarbageCollector::Instance().Stop();
   Mutex::Lock lock(counters_mutex_);
   ok_to_respond_bestmove_ = true;
   FireStopInternal();
@@ -1046,6 +1101,7 @@ void Search::Stop() {
 }
 
 void Search::Abort() {
+  NodeGarbageCollector::Instance().Abort();
   Mutex::Lock lock(counters_mutex_);
   if (!stop_.load(std::memory_order_acquire) ||
       (!bestmove_is_sent_ && !ok_to_respond_bestmove_)) {
@@ -1056,32 +1112,34 @@ void Search::Abort() {
 }
 
 void Search::Wait() {
+  NodeGarbageCollector::Instance().Wait();
   Mutex::Lock lock(threads_mutex_);
+  bool active_threads = !threads_.empty();
   while (!threads_.empty()) {
     threads_.back().join();
     threads_.pop_back();
   }
+  if (active_threads) {
+    SharedMutex::Lock lock(nodes_mutex_);
+
+    assert(root_node_->ZeroNInFlight());
+  }
+  LOGFILE << "Search threads cleaned.";
 }
 
-void Search::CancelSharedCollisions() REQUIRES(nodes_mutex_) {
-  for (auto& entry : shared_collisions_) {
-    auto path = entry.first;
+void SearchWorker::CancelCollisions() {
+  for (auto& entry : minibatch_) {
+    if (!entry.IsCollision()) continue;
+    auto path = entry.path;
     for (auto it = ++(path.crbegin()); it != path.crend(); ++it) {
-      std::get<0>(*it)->CancelScoreUpdate(entry.second);
+      std::get<0>(*it)->CancelScoreUpdate(entry.multivisit);
     }
   }
-  shared_collisions_.clear();
 }
 
 Search::~Search() {
   Abort();
   Wait();
-  {
-    SharedMutex::Lock lock(nodes_mutex_);
-    CancelSharedCollisions();
-
-    assert(root_node_->ZeroNInFlight());
-  }
   LOGFILE << "Search destroyed.";
 }
 
@@ -1089,34 +1147,72 @@ Search::~Search() {
 // SearchWorker
 //////////////////////////////////////////////////////////////////////////////
 
+SearchWorker::~SearchWorker()
+{
+  {
+    // Tasks must be completed before destructor. If a gather tasks is running,
+    // it can increment task_count_ which would break the exit state.
+    assert(IsTasksCompleted(task_count_, completed_tasks_));
+    task_count_.fetch_or(kTaskCountSuspend, std::memory_order_release);
+    Mutex::Lock lock(picking_tasks_mutex_);
+    exiting_ = true;
+    task_added_.notify_all();
+  }
+  for (size_t i = 0; i < task_threads_.size(); i++) {
+    task_threads_[i].join();
+  }
+  LOGFILE << "Search worker destroyed.";
+}
+
+std::tuple<SearchWorker::PickTask*, int, int> SearchWorker::PickTaskToProcess() {
+  auto [packed_value, nta, tc] = ReadTaskCount(task_count_);
+
+  // Check if tasks are queued and try increment taken count.
+  while (nta < tc &&
+      !task_count_.compare_exchange_weak(packed_value, packed_value + kTasksTakenOne,
+                                         std::memory_order_acq_rel)) {
+    // Queue had tasks but another worker increment taken. We check
+    // if new work was added to the queue. Then we try to increment
+    // taken again.
+    std::tie(packed_value, nta, tc) = ReadTaskCount(packed_value);
+  }
+  // We incremented taken if nta and tc are different
+  if (nta < tc) {
+    return {picking_tasks_.data() + nta, nta, tc};
+  }
+  return {nullptr, nta, tc};
+}
+
+void SearchWorker::ProcessTask(PickTask* task, int id,
+                               std::vector<NodeToProcess>* receiver,
+                               TaskWorkspace* workspace) {
+  switch (task->task_type) {
+    case PickTask::kGathering: {
+      PickNodesToExtendTask(task->start_path, task->collision_limit,
+                            task->history, receiver,
+                            workspace);
+      break;
+    }
+    case PickTask::kProcessing: {
+      ProcessPickedTask(task->start_idx, task->end_idx);
+      break;
+    }
+  }
+  picking_tasks_.data()[id].complete = true;
+  completed_tasks_.fetch_add(1, std::memory_order_acq_rel);
+}
+
 void SearchWorker::RunTasks(int tid) {
   while (true) {
     PickTask* task = nullptr;
     int id = 0;
+    int tc = 0;
     {
       int spins = 0;
       while (true) {
-        int nta = tasks_taken_.load(std::memory_order_acquire);
-        int tc = task_count_.load(std::memory_order_acquire);
-        if (nta < tc) {
-          int val = 0;
-          if (task_taking_started_.compare_exchange_weak(
-                  val, 1, std::memory_order_acq_rel,
-                  std::memory_order_relaxed)) {
-            nta = tasks_taken_.load(std::memory_order_acquire);
-            tc = task_count_.load(std::memory_order_acquire);
-            // We got the spin lock, double check we're still in the clear.
-            if (nta < tc) {
-              id = tasks_taken_.fetch_add(1, std::memory_order_acq_rel);
-              task = &picking_tasks_[id];
-              task_taking_started_.store(0, std::memory_order_release);
-              break;
-            }
-            task_taking_started_.store(0, std::memory_order_release);
-          }
-          SpinloopPause();
-          spins = 0;
-          continue;
+        std::tie(task, id, tc) = PickTaskToProcess();
+        if (task) {
+          break;
         } else if (tc != -1) {
           spins++;
           if (spins >= 512) {
@@ -1131,39 +1227,24 @@ void SearchWorker::RunTasks(int tid) {
         // Looks like sleep time.
         Mutex::Lock lock(picking_tasks_mutex_);
         // Refresh them now we have the lock.
-        nta = tasks_taken_.load(std::memory_order_acquire);
-        tc = task_count_.load(std::memory_order_acquire);
+        int tc, nta;
+        std::tie(std::ignore, std::ignore, tc) = ReadTaskCount(task_count_);
         if (tc != -1) continue;
-        if (nta >= tc && exiting_) return;
+        if (exiting_) return;
         task_added_.wait(lock.get_raw());
-        // And refresh again now we're awake.
-        nta = tasks_taken_.load(std::memory_order_acquire);
-        tc = task_count_.load(std::memory_order_acquire);
+        std::tie(std::ignore, nta, tc) = ReadTaskCount(task_count_);
         if (nta >= tc && exiting_) return;
       }
     }
     if (task != nullptr) {
-      switch (task->task_type) {
-        case PickTask::kGathering: {
-          PickNodesToExtendTask(task->start_path, task->collision_limit,
-                                task->history, &(task->results),
-                                &(task_workspaces_[tid]));
-          break;
-        }
-        case PickTask::kProcessing: {
-          ProcessPickedTask(task->start_idx, task->end_idx);
-          break;
-        }
-      }
-      picking_tasks_[id].complete = true;
-      completed_tasks_.fetch_add(1, std::memory_order_acq_rel);
+      ProcessTask(task, id, &(task->results), &(task_workspaces_[tid]));
     }
   }
 }
 
 void SearchWorker::ExecuteOneIteration() {
   // 1. Initialize internal structures.
-  InitializeIteration(search_->backend_->CreateComputation());
+  InitializeIteration();
 
   if (params_.GetMaxConcurrentSearchers() != 0) {
     std::unique_ptr<SpinHelper> spin_helper;
@@ -1203,12 +1284,10 @@ void SearchWorker::ExecuteOneIteration() {
 
   // 2. Gather minibatch.
   GatherMinibatch();
-  task_count_.store(-1, std::memory_order_release);
+  assert(IsTasksCompleted(task_count_, completed_tasks_));
+  task_count_.fetch_or(kTaskCountSuspend, std::memory_order_release);
   search_->backend_waiting_counter_.fetch_add(1, std::memory_order_relaxed);
 
-  // 2b. Collect collisions.
-  CollectCollisions();
-
   if (params_.GetMaxConcurrentSearchers() != 0) {
     search_->pending_searchers_.fetch_add(1, std::memory_order_acq_rel);
   }
@@ -1227,16 +1306,11 @@ void SearchWorker::ExecuteOneIteration() {
   UpdateCounters();
 
   // If required, waste time to limit nps.
-  if (params_.GetNpsLimit() > 0) {
+  if (params_.GetNpsLimit() > 0 && iteration_stats_.time_since_first_batch) {
     while (search_->IsSearchActive()) {
-      int64_t time_since_first_batch_ms = 0;
-      {
-        Mutex::Lock lock(search_->counters_mutex_);
-        time_since_first_batch_ms = search_->GetTimeSinceFirstBatch();
-      }
-      if (time_since_first_batch_ms <= 0) {
-        time_since_first_batch_ms = search_->GetTimeSinceStart();
-      }
+      // GetTimeSinceFirstBatch is set only once. We check iteration_stats_ to
+      // know if it was set and later read inside nodes_mutex_.
+      int64_t time_since_first_batch_ms = search_->GetTimeSinceFirstBatch();
       auto nps = search_->GetTotalPlayouts() * 1e3f / time_since_first_batch_ms;
       if (nps > params_.GetNpsLimit()) {
         std::this_thread::sleep_for(std::chrono::milliseconds(1));
@@ -1249,9 +1323,12 @@ void SearchWorker::ExecuteOneIteration() {
 
 // 1. Initialize internal structures.
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-void SearchWorker::InitializeIteration(
-    std::unique_ptr<BackendComputation> computation) {
-  computation_ = std::move(computation);
+void SearchWorker::InitializeIteration() {
+  LCTRACE_FUNCTION_SCOPE;
+  // Free the old computation before allocating a new one. This works better
+  // when backend caches buffer allocations between computations.
+  computation_.reset();
+  computation_ = search_->backend_->CreateComputation();
   minibatch_.clear();
   minibatch_.reserve(2 * target_minibatch_size_);
 }
@@ -1282,13 +1359,24 @@ int CalculateCollisionsLeft(int64_t nodes, const SearchParams& params) {
 }  // namespace
 
 void SearchWorker::GatherMinibatch() {
+  LCTRACE_FUNCTION_SCOPE;
   // Total number of nodes to process.
   int minibatch_size = 0;
   int cur_n = 0;
-  {
-    SharedMutex::Lock lock(search_->nodes_mutex_);
-    cur_n = search_->root_node_->GetN();
-  }
+
+  // Collision use atomic operations. We can cancel them outside the lock.
+  struct CollisionsManager {
+    SearchWorker& worker;
+    CollisionsManager(SearchWorker& worker) : worker(worker) {
+    }
+    ~CollisionsManager() {
+      worker.CancelCollisions();
+    }
+  } cancel_collisions_object(*this);
+  // We take the nodes_mutex_ only once to avoid bouncing between this thread
+  // and a thread returning from RunNNComputation.
+  SharedMutex::Lock lock(search_->nodes_mutex_);
+  cur_n = search_->root_node_->GetN();
   // TODO: GetEstimatedRemainingPlayouts has already had smart pruning factor
   // applied, which doesn't clearly make sense to include here...
   int64_t remaining_n =
@@ -1301,6 +1389,10 @@ void SearchWorker::GatherMinibatch() {
 
   int thread_count = search_->thread_count_.load(std::memory_order_acquire);
 
+  absl::Cleanup record_batch_start_time = [&] {
+    if (minibatch_size) search_->RecordNPSStartTime();
+  };
+
   // Gather nodes to process in the current batch.
   // If we had too many nodes out of order, also interrupt the iteration so
   // that search can exit.
@@ -1342,10 +1434,6 @@ void SearchWorker::GatherMinibatch() {
     }
 
     {
-      // This lock must be held until after the task_completed_ wait succeeds
-      // below. Since the tasks perform work which assumes they have the lock,
-      // even though actually this thread does.
-      SharedMutex::Lock lock(search_->nodes_mutex_);
 
       bool needs_wait = false;
       int ppt_start = new_start;
@@ -1389,7 +1477,6 @@ void SearchWorker::GatherMinibatch() {
       }
     }
     if (some_ooo) {
-      SharedMutex::Lock lock(search_->nodes_mutex_);
       for (int i = static_cast<int>(minibatch_.size()) - 1; i >= new_start;
            i--) {
         // If there was any OOO, revert 'all' new collisions - it isn't possible
@@ -1420,7 +1507,6 @@ void SearchWorker::GatherMinibatch() {
         // Check to see if we can upsize the collision to exit sooner.
         if (picked_node.maxvisit > 0 &&
             collisions_left > picked_node.multivisit) {
-          SharedMutex::Lock lock(search_->nodes_mutex_);
           int extra = std::min(picked_node.maxvisit, collisions_left) -
                       picked_node.multivisit;
           picked_node.multivisit += extra;
@@ -1436,7 +1522,8 @@ void SearchWorker::GatherMinibatch() {
   }
 }
 
-void SearchWorker::ProcessPickedTask(int start_idx, int end_idx) {
+void SearchWorker::ProcessPickedTask(int start_idx, int end_idx)
+    REQUIRES(search_->nodes_mutex_) {
   for (int i = start_idx; i < end_idx; i++) {
     auto& picked_node = minibatch_[i];
     if (picked_node.IsCollision()) continue;
@@ -1453,28 +1540,44 @@ void SearchWorker::ProcessPickedTask(int start_idx, int end_idx) {
   }
 }
 
-#define MAX_TASKS 100
+#define MAX_TASKS 256
 
 void SearchWorker::ResetTasks() {
+  // Tasks must be completed before reset.
+  assert(IsTasksCompleted(task_count_, completed_tasks_));
   task_count_.store(0, std::memory_order_release);
-  tasks_taken_.store(0, std::memory_order_release);
   completed_tasks_.store(0, std::memory_order_release);
   picking_tasks_.clear();
   // Reserve because resizing breaks pointers held by the task threads.
   picking_tasks_.reserve(MAX_TASKS);
 }
 
-int SearchWorker::WaitForTasks() {
+int SearchWorker::WaitForTasks() REQUIRES(search_->nodes_mutex_) {
+  // Process any outstanding tasks before checking if compelted. This avoids a
+  // long polling loop when PickNodesToExtend scheduled many tasks.
+  while (true) {
+    PickTask* task = nullptr;
+    int id = 0;
+    std::tie(task, id, std::ignore) = PickTaskToProcess();
+    if (task == nullptr) {
+      break;
+    }
+    ProcessTask(task, id, &minibatch_, &main_workspace_);
+  }
   // Spin lock, other tasks should be done soon.
   while (true) {
     int completed = completed_tasks_.load(std::memory_order_acquire);
-    int todo = task_count_.load(std::memory_order_acquire);
+    int todo, nta;
+    std::tie(std::ignore, nta, todo) = ReadTaskCount(task_count_);
+    std::ignore = nta;
+    assert(nta <= todo);
     if (todo == completed) return completed;
     SpinloopPause();
   }
 }
 
-void SearchWorker::PickNodesToExtend(int collision_limit) {
+void SearchWorker::PickNodesToExtend(int collision_limit)
+    REQUIRES(search_->nodes_mutex_) {
   ResetTasks();
   if (task_workers_ > 0 && !search_->backend_attributes_.runs_on_cpu) {
     // While nothing is ready yet - wake the task runners so they are ready to
@@ -1483,10 +1586,6 @@ void SearchWorker::PickNodesToExtend(int collision_limit) {
     task_added_.notify_all();
   }
   std::vector<Move> empty_movelist;
-  // This lock must be held until after the task_completed_ wait succeeds below.
-  // Since the tasks perform work which assumes they have the lock, even though
-  // actually this thread does.
-  SharedMutex::Lock lock(search_->nodes_mutex_);
   history_.Trim(search_->played_history_.GetLength());
   PickNodesToExtendTask({std::make_tuple(search_->root_node_, 0, 0)},
                         collision_limit, history_, &minibatch_,
@@ -1573,9 +1672,7 @@ void SearchWorker::PickNodesToExtendTask(
     const BackupPath& path, int collision_limit, PositionHistory& history,
     std::vector<NodeToProcess>* receiver,
     TaskWorkspace* workspace) NO_THREAD_SAFETY_ANALYSIS {
-  // TODO: Find a safe way to make helper threads work in parallel without
-  // excessive locking.
-  Mutex::Lock lock(picking_tasks_mutex_);
+  LCTRACE_FUNCTION_SCOPE;
   assert(path.size() == (size_t)history.GetLength() -
                             search_->played_history_.GetLength() + 1);
 
@@ -1858,9 +1955,8 @@ void SearchWorker::PickNodesToExtendTask(
           if (!ShouldStopPickingHere(child_node, false, child_repetitions)) {
             bool passed = false;
             {
-              // TODO: Reinstate this lock when the whole function lock is gone.
               // Multiple writers, so need mutex here.
-              // Mutex::Lock lock(picking_tasks_mutex_);
+              Mutex::Lock lock(picking_tasks_mutex_);
               // Ensure not to exceed size of reservation.
               if (picking_tasks_.size() < MAX_TASKS) {
                 picking_tasks_.emplace_back(full_path, history, child_limit);
@@ -1991,8 +2087,6 @@ void SearchWorker::ExtendNode(NodeToProcess& picked_node) {
     }
   }
 
-  picked_node.nn_queried = true;  // Node::SetLowNode() required.
-
   // Check the transposition table first and NN cache second before asking for
   // NN evaluation.
   picked_node.hash = history.HashLast(params_.GetCacheHistoryLength() + 1);
@@ -2018,18 +2112,6 @@ void SearchWorker::ExtendNode(NodeToProcess& picked_node) {
   }
 }
 
-// 2b. Copy collisions into shared collisions.
-void SearchWorker::CollectCollisions() {
-  SharedMutex::Lock lock(search_->nodes_mutex_);
-
-  for (const NodeToProcess& node_to_process : minibatch_) {
-    if (node_to_process.IsCollision()) {
-      search_->shared_collisions_.emplace_back(node_to_process.path,
-                                               node_to_process.multivisit);
-    }
-  }
-}
-
 // 4. Run NN computation.
 // ~~~~~~~~~~~~~~~~~~~~~~
 void SearchWorker::RunNNComputation() {
@@ -2039,73 +2121,51 @@ void SearchWorker::RunNNComputation() {
 // 5. Retrieve NN computations (and terminal values) into nodes.
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 void SearchWorker::FetchMinibatchResults() {
-  SharedMutex::Lock nodes_lock(search_->nodes_mutex_);
+  LCTRACE_FUNCTION_SCOPE;
   // Populate NN/cached results, or terminal results, into nodes.
   for (auto& node_to_process : minibatch_) {
     FetchSingleNodeResult(&node_to_process);
   }
 }
 
-void SearchWorker::FetchSingleNodeResult(NodeToProcess* node_to_process)
-    REQUIRES(search_->nodes_mutex_) {
+void SearchWorker::FetchSingleNodeResult(NodeToProcess* node_to_process) {
   if (!node_to_process->nn_queried) return;
 
-  if (!node_to_process->is_tt_hit) {
-    auto [tt_iter, is_tt_miss] = search_->tt_->insert(
-        {node_to_process->hash, node_to_process->tt_low_node});
-    auto wdl_rescale = [&]() {
-      if (params_.GetWDLRescaleRatio() != 1.0f ||
-          (params_.GetWDLRescaleDiff() != 0.0f &&
-           search_->contempt_mode_ != ContemptMode::NONE)) {
-        // Check whether root moves are from the set perspective.
-        bool root_stm = search_->contempt_mode_ == ContemptMode::WHITE;
-        auto sign = (root_stm ^ node_to_process->history.IsBlackToMove())
-                        ? 1.0f
-                        : -1.0f;
-        WDLRescale(node_to_process->eval->q, node_to_process->eval->d,
-                   params_.GetWDLRescaleRatio(),
-                   search_->contempt_mode_ == ContemptMode::NONE
-                       ? 0
-                       : params_.GetWDLRescaleDiff(),
-                   sign, false, params_.GetWDLMaxS());
-      }
-    };
-    if (is_tt_miss) {
-      assert(!tt_iter->second.expired());
-      wdl_rescale();
-      node_to_process->tt_low_node->SetNNEval(node_to_process->eval.get());
-      node_to_process->tt_low_node->SortEdges();
-    } else {
-      auto tt_low_node = tt_iter->second.lock();
-      if (!tt_low_node) {
-        tt_iter->second = node_to_process->tt_low_node;
-        wdl_rescale();
-        node_to_process->tt_low_node->SetNNEval(node_to_process->eval.get());
-        node_to_process->tt_low_node->SortEdges();
-      } else {
-        assert(!tt_iter->second.expired());
-        node_to_process->tt_low_node = tt_iter->second.lock();
-      }
+  auto wdl_rescale = [&]() {
+    if (params_.GetWDLRescaleRatio() != 1.0f ||
+        (params_.GetWDLRescaleDiff() != 0.0f &&
+         search_->contempt_mode_ != ContemptMode::NONE)) {
+      // Check whether root moves are from the set perspective.
+      bool root_stm = search_->contempt_mode_ == ContemptMode::WHITE;
+      auto sign = (root_stm ^ node_to_process->history.IsBlackToMove())
+                      ? 1.0f
+                      : -1.0f;
+      WDLRescale(node_to_process->eval->q, node_to_process->eval->d,
+                 params_.GetWDLRescaleRatio(),
+                 search_->contempt_mode_ == ContemptMode::NONE
+                     ? 0
+                     : params_.GetWDLRescaleDiff(),
+                 sign, false, params_.GetWDLMaxS());
     }
-  }
+  };
+  wdl_rescale();
+  node_to_process->tt_low_node->SetNNEval(node_to_process->eval.get());
+  node_to_process->tt_low_node->SortEdges();
 
   // Add NN results to node.
   Node* node = node_to_process->node;
   // Add Dirichlet noise if enabled and at root.
   if (params_.GetNoiseEpsilon() && node == search_->root_node_) {
-    node->SetLowNode(
-        std::make_shared<LowNode>(*node_to_process->tt_low_node.get()));
-    ApplyDirichletNoise(node, params_.GetNoiseEpsilon(),
-                        params_.GetNoiseAlpha());
-    node->SortEdges();
-  } else {
-    node->SetLowNode(node_to_process->tt_low_node);
+    ApplyDirichletNoise(node_to_process->tt_low_node.get(),
+                        params_.GetNoiseEpsilon(), params_.GetNoiseAlpha());
+    node_to_process->tt_low_node->SortEdges();
   }
 }
 
 // 6. Propagate the new nodes' information to all their parents in the tree.
 // ~~~~~~~~~~~~~~
 void SearchWorker::DoBackupUpdate() {
+  LCTRACE_FUNCTION_SCOPE;
   // Nodes mutex for doing node updates.
   SharedMutex::Lock lock(search_->nodes_mutex_);
 
@@ -2117,7 +2177,6 @@ void SearchWorker::DoBackupUpdate() {
     }
   }
   if (!work_done) return;
-  search_->CancelSharedCollisions();
   search_->total_batches_ += 1;
 }
 
@@ -2134,7 +2193,7 @@ bool SearchWorker::MaybeAdjustForTerminalOrTransposition(
   }
 
   // Use information from transposition or a new terminal.
-  if (nl->IsTransposition() || nl->IsTerminal()) {
+  if (nl->IsTransposition() || nl->IsTerminal() || n->GetN() < nl->GetN()) {
     // Adapt information from low node to node by flipping Q sign, bounds,
     // result and incrementing m.
     v = -nl->GetWL();
@@ -2180,16 +2239,36 @@ bool SearchWorker::MaybeAdjustForTerminalOrTransposition(
 void SearchWorker::DoBackupUpdateSingleNode(
     const NodeToProcess& node_to_process) REQUIRES(search_->nodes_mutex_) {
   if (node_to_process.IsCollision()) {
-    // Collisions are handled via shared_collisions instead.
     return;
   }
 
   auto path = node_to_process.path;
+
+  if (node_to_process.nn_queried) {
+    auto [tt_iter, is_tt_miss] = search_->tt_->try_emplace(
+        node_to_process.hash, node_to_process.tt_low_node);
+    if (is_tt_miss) {
+      assert(!tt_iter->second.expired());
+      node_to_process.node->SetLowNode(node_to_process.tt_low_node);
+    } else {
+      auto tt_low_node = tt_iter->second.lock();
+      if (!tt_low_node) {
+        tt_iter->second = node_to_process.tt_low_node;
+        node_to_process.node->SetLowNode(node_to_process.tt_low_node);
+      } else {
+        assert(!tt_iter->second.expired());
+        node_to_process.node->SetLowNode(tt_low_node);
+      }
+    }
+  } else if (node_to_process.is_tt_hit) {
+    node_to_process.node->SetLowNode(node_to_process.tt_low_node);
+  }
+
   auto [n, nr, nm] = path.back();
   // For the first visit to a terminal, maybe update parent bounds too.
   auto update_parent_bounds =
       params_.GetStickyEndgames() && n->IsTerminal() && !n->GetN();
-  auto nl = n->GetLowNode();
+  const auto& nl = n->GetLowNode();
   float v = 0.0f;
   float d = 0.0f;
   float m = 0.0f;
@@ -2243,7 +2322,7 @@ void SearchWorker::DoBackupUpdateSingleNode(
     // Nothing left to do without ancestors to update.
     if (++it == path.crend()) break;
     auto [p, pr, pm] = *it;
-    auto pl = p->GetLowNode();
+    const auto& pl = p->GetLowNode();
 
     assert(!p->IsTerminal() ||
            (p->IsTerminal() && pl->IsTerminal() && p->GetWL() == -pl->GetWL() &&
@@ -2297,6 +2376,9 @@ void SearchWorker::DoBackupUpdateSingleNode(
     nm = pm;
   }
   search_->total_playouts_ += node_to_process.multivisit;
+  if (node_to_process.nn_queried && !node_to_process.is_cache_hit) {
+    search_->network_evaluations_++;
+  }
   search_->cum_depth_ +=
       node_to_process.path.size() * node_to_process.multivisit;
   search_->max_depth_ =
@@ -2345,7 +2427,7 @@ bool SearchWorker::MaybeSetBounds(Node* p, float m, uint32_t* n_to_fix,
   //        Win ( 1, 1) -> (-1,-1) Loss
 
   // Nothing left to do for ancestors if the parent would be a regular node.
-  auto pl = p->GetLowNode();
+  const auto& pl = p->GetLowNode();
   if (lower == GameResult::BLACK_WON && upper == GameResult::WHITE_WON) {
     return false;
   } else if (lower == upper) {
@@ -2376,9 +2458,10 @@ bool SearchWorker::MaybeSetBounds(Node* p, float m, uint32_t* n_to_fix,
 // 7. Update the Search's status and progress information.
 //~~~~~~~~~~~~~~~~~~~~
 void SearchWorker::UpdateCounters() {
+  LCTRACE_FUNCTION_SCOPE;
   search_->PopulateCommonIterationStats(&iteration_stats_);
   search_->MaybeTriggerStop(iteration_stats_, &latest_time_manager_hints_);
-  search_->MaybeOutputInfo();
+  search_->MaybeOutputInfo(iteration_stats_);
 
   // If this thread had no work, not even out of order, then sleep for some
   // milliseconds. Collisions don't count as work, so have to enumerate to find
diff --git a/src/search/dag_classic/search.h b/src/search/dag_classic/search.h
index e39f89317d..a60fb7e705 100644
--- a/src/search/dag_classic/search.h
+++ b/src/search/dag_classic/search.h
@@ -100,6 +100,8 @@ class Search {
   // from temperature having been applied again.
   void ResetBestMove();
 
+  void RecordNPSStartTime();
+
  private:
   // Computes the best move, maybe with temperature (according to the settings).
   void EnsureBestMoveKnown();
@@ -116,8 +118,9 @@ class Search {
   int64_t GetTimeSinceFirstBatch() const;
   void MaybeTriggerStop(const classic::IterationStats& stats,
                         classic::StoppersHints* hints);
-  void MaybeOutputInfo();
-  void SendUciInfo();  // Requires nodes_mutex_ to be held.
+  void MaybeOutputInfo(const classic::IterationStats& stats);
+  // Requires nodes_mutex_ to be held.
+  void SendUciInfo(const classic::IterationStats& stats);
   // Sets stop to true and notifies watchdog thread.
   void FireStopInternal();
 
@@ -135,16 +138,13 @@ class Search {
   // Node can only be root or ponder (depth 1) and move_to_node is only given
   // for the ponder node.
   std::vector<std::string> GetVerboseStats(
-      Node* node, std::optional<Move> move_to_node) const;
+      const Node* node, std::optional<Move> move_to_node) const;
 
   // Returns the draw score at the root of the search. At odd depth pass true to
   // the value of @is_odd_depth to change the sign of the draw score.
   // Depth of a root node is 0 (even number).
   float GetDrawScore(bool is_odd_depth) const;
 
-  // Ensure that all shared collisions are cancelled and clear them out.
-  void CancelSharedCollisions();
-
   mutable Mutex counters_mutex_ ACQUIRED_AFTER(nodes_mutex_);
   // Tells all threads to stop.
   std::atomic<bool> stop_{false};
@@ -157,6 +157,8 @@ class Search {
   // There is already one thread that responded bestmove, other threads
   // should not do that.
   bool bestmove_is_sent_ GUARDED_BY(counters_mutex_) = false;
+  // Node garbage collection has been started for this search.
+  bool gc_started_ GUARDED_BY(counters_mutex_) = false;
   // Stored so that in the case of non-zero temperature GetBestMove() returns
   // consistent results.
   Move final_bestmove_ GUARDED_BY(counters_mutex_);
@@ -189,22 +191,21 @@ class Search {
   Edge* last_outputted_info_edge_ GUARDED_BY(nodes_mutex_) = nullptr;
   ThinkingInfo last_outputted_uci_info_ GUARDED_BY(nodes_mutex_);
   int64_t total_playouts_ GUARDED_BY(nodes_mutex_) = 0;
+  int64_t network_evaluations_ GUARDED_BY(nodes_mutex_) = 0;
   int64_t total_batches_ GUARDED_BY(nodes_mutex_) = 0;
   // Maximum search depth = length of longest path taken in PickNodetoExtend.
   uint16_t max_depth_ GUARDED_BY(nodes_mutex_) = 0;
   // Cumulative depth of all paths taken in PickNodetoExtend.
   uint64_t cum_depth_ GUARDED_BY(nodes_mutex_) = 0;
 
-  std::optional<std::chrono::steady_clock::time_point> nps_start_time_
-      GUARDED_BY(counters_mutex_);
+  // The start time of search. It is set when the first thread exits
+  // GatherMinibatch. It is guarded by nodes mutex until set once.
+  std::optional<std::chrono::steady_clock::time_point> nps_start_time_;
 
   std::atomic<int> pending_searchers_{0};
   std::atomic<int> backend_waiting_counter_{0};
   std::atomic<int> thread_count_{0};
 
-  std::vector<std::pair<const BackupPath, int>> shared_collisions_
-      GUARDED_BY(nodes_mutex_);
-
   std::unique_ptr<UciResponder> uci_responder_;
   ContemptMode contempt_mode_;
   friend class SearchWorker;
@@ -215,6 +216,12 @@ class Search {
 // within one thread, have to split into stages.
 class SearchWorker {
  public:
+  static constexpr int kTaskCountDigits = std::numeric_limits<int>::digits + 1;
+  static constexpr int kTasksTakenShift = kTaskCountDigits/2;
+  static constexpr int kTasksTakenOne = 1 << kTasksTakenShift;
+  // Suspend is -1 for the low half.
+  static constexpr int kTaskCountSuspend = kTasksTakenOne - 1;
+
   SearchWorker(Search* search, const SearchParams& params)
       : search_(search),
         history_(search_->played_history_),
@@ -233,7 +240,11 @@ class SearchWorker {
     }
     for (int i = 0; i < task_workers_; i++) {
       task_workspaces_.emplace_back();
-      task_threads_.emplace_back([this, i]() { this->RunTasks(i); });
+      task_threads_.emplace_back([this, i]() {
+          LOGFILE << "Task worker " << i << " starting.";
+          this->RunTasks(i);
+          LOGFILE << "Task worker " << i << " exiting.";
+        });
     }
     target_minibatch_size_ = params_.GetMiniBatchSize();
     if (target_minibatch_size_ == 0) {
@@ -245,17 +256,7 @@ class SearchWorker {
                                      target_minibatch_size_));
   }
 
-  ~SearchWorker() {
-    {
-      task_count_.store(-1, std::memory_order_release);
-      Mutex::Lock lock(picking_tasks_mutex_);
-      exiting_ = true;
-      task_added_.notify_all();
-    }
-    for (size_t i = 0; i < task_threads_.size(); i++) {
-      task_threads_[i].join();
-    }
-  }
+  ~SearchWorker();
 
   // Runs iterations while needed.
   void RunBlocking() {
@@ -286,7 +287,7 @@ class SearchWorker {
   // The same operations one by one:
   // 1. Initialize internal structures.
   // @computation is the computation to use on this iteration.
-  void InitializeIteration(std::unique_ptr<BackendComputation> computation);
+  void InitializeIteration();
 
   // 2. Gather minibatch.
   void GatherMinibatch();
@@ -361,7 +362,7 @@ class SearchWorker {
       for (auto it = path.cbegin(); it != path.cend(); ++it) {
         if (it != path.cbegin()) oss << "->";
         auto n = std::get<0>(*it);
-        auto nl = n->GetLowNode();
+        const auto& nl = n->GetLowNode();
         oss << n << ":" << n->GetNInFlight();
         if (nl) {
           oss << "(" << nl << ")";
@@ -460,6 +461,7 @@ class SearchWorker {
                              PositionHistory& history,
                              std::vector<NodeToProcess>* receiver,
                              TaskWorkspace* workspace);
+  void CancelCollisions();
 
   // Check if the situation described by @depth under root and @position is a
   // safe two-fold or a draw by repetition and return the number of safe
@@ -470,6 +472,10 @@ class SearchWorker {
   void ProcessPickedTask(int batch_start, int batch_end);
   void ExtendNode(NodeToProcess& picked_node);
   void FetchSingleNodeResult(NodeToProcess* node_to_process);
+  std::tuple<PickTask*, int, int> PickTaskToProcess();
+  void ProcessTask(PickTask* task, int id,
+                   std::vector<NodeToProcess>* receiver,
+                   TaskWorkspace* workspace);
   void RunTasks(int tid);
   void ResetTasks();
   // Returns how many tasks there were.
@@ -495,9 +501,8 @@ class SearchWorker {
 
   Mutex picking_tasks_mutex_;
   std::vector<PickTask> picking_tasks_;
-  std::atomic<int> task_count_ = -1;
-  std::atomic<int> task_taking_started_ = 0;
-  std::atomic<int> tasks_taken_ = 0;
+  // A packed atomic. LSB half is task_count_. MSB half is tasks_taken_.
+  std::atomic<int> task_count_ = kTaskCountSuspend;
   std::atomic<int> completed_tasks_ = 0;
   std::condition_variable task_added_;
   std::vector<std::thread> task_threads_;
diff --git a/src/search/dag_classic/wrapper.cc b/src/search/dag_classic/wrapper.cc
index a618134f13..8de3b28385 100644
--- a/src/search/dag_classic/wrapper.cc
+++ b/src/search/dag_classic/wrapper.cc
@@ -30,7 +30,8 @@
 #include "search/dag_classic/search.h"
 #include "search/register.h"
 #include "search/search.h"
-#include "src/neural/shared_params.h"
+#include "neural/shared_params.h"
+#include "utils/trace.h"
 
 namespace lczero {
 namespace dag_classic {
@@ -63,6 +64,7 @@ class DagClassicSearch : public SearchBase {
     move_start_time_ = std::chrono::steady_clock::now();
   }
   void WaitSearch() override {
+    LOGFILE << "Waiting for the search.";
     if (search_) search_->Wait();
   }
   void StopSearch() override {
@@ -99,6 +101,8 @@ MoveList StringsToMovelist(const std::vector<std::string>& moves,
 }
 
 void DagClassicSearch::NewGame() {
+  LCTRACE_FUNCTION_SCOPE;
+  LOGFILE << "New game.";
   search_.reset();
   tt_.clear();
   tree_.reset();
@@ -106,27 +110,35 @@ void DagClassicSearch::NewGame() {
 }
 
 void DagClassicSearch::SetPosition(const GameState& pos) {
+  LCTRACE_FUNCTION_SCOPE;
   if (!tree_) tree_ = std::make_unique<NodeTree>();
   const bool is_same_game = tree_->ResetToPosition(pos);
+  LOGFILE << "Tree reset to a new position.";
   if (!is_same_game) time_manager_ = classic::MakeTimeManager(*options_);
 }
 
 void DagClassicSearch::StartSearch(const GoParams& params) {
+  LCTRACE_FUNCTION_SCOPE;
   auto forwarder =
       std::make_unique<NonOwningUciRespondForwarder>(uci_responder_);
-  if (options_->Get<Button>(kClearTree).TestAndReset()) tree_->TrimTreeAtHead();
+  if (options_->Get<Button>(kClearTree).TestAndReset()) {
+    tree_->TrimTreeAtHead();
+    LOGFILE << "Tree cleared.";
+  }
 
   const auto cache_size =
       options_->Get<int>(SharedBackendParams::kNNCacheSizeId);
   // FIXME: This is too conservative.
   const size_t kAvgNodeSize =
-      sizeof(Node) + sizeof(LowNode) + sizeof(TranspositionTable::slot_type) +
+      sizeof(Node) + sizeof(LowNode) +
       classic::MemoryWatchingStopper::kAvgMovesPerPosition * sizeof(Edge);
   const size_t kAvgCacheItemSize =
       3 * sizeof(float) + sizeof(std::unique_ptr<float[]>) +
       sizeof(float[classic::MemoryWatchingStopper::kAvgMovesPerPosition]);
-  size_t total_memory = tree_.get()->GetCurrentHead()->GetN() * kAvgNodeSize +
-                        cache_size * kAvgCacheItemSize;
+  size_t total_memory =
+      tree_.get()->GetCurrentHead()->GetN() * kAvgNodeSize +
+      (sizeof(TranspositionTable::value_type) + 1) * tt_.bucket_count() +
+      cache_size * kAvgCacheItemSize;
   auto stopper = time_manager_->GetStopper(
       params, tree_.get()->HeadPosition(), total_memory, kAvgNodeSize,
       tree_.get()->GetCurrentHead()->GetN());
@@ -145,6 +157,7 @@ class DagClassicSearchFactory : public SearchFactory {
   std::string_view GetName() const override { return "dag-preview"; }
   std::unique_ptr<SearchBase> CreateSearch(
       UciResponder* responder, const OptionsDict* options) const override {
+    LCTRACE_FUNCTION_SCOPE;
     return std::make_unique<DagClassicSearch>(responder, options);
   }
 
diff --git a/src/search/register.h b/src/search/register.h
index 419a4dc7b6..1625f18a1b 100644
--- a/src/search/register.h
+++ b/src/search/register.h
@@ -60,9 +60,10 @@ class SearchManager {
   std::vector<std::unique_ptr<SearchFactory>> algorithms_;
 };
 
-#define REGISTER_SEARCH(alg)                                              \
-  namespace {                                                             \
-  static SearchManager::Register reg3b50Y_##alg(std::make_unique<alg>()); \
+#define REGISTER_SEARCH(alg)                                      \
+  namespace {                                                     \
+  [[maybe_unused]] static SearchManager::Register reg3b50Y_##alg( \
+      std::make_unique<alg>());                                   \
   }
 
 }  // namespace lczero
\ No newline at end of file
diff --git a/src/tools/backendbench.cc b/src/tools/backendbench.cc
index 58d2648024..ff048048e5 100644
--- a/src/tools/backendbench.cc
+++ b/src/tools/backendbench.cc
@@ -47,6 +47,8 @@ const OptionId kMaxBatchSizeId{"max-batch-size", "",
                                "Maximum batch size to benchmark."};
 const OptionId kBatchStepId{"batch-step", "",
                             "Step of batch size in benchmark."};
+const OptionId kHeaderOnlyOnceId{"header-only-once", "",
+                                 "Print CSV header only once."};
 const OptionId kFenId{"fen", "", "Benchmark initial position FEN."};
 
 const OptionId kClippyId{"clippy", "", "Enable helpful assistant."};
@@ -84,6 +86,7 @@ void BackendBenchmark::Run() {
   options.Add<IntOption>(kStartBatchSizeId, 1, 1024) = 1;
   options.Add<IntOption>(kMaxBatchSizeId, 1, 1024) = 256;
   options.Add<IntOption>(kBatchStepId, 1, 256) = 1;
+  options.Add<BoolOption>(kHeaderOnlyOnceId) = false;
   options.Add<StringOption>(kFenId) = ChessBoard::kStartposFen;
   options.Add<BoolOption>(kClippyId) = false;
 
@@ -93,15 +96,32 @@ void BackendBenchmark::Run() {
     auto option_dict = options.GetOptionsDict();
 
     auto backend = BackendManager::Get()->CreateFromParams(option_dict);
+    const int threads = option_dict.Get<int>(kThreadsOptionId);
 
     classic::NodeTree tree;
     tree.ResetToPosition(option_dict.Get<std::string>(kFenId), {});
     EvalPosition pos{tree.GetPositionHistory().GetPositions(), {}};
+    std::vector<std::thread> handles;
 
     // Do any backend initialization outside the loop.
-    auto warmup = backend->CreateComputation();
-    warmup->AddInput(pos, {});
-    warmup->ComputeBlocking();
+    auto warm = [&]() {
+      // Give GPU enough work to make it go from idle clocks to max clocks.
+      for (int i = 0; i < 2; i++) {
+        auto warmup = backend->CreateComputation();
+        for (int j = 0; j < option_dict.Get<int>(kMaxBatchSizeId); ++j) {
+          warmup->AddInput(pos, {});
+        }
+        warmup->ComputeBlocking();
+      }
+    };
+    for (int t = 1; t < threads; t++) {
+      handles.emplace_back(warm);
+    }
+    warm();
+    for (auto& handle : handles) {
+      handle.join();
+    }
+    handles.clear();
 
     const int batches = option_dict.Get<int>(kBatchesId);
 
@@ -112,29 +132,106 @@ void BackendBenchmark::Run() {
     float best_nps2 = 0.0f;
     float best_nps3 = 0.0f;
     std::optional<std::chrono::time_point<std::chrono::steady_clock>> pending;
-
+    using tp = std::chrono::time_point<std::chrono::steady_clock>;
+    std::vector<std::vector<tp>> ends(threads);
+    for (auto& vend : ends) {
+      vend.resize(batches + 1);
+    }
+    std::vector<std::chrono::duration<double>> times(batches);
+    std::vector<int> thread_counts(threads);
     for (int i = option_dict.Get<int>(kStartBatchSizeId);
          i <= option_dict.Get<int>(kMaxBatchSizeId);
          i += option_dict.Get<int>(kBatchStepId)) {
-      const auto start = std::chrono::steady_clock::now();
-      // TODO: support threads not equal to 1 to be able to more sensibly test
-      // multiplexing backend.
-      for (int j = 0; j < batches; j++) {
-        // Put i copies of tree root node into computation and compute.
-        auto computation = backend->CreateComputation();
-        for (int k = 0; k < i; k++) {
-          computation->AddInput(pos, {});
+      handles.reserve(threads);
+      std::atomic<int> j{0};
+
+      auto compute = [&](int tid = 0) {
+        int count = 0;
+        auto& end = ends[tid];
+        // Ignore the first batch to let GPU queue fill for stable measurements.
+        while (j++ < batches) {
+          // Put i copies of tree root node into computation and compute.
+          auto computation = backend->CreateComputation();
+          for (int k = 0; k < i; k++) {
+            computation->AddInput(pos, {});
+          }
+          computation->ComputeBlocking();
+          end[count++] = std::chrono::steady_clock::now();
         }
-        computation->ComputeBlocking();
+        thread_counts[tid] = count;
+      };
+
+      for (int t = 1; t < threads; t++) {
+        handles.emplace_back(compute, t);
+      }
+
+      compute(0);
+      for (auto& handle : handles) {
+        handle.join();
       }
 
-      const auto end = std::chrono::steady_clock::now();
-      std::chrono::duration<double> time = end - start;
-      const auto nps = i * batches / time.count();
-      std::cout << "Benchmark batch size " << i
-                << " with inference average time "
-                << time.count() / batches * 1000 << "ms - throughput " << nps
-                << " nps." << std::endl;
+      handles.clear();
+
+      double stddev = 0;
+      double total = 0;
+      int batches_done = 0;
+      for (int t = 0; t < threads; t++) {
+        for (int j = 1; j < thread_counts[t]; j++) {
+          times[batches_done] = (ends[t][j] - ends[t][j - 1]) / threads;
+          total += times[batches_done].count();
+          batches_done++;
+        }
+      }
+
+      double mean = total / batches_done;
+
+      for (int j = 0; j < batches_done; j++) {
+        double diff = times[j].count() - mean;
+        stddev += diff * diff;
+      }
+      stddev = std::sqrt(stddev / (batches_done - 1));
+      double cv = stddev / mean;
+
+      std::sort(times.begin(), times.begin() + batches_done);
+
+      mean *= 1000;
+
+      const auto nps = i * batches_done / total;
+      const auto median = batches_done % 2 == 0
+                              ? 2 * i /
+                                    (times[batches_done / 2 - 1].count() +
+                                     times[batches_done / 2].count())
+                              : i / times[batches_done / 2].count();
+      if (option_dict.Get<bool>(kHeaderOnlyOnceId)
+              ? i == option_dict.Get<int>(kStartBatchSizeId)
+              : ((i - option_dict.Get<int>(kStartBatchSizeId)) /
+                     option_dict.Get<int>(kBatchStepId) % 32 ==
+                 0)) {
+        std::cout << "size,"
+                     " mean nps,"
+                     " mean ms,"
+                     "   sdev,"
+                     "     cv,"
+                     " max nps,"
+                     "  median,"
+                     " min nps,"
+                  << std::endl;
+      }
+      // clang-format off
+      std::cout << std::setw(4) << i << ","
+                << std::fixed << std::setprecision(0)
+                << std::setw(9) << nps << ","
+                << std::defaultfloat << std::setprecision(4)
+                << std::setw(8) << mean  << ","
+                << std::fixed << std::setprecision(4)
+                << std::setw(7) << stddev * 1000 << ","
+                << std::setw(7) << cv << ","
+                << std::fixed << std::setprecision(0)
+                << std::setw(8) << i / times[0].count() << ","
+                << std::setw(8) << median << ","
+                << std::setw(8) << i / times[batches_done - 1].count()
+                << std::endl;
+      // clang-format on
 
       if (option_dict.Get<bool>(kClippyId)) {
         float nps_ingame = std::pow((nps + best_nps) / 2, 1.085);
@@ -163,7 +260,8 @@ void BackendBenchmark::Run() {
           }
         }
         if (pending) {
-          time = std::chrono::steady_clock::now() - *pending;
+          std::chrono::duration<double> time =
+              std::chrono::steady_clock::now() - *pending;
           if (time.count() > 10) {
             Clippy("Recommended minibatch-size for this net (so far):",
                    "1s/move   (Bullet):     ", std::to_string(best3),
diff --git a/src/tools/describenet.cc b/src/tools/describenet.cc
index 80b5f1ad82..679c2e51cb 100644
--- a/src/tools/describenet.cc
+++ b/src/tools/describenet.cc
@@ -28,7 +28,7 @@
 #include "tools/describenet.h"
 
 #include "neural/loader.h"
-#include "neural/onnx/onnx.pb.h"
+#include "proto/onnx.pb.h"
 #include "utils/optionsparser.h"
 
 namespace lczero {
@@ -110,6 +110,11 @@ void ShowNetworkFormatInfo(const pblczero::Net& weights) {
     COUT << Justify("FFN activation")
          << NetworkFormat::ActivationFunction_Name(net_format.ffn_activation());
   }
+  if (net_format.has_input_embedding()) {
+    COUT << Justify("Input embedding")
+         << NetworkFormat::InputEmbeddingFormat_Name(
+                net_format.input_embedding());
+  }
 }
 
 void ShowNetworkTrainingInfo(const pblczero::Net& weights) {
diff --git a/src/tools/leela2onnx.cc b/src/tools/leela2onnx.cc
index bcd3058c96..fc7cb9db40 100644
--- a/src/tools/leela2onnx.cc
+++ b/src/tools/leela2onnx.cc
@@ -57,6 +57,11 @@ const OptionId kOnnxDataTypeId{"onnx-data-type", "",
                                "Data type to use in the ONNX model."};
 const OptionId kOnnxOpsetId{"onnx-opset", "",
                             "Opset to use in the ONNX model."};
+const OptionId kOnnxIrId{
+    {.long_flag = "onnx-ir",
+     .uci_option = "",
+     .help_text = "IR to use for the ONNX model.",
+     .visibility = OptionId::kProOnly}};
 const OptionId kHloAllowPartialResultId{
     {.long_flag = "hlo-allow-partial-result",
      .uci_option = "",
@@ -94,6 +99,7 @@ bool ProcessParameters(OptionsParser* options) {
   options->Add<StringOption>(kHloProtoOutputFilenameId);
   options->Add<IntOption>(kOnnxBatchSizeId, -1, 2048) = -1;
   options->Add<IntOption>(kOnnxOpsetId, 7, 18) = 17;
+  options->Add<IntOption>(kOnnxIrId, -1, 10) = -1;
   options->Add<IntOption>(kHloBatchSizeId, 1, 2048) = 333;
   options->Add<ChoiceOption>(
       kOnnxDataTypeId, std::vector<std::string>{"f32", "f16", "bf16"}) = "f32";
@@ -143,6 +149,7 @@ void ConvertLeelaToOnnx() {
     onnx_options.output_wdl = dict.Get<std::string>(kOutputWdl);
     onnx_options.output_value = dict.Get<std::string>(kOutputValue);
     onnx_options.opset = dict.Get<int>(kOnnxOpsetId);
+    onnx_options.ir = dict.Get<int>(kOnnxIrId);
     onnx_options.batch_size = dict.Get<int>(kOnnxBatchSizeId);
     onnx_options.data_type = WeightsToOnnxConverterOptions::StringToDataType(
         dict.Get<std::string>(kOnnxDataTypeId));
diff --git a/src/tools/onnx2leela.cc b/src/tools/onnx2leela.cc
index 562367f14d..ff90a6e518 100644
--- a/src/tools/onnx2leela.cc
+++ b/src/tools/onnx2leela.cc
@@ -31,8 +31,8 @@
 #include <fstream>
 #include <set>
 
-#include "neural/onnx/onnx.pb.h"
 #include "proto/net.pb.h"
+#include "proto/onnx.pb.h"
 #include "tools/describenet.h"
 #include "utils/files.h"
 #include "utils/fp16_utils.h"
@@ -61,9 +61,9 @@ T GetEnumValueFromString(const std::string& str_value,
 }
 
 const OptionId kInputFilenameId{"input", "InputFile",
-                                "Path of the input Lc0 weights file."};
+                                "Path of the input ONNX file."};
 const OptionId kOutputFilenameId{"output", "OutputFile",
-                                 "Path of the output ONNX file."};
+                                 "Path of the output Lc0 weights file."};
 
 const OptionId kInputFormatId(
     "input-format", "InputFormat",
@@ -167,7 +167,7 @@ bool ValidateNetwork(const pblczero::Net& weights, pblczero::ModelProto& onnx) {
 
   auto check_exists = [](std::string_view n, std::set<std::string>* nodes) {
     std::string name(n);
-    if (nodes->count(name) == 0) {
+    if (!nodes->contains(name)) {
       CERR << "Node '" << name << "' doesn't exist in ONNX.";
       return false;
     }
@@ -465,9 +465,11 @@ void ConvertOnnxToLeela() {
       NetworkFormat::ValueFormat_AllValues, NetworkFormat::ValueFormat_Name));
   if (dict.OwnExists<std::string>(kOnnxOutputValueId)) {
     onnx->set_output_value(dict.Get<std::string>(kOnnxOutputValueId));
+    format->set_output(NetworkFormat::OUTPUT_CLASSICAL);
   }
   if (dict.OwnExists<std::string>(kOnnxOutputWdlId)) {
     onnx->set_output_wdl(dict.Get<std::string>(kOnnxOutputWdlId));
+    format->set_output(NetworkFormat::OUTPUT_WDL);
   }
 
   // Mlh.
diff --git a/src/trainingdata/rescorer.cc b/src/trainingdata/rescorer.cc
index 177ff2fb99..a2a4148d28 100644
--- a/src/trainingdata/rescorer.cc
+++ b/src/trainingdata/rescorer.cc
@@ -27,7 +27,10 @@
 
 #include "trainingdata/rescorer.h"
 
+#include <algorithm>
 #include <optional>
+#include <source_location>
+#include <span>
 #include <sstream>
 
 #include "gtb-probe.h"
@@ -118,11 +121,15 @@ bool deblunderEnabled = false;
 float deblunderQBlunderThreshold = 2.0f;
 float deblunderQBlunderWidth = 0.0f;
 
-void DataAssert(bool check_result) {
-  if (!check_result) throw Exception("Range Violation");
+void DataAssert(bool check_result,
+                std::source_location loc = std::source_location::current()) {
+  if (!check_result) {
+    throw Exception(std::string("Range Violation at ") + loc.file_name() + ":" +
+                    std::to_string(loc.line()));
+  }
 }
 
-void Validate(const std::vector<V6TrainingData>& fileContents) {
+void Validate(std::span<const V6TrainingData> fileContents) {
   if (fileContents.empty()) throw Exception("Empty File");
 
   for (size_t i = 0; i < fileContents.size(); i++) {
@@ -210,7 +217,7 @@ void Validate(const std::vector<V6TrainingData>& fileContents) {
   }
 }
 
-void Validate(const std::vector<V6TrainingData>& fileContents,
+void Validate(std::span<const V6TrainingData> fileContents,
               const MoveList& moves) {
   PositionHistory history;
   int rule50ply;
@@ -453,650 +460,736 @@ struct ProcessFileFlags {
   bool nnue_best_move : 1;
 };
 
-void ProcessFile(const std::string& file, SyzygyTablebase* tablebase,
-                 std::string outputDir, float distTemp, float distOffset,
-                 float dtzBoost, int newInputFormat,
-                 std::string nnue_plain_file, ProcessFileFlags flags) {
-  // Scope to ensure reader and writer are closed before deleting source file.
-  {
-    try {
-      TrainingDataReader reader(file);
-      std::vector<V6TrainingData> fileContents;
-      V6TrainingData data;
-      while (reader.ReadChunk(&data)) {
-        fileContents.push_back(data);
-      }
-      Validate(fileContents);
-      MoveList moves;
-      for (size_t i = 1; i < fileContents.size(); i++) {
-        moves.push_back(
-            DecodeMoveFromInput(PlanesFromTrainingData(fileContents[i]),
-                                PlanesFromTrainingData(fileContents[i - 1])));
-        // All moves decoded are from the point of view of the side after the
-        // move so need to mirror them all to be applicable to apply to the
-        // position before.
-        moves.back().Flip();
+struct FileData {
+  std::vector<V6TrainingData> fileContents;
+  MoveList moves;
+  pblczero::NetworkFormat::InputFormat input_format;
+};
+
+bool IsAllDraws(const FileData& data) {
+  for (const auto& chunk : data.fileContents) {
+    if (ResultForData(chunk) != 0) {
+      return false;
+    }
+  }
+  return true;
+}
+
+std::vector<V6TrainingData> ReadFile(const std::string& file) {
+  std::vector<V6TrainingData> fileContents;
+
+  TrainingDataReader reader(file);
+  V6TrainingData chunk;
+  while (reader.ReadChunk(&chunk)) {
+    fileContents.push_back(chunk);
+  }
+
+  return fileContents;
+}
+
+FileData ProcessAndValidateFileData(std::vector<V6TrainingData> fileContents) {
+  FileData data;
+  data.fileContents = std::move(fileContents);
+
+  Validate(data.fileContents);
+  games += 1;
+  positions += data.fileContents.size();
+  // Decode moves from input data
+  for (size_t i = 1; i < data.fileContents.size(); i++) {
+    data.moves.push_back(
+        DecodeMoveFromInput(PlanesFromTrainingData(data.fileContents[i]),
+                            PlanesFromTrainingData(data.fileContents[i - 1])));
+    // All moves decoded are from the point of view of the side after the
+    // move so need to mirror them all to be applicable to apply to the
+    // position before.
+    data.moves.back().Flip();
+  }
+  Validate(data.fileContents, data.moves);
+
+  data.input_format = static_cast<pblczero::NetworkFormat::InputFormat>(
+      data.fileContents[0].input_format);
+
+  return data;
+}
+
+void ApplyPolicySubstitutions(FileData& data) {
+  if (policy_subs.empty()) return;
+  PositionHistory history;
+  int rule50ply;
+  int gameply;
+  ChessBoard board;
+
+  PopulateBoard(data.input_format, PlanesFromTrainingData(data.fileContents[0]),
+                &board, &rule50ply, &gameply);
+  history.Reset(board, rule50ply, gameply);
+  uint64_t rootHash = HashCat(board.Hash(), rule50ply);
+
+  if (policy_subs.find(rootHash) != policy_subs.end()) {
+    PolicySubNode* rootNode = &policy_subs[rootHash];
+    for (size_t i = 0; i < data.fileContents.size(); i++) {
+      if (rootNode->active) {
+        for (int j = 0; j < 1858; j++) {
+          data.fileContents[i].probabilities[j] = rootNode->policy[j];
+        }
       }
-      Validate(fileContents, moves);
-      games += 1;
-      positions += fileContents.size();
-      PositionHistory history;
-      int rule50ply;
-      int gameply;
-      ChessBoard board;
-      auto input_format = static_cast<pblczero::NetworkFormat::InputFormat>(
-          fileContents[0].input_format);
-      PopulateBoard(input_format, PlanesFromTrainingData(fileContents[0]),
-                    &board, &rule50ply, &gameply);
-      history.Reset(board, rule50ply, gameply);
-      uint64_t rootHash = HashCat(board.Hash(), rule50ply);
-      if (policy_subs.find(rootHash) != policy_subs.end()) {
-        PolicySubNode* rootNode = &policy_subs[rootHash];
-        for (size_t i = 0; i < fileContents.size(); i++) {
-          if (rootNode->active) {
-            /* Some logic for choosing a softmax to apply to better align the
-            new policy with the old policy...
-            double bestkld =
-              std::numeric_limits<double>::max(); float besttemp = 1.0f;
-            // Minima is usually in this range for 'better' data.
-            for (float temp = 1.0f; temp < 3.0f; temp += 0.1f) {
-              float soft[1858];
-              float sum = 0.0f;
-              for (int j = 0; j < 1858; j++) {
-                if (rootNode->policy[j] >= 0.0) {
-                  soft[j] = std::pow(rootNode->policy[j], 1.0f / temp);
-                  sum += soft[j];
-                } else {
-                  soft[j] = -1.0f;
-                }
-              }
-              double kld = 0.0;
-              for (int j = 0; j < 1858; j++) {
-                if (soft[j] >= 0.0) soft[j] /= sum;
-                if (rootNode->policy[j] > 0.0 &&
-                    fileContents[i].probabilities[j] > 0) {
-                  kld += -1.0f * soft[j] *
-                    std::log(fileContents[i].probabilities[j] / soft[j]);
-                }
-              }
-              if (kld < bestkld) {
-                bestkld = kld;
-                besttemp = temp;
-              }
-            }
-            std::cerr << i << " " << besttemp << " " << bestkld << std::endl;
-            */
-            for (int j = 0; j < 1858; j++) {
-              /*
-              if (rootNode->policy[j] >= 0.0) {
-                std::cerr << i << " " << j << " " << rootNode->policy[j] << " "
-                          << fileContents[i].probabilities[j] << std::endl;
-              }
-              */
-              fileContents[i].probabilities[j] = rootNode->policy[j];
-            }
-          }
-          if (i + 1 < fileContents.size()) {
-            int transform = TransformForPosition(input_format, history);
-            int idx = MoveToNNIndex(moves[i], transform);
-            if (rootNode->children[idx] == nullptr) {
-              break;
-            }
-            rootNode = rootNode->children[idx];
-            history.Append(moves[i]);
-          }
+      if (i + 1 < data.fileContents.size()) {
+        int transform = TransformForPosition(data.input_format, history);
+        int idx = MoveToNNIndex(data.moves[i], transform);
+        if (rootNode->children[idx] == nullptr) {
+          break;
         }
+        rootNode = rootNode->children[idx];
+        history.Append(data.moves[i]);
       }
+    }
+  }
+}
 
-      PopulateBoard(input_format, PlanesFromTrainingData(fileContents[0]),
-                    &board, &rule50ply, &gameply);
-      history.Reset(board, rule50ply, gameply);
-      int last_rescore = -1;
-      orig_counts[ResultForData(fileContents[0]) + 1]++;
-      fixed_counts[ResultForData(fileContents[0]) + 1]++;
-      for (int i = 0; i < static_cast<int>(moves.size()); i++) {
-        history.Append(moves[i]);
-        const auto& board = history.Last().GetBoard();
-        if (board.castlings().no_legal_castle() &&
-            history.Last().GetRule50Ply() == 0 &&
-            (board.ours() | board.theirs()).count() <=
-                tablebase->max_cardinality()) {
-          ProbeState state;
-          WDLScore wdl = tablebase->probe_wdl(history.Last(), &state);
-          // Only fail state means the WDL is wrong, probe_wdl may produce
-          // correct result with a stat other than OK.
-          if (state != FAIL) {
-            int8_t score_to_apply = 0;
-            if (wdl == WDL_WIN) {
-              score_to_apply = 1;
-            } else if (wdl == WDL_LOSS) {
-              score_to_apply = -1;
-            }
-            for (int j = i + 1; j > last_rescore; j--) {
-              if (ResultForData(fileContents[j]) != score_to_apply) {
-                if (j == i + 1 && last_rescore == -1) {
-                  fixed_counts[ResultForData(fileContents[0]) + 1]--;
-                  bool flip = (i % 2) == 0;
-                  fixed_counts[(flip ? -score_to_apply : score_to_apply) + 1]++;
-                  /*
-                  std::cerr << "Rescoring: " << file << " "  <<
-                  (int)fileContents[j].result << " -> "
-                            << (int)score_to_apply
-                            << std::endl;
-                            */
-                }
-                rescored += 1;
-                delta += abs(ResultForData(fileContents[j]) - score_to_apply);
-                /*
-              std::cerr << "Rescoring: " << (int)fileContents[j].result << " ->
-              "
-                        << (int)score_to_apply
-                        << std::endl;
-                        */
-              }
+void ApplySyzygyRescoring(FileData& data, SyzygyTablebase* tablebase) {
+  PositionHistory history;
+  int rule50ply;
+  int gameply;
+  ChessBoard board;
 
-              if (score_to_apply == 0) {
-                fileContents[j].result_d = 1.0f;
-              } else {
-                fileContents[j].result_d = 0.0f;
-              }
-              fileContents[j].result_q = static_cast<float>(score_to_apply);
-              score_to_apply = -score_to_apply;
-            }
-            last_rescore = i + 1;
-          }
-        }
-      }
-      PopulateBoard(input_format, PlanesFromTrainingData(fileContents[0]),
-                    &board, &rule50ply, &gameply);
-      history.Reset(board, rule50ply, gameply);
-      for (size_t i = 0; i < moves.size(); i++) {
-        history.Append(moves[i]);
-        const auto& board = history.Last().GetBoard();
-        if (board.castlings().no_legal_castle() &&
-            history.Last().GetRule50Ply() != 0 &&
-            (board.ours() | board.theirs()).count() <=
-                tablebase->max_cardinality()) {
-          ProbeState state;
-          WDLScore wdl = tablebase->probe_wdl(history.Last(), &state);
-          // Only fail state means the WDL is wrong, probe_wdl may produce
-          // correct result with a stat other than OK.
-          if (state != FAIL) {
-            int8_t score_to_apply = 0;
-            if (wdl == WDL_WIN) {
-              score_to_apply = 1;
-            } else if (wdl == WDL_LOSS) {
-              score_to_apply = -1;
-            }
-            // If the WDL result disagrees with the game outcome, make it a
-            // draw. WDL draw is always draw regardless of prior moves since
-            // zero, so that clearly works. Otherwise, the WDL result could be
-            // correct or draw, so best we can do is change scores that don't
-            // agree, to be a draw. If score was a draw this is a no-op, if it
-            // was opposite it becomes a draw.
-            int8_t new_score =
-                ResultForData(fileContents[i + 1]) != score_to_apply
-                    ? 0
-                    : ResultForData(fileContents[i + 1]);
-            bool dtz_rescored = false;
-            // if score is not already right, and the score to apply isn't 0,
-            // dtz can let us know its definitely correct.
-            if (ResultForData(fileContents[i + 1]) != score_to_apply &&
-                score_to_apply != 0) {
-              // Any repetitions in the history since last 50 ply makes it risky
-              // to assume dtz is still correct.
-              int steps = history.Last().GetRule50Ply();
-              bool no_reps = true;
-              for (int i = 0; i < steps; i++) {
-                // If game started from non-zero 50 move rule, this could
-                // underflow. Only safe option is to assume there were
-                // repetitions before this point.
-                if (history.GetLength() - i - 1 < 0) {
-                  no_reps = false;
-                  break;
-                }
-                if (history.GetPositionAt(history.GetLength() - i - 1)
-                        .GetRepetitions() != 0) {
-                  no_reps = false;
-                  break;
-                }
-              }
-              if (no_reps) {
-                int depth = tablebase->probe_dtz(history.Last(), &state);
-                if (state != FAIL) {
-                  // This should be able to be <= 99 safely, but I've not
-                  // convinced myself thats true.
-                  if (steps + std::abs(depth) < 99) {
-                    rescored3++;
-                    new_score = score_to_apply;
-                    dtz_rescored = true;
-                  }
-                }
-              }
-            }
+  // First pass: rescoring positions with rule50ply == 0
+  PopulateBoard(data.input_format, PlanesFromTrainingData(data.fileContents[0]),
+                &board, &rule50ply, &gameply);
+  history.Reset(board, rule50ply, gameply);
+  int last_rescore = -1;
+  orig_counts[ResultForData(data.fileContents[0]) + 1]++;
+  fixed_counts[ResultForData(data.fileContents[0]) + 1]++;
 
-            // If score is not already a draw, and its not obviously a draw,
-            // check if 50 move rule has advanced so far its obviously a draw.
-            // Obviously not needed if we've already proven with dtz that its a
-            // win/loss.
-            if (ResultForData(fileContents[i + 1]) != 0 &&
-                score_to_apply != 0 && !dtz_rescored) {
-              int depth = tablebase->probe_dtz(history.Last(), &state);
-              if (state != FAIL) {
-                int steps = history.Last().GetRule50Ply();
-                // This should be able to be >= 101 safely, but I've not
-                // convinced myself thats true.
-                if (steps + std::abs(depth) > 101) {
-                  rescored3++;
-                  new_score = 0;
-                  dtz_rescored = true;
-                }
-              }
-            }
-            if (new_score != ResultForData(fileContents[i + 1])) {
-              rescored2 += 1;
-              /*
-            std::cerr << "Rescoring: " << (int)fileContents[j].result << " -> "
-                      << (int)score_to_apply
-                      << std::endl;
-                      */
+  for (int i = 0; i < static_cast<int>(data.moves.size()); i++) {
+    history.Append(data.moves[i]);
+    const auto& board = history.Last().GetBoard();
+    if (board.castlings().no_legal_castle() &&
+        history.Last().GetRule50Ply() == 0 &&
+        (board.ours() | board.theirs()).count() <=
+            tablebase->max_cardinality()) {
+      ProbeState state;
+      WDLScore wdl = tablebase->probe_wdl(history.Last(), &state);
+      // Only fail state means the WDL is wrong, probe_wdl may produce
+      // correct result with a stat other than OK.
+      if (state != FAIL) {
+        int8_t score_to_apply = 0;
+        if (wdl == WDL_WIN) {
+          score_to_apply = 1;
+        } else if (wdl == WDL_LOSS) {
+          score_to_apply = -1;
+        }
+        for (int j = i + 1; j > last_rescore; j--) {
+          if (ResultForData(data.fileContents[j]) != score_to_apply) {
+            if (j == i + 1 && last_rescore == -1) {
+              fixed_counts[ResultForData(data.fileContents[0]) + 1]--;
+              bool flip = (i % 2) == 0;
+              fixed_counts[(flip ? -score_to_apply : score_to_apply) + 1]++;
             }
+            rescored += 1;
+            delta += abs(ResultForData(data.fileContents[j]) - score_to_apply);
+          }
 
-            if (new_score == 0) {
-              fileContents[i + 1].result_d = 1.0f;
-            } else {
-              fileContents[i + 1].result_d = 0.0f;
-            }
-            fileContents[i + 1].result_q = static_cast<float>(new_score);
+          if (score_to_apply == 0) {
+            data.fileContents[j].result_d = 1.0f;
+          } else {
+            data.fileContents[j].result_d = 0.0f;
           }
+          data.fileContents[j].result_q = static_cast<float>(score_to_apply);
+          score_to_apply = -score_to_apply;
         }
+        last_rescore = i + 1;
       }
+    }
+  }
 
-      if (distTemp != 1.0f || distOffset != 0.0f || dtzBoost != 0.0f) {
-        PopulateBoard(input_format, PlanesFromTrainingData(fileContents[0]),
-                      &board, &rule50ply, &gameply);
-        history.Reset(board, rule50ply, gameply);
-        int move_index = 0;
-        for (auto& chunk : fileContents) {
-          const auto& board = history.Last().GetBoard();
-          std::vector<bool> boost_probs(1858, false);
-          int boost_count = 0;
-
-          if (dtzBoost != 0.0f && board.castlings().no_legal_castle() &&
-              (board.ours() | board.theirs()).count() <=
-                  tablebase->max_cardinality()) {
-            MoveList to_boost;
-            MoveList maybe_boost;
-            tablebase->root_probe(history.Last(), true, true, &to_boost);
-            if (history.DidRepeatSinceLastZeroingMove()) {
-              maybe_boost = to_boost;
-            } else {
-              tablebase->root_probe(history.Last(), false, true, &maybe_boost);
-            }
-            // If there is only one move, dtm fixup is not helpful.
-            // This code assumes all gaviota 3-4-5 tbs are present, as checked
-            // at startup.
-            if (gaviotaEnabled && maybe_boost.size() > 1 &&
-                (board.ours() | board.theirs()).count() <= 5) {
-              std::vector<unsigned int> dtms;
-              dtms.resize(maybe_boost.size());
-              unsigned int mininum_dtm = 1000;
-              // Only safe moves being considered, boost the smallest dtm
-              // amongst them.
-              for (auto& move : maybe_boost) {
-                Position next_pos = Position(history.Last(), move);
-                unsigned int info;
-                unsigned int dtm;
-                gaviota_tb_probe_hard(next_pos, info, dtm);
-                dtms.push_back(dtm);
-                if (dtm < mininum_dtm) mininum_dtm = dtm;
-              }
-              if (mininum_dtm < 1000) {
-                to_boost.clear();
-                int dtm_idx = 0;
-                for (auto& move : maybe_boost) {
-                  if (dtms[dtm_idx] == mininum_dtm) {
-                    to_boost.push_back(move);
-                  }
-                  dtm_idx++;
-                }
-                policy_dtm_bump++;
-              }
+  // Second pass: rescoring positions with rule50ply != 0
+  PopulateBoard(data.input_format, PlanesFromTrainingData(data.fileContents[0]),
+                &board, &rule50ply, &gameply);
+  history.Reset(board, rule50ply, gameply);
+
+  for (size_t i = 0; i < data.moves.size(); i++) {
+    history.Append(data.moves[i]);
+    const auto& board = history.Last().GetBoard();
+    if (board.castlings().no_legal_castle() &&
+        history.Last().GetRule50Ply() != 0 &&
+        (board.ours() | board.theirs()).count() <=
+            tablebase->max_cardinality()) {
+      ProbeState state;
+      WDLScore wdl = tablebase->probe_wdl(history.Last(), &state);
+      // Only fail state means the WDL is wrong, probe_wdl may produce
+      // correct result with a stat other than OK.
+      if (state != FAIL) {
+        int8_t score_to_apply = 0;
+        if (wdl == WDL_WIN) {
+          score_to_apply = 1;
+        } else if (wdl == WDL_LOSS) {
+          score_to_apply = -1;
+        }
+        // If the WDL result disagrees with the game outcome, make it a
+        // draw. WDL draw is always draw regardless of prior moves since
+        // zero, so that clearly works. Otherwise, the WDL result could be
+        // correct or draw, so best we can do is change scores that don't
+        // agree, to be a draw. If score was a draw this is a no-op, if it
+        // was opposite it becomes a draw.
+        int8_t new_score =
+            ResultForData(data.fileContents[i + 1]) != score_to_apply
+                ? 0
+                : ResultForData(data.fileContents[i + 1]);
+        bool dtz_rescored = false;
+        // if score is not already right, and the score to apply isn't 0,
+        // dtz can let us know its definitely correct.
+        if (ResultForData(data.fileContents[i + 1]) != score_to_apply &&
+            score_to_apply != 0) {
+          // Any repetitions in the history since last 50 ply makes it risky
+          // to assume dtz is still correct.
+          int steps = history.Last().GetRule50Ply();
+          bool no_reps = true;
+          for (int i = 0; i < steps; i++) {
+            // If game started from non-zero 50 move rule, this could
+            // underflow. Only safe option is to assume there were
+            // repetitions before this point.
+            if (history.GetLength() - i - 1 < 0) {
+              no_reps = false;
+              break;
             }
-            int transform = TransformForPosition(input_format, history);
-            for (auto& move : to_boost) {
-              boost_probs[MoveToNNIndex(move, transform)] = true;
+            if (history.GetPositionAt(history.GetLength() - i - 1)
+                    .GetRepetitions() != 0) {
+              no_reps = false;
+              break;
             }
-            boost_count = to_boost.size();
           }
-          float sum = 0.0;
-          int prob_index = 0;
-          float preboost_sum = 0.0f;
-          for (auto& prob : chunk.probabilities) {
-            float offset =
-                distOffset +
-                (boost_probs[prob_index] ? (dtzBoost / boost_count) : 0.0f);
-            if (dtzBoost != 0.0f && boost_probs[prob_index]) {
-              preboost_sum += prob;
-              if (prob < 0 || std::isnan(prob))
-                std::cerr << "Bump for move that is illegal????" << std::endl;
-              policy_bump++;
+          if (no_reps) {
+            int depth = tablebase->probe_dtz(history.Last(), &state);
+            if (state != FAIL) {
+              // This should be able to be <= 99 safely, but I've not
+              // convinced myself thats true.
+              if (steps + std::abs(depth) < 99) {
+                rescored3++;
+                new_score = score_to_apply;
+                dtz_rescored = true;
+              }
             }
-            prob_index++;
-            if (prob < 0 || std::isnan(prob)) continue;
-            prob = std::max(0.0f, prob + offset);
-            prob = std::pow(prob, 1.0f / distTemp);
-            sum += prob;
           }
-          prob_index = 0;
-          float boost_sum = 0.0f;
-          for (auto& prob : chunk.probabilities) {
-            if (dtzBoost != 0.0f && boost_probs[prob_index]) {
-              boost_sum += prob / sum;
+        }
+
+        // If score is not already a draw, and its not obviously a draw,
+        // check if 50 move rule has advanced so far its obviously a draw.
+        // Obviously not needed if we've already proven with dtz that its a
+        // win/loss.
+        if (ResultForData(data.fileContents[i + 1]) != 0 &&
+            score_to_apply != 0 && !dtz_rescored) {
+          int depth = tablebase->probe_dtz(history.Last(), &state);
+          if (state != FAIL) {
+            int steps = history.Last().GetRule50Ply();
+            // This should be able to be >= 101 safely, but I've not
+            // convinced myself thats true.
+            if (steps + std::abs(depth) > 101) {
+              rescored3++;
+              new_score = 0;
+              dtz_rescored = true;
             }
-            prob_index++;
-            if (prob < 0 || std::isnan(prob)) continue;
-            prob /= sum;
-          }
-          if (boost_count > 0) {
-            policy_nobump_total_hist[(int)(preboost_sum * 10)]++;
-            policy_bump_total_hist[(int)(boost_sum * 10)]++;
           }
-          history.Append(moves[move_index]);
-          move_index++;
         }
-      }
+        if (new_score != ResultForData(data.fileContents[i + 1])) {
+          rescored2 += 1;
+        }
 
-      // Make move_count field plies_left for moves left head.
-      int offset = 0;
-      bool all_draws = true;
-      for (auto& chunk : fileContents) {
-        // plies_left can't be 0 for real v5 data, so if it is 0 it must be a v4
-        // conversion, and we should populate it ourselves with a better
-        // starting estimate.
-        if (chunk.plies_left == 0.0f) {
-          chunk.plies_left = (int)(fileContents.size() - offset);
+        if (new_score == 0) {
+          data.fileContents[i + 1].result_d = 1.0f;
+        } else {
+          data.fileContents[i + 1].result_d = 0.0f;
         }
-        offset++;
-        all_draws = all_draws && (ResultForData(chunk) == 0);
+        data.fileContents[i + 1].result_q = static_cast<float>(new_score);
       }
+    }
+  }
+}
 
-      // Correct plies_left using Gaviota TBs for 5 piece and less positions.
-      if (gaviotaEnabled && !all_draws) {
-        PopulateBoard(input_format, PlanesFromTrainingData(fileContents[0]),
-                      &board, &rule50ply, &gameply);
-        history.Reset(board, rule50ply, gameply);
-        int last_rescore = 0;
-        for (size_t i = 0; i < moves.size(); i++) {
-          history.Append(moves[i]);
-          const auto& board = history.Last().GetBoard();
-
-          // Gaviota TBs don't have 50 move rule.
-          // Only consider positions that are not draw after rescoring.
-          if ((ResultForData(fileContents[i + 1]) != 0) &&
-              board.castlings().no_legal_castle() &&
-              (board.ours() | board.theirs()).count() <= 5) {
-            std::vector<int> dtms;
-            unsigned int info;
-            unsigned int dtm;
-            gaviota_tb_probe_hard(history.Last(), info, dtm);
-            if (info != tb_WMATE && info != tb_BMATE) {
-              // Not a win for either player.
-              continue;
-            }
-            int steps = history.Last().GetRule50Ply();
-            if ((dtm + steps > 99) && (dtm <= fileContents[i + 1].plies_left)) {
-              // Following DTM could trigger 50 move rule and the current
-              // move_count is more than DTM.
-              // If DTM is more than the current move_count then we can rescore
-              // using it since DTM50 is not shorter than DTM.
-              continue;
-            }
-            bool no_reps = true;
-            for (int i = 0; i < steps; i++) {
-              // If game started from non-zero 50 move rule, this could
-              // underflow. Only safe option is to assume there were repetitions
-              // before this point.
-              if (history.GetLength() - i - 1 < 0) {
-                no_reps = false;
-                break;
-              }
-              if (history.GetPositionAt(history.GetLength() - i - 1)
-                      .GetRepetitions() != 0) {
-                no_reps = false;
-                break;
-              }
-            }
-            if (!no_reps) {
-              // There were repetitions. Do nothing since DTM path
-              // could trigger draw by repetition.
-              continue;
-            }
-            gaviota_dtm_rescores++;
-            int j;
-            for (j = i; j >= -1; j--) {
-              if (j <= last_rescore) {
-                break;
-              }
-              // std::cerr << j << " " << int(fileContents[j + 1].move_count) <<
-              // " -> " << int(dtm + (i - j)) << std::endl;
-              fileContents[j + 1].plies_left = int(dtm + (i - j));
+void ApplyPolicyAdjustments(FileData& data, SyzygyTablebase* tablebase,
+                            float distTemp, float distOffset, float dtzBoost) {
+  if (distTemp == 1.0f && distOffset == 0.0f && dtzBoost == 0.0f) {
+    return;  // No adjustments needed
+  }
+
+  PositionHistory history;
+  int rule50ply;
+  int gameply;
+  ChessBoard board;
+
+  PopulateBoard(data.input_format, PlanesFromTrainingData(data.fileContents[0]),
+                &board, &rule50ply, &gameply);
+  history.Reset(board, rule50ply, gameply);
+  size_t move_index = 0;
+
+  for (auto& chunk : data.fileContents) {
+    const auto& board = history.Last().GetBoard();
+    std::vector<bool> boost_probs(1858, false);
+    int boost_count = 0;
+
+    if (dtzBoost != 0.0f && board.castlings().no_legal_castle() &&
+        (board.ours() | board.theirs()).count() <=
+            tablebase->max_cardinality()) {
+      MoveList to_boost;
+      MoveList maybe_boost;
+      tablebase->root_probe(history.Last(), true, true, &to_boost);
+      if (history.DidRepeatSinceLastZeroingMove()) {
+        maybe_boost = to_boost;
+      } else {
+        tablebase->root_probe(history.Last(), false, true, &maybe_boost);
+      }
+      // If there is only one move, dtm fixup is not helpful.
+      // This code assumes all gaviota 3-4-5 tbs are present, as checked
+      // at startup.
+      if (gaviotaEnabled && maybe_boost.size() > 1 &&
+          (board.ours() | board.theirs()).count() <= 5) {
+        std::vector<unsigned int> dtms;
+        dtms.resize(maybe_boost.size());
+        unsigned int mininum_dtm = 1000;
+        // Only safe moves being considered, boost the smallest dtm
+        // amongst them.
+        for (auto& move : maybe_boost) {
+          Position next_pos = Position(history.Last(), move);
+          unsigned int info;
+          unsigned int dtm;
+          gaviota_tb_probe_hard(next_pos, info, dtm);
+          dtms.push_back(dtm);
+          if (dtm < mininum_dtm) mininum_dtm = dtm;
+        }
+        if (mininum_dtm < 1000) {
+          to_boost.clear();
+          int dtm_idx = 0;
+          for (auto& move : maybe_boost) {
+            if (dtms[dtm_idx] == mininum_dtm) {
+              to_boost.push_back(move);
             }
-            last_rescore = i;
+            dtm_idx++;
           }
+          policy_dtm_bump++;
         }
       }
+      int transform = TransformForPosition(data.input_format, history);
+      for (auto& move : to_boost) {
+        boost_probs[MoveToNNIndex(move, transform)] = true;
+      }
+      boost_count = to_boost.size();
+    }
+    float sum = 0.0;
+    int prob_index = 0;
+    float preboost_sum = 0.0f;
+    for (auto& prob : chunk.probabilities) {
+      float offset =
+          distOffset +
+          (boost_probs[prob_index] ? (dtzBoost / boost_count) : 0.0f);
+      if (dtzBoost != 0.0f && boost_probs[prob_index]) {
+        preboost_sum += prob;
+        if (prob < 0 || std::isnan(prob))
+          std::cerr << "Bump for move that is illegal????" << std::endl;
+        policy_bump++;
+      }
+      prob_index++;
+      if (prob < 0 || std::isnan(prob)) continue;
+      prob = std::max(0.0f, prob + offset);
+      prob = std::pow(prob, 1.0f / distTemp);
+      sum += prob;
+    }
+    prob_index = 0;
+    float boost_sum = 0.0f;
+    for (auto& prob : chunk.probabilities) {
+      if (dtzBoost != 0.0f && boost_probs[prob_index]) {
+        boost_sum += prob / sum;
+      }
+      prob_index++;
+      if (prob < 0 || std::isnan(prob)) continue;
+      prob /= sum;
+    }
+    if (boost_count > 0) {
+      policy_nobump_total_hist[(int)(preboost_sum * 10)]++;
+      policy_bump_total_hist[(int)(boost_sum * 10)]++;
+    }
+    if (move_index < data.moves.size()) {
+      history.Append(data.moves[move_index]);
+      move_index++;
+    }
+  }
+}
 
-      // Correct move_count using DTZ for 3 piece no-pawn positions only.
-      // If Gaviota TBs are enabled no need to use syzygy.
-      if (!gaviotaEnabled && !all_draws) {
-        PopulateBoard(input_format, PlanesFromTrainingData(fileContents[0]),
-                      &board, &rule50ply, &gameply);
-        history.Reset(board, rule50ply, gameply);
-        for (size_t i = 0; i < moves.size(); i++) {
-          history.Append(moves[i]);
-          const auto& board = history.Last().GetBoard();
-          if (board.castlings().no_legal_castle() &&
-              (board.ours() | board.theirs()).count() <= 3 &&
-              board.pawns().empty()) {
-            ProbeState state;
-            WDLScore wdl = tablebase->probe_wdl(history.Last(), &state);
-            // Only fail state means the WDL is wrong, probe_wdl may produce
-            // correct result with a stat other than OK.
-            if (state != FAIL) {
-              int8_t score_to_apply = 0;
-              if (wdl == WDL_WIN) {
-                score_to_apply = 1;
-              } else if (wdl == WDL_LOSS) {
-                score_to_apply = -1;
-              }
-              // No point updating for draws.
-              if (score_to_apply == 0) continue;
-              // Any repetitions in the history since last 50 ply makes it risky
-              // to assume dtz is still correct.
-              int steps = history.Last().GetRule50Ply();
-              bool no_reps = true;
-              for (int i = 0; i < steps; i++) {
-                // If game started from non-zero 50 move rule, this could
-                // underflow. Only safe option is to assume there were
-                // repetitions before this point.
-                if (history.GetLength() - i - 1 < 0) {
-                  no_reps = false;
-                  break;
-                }
-                if (history.GetPositionAt(history.GetLength() - i - 1)
-                        .GetRepetitions() != 0) {
-                  no_reps = false;
-                  break;
-                }
-              }
-              if (no_reps) {
-                int depth = tablebase->probe_dtz(history.Last(), &state);
-                if (state != FAIL) {
-                  // if depth == -1 this is wrong, since that is mate and the
-                  // answer should be 0, but the move before depth is -2. Since
-                  // data never contains mate position, ignore that discrepency.
-                  int converted_ply_remaining = std::abs(depth);
-                  // This should be able to be <= 99 safely, but I've not
-                  // convinced myself thats true.
-                  if (steps + std::abs(depth) < 99) {
-                    fileContents[i + 1].plies_left = converted_ply_remaining;
-                  }
-                  if (steps == 0) {
-                    for (int j = i; j >= 0; j--) {
-                      fileContents[j].plies_left =
-                          converted_ply_remaining + (i + 1 - j);
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
+void EstimateAndCorrectPliesLeft(FileData& data) {
+  // Make move_count field plies_left for moves left head.
+  int offset = 0;
+  for (auto& chunk : data.fileContents) {
+    // plies_left can't be 0 for real v5 data, so if it is 0 it must be a v4
+    // conversion, and we should populate it ourselves with a better
+    // starting estimate.
+    if (chunk.plies_left == 0.0f) {
+      chunk.plies_left = (int)(data.fileContents.size() - offset);
+    }
+    offset++;
+  }
+}
+
+void ApplyGaviotaCorrections(FileData& data) {
+  if (!gaviotaEnabled) return;
+
+  if (IsAllDraws(data)) return;
+
+  PositionHistory history;
+  int rule50ply;
+  int gameply;
+  ChessBoard board;
+
+  PopulateBoard(data.input_format, PlanesFromTrainingData(data.fileContents[0]),
+                &board, &rule50ply, &gameply);
+  history.Reset(board, rule50ply, gameply);
+  int last_rescore = 0;
+
+  for (size_t i = 0; i < data.moves.size(); i++) {
+    history.Append(data.moves[i]);
+    const auto& board = history.Last().GetBoard();
+
+    // Gaviota TBs don't have 50 move rule.
+    // Only consider positions that are not draw after rescoring.
+    if ((ResultForData(data.fileContents[i + 1]) != 0) &&
+        board.castlings().no_legal_castle() &&
+        (board.ours() | board.theirs()).count() <= 5) {
+      std::vector<int> dtms;
+      unsigned int info;
+      unsigned int dtm;
+      gaviota_tb_probe_hard(history.Last(), info, dtm);
+      if (info != tb_WMATE && info != tb_BMATE) {
+        // Not a win for either player.
+        continue;
       }
-      // Deblunder only works from v6 data onwards. We therefore check
-      // the visits field which is 0 if we're dealing with upgraded data.
-      if (deblunderEnabled && fileContents.back().visits > 0) {
-        PopulateBoard(input_format, PlanesFromTrainingData(fileContents[0]),
-                      &board, &rule50ply, &gameply);
-        history.Reset(board, rule50ply, gameply);
-        for (size_t i = 0; i < moves.size(); i++) {
-          history.Append(moves[i]);
-          const auto& board = history.Last().GetBoard();
-          if (board.castlings().no_legal_castle() &&
-              (board.ours() | board.theirs()).count() <=
-                  tablebase->max_cardinality()) {
-            history.Pop();
-            break;
-          }
+      int steps = history.Last().GetRule50Ply();
+      if ((dtm + steps > 99) && (dtm <= data.fileContents[i + 1].plies_left)) {
+        // Following DTM could trigger 50 move rule and the current
+        // move_count is more than DTM.
+        // If DTM is more than the current move_count then we can rescore
+        // using it since DTM50 is not shorter than DTM.
+        continue;
+      }
+      bool no_reps = true;
+      for (int i = 0; i < steps; i++) {
+        // If game started from non-zero 50 move rule, this could
+        // underflow. Only safe option is to assume there were repetitions
+        // before this point.
+        if (history.GetLength() - i - 1 < 0) {
+          no_reps = false;
+          break;
         }
-        float activeZ[3] = {fileContents.back().result_q,
-                            fileContents.back().result_d,
-                            fileContents.back().plies_left};
-        bool deblunderingStarted = false;
-        while (true) {
-          auto& cur = fileContents[history.GetLength() - 1];
-          // A blunder is defined by the played move being worse than the
-          // best move by a defined threshold, missing a forced win, or
-          // playing into a proven loss without being forced.
-          bool deblunderTriggerThreshold =
-              (cur.best_q - cur.played_q >
-               deblunderQBlunderThreshold - deblunderQBlunderWidth / 2.0);
-          bool deblunderTriggerTerminal =
-              (cur.best_q > -1 && cur.played_q < 1 &&
-               ((cur.best_q == 1 && ((cur.invariance_info & 8) != 0)) ||
-                cur.played_q == -1));
-          if (deblunderTriggerThreshold || deblunderTriggerTerminal) {
-            float newZRatio = 1.0f;
-            // If width > 0 and the deblunder didn't involve a terminal
-            // position, we apply a soft threshold by averaging old and new Z.
-            if (deblunderQBlunderWidth > 0 && !deblunderTriggerTerminal) {
-              newZRatio = std::min(1.0f, (cur.best_q - cur.played_q -
-                                          deblunderQBlunderThreshold) /
-                                                 deblunderQBlunderWidth +
-                                             0.5f);
-            }
-            // Instead of averaging, a randomization can be applied here with
-            // newZRatio = newZRatio > rand( [0, 1) ) ? 1.0f : 0.0f;
-            activeZ[0] = (1 - newZRatio) * activeZ[0] + newZRatio * cur.best_q;
-            activeZ[1] = (1 - newZRatio) * activeZ[1] + newZRatio * cur.best_d;
-            activeZ[2] = (1 - newZRatio) * activeZ[2] + newZRatio * cur.best_m;
-            deblunderingStarted = true;
-            blunders += 1;
-            /* std::cout << "Blunder detected. Best move q=" << cur.best_q <<
-             " played move q=" << cur.played_q; */
-          }
-          if (deblunderingStarted) {
-            /*
-            std::cerr << "Deblundering: "
-                      << fileContents[history.GetLength() - 1].best_q << " "
-                      << fileContents[history.GetLength() - 1].best_d << " "
-                      << (int)fileContents[history.GetLength() - 1].result << "
-            "
-                      << (int)activeZ << std::endl;
-                      */
-            fileContents[history.GetLength() - 1].result_q = activeZ[0];
-            fileContents[history.GetLength() - 1].result_d = activeZ[1];
-            fileContents[history.GetLength() - 1].plies_left = activeZ[2];
-          }
-          if (history.GetLength() == 1) break;
-          // Q values are always from the player to move.
-          activeZ[0] = -activeZ[0];
-          // Estimated remaining plies left has to be increased.
-          activeZ[2] += 1.0f;
-          history.Pop();
+        if (history.GetPositionAt(history.GetLength() - i - 1)
+                .GetRepetitions() != 0) {
+          no_reps = false;
+          break;
         }
       }
-      if (newInputFormat != -1) {
-        PopulateBoard(input_format, PlanesFromTrainingData(fileContents[0]),
-                      &board, &rule50ply, &gameply);
-        history.Reset(board, rule50ply, gameply);
-        ChangeInputFormat(newInputFormat, &fileContents[0], history);
-        for (size_t i = 0; i < moves.size(); i++) {
-          history.Append(moves[i]);
-          ChangeInputFormat(newInputFormat, &fileContents[i + 1], history);
-        }
+      if (!no_reps) {
+        // There were repetitions. Do nothing since DTM path
+        // could trigger draw by repetition.
+        continue;
       }
-
-      if (!outputDir.empty()) {
-        std::string fileName = file.substr(file.find_last_of("/\\") + 1);
-        TrainingDataWriter writer(outputDir + "/" + fileName);
-        for (auto chunk : fileContents) {
-          // Don't save chunks that just provide move history.
-          if ((chunk.invariance_info & 64) == 0) {
-            writer.WriteChunk(chunk);
-          }
+      gaviota_dtm_rescores++;
+      int j;
+      for (j = i; j >= -1; j--) {
+        if (j <= last_rescore) {
+          break;
         }
+        data.fileContents[j + 1].plies_left = int(dtm + (i - j));
       }
+      last_rescore = i;
+    }
+  }
+}
 
-      // Output data in Stockfish plain format.
-      if (!nnue_plain_file.empty()) {
-        static Mutex mutex;
-        std::ostringstream out;
-        pblczero::NetworkFormat::InputFormat format;
-        if (newInputFormat != -1) {
-          format =
-              static_cast<pblczero::NetworkFormat::InputFormat>(newInputFormat);
-        } else {
-          format = input_format;
+void ApplyDTZCorrections(FileData& data, SyzygyTablebase* tablebase) {
+  // Correct move_count using DTZ for 3 piece no-pawn positions only.
+  // If Gaviota TBs are enabled no need to use syzygy.
+  if (gaviotaEnabled) return;
+
+  if (IsAllDraws(data)) return;
+
+  PositionHistory history;
+  int rule50ply;
+  int gameply;
+  ChessBoard board;
+
+  PopulateBoard(data.input_format, PlanesFromTrainingData(data.fileContents[0]),
+                &board, &rule50ply, &gameply);
+  history.Reset(board, rule50ply, gameply);
+
+  for (size_t i = 0; i < data.moves.size(); i++) {
+    history.Append(data.moves[i]);
+    const auto& board = history.Last().GetBoard();
+    if (board.castlings().no_legal_castle() &&
+        (board.ours() | board.theirs()).count() <= 3 && board.pawns().empty()) {
+      ProbeState state;
+      WDLScore wdl = tablebase->probe_wdl(history.Last(), &state);
+      // Only fail state means the WDL is wrong, probe_wdl may produce
+      // correct result with a stat other than OK.
+      if (state != FAIL) {
+        int8_t score_to_apply = 0;
+        if (wdl == WDL_WIN) {
+          score_to_apply = 1;
+        } else if (wdl == WDL_LOSS) {
+          score_to_apply = -1;
         }
-        PopulateBoard(format, PlanesFromTrainingData(fileContents[0]), &board,
-                      &rule50ply, &gameply);
-        history.Reset(board, rule50ply, gameply);
-        for (size_t i = 0; i < fileContents.size(); i++) {
-          auto chunk = fileContents[i];
-          Position p = history.Last();
-          if (chunk.visits > 0) {
-            // Format is v6 and position is evaluated.
-            Move m = MoveFromNNIndex(
-                flags.nnue_best_move ? chunk.best_idx : chunk.played_idx,
-                TransformForPosition(format, history));
-            float q = flags.nnue_best_score ? chunk.best_q : chunk.played_q;
-            out << AsNnueString(p, m, q, round(chunk.result_q));
-          } else if (i < moves.size()) {
-            out << AsNnueString(p, moves[i], chunk.best_q,
-                                round(chunk.result_q));
+        // No point updating for draws.
+        if (score_to_apply == 0) continue;
+        // Any repetitions in the history since last 50 ply makes it risky
+        // to assume dtz is still correct.
+        int steps = history.Last().GetRule50Ply();
+        bool no_reps = true;
+        for (int i = 0; i < steps; i++) {
+          // If game started from non-zero 50 move rule, this could
+          // underflow. Only safe option is to assume there were repetitions
+          // before this point.
+          if (history.GetLength() - i - 1 < 0) {
+            no_reps = false;
+            break;
           }
-          if (i < moves.size()) {
-            history.Append(moves[i]);
+          if (history.GetPositionAt(history.GetLength() - i - 1)
+                  .GetRepetitions() != 0) {
+            no_reps = false;
+            break;
           }
         }
-        std::ofstream file;
-        Mutex::Lock lock(mutex);
-        file.open(nnue_plain_file, std::ios_base::app);
-        if (file.is_open()) {
-          file << out.str();
-          file.close();
+        if (no_reps) {
+          int depth = tablebase->probe_dtz(history.Last(), &state);
+          if (state != FAIL) {
+            // if depth == -1 this is wrong, since that is mate and the
+            // answer should be 0, but the move before depth is -2. Since
+            // data never contains mate position, ignore that discrepency.
+            int converted_ply_remaining = std::abs(depth);
+            // This should be able to be <= 99 safely, but I've not
+            // convinced myself thats true.
+            if (steps + std::abs(depth) < 99) {
+              data.fileContents[i + 1].plies_left = converted_ply_remaining;
+            }
+            if (steps == 0) {
+              for (int j = i; j >= 0; j--) {
+                data.fileContents[j].plies_left =
+                    converted_ply_remaining + (i + 1 - j);
+              }
+            }
+          }
         }
       }
-    } catch (Exception& ex) {
-      std::cerr << "While processing: " << file
-                << " - Exception thrown: " << ex.what() << std::endl;
-      if (flags.delete_files) {
-        std::cerr << "It will be deleted." << std::endl;
+    }
+  }
+}
+
+void ApplyDeblunder(FileData& data, SyzygyTablebase* tablebase) {
+  // Deblunder only works from v6 data onwards. We therefore check
+  // the visits field which is 0 if we're dealing with upgraded data.
+  if (!deblunderEnabled || data.fileContents.back().visits == 0) {
+    return;
+  }
+
+  PositionHistory history;
+  int rule50ply;
+  int gameply;
+  ChessBoard board;
+
+  PopulateBoard(data.input_format, PlanesFromTrainingData(data.fileContents[0]),
+                &board, &rule50ply, &gameply);
+  history.Reset(board, rule50ply, gameply);
+
+  for (size_t i = 0; i < data.moves.size(); i++) {
+    history.Append(data.moves[i]);
+    const auto& board = history.Last().GetBoard();
+    if (board.castlings().no_legal_castle() &&
+        (board.ours() | board.theirs()).count() <=
+            tablebase->max_cardinality()) {
+      history.Pop();
+      break;
+    }
+  }
+
+  float activeZ[3] = {data.fileContents.back().result_q,
+                      data.fileContents.back().result_d,
+                      data.fileContents.back().plies_left};
+  bool deblunderingStarted = false;
+
+  while (true) {
+    auto& cur = data.fileContents[history.GetLength() - 1];
+    // A blunder is defined by the played move being worse than the
+    // best move by a defined threshold, missing a forced win, or
+    // playing into a proven loss without being forced.
+    bool deblunderTriggerThreshold =
+        (cur.best_q - cur.played_q >
+         deblunderQBlunderThreshold - deblunderQBlunderWidth / 2.0);
+    bool deblunderTriggerTerminal =
+        (cur.best_q > -1 && cur.played_q < 1 &&
+         ((cur.best_q == 1 && ((cur.invariance_info & 8) != 0)) ||
+          cur.played_q == -1));
+    if (deblunderTriggerThreshold || deblunderTriggerTerminal) {
+      float newZRatio = 1.0f;
+      // If width > 0 and the deblunder didn't involve a terminal
+      // position, we apply a soft threshold by averaging old and new Z.
+      if (deblunderQBlunderWidth > 0 && !deblunderTriggerTerminal) {
+        newZRatio = std::min(
+            1.0f, (cur.best_q - cur.played_q - deblunderQBlunderThreshold) /
+                          deblunderQBlunderWidth +
+                      0.5f);
+      }
+      // Instead of averaging, a randomization can be applied here with
+      // newZRatio = newZRatio > rand( [0, 1) ) ? 1.0f : 0.0f;
+      activeZ[0] = (1 - newZRatio) * activeZ[0] + newZRatio * cur.best_q;
+      activeZ[1] = (1 - newZRatio) * activeZ[1] + newZRatio * cur.best_d;
+      activeZ[2] = (1 - newZRatio) * activeZ[2] + newZRatio * cur.best_m;
+      deblunderingStarted = true;
+      blunders += 1;
+    }
+    if (deblunderingStarted) {
+      data.fileContents[history.GetLength() - 1].result_q = activeZ[0];
+      data.fileContents[history.GetLength() - 1].result_d = activeZ[1];
+      data.fileContents[history.GetLength() - 1].plies_left = activeZ[2];
+    }
+    if (history.GetLength() == 1) break;
+    // Q values are always from the player to move.
+    activeZ[0] = -activeZ[0];
+    // Estimated remaining plies left has to be increased.
+    activeZ[2] += 1.0f;
+    history.Pop();
+  }
+}
+
+void ConvertInputFormat(FileData& data, int newInputFormat) {
+  if (newInputFormat == -1) return;
+
+  PositionHistory history;
+  int rule50ply;
+  int gameply;
+  ChessBoard board;
+
+  PopulateBoard(data.input_format, PlanesFromTrainingData(data.fileContents[0]),
+                &board, &rule50ply, &gameply);
+  history.Reset(board, rule50ply, gameply);
+  ChangeInputFormat(newInputFormat, &data.fileContents[0], history);
+
+  for (size_t i = 0; i < data.moves.size(); i++) {
+    history.Append(data.moves[i]);
+    ChangeInputFormat(newInputFormat, &data.fileContents[i + 1], history);
+  }
+}
+
+void WriteNnueOutput(const FileData& data, const std::string& nnue_plain_file,
+                     ProcessFileFlags flags) {
+  // Output data in Stockfish plain format.
+  if (!nnue_plain_file.empty()) {
+    static Mutex mutex;
+    std::ostringstream out;
+
+    PositionHistory history;
+    int rule50ply;
+    int gameply;
+    ChessBoard board;
+
+    PopulateBoard(data.input_format,
+                  PlanesFromTrainingData(data.fileContents[0]), &board,
+                  &rule50ply, &gameply);
+    history.Reset(board, rule50ply, gameply);
+
+    for (size_t i = 0; i < data.fileContents.size(); i++) {
+      auto chunk = data.fileContents[i];
+      Position p = history.Last();
+      if (chunk.visits > 0) {
+        // Format is v6 and position is evaluated.
+        Move m = MoveFromNNIndex(
+            flags.nnue_best_move ? chunk.best_idx : chunk.played_idx,
+            TransformForPosition(data.input_format, history));
+        float q = flags.nnue_best_score ? chunk.best_q : chunk.played_q;
+        out << AsNnueString(p, m, q, round(chunk.result_q));
+      } else if (i < data.moves.size()) {
+        out << AsNnueString(p, data.moves[i], chunk.best_q,
+                            round(chunk.result_q));
+      }
+      if (i < data.moves.size()) {
+        history.Append(data.moves[i]);
+      }
+    }
+    std::ofstream file;
+    Mutex::Lock lock(mutex);
+    file.open(nnue_plain_file, std::ios_base::app);
+    if (file.is_open()) {
+      file << out.str();
+      file.close();
+    }
+  }
+}
+
+void WriteOutputs(const FileData& data, const std::string& file,
+                  const std::string& outputDir) {
+  // Write processed training data
+  if (!outputDir.empty()) {
+    std::string fileName = file.substr(file.find_last_of("/\\") + 1);
+    TrainingDataWriter writer(outputDir + "/" + fileName);
+    for (const auto& chunk : data.fileContents) {
+      // Don't save chunks that just provide move history.
+      if ((chunk.invariance_info & 64) == 0) {
+        writer.WriteChunk(chunk);
       }
     }
   }
+}
+
+FileData ProcessFileInternal(std::vector<V6TrainingData> fileContents,
+                             SyzygyTablebase* tablebase, float distTemp,
+                             float distOffset, float dtzBoost,
+                             int newInputFormat) {
+  // Process and validate file data
+  FileData data = ProcessAndValidateFileData(std::move(fileContents));
+
+  // Apply policy substitutions if available
+  ApplyPolicySubstitutions(data);
+
+  // Apply Syzygy tablebase rescoring
+  ApplySyzygyRescoring(data, tablebase);
+
+  // Apply policy adjustments (temperature, offset, boost)
+  ApplyPolicyAdjustments(data, tablebase, distTemp, distOffset, dtzBoost);
+
+  // Estimate and correct plies left
+  EstimateAndCorrectPliesLeft(data);
+
+  // Apply Gaviota tablebase corrections
+  ApplyGaviotaCorrections(data);
+
+  // Apply DTZ corrections
+  ApplyDTZCorrections(data, tablebase);
+
+  // Apply deblunder processing
+  ApplyDeblunder(data, tablebase);
+
+  // Convert input format if needed
+  ConvertInputFormat(data, newInputFormat);
+
+  return data;
+}
+
+void ProcessFile(const std::string& file, SyzygyTablebase* tablebase,
+                 std::string outputDir, float distTemp, float distOffset,
+                 float dtzBoost, int newInputFormat,
+                 std::string nnue_plain_file, ProcessFileFlags flags) {
+  try {
+    // Read file data
+    std::vector<V6TrainingData> fileContents = ReadFile(file);
+
+    FileData data =
+        ProcessFileInternal(std::move(fileContents), tablebase, distTemp,
+                            distOffset, dtzBoost, newInputFormat);
+
+    // Write NNUE output
+    WriteNnueOutput(data, nnue_plain_file, flags);
+
+    // Write outputs
+    WriteOutputs(data, file, outputDir);
+
+  } catch (Exception& ex) {
+    std::cerr << "While processing: " << file
+              << " - Exception thrown: " << ex.what() << std::endl;
+    if (flags.delete_files) {
+      std::cerr << "It will be deleted." << std::endl;
+    }
+  }
   if (flags.delete_files) {
     remove(file.c_str());
   }
@@ -1217,11 +1310,11 @@ void RunRescorer() {
     return;
   }
 
-  deblunderEnabled = options.GetOptionsDict().Get<bool>(kDeblunder);
-  deblunderQBlunderThreshold =
-      options.GetOptionsDict().Get<float>(kDeblunderQBlunderThreshold);
-  deblunderQBlunderWidth =
-      options.GetOptionsDict().Get<float>(kDeblunderQBlunderWidth);
+  if (options.GetOptionsDict().Get<bool>(kDeblunder)) {
+    RescorerDeblunderSetup(
+        options.GetOptionsDict().Get<float>(kDeblunderQBlunderThreshold),
+        options.GetOptionsDict().Get<float>(kDeblunderQBlunderWidth));
+  }
 
   SyzygyTablebase tablebase;
   if (!tablebase.init(
@@ -1230,48 +1323,26 @@ void RunRescorer() {
     std::cerr << "FAILED TO LOAD SYZYGY" << std::endl;
     return;
   }
-  auto dtmPaths =
-      options.GetOptionsDict().Get<std::string>(kGaviotaTablebaseId);
-  if (dtmPaths.size() != 0) {
-    std::stringstream path_string_stream(dtmPaths);
-    std::string path;
-    auto paths = tbpaths_init();
-    while (std::getline(path_string_stream, path, SEP_CHAR)) {
-      paths = tbpaths_add(paths, path.c_str());
-    }
-    tb_init(0, tb_CP4, paths);
-    tbcache_init(64 * 1024 * 1024, 64);
-    if (tb_availability() != 63) {
-      std::cerr << "UNEXPECTED gaviota availability" << std::endl;
-      return;
-    } else {
-      std::cerr << "Found Gaviota TBs" << std::endl;
-    }
-    gaviotaEnabled = true;
-  }
-  auto policySubsDir =
-      options.GetOptionsDict().Get<std::string>(kPolicySubsDirId);
-  if (policySubsDir.size() != 0) {
-    auto policySubFiles = GetFileList(policySubsDir);
-    for (size_t i = 0; i < policySubFiles.size(); i++) {
-      policySubFiles[i] = policySubsDir + "/" + policySubFiles[i];
-    }
-    BuildSubs(policySubFiles);
-  }
+
+  RescorerGaviotaSetup(
+      options.GetOptionsDict().Get<std::string>(kGaviotaTablebaseId));
+
+  RescorerPolicySubstitutionSetup(
+      options.GetOptionsDict().Get<std::string>(kPolicySubsDirId));
 
   auto inputDir = options.GetOptionsDict().Get<std::string>(kInputDirId);
-  if (inputDir.size() == 0) {
+  if (inputDir.empty()) {
     std::cerr << "Must provide an input dir." << std::endl;
     return;
   }
   auto files = GetFileList(inputDir);
-  if (files.size() == 0) {
+  if (files.empty()) {
     std::cerr << "No files to process" << std::endl;
     return;
   }
-  for (size_t i = 0; i < files.size(); i++) {
-    files[i] = inputDir + "/" + files[i];
-  }
+  std::transform(
+      files.begin(), files.end(), files.begin(),
+      [&inputDir](const std::string& file) { return inputDir + "/" + file; });
   float dtz_boost = options.GetOptionsDict().Get<float>(kMinDTZBoostId);
   unsigned int threads = options.GetOptionsDict().Get<int>(kThreadsId);
   ProcessFileFlags flags;
@@ -1346,4 +1417,54 @@ void RunRescorer() {
             << std::endl;
 }
 
+std::vector<V6TrainingData> RescoreTrainingData(
+    std::vector<V6TrainingData> fileContents, SyzygyTablebase* tablebase,
+    float distTemp, float distOffset, float dtzBoost, int newInputFormat) {
+  FileData data =
+      ProcessFileInternal(std::move(fileContents), tablebase, distTemp,
+                          distOffset, dtzBoost, newInputFormat);
+  return data.fileContents;
+}
+
+bool RescorerDeblunderSetup(float threshold, float width) {
+  deblunderEnabled = true;
+  deblunderQBlunderThreshold = threshold;
+  deblunderQBlunderWidth = width;
+  return true;
+}
+
+bool RescorerGaviotaSetup(std::string dtmPaths) {
+  if (!dtmPaths.empty()) {
+    std::stringstream path_string_stream(dtmPaths);
+    std::string path;
+    auto paths = tbpaths_init();
+    while (std::getline(path_string_stream, path, SEP_CHAR)) {
+      paths = tbpaths_add(paths, path.c_str());
+    }
+    tb_init(0, tb_CP4, paths);
+    tbcache_init(64 * 1024 * 1024, 64);
+    if (tb_availability() != 63) {
+      throw Exception("UNEXPECTED gaviota availability");
+      return false;
+    } else {
+      std::cerr << "Found Gaviota TBs" << std::endl;
+    }
+    gaviotaEnabled = true;
+  }
+  return gaviotaEnabled;
+}
+
+bool RescorerPolicySubstitutionSetup(std::string policySubsDir) {
+  if (!policySubsDir.empty()) {
+    auto policySubFiles = GetFileList(policySubsDir);
+    std::transform(policySubFiles.begin(), policySubFiles.end(),
+                   policySubFiles.begin(),
+                   [&policySubsDir](const std::string& file) {
+                     return policySubsDir + "/" + file;
+                   });
+    BuildSubs(policySubFiles);
+  }
+  return !policy_subs.empty();
+}
+
 }  // namespace lczero
diff --git a/src/trainingdata/rescorer.h b/src/trainingdata/rescorer.h
index bd1f31da4b..c38c0ff5c5 100644
--- a/src/trainingdata/rescorer.h
+++ b/src/trainingdata/rescorer.h
@@ -27,13 +27,22 @@
 
 #pragma once
 
-#include <thread>
+#include <vector>
 
-#include "chess/uciloop.h"
-#include "utils/optionsparser.h"
+#include "syzygy/syzygy.h"
+#include "trainingdata/trainingdata_v6.h"
 
 namespace lczero {
 
 void RunRescorer();
 
+// Interface for external use.
+bool RescorerDeblunderSetup(float threshold, float width);
+bool RescorerGaviotaSetup(std::string dtmPaths);
+bool RescorerPolicySubstitutionSetup(std::string policySubsDir);
+std::vector<V6TrainingData> RescoreTrainingData(
+    std::vector<V6TrainingData> fileContents, SyzygyTablebase* tablebase,
+    float distTemp = 1.0f, float distOffset = 0.0f, float dtzBoost = 0.0f,
+    int newInputFormat = -1);
+
 }  // namespace lczero
diff --git a/src/trainingdata/trainingdata.h b/src/trainingdata/trainingdata.h
index aac53bbfae..8780bcb611 100644
--- a/src/trainingdata/trainingdata.h
+++ b/src/trainingdata/trainingdata.h
@@ -30,64 +30,10 @@
 #include "neural/backend.h"
 #include "search/classic/node.h"
 #include "trainingdata/writer.h"
+#include "trainingdata/trainingdata_v6.h"
 
 namespace lczero {
 
-#pragma pack(push, 1)
-
-struct V6TrainingData {
-  uint32_t version;
-  uint32_t input_format;
-  float probabilities[1858];
-  uint64_t planes[104];
-  uint8_t castling_us_ooo;
-  uint8_t castling_us_oo;
-  uint8_t castling_them_ooo;
-  uint8_t castling_them_oo;
-  // For input type 3 contains enpassant column as a mask.
-  uint8_t side_to_move_or_enpassant;
-  uint8_t rule50_count;
-  // Bitfield with the following allocation:
-  //  bit 7: side to move (input type 3)
-  //  bit 6: position marked for deletion by the rescorer (never set by lc0)
-  //  bit 5: game adjudicated (v6)
-  //  bit 4: max game length exceeded (v6)
-  //  bit 3: best_q is for proven best move (v6)
-  //  bit 2: transpose transform (input type 3)
-  //  bit 1: mirror transform (input type 3)
-  //  bit 0: flip transform (input type 3)
-  // In versions prior to v5 this spot contained an unused move count field.
-  uint8_t invariance_info;
-  // In versions prior to v6 this spot contained thr result as an int8_t.
-  uint8_t dummy;
-  float root_q;
-  float best_q;
-  float root_d;
-  float best_d;
-  float root_m;      // In plies.
-  float best_m;      // In plies.
-  float plies_left;  // This is the training target for MLH.
-  float result_q;
-  float result_d;
-  float played_q;
-  float played_d;
-  float played_m;
-  // The folowing may be NaN if not found in cache.
-  float orig_q;  // For value repair.
-  float orig_d;
-  float orig_m;
-  uint32_t visits;
-  // Indices in the probabilities array.std::optional<EvalResult>
-  uint16_t played_idx;
-  uint16_t best_idx;
-  // Kullback-Leibler divergence between visits and policy (denominator)
-  float policy_kld;
-  uint32_t reserved;
-} PACKED_STRUCT;
-static_assert(sizeof(V6TrainingData) == 8356, "Wrong struct size");
-
-#pragma pack(pop)
-
 class V6TrainingDataArray {
  public:
   V6TrainingDataArray(FillEmptyHistory white_fill_empty_history,
diff --git a/src/trainingdata/trainingdata_v6.h b/src/trainingdata/trainingdata_v6.h
new file mode 100644
index 0000000000..0b5c986c0e
--- /dev/null
+++ b/src/trainingdata/trainingdata_v6.h
@@ -0,0 +1,90 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2021 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#pragma once
+
+#include <cstdint>
+#include "utils/cppattributes.h"
+
+namespace lczero {
+
+#pragma pack(push, 1)
+
+struct V6TrainingData {
+  uint32_t version;
+  uint32_t input_format;
+  float probabilities[1858];
+  uint64_t planes[104];
+  uint8_t castling_us_ooo;
+  uint8_t castling_us_oo;
+  uint8_t castling_them_ooo;
+  uint8_t castling_them_oo;
+  // For input type 3 contains enpassant column as a mask.
+  uint8_t side_to_move_or_enpassant;
+  uint8_t rule50_count;
+  // Bitfield with the following allocation:
+  //  bit 7: side to move (input type 3)
+  //  bit 6: position marked for deletion by the rescorer (never set by lc0)
+  //  bit 5: game adjudicated (v6)
+  //  bit 4: max game length exceeded (v6)
+  //  bit 3: best_q is for proven best move (v6)
+  //  bit 2: transpose transform (input type 3)
+  //  bit 1: mirror transform (input type 3)
+  //  bit 0: flip transform (input type 3)
+  // In versions prior to v5 this spot contained an unused move count field.
+  uint8_t invariance_info;
+  // In versions prior to v6 this spot contained thr result as an int8_t.
+  uint8_t dummy;
+  float root_q;
+  float best_q;
+  float root_d;
+  float best_d;
+  float root_m;      // In plies.
+  float best_m;      // In plies.
+  float plies_left;  // This is the training target for MLH.
+  float result_q;
+  float result_d;
+  float played_q;
+  float played_d;
+  float played_m;
+  // The folowing may be NaN if not found in cache.
+  float orig_q;  // For value repair.
+  float orig_d;
+  float orig_m;
+  uint32_t visits;
+  // Indices in the probabilities array.std::optional<EvalResult>
+  uint16_t played_idx;
+  uint16_t best_idx;
+  // Kullback-Leibler divergence between visits and policy (denominator)
+  float policy_kld;
+  uint32_t reserved;
+} PACKED_STRUCT;
+static_assert(sizeof(V6TrainingData) == 8356, "Wrong struct size");
+
+#pragma pack(pop)
+
+}  // namespace lczero
diff --git a/src/utils/bit.h b/src/utils/bit.h
new file mode 100644
index 0000000000..7c335e2e0b
--- /dev/null
+++ b/src/utils/bit.h
@@ -0,0 +1,45 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2025 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#pragma once
+#include <bit>
+#include <cstring>
+
+namespace lczero {
+
+#if __cpp_lib_bit_cast >= 201806L
+using std::bit_cast;
+#else
+template <class To, class From>
+To bit_cast(const From& src) noexcept {
+  To dst;
+  std::memcpy((void*)&dst, &src, sizeof(To));
+  return dst;
+}
+#endif
+
+}  // namespace lczero
diff --git a/src/utils/fp16_utils.h b/src/utils/fp16_utils.h
index 2680536599..9efa0e6574 100644
--- a/src/utils/fp16_utils.h
+++ b/src/utils/fp16_utils.h
@@ -27,7 +27,8 @@
 #pragma once
 
 #include <cstdint>
-#include <cstring>
+
+#include "utils/bit.h"
 
 // Define NO_F16C to avoid the F16C intrinsics. Also disabled with NO_POPCNT
 // since it catches most processors without F16C instructions.
@@ -40,59 +41,18 @@
 
 namespace lczero {
 
-#if defined(NO_POPCNT) || defined(NO_F16C) || \
-    (defined(__GNUC__) && !defined(__F16C__))
+#if defined(HAS_FLOAT16) && (defined(__F16C__) || defined(__aarch64__))
 
 inline uint16_t FP32toFP16(float f32) {
-  unsigned int x;
-  unsigned int sign = 0;
-  memcpy(&x, &f32, sizeof(float));
-  if (x & 0x80000000) sign = 0x8000;
-  x &= 0x7fffffff;
-  if (x >= 0x477ff000) {
-    if ((x & 0x7f800000) == 0x7f800000 && (x & 0x7fffff)) {
-      x = ((x >> 13) - 0x38000) | 0x200;
-    } else {
-      x = 0x7c00;
-    }
-  } else if (x <= 0x33000000)
-    x = 0;
-  else if (x <= 0x387fefff) {
-    int shift = 126 - ((x >> 23) & 0xff);
-    x = (x & 0x7fffff) | 0x800000;
-    if (x & (0x17fffff >> (24 - shift))) x += 0x800000 >> (24 - shift);
-    x >>= shift;
-  } else {
-    // Adjust exponent and round to nearest even.
-    if (x & 0x2fff) {
-      x -= 0x37fff000;
-    } else {
-      x -= 0x38000000;
-    }
-    x >>= 13;
-  }
-  return x | sign;
+  return bit_cast<uint16_t>(static_cast<_Float16>(f32));
 }
 
 inline float FP16toFP32(uint16_t f16) {
-  unsigned int x;
-  float f;
-  x = f16 & 0x7fff;
-  if ((x & 0x7c00) == 0) {
-    f = 5.9604645e-8f * x;
-    memcpy(&x, &f, sizeof(float));
-  } else if (x >= 0x7c00) {
-    if (x & 0x1ff) x |= 0x200;
-    x = (x + 0x38000) << 13;
-  } else {
-    x = (x + 0x1c000) << 13;
-  }
-  if (f16 & 0x8000) x |= 0x80000000;
-  memcpy(&f, &x, sizeof(float));
-  return f;
+  return static_cast<float>(bit_cast<_Float16>(f16));
 }
 
-#else
+#elif !defined(NO_POPCNT) && !defined(NO_F16C) && \
+    (!defined(__GNUC__) || defined(__F16C__))
 
 inline uint16_t FP32toFP16(float f32) {
   __m128 A = _mm_set_ss(f32);
@@ -107,6 +67,56 @@ inline float FP16toFP32(uint16_t f16) {
   return _mm_cvtss_f32(A);
 }
 
+#else
+
+inline uint16_t FP32toFP16(float f32) {
+  uint32_t x = bit_cast<uint32_t>(f32);
+  uint32_t sign = (x & 0x80000000) >> 16;
+  x &= 0x7fffffff;
+  if (x < 0x477ff000) {
+    if (x >= 0x387ff000) {
+      // Normal fp16 result. Adjust exponent and round to nearest even.
+      // Branchless idea from <https://gist.github.com/rygorous/2156668>.
+      x += (x >> 13) & 1;
+      x -= 0x37fff001;
+      x >>= 13;
+    } else {
+      // Subnormal or zero. The result is the last bits of fabs(f32) + 0.5f.
+      x = bit_cast<uint32_t>(bit_cast<float>(x) + 0.5f);
+    }
+  } else {
+    if (x > 0x7f800000) {
+      // NaN
+      x = ((x >> 13) - 0x38000) | 0x200;
+    } else {
+      // Inf
+      x = 0x7c00;
+    }
+  }
+  return x | sign;
+}
+
+inline float FP16toFP32(uint16_t f16) {
+  int32_t s = static_cast<int16_t>(f16);
+  uint32_t x;
+  float f;
+  if ((s & 0x7c00) == 0) {
+    // Subnormal or zero. Scale to float.
+    x = s & 0x7fff;
+    f = 5.9604645e-8f * x;
+    x = bit_cast<uint32_t>(f);
+    if (s & 0x8000) x |= 0x80000000;
+  } else if ((s & 0x7c00) == 0x7c00) {
+    // Inf or NaN. Adjust exponent and shift.
+    if (s & 0x1ff) s |= 0x200;  // Change sNaN to qNaN as intel does.
+    x = ((s & 0x47fff) + 0x38000) << 13;
+  } else {
+    // Normal. Adjust exponent and shift.
+    x = ((s & 0x47fff) + 0x1c000U) << 13;
+  }
+  return bit_cast<float>(x);
+}
+
 #endif
 
 }  // namespace lczero
diff --git a/src/utils/fp16_utils_test.cc b/src/utils/fp16_utils_test.cc
new file mode 100644
index 0000000000..32aa66f240
--- /dev/null
+++ b/src/utils/fp16_utils_test.cc
@@ -0,0 +1,107 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2018 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#include "utils/fp16_utils.h"
+
+#include <gtest/gtest.h>
+
+namespace lczero {
+
+testing::AssertionResult FP16Equal(const char* a_expr, const char* b_expr,
+                                   uint16_t a, uint16_t b) {
+  if (a == b) return testing::AssertionSuccess();
+  std::ostringstream oss_a;
+  oss_a << std::hex << a;
+  std::ostringstream oss_b;
+  oss_b << std::hex << b;
+  return testing::AssertionFailure()
+         << "Expected FP16 values to be equal:\n"
+         << "  " << a_expr << "\n"
+         << "     Which is: 0x" << oss_a.str() << "\n"
+         << "  " << b_expr << "\n"
+         << "     Which is: 0x" << oss_b.str() << "\n";
+}
+
+TEST(FP16, TestNormalConversion) {
+  float values[] = {0.0f,
+                    -0.000000029802322f,
+                    0.000000029802326f,
+                    -0.000000059604645f,
+                    0.000060975552f,
+                    -0.00006103515625f,
+                    0.1f,
+                    -0.5f,
+                    0.99951172f,
+                    -1.0f,
+                    1.00097656f,
+                    -2.0f,
+                    3.5f,
+                    -4.25f,
+                    65488.0f,
+                    -65488.004f,
+                    65504.0f,
+                    -65519.996f,
+                    65520.0f,
+                    -std::numeric_limits<float>::infinity()};
+  uint16_t expected_fp16[] = {0x0,    0x8000, 0x1,    0x8001, 0x3ff,
+                              0x8400, 0x2E66, 0xB800, 0x3BFF, 0xBC00,
+                              0x3C01, 0xC000, 0x4300, 0xC440, 0x7BFE,
+                              0xFBFF, 0x7BFF, 0xFBFF, 0x7C00, 0xFC00};
+  float expected_fp32[] = {0.0f,
+                           -0.0f,
+                           0.000000059604645f,
+                           -0.000000059604645f,
+                           0.000060975552f,
+                           -0.00006103515625f,
+                           0.0999755859f,
+                           -0.5f,
+                           0.99951172f,
+                           -1.0f,
+                           1.00097656f,
+                           -2.0f,
+                           3.5f,
+                           -4.25f,
+                           65472.0f,
+                           -65504.0f,
+                           65504.0f,
+                           -65504.0f,
+                           std::numeric_limits<float>::infinity(),
+                           -std::numeric_limits<float>::infinity()};
+  for (size_t i = 0; i < sizeof(values) / sizeof(values[0]); ++i) {
+    uint16_t fp16 = FP32toFP16(values[i]);
+    EXPECT_PRED_FORMAT2(FP16Equal, fp16, expected_fp16[i]) << " at index " << i;
+    float back = FP16toFP32(fp16);
+    EXPECT_FLOAT_EQ(back, expected_fp32[i]) << " at index " << i;
+  }
+}
+
+}  // namespace lczero
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/utils/optionsdict.cc b/src/utils/optionsdict.cc
index b515914c36..1c820c59af 100644
--- a/src/utils/optionsdict.cc
+++ b/src/utils/optionsdict.cc
@@ -74,7 +74,7 @@ std::vector<std::string> OptionsDict::ListSubdicts() const {
 }
 
 bool OptionsDict::HasSubdict(const std::string& name) const {
-  return subdicts_.find(name) != subdicts_.end();
+  return subdicts_.contains(name);
 }
 
 namespace {
diff --git a/src/utils/optionsdict.h b/src/utils/optionsdict.h
index ce5caa58c4..e4496be85f 100644
--- a/src/utils/optionsdict.h
+++ b/src/utils/optionsdict.h
@@ -27,6 +27,7 @@
 
 #pragma once
 
+#include <atomic>
 #include <map>
 #include <memory>
 #include <optional>
@@ -43,21 +44,35 @@ class TypeDict {
  protected:
   struct V {
     const T& Get() const {
-      was_read_since_last_set_ = true;
+      was_read_since_last_set_.store(true, std::memory_order::release);
       return value_;
     }
     T& Get() {
-      was_read_since_last_set_ = true;
+      was_read_since_last_set_.store(true, std::memory_order::release);
       return value_;
     }
     void Set(const T& v) {
-      was_read_since_last_set_ = false;
       value_ = v;
+      was_read_since_last_set_.store(false, std::memory_order::release);
+    }
+    bool WasReadSinceLastSet() const {
+      return was_read_since_last_set_.load(std::memory_order::acquire);
     }
-    bool WasReadSinceLastSet() const { return was_read_since_last_set_; }
 
+    V() = default;
+    V(const V& o) :
+      was_read_since_last_set_{o.was_read_since_last_set_.load(std::memory_order::acquire)},
+      value_{o.value_} {
+    }
+    V& operator=(const V& o) {
+      value_ = o.value_;
+      was_read_since_last_set_.store(o.was_read_since_last_set_.load(std::memory_order::acquire),
+                                     std::memory_order::release);
+      return *this;
+    }
+    V(const T& v) : value_{v} {}
    private:
-    mutable bool was_read_since_last_set_ = false;
+    mutable std::atomic<bool> was_read_since_last_set_ = false;
     T value_;
   };
   void EnsureNoUnusedOptions(const std::string& type_name,
diff --git a/src/utils/optionsparser.cc b/src/utils/optionsparser.cc
index 0e41602744..144fa0918d 100644
--- a/src/utils/optionsparser.cc
+++ b/src/utils/optionsparser.cc
@@ -27,6 +27,7 @@
 
 #include "optionsparser.h"
 
+#include <charconv>
 #include <iomanip>
 #include <iostream>
 #include <sstream>
@@ -36,12 +37,6 @@
 #include "utils/logging.h"
 #include "utils/string.h"
 
-#if __has_include(<charconv>)
-#include <charconv>
-#else
-#define NO_CHARCONV
-#endif
-
 namespace lczero {
 namespace {
 const int kHelpIndent = 15;
@@ -399,7 +394,6 @@ void IntOption::SetVal(OptionsDict* dict, const ValueType& val) const {
   dict->Set<ValueType>(GetId(), val);
 }
 
-#ifndef NO_CHARCONV
 int IntOption::ValidateIntString(const std::string& val) const {
   int result;
   const auto end = val.data() + val.size();
@@ -414,20 +408,6 @@ int IntOption::ValidateIntString(const std::string& val) const {
     return result;
   }
 }
-#else
-int IntOption::ValidateIntString(const std::string& val) const {
-  char* end;
-  errno = 0;
-  int result = std::strtol(val.c_str(), &end, 10);
-  if (errno == ERANGE) {
-    throw Exception("Flag '--" + GetLongFlag() + "' is out of range.");
-  } else if (val.length() == 0 || *end != '\0') {
-    throw Exception("Flag '--" + GetLongFlag() + "' value is invalid.");
-  } else {
-    return result;
-  }
-}
-#endif
 
 /////////////////////////////////////////////////////////////////
 // FloatOption
diff --git a/src/utils/trace.cc b/src/utils/trace.cc
new file mode 100644
index 0000000000..373cd0fb78
--- /dev/null
+++ b/src/utils/trace.cc
@@ -0,0 +1,30 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2025 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#include "trace.h"
+
+LCTRACE_DECLARE_CATEGORIES;
diff --git a/src/utils/trace.h b/src/utils/trace.h
new file mode 100644
index 0000000000..218aade57d
--- /dev/null
+++ b/src/utils/trace.h
@@ -0,0 +1,72 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2025 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#pragma once
+
+#include "trace_config.h"
+
+#if USE_PERFETTO_TRACE
+#include <perfetto.h>
+
+PERFETTO_DEFINE_CATEGORIES(
+    perfetto::Category("lc0").SetDescription("Leela Chess Zero"));
+#endif
+
+#if USE_NVTX_TRACE
+#include <nvtx3/nvtx3.hpp>
+#endif
+
+namespace lczero {
+#if USE_PERFETTO_TRACE
+#define LCTRACE_DECLARE_CATEGORIES PERFETTO_TRACK_EVENT_STATIC_STORAGE();
+
+#define LCTRACE_INITIALIZE                     \
+  do {                                         \
+    perfetto::TracingInitArgs args;            \
+    args.backends |= perfetto::kSystemBackend; \
+    perfetto::Tracing::Initialize(args);       \
+    perfetto::TrackEvent::Register();          \
+  } while (false)
+
+#define LCTRACE_FUNCTION_SCOPE \
+  const auto& name = __func__; \
+  TRACE_EVENT("lc0", name)
+
+#elif USE_NVTX_TRACE
+#define LCTRACE_DECLARE_CATEGORIES /* nop */
+#define LCTRACE_INITIALIZE         /* nop */
+struct lc0_domain {
+  static constexpr char name[] = "lc0";
+};
+#define LCTRACE_FUNCTION_SCOPE NVTX3_FUNC_RANGE_IN(lc0_domain)
+#else
+
+#define LCTRACE_DECLARE_CATEGORIES
+#define LCTRACE_INITIALIZE
+#define LCTRACE_FUNCTION_SCOPE
+#endif
+}  // namespace lczero
diff --git a/src/utils/weights_adapter.cc b/src/utils/weights_adapter.cc
index 2a5d196f14..a0f7adb278 100644
--- a/src/utils/weights_adapter.cc
+++ b/src/utils/weights_adapter.cc
@@ -25,19 +25,63 @@
   Program grant you additional permission to convey the resulting work.
 */
 
-#include "src/utils/weights_adapter.h"
+#include "utils/weights_adapter.h"
+
+#include <absl/base/optimization.h>
+
+#include "utils/bf16_utils.h"
+#include "utils/exception.h"
+#include "utils/fp16_utils.h"
 
 namespace lczero {
-float LayerAdapter::Iterator::ExtractValue(const uint16_t* ptr,
+
+float LayerAdapter::Iterator::ExtractValue(const std::byte* ptr,
                                            const LayerAdapter* adapter) {
-  return *ptr / static_cast<float>(0xffff) * adapter->range_ + adapter->min_;
+  switch (adapter->encoding_) {
+    case pblczero::Weights::Layer::LINEAR16: {
+      float theta =
+          *reinterpret_cast<const uint16_t*>(ptr) / static_cast<float>(0xffff);
+      return adapter->min_ * (1 - theta) + adapter->max_ * theta;
+    }
+    case pblczero::Weights::Layer::FLOAT16:
+      return FP16toFP32(*reinterpret_cast<const uint16_t*>(ptr));
+    case pblczero::Weights::Layer::BFLOAT16:
+      return BF16toFP32(*reinterpret_cast<const uint16_t*>(ptr));
+    case pblczero::Weights::Layer::FLOAT32: {
+      return *reinterpret_cast<const float*>(ptr);
+    }
+    [[unlikely]] default:  // To silence a couple of warnings.
+#if defined(ABSL_UNREACHABLE)
+      ABSL_UNREACHABLE();
+#elif defined(__GNUC__)
+      __builtin_unreachable();
+#else
+      __assume(false);
+#endif
+  }
 }
 
 LayerAdapter::LayerAdapter(const pblczero::Weights::Layer& layer)
-    : data_(reinterpret_cast<const uint16_t*>(layer.params().data())),
-      size_(layer.params().size() / sizeof(uint16_t)),
+    : encoding_(layer.has_encoding() ? layer.encoding()
+                                     : pblczero::Weights::Layer::LINEAR16),
+      element_size_(encoding_ == pblczero::Weights::Layer::FLOAT32
+                        ? sizeof(float)
+                        : sizeof(uint16_t)),
+      data_(reinterpret_cast<const std::byte*>(layer.params().data())),
+      size_(layer.params().size() / element_size_),
       min_(layer.min_val()),
-      range_(layer.max_val() - min_) {}
+      max_(layer.max_val()) {
+  switch (encoding_) {
+    case pblczero::Weights::Layer::LINEAR16:
+    case pblczero::Weights::Layer::FLOAT16:
+    case pblczero::Weights::Layer::BFLOAT16:
+    case pblczero::Weights::Layer::FLOAT32:
+      break;
+    default:
+      throw Exception("Unknown layer encoding " +
+                      pblczero::Weights::Layer::Encoding_Name(encoding_));
+  }
+}
 
 std::vector<float> LayerAdapter::as_vector() const {
   return std::vector<float>(begin(), end());
@@ -46,7 +90,7 @@ float LayerAdapter::Iterator::operator*() const {
   return ExtractValue(data_, adapter_);
 }
 float LayerAdapter::Iterator::operator[](size_t idx) const {
-  return ExtractValue(data_ + idx, adapter_);
+  return ExtractValue(data_ + idx * adapter_->element_size_, adapter_);
 }
 
 }  // namespace lczero
diff --git a/src/utils/weights_adapter.h b/src/utils/weights_adapter.h
index b26b172dfa..ded90989da 100644
--- a/src/utils/weights_adapter.h
+++ b/src/utils/weights_adapter.h
@@ -27,6 +27,7 @@
 
 #pragma once
 
+#include <cstdint>
 #include <iterator>
 #include <vector>
 
@@ -56,27 +57,28 @@ class LayerAdapter {
       return data_ != other.data_;
     }
     Iterator& operator++() {
-      ++data_;
+      data_ += adapter_->element_size_;
       return *this;
     }
     Iterator& operator--() {
-      --data_;
+      data_ -= adapter_->element_size_;
       return *this;
     }
     ptrdiff_t operator-(const Iterator& other) const {
-      return data_ - other.data_;
+      return (data_ - other.data_) / adapter_->element_size_;
     }
 
     // TODO(crem) implement other iterator functions when they are needed.
 
    private:
     friend class LayerAdapter;
-    Iterator(const LayerAdapter* adapter, const uint16_t* ptr)
+    Iterator(const LayerAdapter* adapter, const std::byte* ptr)
         : adapter_(adapter), data_(ptr) {}
-    static float ExtractValue(const uint16_t* ptr, const LayerAdapter* adapter);
+    static float ExtractValue(const std::byte* ptr,
+                              const LayerAdapter* adapter);
 
     const LayerAdapter* adapter_ = nullptr;
-    const uint16_t* data_ = nullptr;
+    const std::byte* data_ = nullptr;
   };
 
   LayerAdapter(const pblczero::Weights::Layer& layer);
@@ -84,13 +86,15 @@ class LayerAdapter {
   size_t size() const { return size_; }
   float operator[](size_t idx) const { return begin()[idx]; }
   Iterator begin() const { return {this, data_}; }
-  Iterator end() const { return {this, data_ + size_}; }
+  Iterator end() const { return {this, data_ + size_ * element_size_}; }
 
  private:
-  const uint16_t* data_ = nullptr;
+  const pblczero::Weights::Layer::Encoding encoding_;
+  const size_t element_size_ = 0;
+  const std::byte* data_ = nullptr;
   const size_t size_ = 0;
   const float min_;
-  const float range_;
+  const float max_;
 };
 
 }  // namespace lczero
diff --git a/src/version.inc b/src/version.inc
index a653627335..ccb71f6d74 100644
--- a/src/version.inc
+++ b/src/version.inc
@@ -1,4 +1,4 @@
 #define LC0_VERSION_MAJOR 0
-#define LC0_VERSION_MINOR 32
+#define LC0_VERSION_MINOR 33
 #define LC0_VERSION_PATCH 0
 #define LC0_VERSION_POSTFIX "dev"
diff --git a/subprojects/abseil-cpp.wrap b/subprojects/abseil-cpp.wrap
index 0cb2a42a25..18b5a1dda4 100644
--- a/subprojects/abseil-cpp.wrap
+++ b/subprojects/abseil-cpp.wrap
@@ -3,11 +3,8 @@ directory = abseil-cpp-20240722.0
 source_url = https://github.com/abseil/abseil-cpp/releases/download/20240722.0/abseil-cpp-20240722.0.tar.gz
 source_filename = abseil-cpp-20240722.0.tar.gz
 source_hash = f50e5ac311a81382da7fa75b97310e4b9006474f9560ac46f54a9967f07d4ae3
-patch_filename = abseil-cpp_20240722.0-3_patch.zip
-patch_url = https://wrapdb.mesonbuild.com/v2/abseil-cpp_20240722.0-3/get_patch
-patch_hash = 12dd8df1488a314c53e3751abd2750cf233b830651d168b6a9f15e7d0cf71f7b
+patch_directory = abseil-cpp-20240722.0
 source_fallback_url = https://github.com/mesonbuild/wrapdb/releases/download/abseil-cpp_20240722.0-3/abseil-cpp-20240722.0.tar.gz
-wrapdb_version = 20240722.0-3
 
 [provide]
 absl_base = absl_base_dep
diff --git a/subprojects/cutlass.wrap b/subprojects/cutlass.wrap
new file mode 100644
index 0000000000..9b2e897962
--- /dev/null
+++ b/subprojects/cutlass.wrap
@@ -0,0 +1,7 @@
+[wrap-git]
+url = https://github.com/NVIDIA/cutlass.git
+revision = v2.11.0
+
+patch_directory = cutlass
+
+
diff --git a/subprojects/eigen.wrap b/subprojects/eigen.wrap
index e46839c90b..becc4767c7 100644
--- a/subprojects/eigen.wrap
+++ b/subprojects/eigen.wrap
@@ -3,10 +3,11 @@ directory = eigen-3.4.0
 source_url = https://gitlab.com/libeigen/eigen/-/archive/3.4.0/eigen-3.4.0.tar.bz2
 source_filename = eigen-3.4.0.tar.bz2
 source_hash = b4c198460eba6f28d34894e3a5710998818515104d6e74e5cc331ce31e46e626
-patch_filename = eigen_3.4.0-1_patch.zip
-patch_url = https://wrapdb.mesonbuild.com/v2/eigen_3.4.0-1/get_patch
-patch_hash = fae999acdb3ea23eada3becdbde7f7f76755e94ad85fee7775b7ab1cf12e84e3
+patch_filename = eigen_3.4.0-2_patch.zip
+patch_url = https://wrapdb.mesonbuild.com/v2/eigen_3.4.0-2/get_patch
+patch_hash = cb764fd9fec02d94aaa2ec673d473793c0d05da4f4154c142f76ef923ea68178
+source_fallback_url = https://github.com/mesonbuild/wrapdb/releases/download/eigen_3.4.0-2/eigen-3.4.0.tar.bz2
+wrapdb_version = 3.4.0-2
 
 [provide]
 eigen3 = eigen_dep
-
diff --git a/subprojects/packagefiles/abseil-cpp-20240722.0/LICENSE.build b/subprojects/packagefiles/abseil-cpp-20240722.0/LICENSE.build
new file mode 100644
index 0000000000..b59833dedb
--- /dev/null
+++ b/subprojects/packagefiles/abseil-cpp-20240722.0/LICENSE.build
@@ -0,0 +1,19 @@
+Copyright (c) 2021 The Meson development team
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/subprojects/packagefiles/abseil-cpp-20240722.0/absl/base/internal/per_thread_tls.h b/subprojects/packagefiles/abseil-cpp-20240722.0/absl/base/internal/per_thread_tls.h
new file mode 100644
index 0000000000..441a8cb0f0
--- /dev/null
+++ b/subprojects/packagefiles/abseil-cpp-20240722.0/absl/base/internal/per_thread_tls.h
@@ -0,0 +1,52 @@
+// Copyright 2017 The Abseil Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ABSL_BASE_INTERNAL_PER_THREAD_TLS_H_
+#define ABSL_BASE_INTERNAL_PER_THREAD_TLS_H_
+
+// This header defines two macros:
+//
+// If the platform supports thread-local storage:
+//
+// * ABSL_PER_THREAD_TLS_KEYWORD is the C keyword needed to declare a
+//   thread-local variable
+// * ABSL_PER_THREAD_TLS is 1
+//
+// Otherwise:
+//
+// * ABSL_PER_THREAD_TLS_KEYWORD is empty
+// * ABSL_PER_THREAD_TLS is 0
+//
+// Microsoft C supports thread-local storage.
+// GCC supports it if the appropriate version of glibc is available,
+// which the programmer can indicate by defining ABSL_HAVE_TLS
+
+#include "absl/base/port.h"  // For ABSL_HAVE_TLS
+
+#if defined(ABSL_PER_THREAD_TLS)
+#error ABSL_PER_THREAD_TLS cannot be directly set
+#elif defined(ABSL_PER_THREAD_TLS_KEYWORD)
+#error ABSL_PER_THREAD_TLS_KEYWORD cannot be directly set
+#elif defined(ABSL_HAVE_TLS) || defined(__INTEL_LLVM_COMPILER)
+#define ABSL_PER_THREAD_TLS_KEYWORD __thread
+#define ABSL_PER_THREAD_TLS 1
+#elif defined(_MSC_VER)
+#define ABSL_PER_THREAD_TLS_KEYWORD __declspec(thread)
+#define ABSL_PER_THREAD_TLS 1
+#else
+#define ABSL_PER_THREAD_TLS_KEYWORD
+#define ABSL_PER_THREAD_TLS 0
+#endif
+
+#endif  // ABSL_BASE_INTERNAL_PER_THREAD_TLS_H_
diff --git a/subprojects/packagefiles/abseil-cpp-20240722.0/meson.build b/subprojects/packagefiles/abseil-cpp-20240722.0/meson.build
new file mode 100644
index 0000000000..9848e5b33b
--- /dev/null
+++ b/subprojects/packagefiles/abseil-cpp-20240722.0/meson.build
@@ -0,0 +1,906 @@
+project(
+  'abseil-cpp',
+  'cpp',
+  version: '20240722.0',
+  license: 'Apache-2.0',
+  default_options: [
+    'cpp_std=c++17',
+  ],
+)
+
+cpp = meson.get_compiler('cpp')
+
+flags = cpp.get_supported_arguments('/DNOMINMAX', '-Wno-sign-compare', '-Wno-gcc-compat')
+add_project_arguments(flags, language: 'cpp')
+
+arch_cpp_flags = []
+hw_cpp_flags = []
+if host_machine.cpu_family() == 'x86_64'
+  hw_cpp_flags += ['-maes', '-msse4.1']
+elif host_machine.cpu_family() == 'aarch64' and cpp.sizeof('void*') == 8
+  hw_cpp_flags += ['-march=armv8-a+crypto']
+elif host_machine.cpu_family() == 'arm' and cpp.sizeof('void*') == 4
+  hw_cpp_flags += ['-mfpu=neon']
+elif host_machine.cpu_family() == 'ppc' or host_machine.cpu_family() == 'ppc64'
+  # This will work with glibc but not musl
+  timebase_check = '''#include <sys/platform/ppc.h>
+    int main() {
+       __ppc_get_timebase_freq();
+       return 0;
+    }'''
+  if not cpp.compiles(timebase_check)
+    arch_cpp_flags += ['-DABSL_USE_UNSCALED_CYCLECLOCK=0']
+  endif
+endif
+arch_flags = cpp.get_supported_arguments(arch_cpp_flags)
+hw_flags = cpp.get_supported_arguments(hw_cpp_flags)
+
+libatomic = dependency('', required: false)
+if cpp.get_argument_syntax() != 'msvc' and not cpp.links('int main(){__sync_synchronize();}', name: 'atomic builtins')
+  libatomic = cpp.find_library('atomic')
+endif
+
+absl_include_dir = include_directories('.')
+
+# Group files by the containing library
+absl_base_sources = files(
+  'absl/base/internal/cycleclock.cc',
+  'absl/base/internal/low_level_alloc.cc',
+  'absl/base/internal/poison.cc',
+  'absl/base/internal/raw_logging.cc',
+  'absl/base/internal/scoped_set_env.cc',
+  'absl/base/internal/spinlock.cc',
+  'absl/base/internal/spinlock_wait.cc',
+  'absl/base/internal/strerror.cc',
+  'absl/base/internal/sysinfo.cc',
+  'absl/base/internal/thread_identity.cc',
+  'absl/base/internal/throw_delegate.cc',
+  'absl/base/internal/unscaledcycleclock.cc',
+  'absl/base/log_severity.cc',
+)
+absl_base_headers = files(
+  'absl/base/attributes.h',
+  'absl/base/call_once.h',
+  'absl/base/casts.h',
+  'absl/base/config.h',
+  'absl/base/const_init.h',
+  'absl/base/dynamic_annotations.h',
+  'absl/base/internal/atomic_hook.h',
+  'absl/base/internal/atomic_hook_test_helper.h',
+  'absl/base/internal/cycleclock.h',
+  'absl/base/internal/cycleclock_config.h',
+  'absl/base/internal/direct_mmap.h',
+  'absl/base/internal/dynamic_annotations.h',
+  'absl/base/internal/endian.h',
+  'absl/base/internal/errno_saver.h',
+  'absl/base/internal/exception_safety_testing.h',
+  'absl/base/internal/exception_testing.h',
+  'absl/base/internal/fast_type_id.h',
+  'absl/base/internal/hide_ptr.h',
+  'absl/base/internal/identity.h',
+  'absl/base/internal/inline_variable.h',
+  'absl/base/internal/inline_variable_testing.h',
+  'absl/base/internal/invoke.h',
+  'absl/base/internal/low_level_alloc.h',
+  'absl/base/internal/low_level_scheduling.h',
+  'absl/base/internal/nullability_impl.h',
+  'absl/base/internal/per_thread_tls.h',
+  'absl/base/internal/poison.h',
+  'absl/base/internal/pretty_function.h',
+  'absl/base/internal/raw_logging.h',
+  'absl/base/internal/scheduling_mode.h',
+  'absl/base/internal/scoped_set_env.h',
+  'absl/base/internal/spinlock.h',
+  'absl/base/internal/spinlock_akaros.inc',
+  'absl/base/internal/spinlock_linux.inc',
+  'absl/base/internal/spinlock_posix.inc',
+  'absl/base/internal/spinlock_wait.h',
+  'absl/base/internal/spinlock_win32.inc',
+  'absl/base/internal/strerror.h',
+  'absl/base/internal/sysinfo.h',
+  'absl/base/internal/thread_identity.h',
+  'absl/base/internal/throw_delegate.h',
+  'absl/base/internal/tsan_mutex_interface.h',
+  'absl/base/internal/unaligned_access.h',
+  'absl/base/internal/unscaledcycleclock.h',
+  'absl/base/internal/unscaledcycleclock_config.h',
+  'absl/base/log_severity.h',
+  'absl/base/macros.h',
+  'absl/base/no_destructor.h',
+  'absl/base/nullability.h',
+  'absl/base/optimization.h',
+  'absl/base/options.h',
+  'absl/base/policy_checks.h',
+  'absl/base/port.h',
+  'absl/base/thread_annotations.h',
+  'absl/functional/any_invocable.h',
+  'absl/functional/internal/any_invocable.h',
+  'absl/memory/memory.h',
+  'absl/meta/type_traits.h',
+  'absl/utility/utility.h',
+  # Dependent headers of absl_base
+)
+
+absl_container_sources = files(
+  'absl/container/internal/hashtablez_sampler.cc',
+  'absl/container/internal/hashtablez_sampler_force_weak_definition.cc',
+  'absl/container/internal/raw_hash_set.cc',
+)
+absl_container_headers = files(
+  'absl/container/btree_map.h',
+  'absl/container/btree_set.h',
+  'absl/container/hash_container_defaults.h',
+  'absl/container/btree_test.h',
+  'absl/container/fixed_array.h',
+  'absl/container/flat_hash_map.h',
+  'absl/container/flat_hash_set.h',
+  'absl/container/inlined_vector.h',
+  'absl/container/internal/btree.h',
+  'absl/container/internal/btree_container.h',
+  'absl/container/internal/common.h',
+  'absl/container/internal/common_policy_traits.h',
+  'absl/container/internal/compressed_tuple.h',
+  'absl/container/internal/container_memory.h',
+  'absl/container/internal/hash_function_defaults.h',
+  'absl/container/internal/hash_generator_testing.h',
+  'absl/container/internal/hash_policy_testing.h',
+  'absl/container/internal/hash_policy_traits.h',
+  'absl/container/internal/hashtable_debug.h',
+  'absl/container/internal/hashtable_debug_hooks.h',
+  'absl/container/internal/hashtablez_sampler.h',
+  'absl/container/internal/inlined_vector.h',
+  'absl/container/internal/layout.h',
+  'absl/container/internal/node_slot_policy.h',
+  'absl/container/internal/raw_hash_map.h',
+  'absl/container/internal/raw_hash_set.h',
+  'absl/container/internal/test_instance_tracker.h',
+  'absl/container/internal/tracked.h',
+  'absl/container/internal/unordered_map_constructor_test.h',
+  'absl/container/internal/unordered_map_lookup_test.h',
+  'absl/container/internal/unordered_map_members_test.h',
+  'absl/container/internal/unordered_map_modifiers_test.h',
+  'absl/container/internal/unordered_set_constructor_test.h',
+  'absl/container/internal/unordered_set_lookup_test.h',
+  'absl/container/internal/unordered_set_members_test.h',
+  'absl/container/internal/unordered_set_modifiers_test.h',
+  'absl/container/node_hash_map.h',
+  'absl/container/node_hash_set.h',
+)
+
+absl_crc_sources = files(
+  'absl/crc/crc32c.cc',
+  'absl/crc/internal/cpu_detect.cc',
+  'absl/crc/internal/crc.cc',
+  'absl/crc/internal/crc_cord_state.cc',
+  'absl/crc/internal/crc_memcpy_fallback.cc',
+  'absl/crc/internal/crc_memcpy_x86_arm_combined.cc',
+  'absl/crc/internal/crc_non_temporal_memcpy.cc',
+  'absl/crc/internal/crc_x86_arm_combined.cc',
+)
+absl_crc_headers = files(
+  'absl/crc/crc32c.h',
+  'absl/crc/internal/cpu_detect.h',
+  'absl/crc/internal/crc.h',
+  'absl/crc/internal/crc32_x86_arm_combined_simd.h',
+  'absl/crc/internal/crc32c.h',
+  'absl/crc/internal/crc32c_inline.h',
+  'absl/crc/internal/crc_cord_state.h',
+  'absl/crc/internal/crc_internal.h',
+  'absl/crc/internal/crc_memcpy.h',
+  'absl/crc/internal/non_temporal_arm_intrinsics.h',
+  'absl/crc/internal/non_temporal_memcpy.h',
+)
+
+absl_debugging_sources = files(
+  'absl/debugging/failure_signal_handler.cc',
+  'absl/debugging/internal/address_is_readable.cc',
+  'absl/debugging/internal/decode_rust_punycode.cc',
+  'absl/debugging/internal/demangle.cc',
+  'absl/debugging/internal/demangle_rust.cc',
+  'absl/debugging/internal/elf_mem_image.cc',
+  'absl/debugging/internal/examine_stack.cc',
+  'absl/debugging/internal/stack_consumption.cc',
+  'absl/debugging/internal/vdso_support.cc',
+  'absl/debugging/leak_check.cc',
+  'absl/debugging/stacktrace.cc',
+  'absl/debugging/symbolize.cc',
+  'absl/debugging/internal/utf8_for_code_point.cc',
+)
+absl_debugging_headers = files(
+  'absl/debugging/failure_signal_handler.h',
+  'absl/debugging/internal/address_is_readable.h',
+  'absl/debugging/internal/bounded_utf8_length_sequence.h',
+  'absl/debugging/internal/decode_rust_punycode.h',
+  'absl/debugging/internal/demangle.h',
+  'absl/debugging/internal/demangle_rust.h',
+  'absl/debugging/internal/elf_mem_image.h',
+  'absl/debugging/internal/examine_stack.h',
+  'absl/debugging/internal/stack_consumption.h',
+  'absl/debugging/internal/stacktrace_aarch64-inl.inc',
+  'absl/debugging/internal/stacktrace_arm-inl.inc',
+  'absl/debugging/internal/stacktrace_config.h',
+  'absl/debugging/internal/stacktrace_emscripten-inl.inc',
+  'absl/debugging/internal/stacktrace_generic-inl.inc',
+  'absl/debugging/internal/stacktrace_powerpc-inl.inc',
+  'absl/debugging/internal/stacktrace_riscv-inl.inc',
+  'absl/debugging/internal/stacktrace_unimplemented-inl.inc',
+  'absl/debugging/internal/stacktrace_win32-inl.inc',
+  'absl/debugging/internal/stacktrace_x86-inl.inc',
+  'absl/debugging/internal/symbolize.h',
+  'absl/debugging/internal/utf8_for_code_point.h',
+  'absl/debugging/internal/vdso_support.h',
+  'absl/debugging/leak_check.h',
+  'absl/debugging/stacktrace.h',
+  'absl/debugging/symbolize.h',
+  'absl/debugging/symbolize_darwin.inc',
+  'absl/debugging/symbolize_elf.inc',
+  'absl/debugging/symbolize_emscripten.inc',
+  'absl/debugging/symbolize_unimplemented.inc',
+  'absl/debugging/symbolize_win32.inc',
+)
+
+absl_flags_sources = files(
+  'absl/flags/commandlineflag.cc',
+  'absl/flags/internal/flag.cc',
+  'absl/flags/internal/commandlineflag.cc',
+  'absl/flags/internal/flag.cc',
+  'absl/flags/internal/private_handle_accessor.cc',
+  'absl/flags/internal/program_name.cc',
+  'absl/flags/internal/usage.cc',
+  'absl/flags/marshalling.cc',
+  'absl/flags/parse.cc',
+  'absl/flags/reflection.cc',
+  'absl/flags/usage.cc',
+  'absl/flags/usage_config.cc',
+)
+absl_flags_headers = files(
+  'absl/flags/commandlineflag.h',
+  'absl/flags/config.h',
+  'absl/flags/declare.h',
+  'absl/flags/internal/flag.h',
+  'absl/flags/internal/commandlineflag.h',
+  'absl/flags/internal/flag.h',
+  'absl/flags/internal/parse.h',
+  'absl/flags/internal/path_util.h',
+  'absl/flags/internal/private_handle_accessor.h',
+  'absl/flags/internal/program_name.h',
+  'absl/flags/internal/registry.h',
+  'absl/flags/internal/sequence_lock.h',
+  'absl/flags/internal/usage.h',
+  'absl/flags/marshalling.h',
+  'absl/flags/parse.h',
+  'absl/flags/reflection.h',
+  'absl/flags/usage.h',
+  'absl/flags/usage_config.h',
+)
+
+absl_hash_sources = files(
+  'absl/hash/internal/city.cc',
+  'absl/hash/internal/hash.cc',
+  'absl/hash/internal/low_level_hash.cc',
+)
+absl_hash_headers = files(
+  'absl/hash/hash.h',
+  'absl/hash/hash_testing.h',
+  'absl/hash/internal/city.h',
+  'absl/hash/internal/hash.h',
+  'absl/hash/internal/low_level_hash.h',
+  'absl/hash/internal/spy_hash_state.h',
+)
+
+absl_log_sources = files(
+  'absl/log/die_if_null.cc',
+  'absl/log/flags.cc',
+  'absl/log/globals.cc',
+  'absl/log/initialize.cc',
+  'absl/log/internal/check_op.cc',
+  'absl/log/internal/conditions.cc',
+  'absl/log/internal/fnmatch.cc',
+  'absl/log/internal/globals.cc',
+  'absl/log/internal/log_format.cc',
+  'absl/log/internal/log_message.cc',
+  'absl/log/internal/log_sink_set.cc',
+  'absl/log/internal/nullguard.cc',
+  'absl/log/internal/proto.cc',
+  'absl/log/internal/vlog_config.cc',
+  'absl/log/log_entry.cc',
+  'absl/log/log_sink.cc',
+)
+absl_log_headers = files(
+  'absl/log/absl_check.h',
+  'absl/log/absl_log.h',
+  'absl/log/check.h',
+  'absl/log/die_if_null.h',
+  'absl/log/flags.h',
+  'absl/log/globals.h',
+  'absl/log/initialize.h',
+  'absl/log/internal/append_truncated.h',
+  'absl/log/internal/check_impl.h',
+  'absl/log/internal/check_op.h',
+  'absl/log/internal/conditions.h',
+  'absl/log/internal/config.h',
+  'absl/log/internal/fnmatch.h',
+  'absl/log/internal/flags.h',
+  'absl/log/internal/globals.h',
+  'absl/log/internal/log_format.h',
+  'absl/log/internal/log_impl.h',
+  'absl/log/internal/log_message.h',
+  'absl/log/internal/log_sink_set.h',
+  'absl/log/internal/nullguard.h',
+  'absl/log/internal/nullstream.h',
+  'absl/log/internal/proto.h',
+  'absl/log/internal/strip.h',
+  'absl/log/internal/structured.h',
+  'absl/log/internal/test_actions.h',
+  'absl/log/internal/test_helpers.h',
+  'absl/log/internal/test_matchers.h',
+  'absl/log/internal/vlog_config.h',
+  'absl/log/internal/voidify.h',
+  'absl/log/log.h',
+  'absl/log/log_entry.h',
+  'absl/log/log_sink.h',
+  'absl/log/log_sink_registry.h',
+  'absl/log/log_streamer.h',
+  'absl/log/scoped_mock_log.h',
+  'absl/log/structured.h',
+)
+
+absl_numeric_sources = files(
+  'absl/numeric/int128.cc',
+)
+absl_numeric_headers = files(
+  'absl/numeric/bits.h',
+  'absl/numeric/int128.h',
+  'absl/numeric/int128_have_intrinsic.inc',
+  'absl/numeric/int128_no_intrinsic.inc',
+  'absl/numeric/internal/bits.h',
+  'absl/numeric/internal/representation.h',
+)
+
+absl_profiling_sources = files(
+  'absl/profiling/internal/exponential_biased.cc',
+  'absl/profiling/internal/periodic_sampler.cc',
+)
+absl_profiling_headers = files(
+  'absl/profiling/internal/exponential_biased.h',
+  'absl/profiling/internal/periodic_sampler.h',
+  'absl/profiling/internal/sample_recorder.h',
+)
+
+absl_random_sources = files(
+  'absl/random/discrete_distribution.cc',
+  'absl/random/gaussian_distribution.cc',
+  'absl/random/internal/chi_square.cc',
+  'absl/random/internal/pool_urbg.cc',
+  'absl/random/internal/randen.cc',
+  'absl/random/internal/randen_detect.cc',
+  'absl/random/internal/randen_hwaes.cc',
+  'absl/random/internal/randen_round_keys.cc',
+  'absl/random/internal/randen_slow.cc',
+  'absl/random/internal/seed_material.cc',
+  'absl/random/seed_gen_exception.cc',
+  'absl/random/seed_sequences.cc',
+)
+absl_random_headers = files(
+  'absl/random/bernoulli_distribution.h',
+  'absl/random/beta_distribution.h',
+  'absl/random/bit_gen_ref.h',
+  'absl/random/discrete_distribution.h',
+  'absl/random/distributions.h',
+  'absl/random/exponential_distribution.h',
+  'absl/random/gaussian_distribution.h',
+  'absl/random/internal/chi_square.h',
+  'absl/random/internal/distribution_caller.h',
+  'absl/random/internal/distribution_test_util.h',
+  'absl/random/internal/explicit_seed_seq.h',
+  'absl/random/internal/fast_uniform_bits.h',
+  'absl/random/internal/fastmath.h',
+  'absl/random/internal/generate_real.h',
+  'absl/random/internal/iostream_state_saver.h',
+  'absl/random/internal/mock_helpers.h',
+  'absl/random/internal/mock_overload_set.h',
+  'absl/random/internal/nanobenchmark.h',
+  'absl/random/internal/nonsecure_base.h',
+  'absl/random/internal/pcg_engine.h',
+  'absl/random/internal/platform.h',
+  'absl/random/internal/pool_urbg.h',
+  'absl/random/internal/randen.h',
+  'absl/random/internal/randen_detect.h',
+  'absl/random/internal/randen_engine.h',
+  'absl/random/internal/randen_hwaes.h',
+  'absl/random/internal/randen_slow.h',
+  'absl/random/internal/randen_traits.h',
+  'absl/random/internal/salted_seed_seq.h',
+  'absl/random/internal/seed_material.h',
+  'absl/random/internal/sequence_urbg.h',
+  'absl/random/internal/traits.h',
+  'absl/random/internal/uniform_helper.h',
+  'absl/random/internal/wide_multiply.h',
+  'absl/random/log_uniform_int_distribution.h',
+  'absl/random/mock_distributions.h',
+  'absl/random/mocking_bit_gen.h',
+  'absl/random/poisson_distribution.h',
+  'absl/random/random.h',
+  'absl/random/seed_gen_exception.h',
+  'absl/random/seed_sequences.h',
+  'absl/random/uniform_int_distribution.h',
+  'absl/random/uniform_real_distribution.h',
+  'absl/random/zipf_distribution.h',
+)
+
+absl_status_sources = files(
+  'absl/status/internal/status_internal.cc',
+  'absl/status/status.cc',
+  'absl/status/status_payload_printer.cc',
+  'absl/status/statusor.cc',
+)
+absl_status_headers = files(
+  'absl/status/internal/status_internal.h',
+  'absl/status/internal/statusor_internal.h',
+  'absl/status/status.h',
+  'absl/status/status_payload_printer.h',
+  'absl/status/statusor.h',
+)
+
+absl_strings_sources = files(
+  'absl/strings/ascii.cc',
+  'absl/strings/charconv.cc',
+  'absl/strings/cord.cc',
+  'absl/strings/cord_analysis.cc',
+  'absl/strings/cord_buffer.cc',
+  'absl/strings/escaping.cc',
+  'absl/strings/internal/charconv_bigint.cc',
+  'absl/strings/internal/charconv_parse.cc',
+  'absl/strings/internal/cord_internal.cc',
+  'absl/strings/internal/cord_rep_btree.cc',
+  'absl/strings/internal/cord_rep_btree_navigator.cc',
+  'absl/strings/internal/cord_rep_btree_reader.cc',
+  'absl/strings/internal/cord_rep_consume.cc',
+  'absl/strings/internal/cord_rep_crc.cc',
+  'absl/strings/internal/cordz_functions.cc',
+  'absl/strings/internal/cordz_handle.cc',
+  'absl/strings/internal/cordz_info.cc',
+  'absl/strings/internal/cordz_sample_token.cc',
+  'absl/strings/internal/damerau_levenshtein_distance.cc',
+  'absl/strings/internal/escaping.cc',
+  'absl/strings/internal/memutil.cc',
+  'absl/strings/internal/ostringstream.cc',
+  'absl/strings/internal/pow10_helper.cc',
+  'absl/strings/internal/str_format/arg.cc',
+  'absl/strings/internal/str_format/bind.cc',
+  'absl/strings/internal/str_format/extension.cc',
+  'absl/strings/internal/str_format/float_conversion.cc',
+  'absl/strings/internal/str_format/output.cc',
+  'absl/strings/internal/str_format/parser.cc',
+  'absl/strings/internal/stringify_sink.cc',
+  'absl/strings/internal/utf8.cc',
+  'absl/strings/match.cc',
+  'absl/strings/numbers.cc',
+  'absl/strings/str_cat.cc',
+  'absl/strings/str_replace.cc',
+  'absl/strings/str_split.cc',
+  'absl/strings/string_view.cc',
+  'absl/strings/substitute.cc',
+)
+absl_strings_headers = files(
+  'absl/strings/ascii.h',
+  'absl/strings/charconv.h',
+  'absl/strings/cord.h',
+  'absl/strings/cord_analysis.h',
+  'absl/strings/cord_buffer.h',
+  'absl/strings/cord_test_helpers.h',
+  'absl/strings/cordz_test_helpers.h',
+  'absl/strings/escaping.h',
+  'absl/strings/has_absl_stringify.h',
+  'absl/strings/internal/charconv_bigint.h',
+  'absl/strings/internal/charconv_parse.h',
+  'absl/strings/internal/cord_data_edge.h',
+  'absl/strings/internal/cord_internal.h',
+  'absl/strings/internal/cord_rep_btree.h',
+  'absl/strings/internal/cord_rep_btree_navigator.h',
+  'absl/strings/internal/cord_rep_btree_reader.h',
+  'absl/strings/internal/cord_rep_consume.h',
+  'absl/strings/internal/cord_rep_crc.h',
+  'absl/strings/internal/cord_rep_flat.h',
+  'absl/strings/internal/cord_rep_test_util.h',
+  'absl/strings/internal/cordz_functions.h',
+  'absl/strings/internal/cordz_handle.h',
+  'absl/strings/internal/cordz_info.h',
+  'absl/strings/internal/cordz_sample_token.h',
+  'absl/strings/internal/cordz_statistics.h',
+  'absl/strings/internal/cordz_update_scope.h',
+  'absl/strings/internal/cordz_update_tracker.h',
+  'absl/strings/internal/damerau_levenshtein_distance.h',
+  'absl/strings/internal/escaping.h',
+  'absl/strings/internal/escaping_test_common.h',
+  'absl/strings/internal/memutil.h',
+  'absl/strings/internal/numbers_test_common.h',
+  'absl/strings/internal/ostringstream.h',
+  'absl/strings/internal/pow10_helper.h',
+  'absl/strings/internal/resize_uninitialized.h',
+  'absl/strings/internal/stl_type_traits.h',
+  'absl/strings/internal/str_format/arg.h',
+  'absl/strings/internal/str_format/bind.h',
+  'absl/strings/internal/str_format/checker.h',
+  'absl/strings/internal/str_format/constexpr_parser.h',
+  'absl/strings/internal/str_format/extension.h',
+  'absl/strings/internal/str_format/float_conversion.h',
+  'absl/strings/internal/str_format/output.h',
+  'absl/strings/internal/str_format/parser.h',
+  'absl/strings/internal/str_join_internal.h',
+  'absl/strings/internal/str_split_internal.h',
+  'absl/strings/internal/string_constant.h',
+  'absl/strings/internal/stringify_sink.h',
+  'absl/strings/internal/utf8.h',
+  'absl/strings/match.h',
+  'absl/strings/numbers.h',
+  'absl/strings/str_cat.h',
+  'absl/strings/str_format.h',
+  'absl/strings/str_join.h',
+  'absl/strings/str_replace.h',
+  'absl/strings/str_split.h',
+  'absl/strings/string_view.h',
+  'absl/strings/strip.h',
+  'absl/strings/substitute.h',
+)
+
+absl_synchronization_sources = files(
+  'absl/synchronization/barrier.cc',
+  'absl/synchronization/blocking_counter.cc',
+  'absl/synchronization/internal/create_thread_identity.cc',
+  'absl/synchronization/internal/futex_waiter.cc',
+  'absl/synchronization/internal/graphcycles.cc',
+  'absl/synchronization/internal/kernel_timeout.cc',
+  'absl/synchronization/internal/per_thread_sem.cc',
+  'absl/synchronization/internal/pthread_waiter.cc',
+  'absl/synchronization/internal/sem_waiter.cc',
+  'absl/synchronization/internal/stdcpp_waiter.cc',
+  'absl/synchronization/internal/waiter_base.cc',
+  'absl/synchronization/internal/win32_waiter.cc',
+  'absl/synchronization/mutex.cc',
+  'absl/synchronization/notification.cc',
+)
+absl_synchronization_headers = files(
+  'absl/synchronization/barrier.h',
+  'absl/synchronization/blocking_counter.h',
+  'absl/synchronization/internal/create_thread_identity.h',
+  'absl/synchronization/internal/futex.h',
+  'absl/synchronization/internal/graphcycles.h',
+  'absl/synchronization/internal/kernel_timeout.h',
+  'absl/synchronization/internal/per_thread_sem.h',
+  'absl/synchronization/internal/thread_pool.h',
+  'absl/synchronization/internal/waiter.h',
+  'absl/synchronization/mutex.h',
+  'absl/synchronization/notification.h',
+)
+
+absl_time_sources = files(
+  'absl/time/civil_time.cc',
+  'absl/time/clock.cc',
+  'absl/time/duration.cc',
+  'absl/time/format.cc',
+  'absl/time/internal/cctz/src/civil_time_detail.cc',
+  'absl/time/internal/cctz/src/time_zone_fixed.cc',
+  'absl/time/internal/cctz/src/time_zone_format.cc',
+  'absl/time/internal/cctz/src/time_zone_if.cc',
+  'absl/time/internal/cctz/src/time_zone_impl.cc',
+  'absl/time/internal/cctz/src/time_zone_info.cc',
+  'absl/time/internal/cctz/src/time_zone_libc.cc',
+  'absl/time/internal/cctz/src/time_zone_lookup.cc',
+  'absl/time/internal/cctz/src/time_zone_posix.cc',
+  'absl/time/internal/cctz/src/zone_info_source.cc',
+  'absl/time/time.cc',
+)
+absl_time_headers = files(
+  'absl/time/civil_time.h',
+  'absl/time/clock.h',
+  'absl/time/internal/cctz/include/cctz/civil_time.h',
+  'absl/time/internal/cctz/include/cctz/civil_time_detail.h',
+  'absl/time/internal/cctz/include/cctz/time_zone.h',
+  'absl/time/internal/cctz/include/cctz/zone_info_source.h',
+  'absl/time/internal/cctz/src/time_zone_fixed.h',
+  'absl/time/internal/cctz/src/time_zone_if.h',
+  'absl/time/internal/cctz/src/time_zone_impl.h',
+  'absl/time/internal/cctz/src/time_zone_info.h',
+  'absl/time/internal/cctz/src/time_zone_libc.h',
+  'absl/time/internal/cctz/src/time_zone_posix.h',
+  'absl/time/internal/cctz/src/tzfile.h',
+  'absl/time/internal/get_current_time_chrono.inc',
+  'absl/time/internal/get_current_time_posix.inc',
+  'absl/time/internal/test_util.h',
+  'absl/time/time.h',
+)
+
+absl_types_sources = files(
+  'absl/types/bad_any_cast.cc',
+  'absl/types/bad_optional_access.cc',
+  'absl/types/bad_variant_access.cc',
+)
+absl_types_headers = files(
+  'absl/types/any.h',
+  'absl/types/bad_any_cast.h',
+  'absl/types/bad_optional_access.h',
+  'absl/types/bad_variant_access.h',
+  'absl/types/compare.h',
+  'absl/types/internal/optional.h',
+  'absl/types/internal/span.h',
+  'absl/types/internal/variant.h',
+  'absl/types/optional.h',
+  'absl/types/span.h',
+  'absl/types/variant.h',
+)
+
+# Libraries
+absl_base_lib = static_library(
+  'absl_base',
+  absl_base_sources,
+  include_directories: absl_include_dir,
+  cpp_args: arch_flags,
+  dependencies: [dependency('threads'), libatomic],
+)
+
+absl_hash_lib = static_library(
+  'absl_hash',
+  absl_hash_sources,
+  include_directories: absl_include_dir,
+)
+
+absl_numeric_lib = static_library(
+  'absl_numeric',
+  absl_numeric_sources,
+  include_directories: absl_include_dir,
+)
+
+absl_profiling_lib = static_library(
+  'absl_profiling',
+  absl_profiling_sources,
+  include_directories: absl_include_dir,
+)
+
+absl_crc_lib = static_library(
+  'absl_crc',
+  absl_crc_sources,
+  include_directories: absl_include_dir,
+  link_with: [
+    absl_base_lib,
+  ],
+  dependencies: libatomic,
+)
+
+absl_strings_lib = static_library(
+  'absl_strings',
+  absl_strings_sources,
+  include_directories: absl_include_dir,
+  link_with: [
+    absl_base_lib,
+    absl_crc_lib,
+    absl_numeric_lib,
+    absl_profiling_lib,
+  ],
+)
+
+absl_debugging_lib = static_library(
+  'absl_debugging',
+  absl_debugging_sources,
+  include_directories: absl_include_dir,
+  link_with: [
+    absl_base_lib,
+    absl_strings_lib,
+  ],
+  dependencies: libatomic,
+)
+
+absl_random_lib = static_library(
+  'absl_random',
+  absl_random_sources,
+  include_directories: absl_include_dir,
+  cpp_args: hw_flags,
+  link_with: [
+    absl_base_lib,
+    absl_strings_lib,
+  ],
+  dependencies: libatomic,
+)
+
+absl_time_lib = static_library(
+  'absl_time',
+  absl_time_sources,
+  include_directories: absl_include_dir,
+  link_with: [
+    absl_base_lib,
+    absl_numeric_lib,
+    absl_strings_lib,
+  ],
+  # macOS only, upstream: https://github.com/abseil/abseil-cpp/pull/280
+  dependencies: dependency('appleframeworks', modules: 'CoreFoundation', required: host_machine.system() == 'darwin'),
+)
+
+absl_types_lib = static_library(
+  'absl_types',
+  absl_types_sources,
+  include_directories: absl_include_dir,
+)
+
+absl_synchronization_lib = static_library(
+  'absl_synchronization',
+  absl_synchronization_sources,
+  include_directories: absl_include_dir,
+  link_with: [
+    absl_base_lib,
+    absl_debugging_lib,
+    absl_time_lib,
+  ],
+)
+
+absl_container_lib = static_library(
+  'absl_container',
+  absl_container_sources,
+  include_directories: absl_include_dir,
+  link_with: [
+    absl_base_lib,
+    absl_debugging_lib,
+    absl_hash_lib,
+    absl_synchronization_lib,
+    absl_time_lib,
+  ],
+)
+
+absl_flags_lib = static_library(
+  'absl_flags',
+  absl_flags_sources,
+  include_directories: absl_include_dir,
+  link_with: [
+    absl_base_lib,
+    absl_container_lib,
+    absl_hash_lib,
+    absl_strings_lib,
+    absl_synchronization_lib,
+  ],
+  dependencies: libatomic,
+)
+
+absl_status_lib = static_library(
+  'absl_status',
+  absl_status_sources,
+  include_directories: absl_include_dir,
+  link_with: [
+    absl_base_lib,
+    absl_strings_lib,
+  ],
+)
+
+absl_log_lib = static_library(
+  'absl_log',
+  absl_log_sources,
+  include_directories: absl_include_dir,
+  link_with: [
+    absl_base_lib,
+    absl_strings_lib,
+    absl_flags_lib,
+  ],
+  dependencies: libatomic,
+)
+
+# Dependencies
+absl_base_dep = declare_dependency(
+  include_directories: absl_include_dir,
+  link_with: absl_base_lib,
+)
+
+absl_hash_dep = declare_dependency(
+  include_directories: absl_include_dir,
+  link_with: absl_hash_lib,
+)
+
+absl_numeric_dep = declare_dependency(
+  include_directories: absl_include_dir,
+  link_with: absl_numeric_lib,
+)
+
+absl_profiling_dep = declare_dependency(
+  include_directories: absl_include_dir,
+  link_with: absl_profiling_lib,
+)
+
+absl_strings_dep = declare_dependency(
+  include_directories: absl_include_dir,
+  link_with: absl_strings_lib,
+  dependencies: [
+    absl_base_dep,
+    absl_numeric_dep,
+  ],
+)
+
+absl_debugging_dep = declare_dependency(
+  include_directories: absl_include_dir,
+  link_with: absl_debugging_lib,
+  dependencies: [
+    absl_base_dep,
+    absl_strings_dep,
+  ],
+)
+
+absl_random_dep = declare_dependency(
+  include_directories: absl_include_dir,
+  link_with: absl_random_lib,
+  dependencies: [
+    absl_base_dep,
+    absl_strings_dep,
+  ],
+)
+
+absl_crc_dep = declare_dependency(
+  include_directories: absl_include_dir,
+  link_with: absl_crc_lib,
+  dependencies: [
+    absl_base_dep,
+  ],
+)
+
+absl_time_dep = declare_dependency(
+  include_directories: absl_include_dir,
+  link_with: absl_time_lib,
+  dependencies: [
+    absl_base_dep,
+    absl_numeric_dep,
+    absl_strings_dep,
+  ],
+)
+
+absl_types_dep = declare_dependency(
+  include_directories: absl_include_dir,
+  link_with: absl_types_lib,
+)
+
+absl_synchronization_dep = declare_dependency(
+  include_directories: absl_include_dir,
+  link_with: absl_synchronization_lib,
+  dependencies: [
+    absl_base_dep,
+    absl_debugging_dep,
+    absl_time_dep,
+  ],
+)
+
+absl_container_dep = declare_dependency(
+  include_directories: absl_include_dir,
+  link_with: absl_container_lib,
+  dependencies: [
+    absl_base_dep,
+    absl_debugging_dep,
+    absl_hash_dep,
+    absl_synchronization_dep,
+    absl_time_dep,
+  ],
+)
+
+absl_flags_dep = declare_dependency(
+  include_directories: absl_include_dir,
+  link_with: absl_flags_lib,
+  dependencies: [
+    absl_base_dep,
+    absl_container_dep,
+    absl_hash_dep,
+    absl_strings_dep,
+    absl_synchronization_dep,
+  ],
+)
+
+absl_log_dep = declare_dependency(
+  include_directories: absl_include_dir,
+  link_with: absl_log_lib,
+  dependencies: [
+    absl_base_dep,
+    absl_strings_dep,
+    absl_flags_dep,
+  ],
+)
+
+absl_status_dep = declare_dependency(
+  include_directories: absl_include_dir,
+  link_with: absl_status_lib,
+  dependencies: [
+    absl_base_dep,
+    absl_strings_dep,
+  ],
+)
diff --git a/subprojects/packagefiles/cutlass/meson.build b/subprojects/packagefiles/cutlass/meson.build
new file mode 100644
index 0000000000..efeb2bddf2
--- /dev/null
+++ b/subprojects/packagefiles/cutlass/meson.build
@@ -0,0 +1,4 @@
+project('cutlass', 'cpp')
+
+include_directory = meson.current_source_dir() + '/include'
+
diff --git a/subprojects/packagefiles/gaviotatb/meson.build b/subprojects/packagefiles/gaviotatb/meson.build
index 939b2026d7..2a1b51bfae 100644
--- a/subprojects/packagefiles/gaviotatb/meson.build
+++ b/subprojects/packagefiles/gaviotatb/meson.build
@@ -30,7 +30,12 @@ gaviotatb_includes = [
 
 gaviota_lib = static_library('gaviota',
   gaviotatb_src,
-  c_args : '-Dz_uLong=uLong',
+  c_args : meson.get_compiler('c').get_supported_arguments([
+    '-Dz_uLong=uLong',
+    '-Wno-misleading-indentation',
+    '-Wno-self-assign',
+    '-Wno-language-extension-token',
+    '-Wno-expansion-to-defined']),
   include_directories : gaviotatb_includes)
 
 incdir = include_directories('.')
diff --git a/subprojects/perfetto.wrap b/subprojects/perfetto.wrap
new file mode 100644
index 0000000000..cd29c93dde
--- /dev/null
+++ b/subprojects/perfetto.wrap
@@ -0,0 +1,6 @@
+[wrap-git]
+url = https://github.com/google/perfetto.git
+revision = v50.1
+
+depth = 1
+
diff --git a/third_party/fused_multi_head_attention/CMakeLists.txt b/third_party/fused_multi_head_attention/CMakeLists.txt
new file mode 100644
index 0000000000..4087c3a808
--- /dev/null
+++ b/third_party/fused_multi_head_attention/CMakeLists.txt
@@ -0,0 +1,56 @@
+
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cutlass_example_add_executable(
+  41_fused_multi_head_attention_fixed_seqlen
+  fused_multihead_attention_fixed_seqlen.cu
+  )
+
+cutlass_example_add_executable(
+  41_fused_multi_head_attention_variable_seqlen
+  fused_multihead_attention_variable_seqlen.cu
+  )
+
+cutlass_example_add_executable(
+  41_fused_multi_head_attention_backward
+  fused_multi_head_attention_backward.cu
+  DISABLE_TESTS ON
+  )
+
+
+add_custom_target(41_fused_multi_head_attention
+DEPENDS 41_fused_multi_head_attention_fixed_seqlen
+        41_fused_multi_head_attention_variable_seqlen
+        41_fused_multi_head_attention_backward
+)
+
+add_test(
+  NAME ctest_examples_41_fmha_backward_python
+  COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/fmha_backward_test.py $<TARGET_FILE:41_fused_multi_head_attention_backward>
+)
diff --git a/third_party/fused_multi_head_attention/debug_utils.h b/third_party/fused_multi_head_attention/debug_utils.h
new file mode 100644
index 0000000000..a22f12b711
--- /dev/null
+++ b/third_party/fused_multi_head_attention/debug_utils.h
@@ -0,0 +1,234 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+#include <cfloat>
+#include <cstdio>
+#include <cmath>
+
+////////////////////////////////////////////////////////////////////////////////
+// Debugging functions
+////////////////////////////////////////////////////////////////////////////////
+// Nans & inf detection
+#define NANCHECK(frag)                         \
+  {                                            \
+    for (size_t _i = 0; _i < frag.size(); ++_i) { \
+      assert(std::isfinite(float(frag[_i])));  \
+      assert(!std::isnan(float(frag[_i])));    \
+    }                                          \
+  }
+
+// Print on the first thread of the first block
+#if 1
+#define PRINT_WARP_ID 0
+#define PRINT_LANE_ID 0
+#define PRINT_B0_T0(msg, ...)                                         \
+  if (blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0 &&        \
+      threadIdx.x == PRINT_LANE_ID && threadIdx.y == PRINT_WARP_ID && \
+      threadIdx.z == 0) {                                             \
+    printf(msg "\n", ##__VA_ARGS__);                                  \
+  }
+#define PRINT_T0(msg, ...)                                            \
+  if (threadIdx.x == PRINT_LANE_ID && threadIdx.y == PRINT_WARP_ID && \
+      threadIdx.z == 0) {                                             \
+    printf(msg "\n", ##__VA_ARGS__);                                  \
+  }
+#define PRINT_TX_LX(msg, ...)                                                 \
+  for (int bx = 0; bx < gridDim.x; ++bx) {                                    \
+    for (int by = 0; by < gridDim.y; ++by) {                                  \
+      for (int bz = 0; bz < gridDim.z; ++bz) {                                \
+        for (int tx = 0; tx < blockDim.x; ++tx) {                             \
+          for (int ty = 0; ty < blockDim.y; ++ty) {                           \
+            for (int tz = 0; tz < blockDim.z; ++tz) {                         \
+              __syncthreads();                                                \
+              if (blockIdx.x == bx && blockIdx.y == by && blockIdx.z == bz && \
+                  threadIdx.x == tx && threadIdx.y == ty &&                   \
+                  threadIdx.z == tz) {                                        \
+                printf(                                                       \
+                    "[%d,%d,%d][%d,%d,%d]" msg "\n",                          \
+                    bx,                                                       \
+                    by,                                                       \
+                    bz,                                                       \
+                    tx,                                                       \
+                    ty,                                                       \
+                    tz,                                                       \
+                    ##__VA_ARGS__);                                           \
+              }                                                               \
+            }                                                                 \
+          }                                                                   \
+        }                                                                     \
+      }                                                                       \
+    }                                                                         \
+  }
+#else
+#define PRINT_B0_T0
+#define PRINT_TX_LX
+#endif
+
+struct __string_view {
+  char const* data;
+  std::size_t size;
+};
+#if __cplusplus >= 201402L
+template <class T>
+constexpr __string_view __get_type_name() {
+  char const* p = __PRETTY_FUNCTION__;
+  while (*p++ != '=')
+    ;
+  for (; *p == ' '; ++p)
+    ;
+  char const* p2 = p;
+  int count = 1;
+  for (;; ++p2) {
+    switch (*p2) {
+      case '[':
+        ++count;
+        break;
+      case ']':
+        --count;
+        if (!count)
+          return {p, std::size_t(p2 - p)};
+    }
+  }
+  return {};
+}
+#else
+template <class T>
+constexpr __string_view __get_type_name() {
+  return {"unsupported", 11};
+}
+#endif
+
+// Print a given array
+#define PRINT_ACCUM8_T0_L0_START(name, accum, start)  \
+  PRINT_B0_T0(                                        \
+      "%s[%d:%d] - {%f, %f, %f, %f, %f, %f, %f, %f}", \
+      name,                                           \
+      int(start),                                     \
+      int(start + 8),                                 \
+      float(accum[start + 0]),                        \
+      float(accum[start + 1]),                        \
+      float(accum[start + 2]),                        \
+      float(accum[start + 3]),                        \
+      float(accum[start + 4]),                        \
+      float(accum[start + 5]),                        \
+      float(accum[start + 6]),                        \
+      float(accum[start + 7]));
+#define PRINT_ACCUM8_T0_L0(name, accum) PRINT_ACCUM8_T0_L0_START(name, accum, 0)
+#define PRINT_FRAG_T0_L0(name, frag)                          \
+  {                                                           \
+    auto typeStr = __get_type_name<decltype(frag)>();         \
+    PRINT_B0_T0("printing %s (%s)", name, typeStr.data);      \
+    for (size_t _start = 0; _start < frag.size(); _start += 8) { \
+      PRINT_ACCUM8_T0_L0_START("  ", frag, _start);           \
+    }                                                         \
+    /*__syncthreads();                                        \
+    NANCHECK(frag); */                                        \
+  }
+#define PRINT_ARRAY_T0_L0_INCR(name, array, length, incr)   \
+  {                                                         \
+    PRINT_B0_T0("printing %s (len=%d)", name, int(length)); \
+    for (int _start = 0; _start < length; _start += incr) { \
+      PRINT_ACCUM8_T0_L0_START("  ", array, _start);        \
+    }                                                       \
+  }
+#define PRINT_ARRAY_T0_L0(name, array, length) \
+  PRINT_ARRAY_T0_L0_INCR(name, array, length, 8)
+
+// Print a 4x4 matrix
+#define PRINT_TENSOR4x4_T0_L0_START(name, ref, start_x, start_y)                                           \
+  PRINT_B0_T0(                                                                                             \
+      "%s[%d:%d, %d:%d]:\n    %f, %f, %f, %f\n    %f, %f, %f, %f\n    %f, %f, %f, %f\n    %f, %f, %f, %f", \
+      name,                                                                                                \
+      int(start_x),                                                                                        \
+      int(start_x + 4),                                                                                    \
+      int(start_y),                                                                                        \
+      int(start_y + 4),                                                                                    \
+      float(ref.at({start_x + 0, start_y + 0})),                                                           \
+      float(ref.at({start_x + 0, start_y + 1})),                                                           \
+      float(ref.at({start_x + 0, start_y + 2})),                                                           \
+      float(ref.at({start_x + 0, start_y + 3})),                                                           \
+      float(ref.at({start_x + 1, start_y + 0})),                                                           \
+      float(ref.at({start_x + 1, start_y + 1})),                                                           \
+      float(ref.at({start_x + 1, start_y + 2})),                                                           \
+      float(ref.at({start_x + 1, start_y + 3})),                                                           \
+      float(ref.at({start_x + 2, start_y + 0})),                                                           \
+      float(ref.at({start_x + 2, start_y + 1})),                                                           \
+      float(ref.at({start_x + 2, start_y + 2})),                                                           \
+      float(ref.at({start_x + 2, start_y + 3})),                                                           \
+      float(ref.at({start_x + 3, start_y + 0})),                                                           \
+      float(ref.at({start_x + 3, start_y + 1})),                                                           \
+      float(ref.at({start_x + 3, start_y + 2})),                                                           \
+      float(ref.at({start_x + 3, start_y + 3})));
+#define PRINT_TENSOR4x4_T0_L0(name, ref) \
+  PRINT_TENSOR4x4_T0_L0_START(name, ref, 0, 0)
+
+#define PRINT_PROBLEM_SIZE(name, ps)            \
+  PRINT_B0_T0(                                  \
+      "%s.problem_size: {.m=%d, .n=%d, .k=%d}", \
+      name,                                     \
+      int(ps.m()),                              \
+      int(ps.n()),                              \
+      int(ps.k()))
+
+template <typename LambdaIterator, typename LaneOffsetT, typename AccumT>
+CUTLASS_DEVICE void print_warp_accum(
+    AccumT accum,
+    LaneOffsetT lane_offset,
+    int32_t num_rows,
+    int32_t num_cols) {
+  bool is_main = blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0 &&
+      threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0;
+  for (int row = 0; row < num_rows; ++row) {
+    for (int col = 0; col < num_cols; ++col) {
+      if (col % 32 == 0) {
+        if (is_main) {
+          printf("\nmat[%3d, %3d:%3d]", row, col, col + 32);
+        }
+        __syncthreads();
+      }
+      LambdaIterator::iterateRows(
+          lane_offset,
+          [&](int accum_m) {},
+          [&](int accum_m, int accum_n, int idx) {
+            if (row == accum_m && col == accum_n &&
+                (blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0)) {
+              printf(" %6.1f", float(accum[idx]));
+            }
+          },
+          [&](int accum_m) {});
+      __syncthreads();
+    }
+    if (is_main) {
+      printf("\n");
+    }
+  }
+}
diff --git a/third_party/fused_multi_head_attention/default_fmha_grouped.h b/third_party/fused_multi_head_attention/default_fmha_grouped.h
new file mode 100644
index 0000000000..14604f10c3
--- /dev/null
+++ b/third_party/fused_multi_head_attention/default_fmha_grouped.h
@@ -0,0 +1,299 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief
+      Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with
+      the appropriate threadblock-scoped epilogue.
+
+      Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are
+      accommodated by exchanging A and B operands and assuming transposed layouts. Partial
+      specializations here choose 'device::GemmTransposed' to implement this functionality.
+
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+#include "cutlass/complex.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/numeric_types.h"
+
+#include "fmha_grouped.h"
+#include "gemm_kernel_utils.h"
+#include "gemm/custom_mma.h"
+#include "gemm/find_default_mma.h"
+#include "gemm/mma_from_smem.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    // The datatype of Q/K/V
+    typename scalar_t_,
+    // Architecture we are targeting (eg `cutlass::arch::Sm80`)
+    typename ArchTag_,
+    // If Q/K/V are correctly aligned in memory and we can run a fast kernel
+    bool isAligned_,
+    int kQueriesPerBlock,
+    int kKeysPerBlock,
+    int kMaxK = (int)cutlass::platform::numeric_limits<uint32_t>::max(),
+    GroupScheduleMode GroupScheduleMode_ = GroupScheduleMode::kDeviceOnly
+    >
+struct DefaultFMHAGrouped {
+  using scalar_t = scalar_t_;
+  using accum_t = float;
+  using output_t = scalar_t;
+
+  // Accumulator between 2 iterations
+  // Using `accum_t` improves perf on f16 at the cost of
+  // numerical errors
+  using output_accum_t = accum_t;
+
+  using ArchTag = ArchTag_;
+  static bool const kIsAligned = isAligned_;
+  static bool const kSingleValueIteration = kMaxK <= kKeysPerBlock;
+  static constexpr bool kIsHalf = cutlass::sizeof_bits<scalar_t>::value == 16;
+  static int const kWarpSize = 32;
+  static int const kNumWarpsPerBlock = kQueriesPerBlock * kKeysPerBlock / (kWarpSize * kWarpSize);
+
+  struct MM0 {
+    /*
+      In this first matmul, we compute a block of `Q @ K.T`.
+      While the calculation result is still hot in registers, we update
+      `mi`, `m_prime`, `s_prime` in shared-memory, and then store this value
+      into a shared-memory ("AccumulatorSharedStorage") that is used later as
+      operand A for the second matmul (see MM1)
+    */
+
+    using GemmType = gemm_kernel_utils::DefaultGemmType<ArchTag, scalar_t>;
+    using OpClass = typename GemmType::OpClass;
+
+    using ElementA = scalar_t;
+    using ElementB = scalar_t;
+    using ElementC = scalar_t;
+    using ElementAccumulator = accum_t;
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::ColumnMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+
+    using DefaultConfig =
+        typename cutlass::gemm::device::DefaultGemmConfiguration<
+            OpClass,
+            ArchTag,
+            ElementA,
+            ElementB,
+            ElementC,
+            ElementAccumulator
+            >;
+
+    static int const kAlignmentA =
+        kIsAligned ? DefaultConfig::kAlignmentA : GemmType::kMinimumAlignment;
+    static int const kAlignmentB =
+        kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment;
+
+    using ThreadblockShape = cutlass::gemm::GemmShape<kQueriesPerBlock, kKeysPerBlock, GemmType::ThreadK>;
+    using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+    using InstructionShape = typename GemmType::InstructionShape;
+
+    static int const kStages = DefaultConfig::kStages;
+    using Operator = typename GemmType::Operator;
+
+    using DefaultMma = typename cutlass::gemm::threadblock::FindDefaultMma<
+        ElementA,
+        LayoutA,
+        kAlignmentA,
+        ElementB,
+        LayoutB,
+        kAlignmentB,
+        ElementAccumulator,
+        LayoutC,
+        OpClass,
+        ArchTag,
+        ThreadblockShape,
+        WarpShape,
+        InstructionShape,
+        ArchTag::kMinComputeCapability >= 80 && kIsHalf
+            ? 4
+            : DefaultConfig::kStages,
+        Operator
+        >::DefaultMma;
+
+    using MmaCore = typename DefaultMma::MmaCore;
+    using IteratorA = typename DefaultMma::IteratorA;
+    using IteratorB = typename DefaultMma::IteratorB;
+    using DefaultThreadblockMma = typename DefaultMma::ThreadblockMma;
+    using Mma = typename cutlass::platform::conditional<
+        kSingleValueIteration,
+        typename MakeCustomMma<DefaultThreadblockMma, kMaxK>::Mma,
+        DefaultThreadblockMma>::type;
+    using AccumLambdaIterator = typename DefaultMmaAccumLambdaIterator<
+        typename Mma::Operator::IteratorC,
+        ElementAccumulator,
+        kWarpSize>::Iterator;
+
+    static_assert(MmaCore::WarpCount::kCount == kNumWarpsPerBlock, "");
+
+    // Epilogue to store to shared-memory in a format that we can use later for
+    // the second matmul
+    using B2bGemm = typename cutlass::gemm::threadblock::B2bGemm<
+        typename Mma::Operator::IteratorC,
+        typename Mma::Operator,
+        scalar_t,
+        WarpShape,
+        ThreadblockShape>;
+    using AccumulatorSharedStorage = typename B2bGemm::AccumulatorSharedStorage;
+  };
+
+  struct MM1 {
+    /*
+      Second matmul: perform `attn @ V` where `attn` is the attention (not
+      normalized) and stored in shared memory
+    */
+
+    using GemmType = typename MM0::GemmType;
+    using OpClass = typename GemmType::OpClass;
+
+    using ElementA = scalar_t;
+    using ElementB = scalar_t;
+    using ElementC = output_accum_t;
+    using ElementAccumulator = accum_t;
+
+    using LayoutA = cutlass::layout::RowMajor;
+    using LayoutB = cutlass::layout::RowMajor;
+    using LayoutC = cutlass::layout::RowMajor;
+
+    using DefaultConfig =
+        typename cutlass::gemm::device::DefaultGemmConfiguration<
+            OpClass,
+            ArchTag,
+            ElementA,
+            ElementB,
+            ElementC,
+            ElementAccumulator
+            >;
+
+    static int const kAlignmentA = DefaultConfig::kAlignmentA;
+    static int const kAlignmentB =
+        kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment;
+
+    using ThreadblockShape = typename MM0::ThreadblockShape;
+    using WarpShape = typename MM0::WarpShape;
+    using InstructionShape = typename MM0::InstructionShape;
+
+    using EpilogueOutputOp = typename DefaultConfig::EpilogueOutputOp;
+
+    static int const kStages = DefaultConfig::kStages;
+    using Operator = typename GemmType::Operator;
+
+    using ThreadblockSwizzle = void; // Swizzling is unused
+    static bool const kSplitKSerial = false;
+
+    using DefaultGemm = cutlass::gemm::kernel::DefaultGemm<
+        ElementA,
+        LayoutA,
+        kAlignmentA,
+        ElementB,
+        LayoutB,
+        kAlignmentB,
+        ElementC,
+        LayoutC,
+        ElementAccumulator,
+        OpClass,
+        ArchTag,
+        ThreadblockShape,
+        WarpShape,
+        InstructionShape,
+        EpilogueOutputOp,
+        ThreadblockSwizzle,
+        ArchTag::kMinComputeCapability >= 80 && kIsHalf
+            ? 4
+            : DefaultConfig::kStages,
+        kSplitKSerial,
+        Operator>;
+
+    using WarpIteratorA = typename cutlass::gemm::threadblock::
+    DefaultWarpIteratorAFromSharedMemory<
+        typename DefaultGemm::Mma::Policy::Operator::Shape, // WarpShape
+        typename DefaultGemm::Mma::Policy::Operator::InstructionShape,
+        typename DefaultGemm::Mma::Policy::Operator::IteratorA,
+        typename DefaultGemm::Mma::Policy>::WarpIterator;
+
+    using DefaultMmaFromSmem =
+        typename cutlass::gemm::threadblock::DefaultMmaFromSharedMemory<
+            typename DefaultGemm::Mma,
+            MM0::AccumulatorSharedStorage::Shape::kN,  // kMaxK
+            WarpIteratorA,
+            false>; // kScaleOperandA
+
+    using Mma = typename DefaultMmaFromSmem::Mma;
+    using IteratorB = typename Mma::IteratorB;
+    using WarpCount = typename Mma::WarpCount;
+    static_assert(WarpCount::kCount == kNumWarpsPerBlock, "");
+
+    using DefaultEpilogue = typename DefaultGemm::Epilogue;
+    using OutputTileIterator =
+        typename cutlass::epilogue::threadblock::PredicatedTileIterator<
+            typename DefaultEpilogue::OutputTileIterator::ThreadMap,
+            output_t>;
+    using OutputTileIteratorAccum =
+        typename cutlass::epilogue::threadblock::PredicatedTileIterator<
+            typename DefaultEpilogue::OutputTileIterator::ThreadMap,
+            output_accum_t>;
+  };
+
+/// Define the kernel in terms of the default kernel
+  using FMHAKernel = kernel::FMHAGrouped<
+    MM0,
+    MM1,
+    scalar_t,
+    accum_t,
+    output_t,
+    output_accum_t,
+    kSingleValueIteration,
+    GroupScheduleMode_
+  >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+}  // namespace kernel
+}  // namespace gemm
+}  // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/third_party/fused_multi_head_attention/epilogue/epilogue_pipelined.h b/third_party/fused_multi_head_attention/epilogue/epilogue_pipelined.h
new file mode 100644
index 0000000000..9ed17f4b4e
--- /dev/null
+++ b/third_party/fused_multi_head_attention/epilogue/epilogue_pipelined.h
@@ -0,0 +1,624 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  File copied from "cutlass/epilogue/threadblock/epilogue.h"
+  then modified to:
+  (1) load 2 source fragments at the same time (pipelining)
+  (2) support reading from a different dtype
+  (3) pass the row id to the OutputOp if it takes it
+    (see MemoryEfficientAttentionNormalize)
+  Note that in general the fragment passed to the OutputOp could
+  span multiple rows but it does not happen with the configurations we have
+*/
+
+#pragma once
+
+#include <cuda/std/cassert>
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/functional.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/numeric_types.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace threadblock {
+
+template <typename Op>
+struct ApplyEpilogueOp {
+  static CUTLASS_DEVICE typename Op::FragmentOutput apply(
+      Op const& output_op,
+      int row_id,
+      typename Op::FragmentAccumulator const& accum,
+      typename Op::FragmentOutput const& source) {
+    return output_op(accum, source);
+  }
+  static CUTLASS_DEVICE typename Op::FragmentOutput apply(
+      Op const& output_op,
+      int row_id,
+      typename Op::FragmentAccumulator const& accum) {
+    return output_op(accum);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Epilogue operator
+template <
+    typename Shape_, ///< Shape of threadblock tile (concept: GemmShape)
+    typename WarpMmaOperator_, ///< Warp-level MMA operator (concept:
+                               ///< gemm::warp::MmaTensorOp)
+    int PartitionsK, ///< Number of partitions of the K dimension
+    typename OutputTileIterator_, ///< Tile iterator writing output tensors
+    typename AccumulatorFragmentIterator_, ///< Fragment iterator selecting
+                                           ///< accumulators
+    typename WarpTileIterator_, ///< Warp-scoped tile iterator writing
+                                ///< accumulators to SMEM
+    typename SharedLoadIterator_, ///< Threadblock-scoped tile iterator loading
+                                  ///< from SMEM
+    typename OutputOp_, ///< Output operator
+    typename Padding_, ///< Padding added to SMEM allocation to avoid bank
+                       ///< conflicts (concept: MatrixShape)
+    int FragmentsPerPartition =
+        1, ///< Used to coarsten the epilogue granularity
+    int IterationsUnroll = ///< Used to reduce binary size when epilogue op is
+                           ///< large
+    (!IsEpilogueFunctorHeavy<OutputOp_>::value),
+    typename OutputTileSourceIterator_ =
+        OutputTileIterator_ ///< Tile iterator reading tensors
+    >
+class EpiloguePipelined : public EpilogueBase<
+                              Shape_,
+                              typename WarpMmaOperator_::Shape,
+                              PartitionsK,
+                              AccumulatorFragmentIterator_,
+                              WarpTileIterator_,
+                              Padding_,
+                              FragmentsPerPartition> {
+ public:
+  using Base = EpilogueBase<
+      Shape_,
+      typename WarpMmaOperator_::Shape,
+      PartitionsK,
+      AccumulatorFragmentIterator_,
+      WarpTileIterator_,
+      Padding_,
+      FragmentsPerPartition>;
+
+  using Shape = Shape_;
+  using WarpMmaOperator = WarpMmaOperator_;
+  static int const kPartitionsK = PartitionsK;
+  using OutputTileIterator = OutputTileIterator_;
+  using OutputTileSourceIterator = OutputTileSourceIterator_;
+  using AccumulatorFragmentIterator = AccumulatorFragmentIterator_;
+  using WarpTileIterator = WarpTileIterator_;
+  using SharedLoadIterator = SharedLoadIterator_;
+  using OutputOp = OutputOp_;
+  using Padding = Padding_;
+
+  using Layout = layout::RowMajor;
+  using LongIndex = typename Layout::LongIndex;
+
+  /// The complete warp-level accumulator tile
+  using AccumulatorTile = typename Base::AccumulatorTile;
+
+  /// Accumulator element
+  using ElementAccumulator = typename WarpTileIterator::Element;
+
+  /// Output element
+  using ElementOutput = typename OutputTileIterator::Element;
+  using ElementSource = typename OutputTileSourceIterator::Element;
+
+  /// Output access size
+  static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess;
+
+  /// Tensor reference to destination tensor
+  using TensorRef = typename OutputTileIterator::TensorRef;
+
+  /// Tensor reference to sync tensor
+  using SyncTensorRef =
+      typename cutlass::TensorRef<int, cutlass::layout::PackedVectorLayout>;
+
+  /// Const tensor reference to source tensor
+  using ConstTensorRef = typename OutputTileIterator::ConstTensorRef;
+
+  /// Array type used to output
+  using OutputAccessType = Array<
+      typename OutputTileIterator::Element,
+      OutputTileIterator::kElementsPerAccess>;
+  using SourceAccessType = Array<
+      typename OutputTileSourceIterator::Element,
+      OutputTileSourceIterator::kElementsPerAccess>;
+
+  /// Array type used by output functor
+  using AccumulatorAccessType = Array<
+      typename WarpTileIterator::Element,
+      OutputTileIterator::kElementsPerAccess>;
+
+  /// Number of warps
+  using WarpCount = typename Base::WarpCount;
+
+  static int constexpr kSmemTiles = Base::kFragmentsPerIteration > 1
+      ? Base::kFragmentsPerIteration
+      : kPartitionsK;
+  static int constexpr kSmemPointerOffset =
+      Base::SharedStorage::StorageShape::kCount / kSmemTiles;
+
+ public:
+  static_assert(
+      OutputTileSourceIterator::Fragment::kElements ==
+          OutputTileIterator::Fragment::kElements,
+      "Mismatch between input tile and output tile iterator (kElements)");
+  static_assert(
+      OutputTileSourceIterator::kIterations == OutputTileIterator::kIterations,
+      "Mismatch between input tile and output tile iterator (kIterations)");
+  static_assert(
+      SharedLoadIterator::Fragment::kElements ==
+          OutputTileIterator::Fragment::kElements,
+      "Mismatch between shared load iterator and output tile iterator.");
+
+  static_assert(
+      OutputTileIterator::kElementsPerAccess,
+      "OutputTileIterator::kElementsPerAccess must not be zero.");
+
+  static_assert(
+      !(OutputTileIterator::Fragment::kElements %
+        OutputTileIterator::kElementsPerAccess),
+      "Divisibility");
+
+ private:
+  /// Loads fragment from shared memory aligned with output tensor
+  SharedLoadIterator shared_load_iterator_;
+
+ public:
+  /// Constructor
+  CUTLASS_DEVICE
+  EpiloguePipelined(
+      typename Base::SharedStorage& shared_storage, ///< Shared storage object
+      int thread_idx, ///< ID of a thread within the threadblock
+      int warp_idx, ///< ID of warp within threadblock
+      int lane_idx ///< Id of thread within warp
+      )
+      : Base(shared_storage, thread_idx, warp_idx, lane_idx),
+        shared_load_iterator_(shared_storage.reference(), thread_idx) {}
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void operator()(
+      OutputOp const& output_op, ///< Output operator
+      OutputTileIterator
+          destination_iterator, ///< Tile iterator for destination
+      AccumulatorTile const&
+          accumulators, ///< Complete warp-level accumulator tile
+      OutputTileSourceIterator
+          source_iterator) { ///< Threadblock tile coordinate in GEMM (in units
+                             ///< of threadblock tiles)
+
+    if (!output_op.is_source_needed()) {
+      compute_source_not_needed_(output_op, destination_iterator, accumulators);
+    } else {
+      compute_source_needed_(
+          output_op, destination_iterator, accumulators, source_iterator);
+    }
+  }
+  CUTLASS_DEVICE
+  void operator()(
+      OutputOp const& output_op, ///< Output operator
+      OutputTileIterator
+          destination_iterator, ///< Tile iterator for destination
+      AccumulatorTile const&
+          accumulators) { ///< Complete warp-level accumulator tile
+    compute_source_not_needed_(output_op, destination_iterator, accumulators);
+  }
+
+ private:
+  template <class Seq>
+  struct acc2smem_source_not_needed;
+
+  template <size_t... Seq>
+  struct acc2smem_source_not_needed<cutlass::index_sequence<Seq...>> {
+    template <int Advance>
+    CUTLASS_DEVICE static void helper(
+        AccumulatorFragmentIterator accum_fragment_iterator,
+        WarpTileIterator& warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
+        typename AccumulatorFragmentIterator::Fragment accum_fragment;
+
+        accum_fragment_iterator.load(accum_fragment);
+        ++accum_fragment_iterator;
+
+        warp_tile_iterator.store(accum_fragment);
+        if (p < Base::kFragmentsPerIteration - 1) {
+          warp_tile_iterator.add_pointer_offset(kSmemPointerOffset);
+        }
+      }
+
+      if (Base::kFragmentsPerIteration > 1) {
+        warp_tile_iterator.add_pointer_offset(
+            kSmemPointerOffset * (1 - Base::kFragmentsPerIteration));
+      }
+    }
+
+    CUTLASS_DEVICE
+    static void push(
+        size_t pos,
+        AccumulatorFragmentIterator const& iterator_begin,
+        WarpTileIterator& warp_tile_iterator) {
+      int dummy[] = {
+          (pos == (Seq * Base::kFragmentsPerIteration)) &&
+          (helper<Seq * Base::kFragmentsPerIteration>(
+               iterator_begin, warp_tile_iterator),
+           0)...};
+
+      CUTLASS_UNUSED(dummy[0]);
+    }
+  };
+
+  static_assert(
+      kPartitionsK == 1 || Base::kFragmentsPerIteration == 1,
+      "One of these must be exactly 1.");
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_not_needed_(
+      OutputOp const& output_op, ///< Output operator
+      OutputTileIterator
+          destination_iterator, ///< Tile iterator for destination
+      AccumulatorTile const&
+          accumulators ///< Complete warp-level accumulator tile
+  ) {
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    //
+
+#pragma unroll(                                                          \
+    IterationsUnroll                                                     \
+        ? OutputTileIterator::kIterations / Base::kFragmentsPerIteration \
+        : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations;
+         iter += Base::kFragmentsPerIteration) {
+      //
+      // Convert and store fragment
+      //
+
+      __syncthreads();
+
+      acc2smem_source_not_needed<cutlass::make_index_sequence<
+          OutputTileIterator::kIterations / Base::kFragmentsPerIteration>>::
+          push(iter, accum_fragment_iterator, this->warp_tile_iterator_);
+
+      __syncthreads();
+
+      //
+      // Load fragments from shared memory
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int p = 0; p < Base::kFragmentsPerIteration; ++p) {
+        typename SharedLoadIterator::Fragment
+            aligned_accum_fragment[kPartitionsK];
+
+        shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+        if (p < Base::kFragmentsPerIteration - 1) {
+          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+        } else if (kPartitionsK > 1) {
+          plus<typename SharedLoadIterator::Fragment> add_fragments;
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int i = 1; i < kPartitionsK; ++i) {
+            shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+            shared_load_iterator_.load(aligned_accum_fragment[i]);
+            aligned_accum_fragment[0] = add_fragments(
+                aligned_accum_fragment[0], aligned_accum_fragment[i]);
+          }
+
+          shared_load_iterator_.add_pointer_offset(
+              (1 - kPartitionsK) * kSmemPointerOffset);
+        }
+
+        //
+        // Compute the output result
+        //
+
+        typename OutputTileIterator::Fragment output_fragment;
+
+        apply_output_operator_source_not_needed_(
+            destination_iterator.thread_start_row(),
+            output_fragment,
+            output_op,
+            aligned_accum_fragment[0]);
+
+        //
+        // Store the final result
+        //
+
+        destination_iterator.store(output_fragment);
+        ++destination_iterator;
+      }
+
+      if (Base::kFragmentsPerIteration > 1) {
+        shared_load_iterator_.add_pointer_offset(
+            kSmemPointerOffset * (1 - Base::kFragmentsPerIteration));
+      }
+    }
+  }
+
+  template <class Seq>
+  struct acc2smem_source_needed;
+
+  template <size_t... Seq>
+  struct acc2smem_source_needed<cutlass::index_sequence<Seq...>> {
+    template <int Advance>
+    CUTLASS_DEVICE static void helper(
+        AccumulatorFragmentIterator accum_fragment_iterator,
+        WarpTileIterator& warp_tile_iterator) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < Advance; i++) {
+        ++accum_fragment_iterator;
+      }
+
+      typename AccumulatorFragmentIterator::Fragment accum_fragment;
+      accum_fragment_iterator.load(accum_fragment);
+      warp_tile_iterator.store(accum_fragment);
+    }
+
+    CUTLASS_DEVICE
+    static void push(
+        size_t pos,
+        AccumulatorFragmentIterator const& iterator_begin,
+        WarpTileIterator& warp_tile_iterator) {
+      int dummy[] = {
+          (pos == Seq) &&
+          (helper<Seq>(iterator_begin, warp_tile_iterator), 0)...};
+    }
+  };
+
+  /// Streams the result to global memory
+  CUTLASS_DEVICE
+  void compute_source_needed_(
+      OutputOp const& output_op, ///< Output operator
+      OutputTileIterator
+          destination_iterator, ///< Tile iterator for destination
+      AccumulatorTile const&
+          accumulators, ///< Complete warp-level accumulator tile
+      OutputTileSourceIterator
+          source_iterator ///< Threadblock tile coordinate in GEMM (in units of
+                          ///< threadblock tiles)
+  ) {
+    typename OutputTileSourceIterator::Fragment source_fragment[2];
+
+    source_fragment[0].clear();
+    source_iterator.load(source_fragment[0]);
+    ++source_iterator;
+    source_fragment[1].clear();
+
+    //
+    // Iterator over warp-level accumulator fragment
+    //
+
+    AccumulatorFragmentIterator accum_fragment_iterator(accumulators);
+
+    //
+    // Iterate over accumulator tile
+    //
+
+#pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations : 1)
+    for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) {
+      if (iter > 0) {
+        __syncthreads();
+      }
+      //
+      // Load the source for next iteration (pipelining)
+      //
+
+      if (iter + 1 < OutputTileIterator::kIterations) {
+        source_iterator.load(source_fragment[(iter + 1) % 2]);
+      }
+      ++source_iterator;
+      acc2smem_source_needed<
+          cutlass::make_index_sequence<OutputTileIterator::kIterations>>::
+          push(iter, accum_fragment_iterator, this->warp_tile_iterator_);
+
+      __syncthreads();
+
+      //
+      // Load fragments from shared memory
+      //
+
+      typename SharedLoadIterator::Fragment
+          aligned_accum_fragment[kPartitionsK];
+
+      shared_load_iterator_.load(aligned_accum_fragment[0]);
+
+      // If the number of k-slices is > 1 - perform a reduction amongst the
+      // k-slices
+      if (kPartitionsK > 1) {
+        plus<typename SharedLoadIterator::Fragment> add_fragments;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 1; i < kPartitionsK; ++i) {
+          shared_load_iterator_.add_pointer_offset(kSmemPointerOffset);
+          shared_load_iterator_.load(aligned_accum_fragment[i]);
+          aligned_accum_fragment[0] = add_fragments(
+              aligned_accum_fragment[0], aligned_accum_fragment[i]);
+        }
+
+        shared_load_iterator_.add_pointer_offset(
+            (1 - kPartitionsK) * kSmemPointerOffset);
+      }
+
+      //
+      // Compute the output result
+      //
+
+      typename OutputTileIterator::Fragment output_fragment;
+
+      apply_output_operator_(
+          destination_iterator.thread_start_row(),
+          output_fragment,
+          output_op,
+          aligned_accum_fragment[0],
+          source_fragment[iter % 2]);
+
+      //
+      // Store the final result
+      //
+
+      destination_iterator.store(output_fragment);
+      ++destination_iterator;
+    }
+  }
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_(
+      int begin_row,
+      typename OutputTileIterator::Fragment& output_fragment,
+      OutputOp const& output_op, ///< Output operator
+      typename SharedLoadIterator::Fragment const& aligned_accum_fragment,
+      typename OutputTileSourceIterator::Fragment const& source_fragment) {
+    OutputAccessType* output_frag_ptr =
+        reinterpret_cast<OutputAccessType*>(&output_fragment);
+
+    AccumulatorAccessType const* compute_frag_ptr =
+        reinterpret_cast<AccumulatorAccessType const*>(&aligned_accum_fragment);
+
+    SourceAccessType const* source_frag_ptr =
+        reinterpret_cast<SourceAccessType const*>(&source_fragment);
+
+    int const kOutputOpIterations = OutputTileIterator::Fragment::kElements /
+        OutputTileIterator::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+      // Call the output operator
+      output_frag_ptr[i] = ApplyEpilogueOp<OutputOp>::apply(
+          output_op,
+          begin_row + getRowOffset(i * OutputTileIterator::kElementsPerAccess),
+          compute_frag_ptr[i],
+          source_frag_ptr[i]);
+    }
+  }
+
+  /// Helper to invoke the output functor over each vector of output
+  CUTLASS_DEVICE
+  void apply_output_operator_source_not_needed_(
+      int begin_row,
+      typename OutputTileIterator::Fragment& output_fragment,
+      OutputOp const& output_op, ///< Output operator
+      typename SharedLoadIterator::Fragment const& aligned_accum_fragment) {
+    OutputAccessType* output_frag_ptr =
+        reinterpret_cast<OutputAccessType*>(&output_fragment);
+
+    AccumulatorAccessType const* compute_frag_ptr =
+        reinterpret_cast<AccumulatorAccessType const*>(&aligned_accum_fragment);
+
+    int const kOutputOpIterations = OutputTileIterator::Fragment::kElements /
+        OutputTileIterator::kElementsPerAccess;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kOutputOpIterations; ++i) {
+      // Call the output operator
+      output_frag_ptr[i] = ApplyEpilogueOp<OutputOp>::apply(
+          output_op,
+          begin_row + getRowOffset(i * OutputTileIterator::kElementsPerAccess),
+          compute_frag_ptr[i]);
+    }
+  }
+
+  // This should be constexpr, but it's only supported on c++14
+  static int CUTLASS_HOST_DEVICE getRowOffset(int i) {
+    using ThreadMap = typename OutputTileIterator::ThreadMap;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int row_offset = row * ThreadMap::Delta::kRow +
+              group * ThreadMap::Delta::kGroup +
+              cluster * ThreadMap::Delta::kCluster;
+          int frag_row_idx =
+              (row +
+               ThreadMap::Iterations::kRow *
+                   (group + ThreadMap::Iterations::kGroup * cluster));
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            int frag_idx = ThreadMap::kElementsPerAccess *
+                (frag_row_idx * ThreadMap::Iterations::kColumn + column);
+            if (i < frag_idx + ThreadMap::kElementsPerAccess) {
+              return row_offset;
+            }
+          }
+        }
+      }
+    }
+    return -1;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/third_party/fused_multi_head_attention/epilogue/epilogue_rescale_output.h b/third_party/fused_multi_head_attention/epilogue/epilogue_rescale_output.h
new file mode 100644
index 0000000000..973ec3459b
--- /dev/null
+++ b/third_party/fused_multi_head_attention/epilogue/epilogue_rescale_output.h
@@ -0,0 +1,254 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+  \brief Epilogue for threadblock scoped GEMMs using Tensor Ops.
+
+  The epilogue rearranges the result of a matrix product through shared memory
+  to match canonical tensor layouts in global memory. Epilogues support
+  conversion and reduction operations.
+
+  This is a copy of cutlass/epilogue/threadblock/epilogue.h that can
+  handle "row_id" as a first argument, as uses it to get the corresponding
+  `m_prime` / `s_prime` to rescale the output.
+*/
+
+#pragma once
+
+#include <cuda/std/cassert>
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/functional.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_coord.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+#include "cutlass/epilogue/threadblock/epilogue_base.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "epilogue_pipelined.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies a linear combination operator to an array of elements.
+// output <- alpha * accumulator + beta * source
+//   with:
+//     alpha = 1 / s_prime (to normalize when isLast=True, 1 otherwise)
+//     beta = alpha / m_prime (renormalize the output when the max changes)
+//     source is the current output
+template <
+    typename ElementOutput_, ///< Data type used to store tensors
+    typename ElementSource_, //< Data type for source (usually matches
+                             //`ElementOutput`)
+    int Count, ///< Number of elements computed per operation.
+               ///< Usually it is 128/sizeof_bits<ElementOutput_>,
+               ///< but we use 64 or 32 sometimes when there are not enough data
+               ///< to store
+    typename ElementAccumulator_, ///< Accumulator data type
+    typename ElementCompute_, ///< Data type used to compute linear combination
+    bool isFirst,
+    bool isLast,
+    typename FragmentAlphaBeta_,
+    FloatRoundStyle Round = FloatRoundStyle::round_to_nearest>
+class MemoryEfficientAttentionNormalize {
+ public:
+  using ElementOutput = ElementOutput_;
+  using ElementSource = ElementSource_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+
+  static int const kCount = Count;
+
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentSource = Array<ElementSource, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kCount>;
+  using ComputeFragment = Array<ElementCompute, kCount>;
+  using FragmentAlphaBeta = FragmentAlphaBeta_;
+
+  static FloatRoundStyle const kRound = Round;
+
+ private:
+  //
+  // Data members
+  //
+
+  FragmentAlphaBeta const& s_prime_;
+  FragmentAlphaBeta const& m_prime_;
+
+ public:
+  /// Constructs the function object, possibly loading from pointers in host
+  /// memory
+  CUTLASS_HOST_DEVICE
+  MemoryEfficientAttentionNormalize(
+      FragmentAlphaBeta const& s_prime,
+      FragmentAlphaBeta const& m_prime)
+      : s_prime_(s_prime), m_prime_(m_prime) {}
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    return !isFirst;
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {}
+
+  /// Computes linear scaling: D = alpha * accumulator + beta * source
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+      int row,
+      FragmentAccumulator const& accumulator,
+      FragmentSource const& source) const {
+    assert(!isFirst);
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementSource, kCount, Round>
+        source_converter;
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round>
+        accumulator_converter;
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
+        destination_converter;
+
+    ComputeFragment converted_source = source_converter(source);
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+
+    // Perform binary operations
+    ComputeFragment intermediate;
+
+    multiplies<ComputeFragment> mul_add_source;
+    multiply_add<ComputeFragment> mul_add_accumulator;
+
+    ElementCompute alpha = isLast ? (1 / s_prime_[row]) : 1;
+    ElementCompute beta = alpha * m_prime_[row];
+
+    intermediate = mul_add_source(beta, converted_source); // X =  beta * C
+
+    intermediate = mul_add_accumulator(
+        alpha, converted_accumulator, intermediate); // D = alpha * Accum + X
+
+    return destination_converter(intermediate);
+  }
+
+  /// Computes linear scaling: D = alpha * accumulator
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(int row, FragmentAccumulator const& accumulator)
+      const {
+    assert(isFirst);
+
+    // Convert source to interal compute numeric type
+    NumericArrayConverter<ElementCompute, ElementAccumulator, kCount, Round>
+        accumulator_converter;
+
+    // Convert to destination numeric type
+    NumericArrayConverter<ElementOutput, ElementCompute, kCount, Round>
+        destination_converter;
+
+    ComputeFragment converted_accumulator = accumulator_converter(accumulator);
+
+    ComputeFragment intermediate;
+    multiplies<ComputeFragment> mul_accumulator;
+
+    ElementCompute alpha = isLast ? (1 / s_prime_[row]) : 1;
+
+    intermediate = mul_accumulator(
+        alpha, converted_accumulator); // X =  alpha * C + uniform
+
+    return destination_converter(intermediate);
+  }
+};
+
+} // namespace thread
+
+namespace threadblock {
+template <
+    typename EO,
+    typename ES,
+    int Count,
+    typename EA,
+    typename EC,
+    bool F,
+    bool L,
+    typename FAB,
+    FloatRoundStyle R>
+struct ApplyEpilogueOp<thread::MemoryEfficientAttentionNormalize<
+    EO,
+    ES,
+    Count,
+    EA,
+    EC,
+    F,
+    L,
+    FAB,
+    R>> {
+  using Op = thread::
+      MemoryEfficientAttentionNormalize<EO, ES, Count, EA, EC, F, L, FAB, R>;
+  static CUTLASS_DEVICE typename Op::FragmentOutput apply(
+      Op const& output_op,
+      int row_id,
+      typename Op::FragmentAccumulator const& accum,
+      typename Op::FragmentSource const& source) {
+    return output_op(row_id, accum, source);
+  }
+  static CUTLASS_DEVICE typename Op::FragmentOutput apply(
+      Op const& output_op,
+      int row_id,
+      typename Op::FragmentAccumulator const& accum) {
+    return output_op(row_id, accum);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/third_party/fused_multi_head_attention/epilogue/epilogue_thread_apply_logsumexp.h b/third_party/fused_multi_head_attention/epilogue/epilogue_thread_apply_logsumexp.h
new file mode 100644
index 0000000000..b110abeced
--- /dev/null
+++ b/third_party/fused_multi_head_attention/epilogue/epilogue_thread_apply_logsumexp.h
@@ -0,0 +1,174 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Functor performing linear combination operations used by epilogues.
+*/
+
+#pragma once
+
+#include <cuda_fp16.h>
+
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/functional.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace epilogue {
+namespace thread {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <typename Element, int ElementsPerAccess>
+struct ArrayExponential {
+  CUTLASS_HOST_DEVICE
+  Array<Element, ElementsPerAccess> operator()(
+      Array<Element, ElementsPerAccess> const& input) const {
+    Array<Element, ElementsPerAccess> result;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < ElementsPerAccess; ++i) {
+      result[i] = expf(input[i]);
+    }
+
+    return result;
+  }
+};
+
+template <int ElementsPerAccess>
+struct ArrayExponential<half_t, ElementsPerAccess> {
+  CUTLASS_DEVICE
+  Array<half_t, ElementsPerAccess> operator()(
+      Array<half_t, ElementsPerAccess> const& input) const {
+    Array<half_t, ElementsPerAccess> result;
+
+    int const kVectorCount = ElementsPerAccess / 2;
+
+    __half2 const* input_ptr =
+        reinterpret_cast<__half2 const*>(input.raw_data());
+    __half2* res_ptr = reinterpret_cast<__half2*>(result.raw_data());
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kVectorCount; ++i) {
+      res_ptr[i] = h2exp(input_ptr[i]);
+    }
+
+    return result;
+  }
+};
+} // namespace detail
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Applies:
+/// output <- (input - lse).exp()
+template <
+    typename ElementOutput_, // output
+    typename ElementLSE_, // accumulator from LSE
+    typename ElementAccumulator_, // accumulator from matmul
+    typename ElementCompute_, // intermediate compute (and exp calculation)
+    int ElementsPerAccess>
+class ApplyLogSumExp {
+ public:
+  using ElementOutput = ElementOutput_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  using ElementLSE = ElementLSE_;
+
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kCount = kElementsPerAccess;
+  static const ScaleType::Kind kScale =
+      cutlass::epilogue::thread::ScaleType::NoBetaScaling;
+
+  using FragmentOutput = Array<ElementOutput, kCount>;
+  using FragmentAccumulator = Array<ElementAccumulator, kElementsPerAccess>;
+  using FragmentCompute = Array<ElementCompute, kElementsPerAccess>;
+  using FragmentLSE = Array<ElementLSE, kElementsPerAccess>;
+  using FragmentScaleBias = FragmentLSE; // Used by epilogue_smem_accumulator.h
+
+ public:
+  //
+  // Methods
+  //
+
+  CUTLASS_HOST_DEVICE
+  ApplyLogSumExp() {}
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    return true;
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {}
+
+  CUTLASS_HOST_DEVICE
+  FragmentOutput operator()(
+      FragmentAccumulator const& AB,
+      FragmentLSE const& scale_unused,
+      // bias used as LSE
+      FragmentLSE const& bias) const {
+    FragmentCompute frag_AB = NumericArrayConverter<
+        ElementCompute,
+        ElementAccumulator,
+        kElementsPerAccess>()(AB);
+    FragmentCompute frag_lse_compute =
+        NumericArrayConverter<ElementCompute, ElementLSE, kElementsPerAccess>()(
+            bias);
+    FragmentCompute frag_compute;
+
+    minus<FragmentCompute> minus_lse;
+    detail::ArrayExponential<ElementCompute, kElementsPerAccess> apply_exp;
+    frag_compute = minus_lse(frag_AB, frag_lse_compute);
+    frag_compute = apply_exp(frag_compute);
+
+    return NumericArrayConverter<
+        ElementOutput,
+        ElementCompute,
+        kElementsPerAccess>()(frag_compute);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace thread
+} // namespace epilogue
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/third_party/fused_multi_head_attention/fmha_backward_test.py b/third_party/fused_multi_head_attention/fmha_backward_test.py
new file mode 100644
index 0000000000..8bc25462ac
--- /dev/null
+++ b/third_party/fused_multi_head_attention/fmha_backward_test.py
@@ -0,0 +1,232 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+import argparse
+import torch
+import sys
+import os
+from piped_subprocess import PipedSubprocess, TORCH_DTYPE_NAME
+import math
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("example_exe", type=str, help="Path to the 41_fused_multi_head_attention_backward executable")
+args = parser.parse_args()
+
+torch.manual_seed(0)
+dtype = torch.float16
+B, Mq, Mkv, H, K, Kv = 2, 1024, 1024, 5, 128, 128
+causal = True
+repeat_count = 100
+
+ATOL = {
+    torch.float: 5e-4,
+    torch.half: 9.5e-2,
+    torch.bfloat16: 7e-1,
+}[dtype]
+
+RTOL = {
+    torch.float: 1e-4,
+    torch.half: 2e-2,
+    torch.bfloat16: 1e-1,
+}[dtype]
+
+
+assert not (causal and Mq < Mkv), "causal only supports seqlenK <= seqlenQ"
+
+fmha_bw_binary = args.example_exe
+if not os.path.isfile(fmha_bw_binary):
+    print(f"""No such file: `{fmha_bw_binary}`\nDid you forget to run "make 41_fused_multi_head_attention"?""")
+    sys.exit(1)
+
+def create_lower_triangular_mask():
+    return torch.triu(torch.full(  # type: ignore
+        [1, Mq, Mkv],
+        dtype=dtype,
+        fill_value=float("-inf"),
+    ), diagonal=1)
+
+def ref_mha_bmk(q, k, v, mask):
+    # Multi-head attention with inputs/outputs in BMK format
+    q = q.float()
+    k = k.float()
+    v = v.float()
+
+    q = q * (1 / q.shape[-1] ** 0.5)
+    attn = q @ k.transpose(-2, -1)
+    if mask is not None:
+        attn += mask
+    attn_max = attn.max(-1, True).values
+    attn_norm = (attn - attn_max).exp().sum(-1, True)
+    attn = attn.softmax(-1)
+    lse = attn_max + attn_norm.log()
+    lse = lse.squeeze(2)
+    return attn @ v, lse
+
+
+def bmhk2bmk(t):
+    return t.permute((0, 2, 1, 3)).reshape(
+        [t.shape[0] * t.shape[2], t.shape[1], t.shape[3]]
+    )
+
+def ref_mha_bmhk(q, k, v, mask):
+    # Multi-head attention with inputs/outputs in BMHK format
+    assert q.ndim == 4
+
+    out, lse = ref_mha_bmk(bmhk2bmk(q), bmhk2bmk(k), bmhk2bmk(v), mask=mask)
+    out = out.reshape([q.shape[0], q.shape[2], q.shape[1], v.shape[3]])
+    return out.permute((0, 2, 1, 3)), lse.reshape([q.shape[0], q.shape[2], q.shape[1]])
+
+def ref_mha_bw_bmhk(q, k, v, mask, lse, out, grad_out, delta):
+    lse = lse[:, :, :q.shape[1]]  #BMH, unpad Q dimension
+    delta = delta.reshape([-1, delta.shape[-1], 1])
+
+    # bmhk -> bmk
+    q, k, v, out, grad_out = [bmhk2bmk(x).float() for x in (q, k, v, out, grad_out)]
+
+    attn_T = k @ q.transpose(-2, -1)
+    if mask is not None:
+        attn_T += mask.transpose(-2, -1)
+    attn_T = attn_T * (1 / q.shape[-1] ** 0.5)
+    attn_T = attn_T - lse.reshape([-1, 1, lse.shape[-1]])
+    attn_T = attn_T.exp()
+
+    grad_v = attn_T @ grad_out
+
+    dov = grad_out @ v.transpose(-2, -1)
+    tmp = (dov - delta) * attn_T.transpose(-2, -1)
+    tmp = tmp / (q.shape[-1] ** 0.5)
+
+    grad_q = tmp @ k
+    grad_k = tmp.transpose(-2, -1) @ q
+
+    return [x.reshape([B, H, x.shape[1], x.shape[-1]]).permute([0, 2, 1, 3]) for x in [grad_q, grad_k, grad_v]]
+
+
+print("initializing tensors...")
+query = torch.randn([B, Mq, H, K], dtype=dtype)
+key = 3 * torch.randn([B, Mkv, H, K], dtype=dtype)
+value = 3 * torch.randn([B, Mkv, H, Kv], dtype=dtype)
+mask = create_lower_triangular_mask() if causal else None
+
+# let PyTorch compute gradients
+query.requires_grad_(True)
+key.requires_grad_(True)
+value.requires_grad_(True)
+
+print("computing fw...")
+out, lse = ref_mha_bmhk(query, key, value, mask=mask)
+out = out.to(dtype).contiguous()
+grad_out = 3 * torch.randn([B, Mq, H, Kv], dtype=dtype)
+
+print("computing bw with autograd...")
+out.backward(grad_out)
+scale = (1 / query.shape[-1] ** 0.5)
+
+
+# Additional data needed by the kernel
+delta = (grad_out.float() * out.float()).sum(-1).transpose(-2, -1).contiguous()
+pad_amount = (32 - (lse.shape[2] % 32)) % 32
+lse = torch.nn.functional.pad(lse, [0, pad_amount], value=math.inf)
+
+print("computing bw with reference implem...")
+gQr, gKr, gVr = ref_mha_bw_bmhk(query, key, value, mask, lse, out, grad_out, delta)
+
+with PipedSubprocess(fmha_bw_binary) as bw_kernel:
+    # Send kernel arguments
+    bw_kernel.write(
+        TORCH_DTYPE_NAME[query.dtype],
+        "scale", scale,
+        "head_dim", K,
+        "head_dim_value", Kv,
+        "num_queries", Mq,
+        "num_keys", Mkv,
+        "num_heads", H,
+        "custom_mask_type", (1 if causal else 0),
+        "num_batches", B,
+        "repeat_count", repeat_count,
+        "num_splits_key", (Mkv // 128),
+    )
+    bw_kernel.writeTensor(query, "query", ["q_strideB", "q_strideM", "q_strideH"])
+    bw_kernel.writeTensor(key, "key", ["k_strideB", "k_strideM", "k_strideH"])
+    bw_kernel.writeTensor(value, "value", ["v_strideB", "v_strideM", "v_strideH"])
+    bw_kernel.writeTensor(lse, "logsumexp", ["lse_strideB", "lse_strideH"])
+    bw_kernel.writeTensor(out, "output", ["o_strideB", "o_strideM", "o_strideH"])
+    bw_kernel.writeTensor(grad_out, "grad_output", ["gO_strideB", "gO_strideM", "gO_strideH"])
+    bw_kernel.writeTensor(delta, "delta", ["delta_strideB", "delta_strideH"])
+
+    if bw_kernel.read() != "OK":
+        print("Got unexpected output")
+        print(bw_kernel.subp.communicate()[0])
+        sys.exit(0)
+
+    # Read kernel output
+    gQ = bw_kernel.readTensor("grad_query", ["gQ_strideB", "gQ_strideM", "gQ_strideH"], query.shape).float()
+    gK = bw_kernel.readTensor("grad_key", ["gK_strideB", "gK_strideM", "gK_strideH"], key.shape).float()
+    gV = bw_kernel.readTensor("grad_value", ["gV_strideB", "gV_strideM", "gV_strideH"], value.shape).float()
+    runtime_ms = float(bw_kernel.readNamed("runtime_ms"))
+
+float_ops = B * H * sum([
+    # att = Q @ K.transpose
+    Mq * Mkv * K * 2,
+    # att @ dO
+    Mkv * Mq * Kv * 2,
+    # dov = dO @ V
+    Mq * Kv * Mkv * 2,
+    # dov @ K
+    Mq * K * Mkv * 2,
+    # dov @ Q
+    Mq * K * Mkv * 2,
+])
+if causal:
+    float_ops //= 2
+
+print(f"""
+Fused multi-head attention - backward
+    batch_size={B}
+    num_queries={Mq}
+    num_keys={Mkv}
+    num_heads={H}
+    head_dim={K}
+    head_dim_value={Kv}
+
+    Correctness:
+        grad_query: {"PASS" if torch.allclose(gQ, gQr, rtol=RTOL, atol=ATOL) else "FAIL"} (delta: {(gQ - gQr).abs().max()})
+        grad_key:   {"PASS" if torch.allclose(gK, gKr, rtol=RTOL, atol=ATOL) else "FAIL"} (delta: {(gK - gKr).abs().max()})
+        grad_value: {"PASS" if torch.allclose(gV, gVr, rtol=RTOL, atol=ATOL) else "FAIL"} (delta: {(gV - gVr).abs().max()})
+        (atol={ATOL} / rtol={RTOL})
+    Runtime: {runtime_ms}ms ({(float_ops / (1024 ** 4)) / (runtime_ms / 1000):.4f} TFlops)
+""")
+
+assert torch.allclose(query.grad.float(), gQr, rtol=RTOL, atol=ATOL), "Reference implementation does not match PyTorch autograd!"
+assert torch.allclose(key.grad.float(), gKr, rtol=RTOL, atol=ATOL), "Reference implementation does not match PyTorch autograd!"
+assert torch.allclose(value.grad.float(), gVr, rtol=RTOL, atol=ATOL), "Reference implementation does not match PyTorch autograd!"
diff --git a/third_party/fused_multi_head_attention/fmha_grouped.h b/third_party/fused_multi_head_attention/fmha_grouped.h
new file mode 100644
index 0000000000..afc25e4340
--- /dev/null
+++ b/third_party/fused_multi_head_attention/fmha_grouped.h
@@ -0,0 +1,1023 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Grouped FMHA kernel
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/semaphore.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/trace.h"
+#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
+
+#include "fmha_grouped_problem_visitor.h"
+#include "gemm_kernel_utils.h"
+#include "gemm/mma_accum_lambda_iterator.h"
+#include "epilogue/epilogue_rescale_output.h"
+
+
+namespace {
+  static CUTLASS_DEVICE float atomicMaxFloat(float* addr, float value) {
+  // source: https://stackoverflow.com/a/51549250
+  return (value >= 0)
+      ? __int_as_float(atomicMax((int*)addr, __float_as_int(value)))
+      : __uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(value)));
+}
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename MM0_,                           ///! Structure for computing P = Q @ K
+  typename MM1_,                           ///! Structure for computing O = P @ V
+  typename scalar_t_,
+  typename accum_t_,
+  typename output_t_,
+  typename output_accum_t_,
+  bool kKeepOutputInRF,                    ///! Whether the intermediate output from MM0_ should be kept in the register file
+  GroupScheduleMode GroupScheduleMode_     ///! Type of scheduling to perform
+>
+struct FMHAGrouped {
+public:
+  using MM0 = MM0_;
+  using MM1 = MM1_;
+
+  using scalar_t = scalar_t_;
+  using accum_t = accum_t_;
+  using output_t = output_t_;
+  using output_accum_t = output_accum_t_;
+
+  static GroupScheduleMode const kGroupScheduleMode = GroupScheduleMode_;
+
+  static constexpr bool kNeedsOutputAccumulatorBuffer = !kKeepOutputInRF &&
+      !cutlass::platform::is_same<output_accum_t, output_t>::value;
+
+  // Parameters to satisfy BaseGrouped
+  using ElementA = scalar_t;
+  using ElementB = scalar_t;
+  using ElementC = accum_t;
+  using LayoutA = typename MM0::LayoutA;
+  using LayoutB = typename MM0::ElementB;
+  using LayoutC = typename MM1::ElementC;
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+  static int const kAlignmentA = MM0::kAlignmentA;
+  static int const kAlignmentB = MM0::kAlignmentB;
+  static int const kAlignmentC = 1;
+  using Mma = typename MM1::Mma;
+  using EpilogueOutputOp = typename MM1::EpilogueOutputOp;
+  using ThreadblockSwizzle = void;
+  using Operator = typename MM1::Operator;
+  using WarpShape = typename MM1::WarpShape;
+  using InstructionShape = typename MM1::InstructionShape;
+
+  using ElementQ = scalar_t;
+  using ElementK = scalar_t;
+  using ElementP = accum_t;
+  using ElementV = scalar_t;
+  using ElementO = output_t;
+  using ElementOAccum = output_accum_t;
+  using ElementAccumulator = accum_t;
+
+  using LayoutQ = typename MM0::LayoutA;
+  using LayoutK = typename MM0::LayoutB;
+  using LayoutP = typename MM0::LayoutC;
+  using LayoutV = typename MM1::LayoutB;
+  using LayoutO = typename MM1::LayoutC;
+
+  static bool const kPreloadV = (MM1::Mma::ArchTag::kMinComputeCapability >= 80 &&
+                                 cutlass::sizeof_bits<ElementV>::value == 16);
+
+  static int const kAlignmentQ = MM0::kAlignmentA;
+  static int const kAlignmentK = MM0::kAlignmentB;
+  static int const kAlignmentV = 1;
+
+  using ThreadblockShape = typename MM0::ThreadblockShape;
+
+  static int const kQueriesPerBlock = ThreadblockShape::kM;
+  static int const kKeysPerBlock = ThreadblockShape::kN;
+
+  static constexpr bool kSupportsDropout = false;
+  static constexpr bool kSupportsBias = false;
+
+  /// Warp count (concept: GemmShape)
+  using WarpCount = typename MM1::WarpCount;
+  static int const kThreadsPerWarp = 32;
+  static int const kThreadCount = kThreadsPerWarp * WarpCount::kCount;
+
+  static constexpr int kNumWarpsPerBlock =
+    kQueriesPerBlock * kKeysPerBlock / (kThreadsPerWarp * kThreadsPerWarp);
+
+  using ProblemVisitor = FMHAGroupedProblemVisitor<
+                            ThreadblockShape,
+                            kGroupScheduleMode,
+                            kThreadCount,
+                            kThreadCount>;
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments {
+
+    //
+    // Data members
+    //
+
+    GemmCoord *problem_sizes0{nullptr};
+    GemmCoord *problem_sizes1{nullptr};
+
+    int problem_count{0};
+    int threadblock_count{0};
+
+    ElementQ ** ptr_Q{nullptr};
+    ElementK ** ptr_K{nullptr};
+    ElementP ** ptr_P{nullptr};
+    ElementV ** ptr_V{nullptr};
+    ElementO ** ptr_O{nullptr};
+    ElementOAccum ** ptr_O_accum{nullptr};
+
+    typename LayoutQ::Stride::LongIndex *ldq{nullptr};
+    typename LayoutK::Stride::LongIndex *ldk{nullptr};
+    typename LayoutP::Stride::LongIndex *ldv{nullptr};
+    typename LayoutO::Stride::LongIndex *ldo{nullptr};
+
+    // Whether causal masking is to be performed
+    bool causal{false};
+
+    // Scale
+    ElementAccumulator scale{0};
+
+    // Only used by device-level operator
+    GemmCoord *host_problem_sizes{nullptr};
+
+    //
+    // Methods
+    //
+  
+      /// Default ctor
+    Arguments() = default;
+
+    /// Ctor
+    CUTLASS_HOST_DEVICE
+    Arguments(
+      GemmCoord *problem_sizes0,
+      GemmCoord *problem_sizes1,
+      int problem_count,
+      int threadblock_count,
+      ElementQ ** ptr_Q,
+      ElementK ** ptr_K,
+      ElementP ** ptr_P,
+      ElementV ** ptr_V,
+      ElementO ** ptr_O,
+      ElementOAccum ** ptr_O_accum,
+      typename LayoutQ::Stride::LongIndex *ldq,
+      typename LayoutK::Stride::LongIndex *ldk,
+      typename LayoutP::Stride::LongIndex *ldp,
+      typename LayoutV::Stride::LongIndex *ldv,
+      typename LayoutO::Stride::LongIndex *ldo,
+      bool causal,
+      ElementAccumulator scale,
+      GemmCoord *host_problem_sizes=nullptr
+    ):
+      problem_sizes0(problem_sizes0),
+      problem_sizes1(problem_sizes1),
+      problem_count(problem_count),
+      threadblock_count(threadblock_count),
+      ptr_Q(ptr_Q),
+      ptr_K(ptr_K),
+      ptr_P(ptr_P),
+      ptr_V(ptr_V),
+      ptr_O(ptr_O),
+      ptr_O_accum(kNeedsOutputAccumulatorBuffer ? ptr_O_accum : (accum_t**)ptr_O),
+      ldq(ldq),
+      ldk(ldk),
+      ldv(ldv),
+      ldo(ldo),
+      causal(causal),
+      scale(scale),
+      host_problem_sizes(host_problem_sizes)
+    {
+
+    }
+
+    bool __host__ check_supported() {
+      CHECK_ALIGNED_PTR(ptr_Q, kAlignmentQ);
+      CHECK_ALIGNED_PTR(ptr_K, kAlignmentK);
+      CHECK_ALIGNED_PTR(ptr_V, kAlignmentV);
+      XFORMERS_CHECK(ldq % kAlignmentQ == 0, "query is not correctly aligned");
+      XFORMERS_CHECK(ldk % kAlignmentK == 0, "key is not correctly aligned");
+      XFORMERS_CHECK(ldv % kAlignmentV == 0, "value is not correctly aligned");
+      return true;
+    }
+  };
+
+  //
+  // Structure for precomputing values in host memory and passing to kernels
+  //
+
+  /// Parameters structure
+  struct Params {
+
+    typename ProblemVisitor::Params problem_visitor;
+    int threadblock_count;
+
+    ElementQ ** ptr_Q;
+    ElementK ** ptr_K;
+    ElementP ** ptr_P;
+    ElementV ** ptr_V;
+    ElementO ** ptr_O;
+    ElementOAccum ** ptr_O_accum;
+
+    typename LayoutQ::Stride::LongIndex *ldq;
+    typename LayoutK::Stride::LongIndex *ldk;
+    typename LayoutP::Stride::LongIndex *ldv;
+    typename LayoutO::Stride::LongIndex *ldo;
+
+    ElementAccumulator scale;
+    bool causal;
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params():
+      ptr_Q(nullptr),
+      ptr_K(nullptr),
+      ptr_P(nullptr),
+      ptr_V(nullptr),
+      ptr_O(nullptr),
+      ptr_O_accum(nullptr),
+      ldq(nullptr),
+      ldk(nullptr),
+      ldv(nullptr),
+      ldo(nullptr),
+      causal(false),
+      scale(0)
+    { }
+
+    CUTLASS_HOST_DEVICE
+    Params(Arguments const &args,
+          void *workspace = nullptr,
+          int tile_count = 0):
+      problem_visitor(args.problem_sizes0, args.problem_sizes1, args.problem_count, workspace, tile_count),
+      threadblock_count(args.threadblock_count),
+      ptr_Q(args.ptr_Q),
+      ptr_K(args.ptr_K),
+      ptr_P(args.ptr_P),
+      ptr_V(args.ptr_V),
+      ptr_O(args.ptr_O),
+      ptr_O_accum(kNeedsOutputAccumulatorBuffer ? args.ptr_O_accum : (accum_t**)args.ptr_O),
+      ldq(args.ldq),
+      ldk(args.ldk),
+      ldv(args.ldv),
+      ldo(args.ldo),
+      causal(args.causal),
+      scale(args.scale)
+    { 
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    void update(
+      Arguments const &args,
+      void *workspace = nullptr,
+      int tile_count = 0) {
+
+      problem_visitor = typename ProblemVisitor::Params(args.problem_sizes0,
+                                                        args.problem_sizes1,
+                                                        args.problem_count,
+                                                        workspace, tile_count);
+      threadblock_count = args.threadblock_count;
+      ptr_Q = args.ptr_Q;
+      ptr_K = args.ptr_K;
+      ptr_P = args.ptr_P;
+      ptr_V = args.ptr_V;
+      ptr_O = args.ptr_O;
+      ptr_O_accum = kNeedsOutputAccumulatorBuffer ? args.ptr_O_accum : (accum_t**)args.ptr_O;
+      ldq = args.ldq;
+      ldk = args.ldk;
+      ldv = args.ldv;
+      ldo = args.ldo;
+      causal = args.causal;
+      scale = args.scale;
+    }
+  };
+
+  // Shared storage - depends on kernel params
+  struct ScalingCoefs {
+    cutlass::Array<ElementAccumulator, kQueriesPerBlock> m_prime;
+    cutlass::Array<ElementAccumulator, kQueriesPerBlock> s_prime;
+    cutlass::Array<ElementAccumulator, kQueriesPerBlock> mi;
+    cutlass::Array<ElementAccumulator, kQueriesPerBlock> out_rescale;
+    cutlass::Array<ElementAccumulator, kQueriesPerBlock * MM0::MmaCore::WarpCount::kN>
+        addition_storage;
+  };
+
+  struct SharedStorageEpilogueAtEnd : ScalingCoefs {
+    struct SharedStorageAfterMM0 {
+      // Everything here might be overwritten during MM0
+      typename MM0::AccumulatorSharedStorage si;
+      typename MM1::Mma::SharedStorage mm1;
+    };
+
+    union {
+      typename MM0::Mma::SharedStorage mm0;
+      SharedStorageAfterMM0 after_mm0;
+      typename MM1::DefaultEpilogue::SharedStorage epilogue;
+    };
+
+    CUTLASS_DEVICE typename MM1::DefaultEpilogue::SharedStorage&
+    epilogue_shared_storage() {
+      return epilogue;
+    }
+
+    // ProblemVisitor shared storage can't be overlapped with others
+    typename ProblemVisitor::SharedStorage problem_visitor;
+  };
+
+  struct SharedStorageEpilogueInLoop : ScalingCoefs {
+    struct SharedStorageAfterMM0 {
+      // Everything here might be overwritten during MM0
+      typename MM0::AccumulatorSharedStorage si;
+      typename MM1::Mma::SharedStorage mm1;
+      typename MM1::DefaultEpilogue::SharedStorage epilogue;
+    };
+
+    union {
+      typename MM0::Mma::SharedStorage mm0;
+      SharedStorageAfterMM0 after_mm0;
+    };
+
+    CUTLASS_DEVICE typename MM1::DefaultEpilogue::SharedStorage&
+    epilogue_shared_storage() {
+      return after_mm0.epilogue;
+    }
+
+    // ProblemVisitor shared storage can't be overlapped with others
+    typename ProblemVisitor::SharedStorage problem_visitor;
+  };
+
+  using SharedStorage = typename cutlass::platform::conditional<
+      kKeepOutputInRF,
+      SharedStorageEpilogueAtEnd,
+      SharedStorageEpilogueInLoop>::type;
+
+private:
+
+  // Parameters to be used by an individual tile
+  struct TileParams {
+
+    CUTLASS_HOST_DEVICE
+    static int query_start(int threadblock_idx) {
+      return threadblock_idx * kQueriesPerBlock;
+    }
+
+    // Returns whether this threadblock computes within the number of queries,
+    // which is determined by the M dimension of problem 0
+    CUTLASS_HOST_DEVICE
+    static bool can_compute(int threadblock_idx, const GemmCoord& problem_size0) {
+      return query_start(threadblock_idx) < problem_size0.m();
+    }
+
+    CUTLASS_HOST_DEVICE
+    static int num_queries(int threadblock_idx, const GemmCoord& problem_size0) {
+      return problem_size0.m() - query_start(threadblock_idx);
+    }
+
+    CUTLASS_HOST_DEVICE
+    static int num_keys(int threadblock_idx, const GemmCoord& problem_size0, bool causal) {
+      int nk = problem_size0.n();
+      if (causal) {
+        nk = cutlass::fast_min(int32_t(query_start(threadblock_idx) + kQueriesPerBlock), nk);
+      }
+      return nk;
+    }
+
+  };
+
+public:
+
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  FMHAGrouped() { }
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(cutlass::gemm::GemmCoord const & problem_size) {
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args) {
+    return Status::kSuccess;
+  }
+
+  static CUTLASS_DEVICE int16_t thread_id() {
+    return threadIdx.x;
+  }
+
+  static CUTLASS_DEVICE int8_t warp_id() {
+    return threadIdx.x / kThreadsPerWarp;
+  }
+
+  static CUTLASS_DEVICE int8_t lane_id() {
+    return threadIdx.x % kThreadsPerWarp;
+  }
+
+  /// Executes one GEMM
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage) {
+    auto& m_prime = shared_storage.m_prime;
+    auto& s_prime = shared_storage.s_prime;
+    [[maybe_unused]] auto& si = shared_storage.after_mm0.si;
+    auto& mi = shared_storage.mi;
+    auto& out_rescale = shared_storage.out_rescale;
+
+    ProblemVisitor problem_visitor(
+      params.problem_visitor,
+      shared_storage.problem_visitor,
+      blockIdx.x);
+
+    // Outer 'persistent' loop to iterate over tiles
+    while (problem_visitor.next_tile()) {
+
+      GemmCoord problem_size0 = problem_visitor.problem_size0();
+      GemmCoord problem_size1 = problem_visitor.problem_size1();
+      const int32_t threadblock_idx = int32_t(problem_visitor.threadblock_idx());
+
+      if (!TileParams::can_compute(threadblock_idx, problem_size0)) {
+        problem_visitor.advance(gridDim.x);
+        continue;
+      }
+
+      const int32_t problem_idx = problem_visitor.problem_index();
+
+      if (thread_id() < kQueriesPerBlock) {
+        s_prime[thread_id()] = ElementAccumulator(0);
+        out_rescale[thread_id()] = accum_t(1.0);
+        m_prime[thread_id()] =
+            -cutlass::platform::numeric_limits<ElementAccumulator>::infinity();
+        mi[thread_id()] = -cutlass::platform::numeric_limits<ElementAccumulator>::infinity();
+      }
+
+      ElementO *ptr_O = params.ptr_O[problem_idx]  + TileParams::query_start(threadblock_idx) * params.ldo[problem_idx];
+      ElementOAccum *ptr_O_accum = params.ptr_O_accum[problem_idx]  + TileParams::query_start(threadblock_idx) * params.ldo[problem_idx];
+      const int num_queries = TileParams::num_queries(threadblock_idx, problem_size0);
+
+      auto createOutputIter = [&](int col) -> typename MM1::OutputTileIterator {
+        using OutputTileIterator = typename MM1::OutputTileIterator;
+        return OutputTileIterator(
+            typename OutputTileIterator::Params{(int32_t)params.ldo[problem_idx]},
+            ptr_O,
+            typename OutputTileIterator::TensorCoord{
+                num_queries, problem_size1.n()},
+            thread_id(),
+            {0, col});
+      };
+
+      auto createOutputAccumIter = [&](int col) ->
+        typename MM1::OutputTileIteratorAccum {
+          using OutputTileIteratorAccum = typename MM1::OutputTileIteratorAccum;
+          return OutputTileIteratorAccum(
+              typename OutputTileIteratorAccum::Params{(int32_t)params.ldo[problem_idx]},
+              ptr_O_accum,
+              typename OutputTileIteratorAccum::TensorCoord{
+                  num_queries, problem_size1.n()},
+              thread_id(),
+              {0, col});
+        };
+
+      typename MM1::Mma::FragmentC accum_o;
+      accum_o.clear();
+
+      const int num_keys = TileParams::num_keys(threadblock_idx, problem_size0, params.causal);
+
+      for (int32_t iter_key_start = 0; iter_key_start < num_keys;
+           iter_key_start += kKeysPerBlock) {
+        int32_t problem_size_0_m =
+            cutlass::fast_min((int32_t)kQueriesPerBlock, num_queries);
+        int32_t problem_size_0_n = cutlass::fast_min(
+            (int32_t)kKeysPerBlock, num_keys - iter_key_start);
+        int32_t const& problem_size_0_k = problem_size0.k();
+        int32_t const& problem_size_1_n = problem_size1.n();
+        int32_t const& problem_size_1_k = problem_size_0_n;
+
+        auto prologueV = [&](int blockN) {
+          typename MM1::Mma::IteratorB iterator_V(
+              typename MM1::IteratorB::Params{typename MM1::LayoutB(params.ldv[problem_idx])},
+              params.ptr_V[problem_idx] + iter_key_start * params.ldv[problem_idx],
+              {problem_size_1_k, problem_size_1_n},
+              thread_id(),
+              cutlass::MatrixCoord{0, blockN * MM1::Mma::Shape::kN});
+
+          MM1::Mma::prologue(
+              shared_storage.after_mm0.mm1,
+              iterator_V,
+              thread_id(),
+              problem_size_1_k);
+        };
+
+        __syncthreads(); // Need to have shared memory initialized, and `m_prime`
+                         // updated from end of prev iter
+
+        //
+        // MATMUL: Q.K_t
+        //
+        // Computes the block-matrix product of:
+        // (a) query[query_start:query_end, :]
+        // with
+        // (b) key[iter_key_start:iter_key_start + kKeysPerBlock]
+        // and stores that into `shared_storage.si`
+        //
+
+        ElementQ *ptr_Q = params.ptr_Q[problem_idx] + TileParams::query_start(threadblock_idx) * params.ldq[problem_idx];
+
+        // Construct iterators to A and B operands
+        typename MM0::IteratorA iterator_A(
+          typename MM0::IteratorA::Params(
+              typename MM0::MmaCore::LayoutA(params.ldq[problem_idx])),
+          ptr_Q,
+          {problem_size_0_m, problem_size_0_k},
+          thread_id(),
+          {0, 0});
+
+        typename MM0::IteratorB iterator_B(
+            typename MM0::IteratorB::Params(
+                typename MM0::MmaCore::LayoutB(params.ldk[problem_idx])),
+            params.ptr_K[problem_idx] + iter_key_start * params.ldk[problem_idx],
+            {problem_size_0_k, problem_size_0_n},
+            thread_id(),
+            {0, 0});
+
+        // Construct thread-scoped matrix multiply
+        typename MM0::Mma mma(
+            shared_storage.mm0, thread_id(), warp_id(), lane_id());
+
+        typename MM0::Mma::FragmentC accum;
+
+        accum.clear();
+
+        auto gemm_k_iterations =
+            (problem_size_0_k + MM0::Mma::Shape::kK - 1) / MM0::Mma::Shape::kK;
+
+        // Compute threadblock-scoped matrix multiply-add
+        mma(gemm_k_iterations, accum, iterator_A, iterator_B, accum);
+        __syncthreads();
+
+        if (kPreloadV) {
+          prologueV(0);
+        } else {
+          MM1::Mma::drain_cp_asyncs();
+        }
+
+        typename MM0::Mma::Operator::IteratorC::TensorCoord
+          iteratorC_tile_offset = {
+              (warp_id() % MM0::Mma::WarpCount::kM),
+              (warp_id() / MM0::Mma::WarpCount::kM)
+            };
+
+        // Mask out last if causal
+        if (params.causal && num_keys - iter_key_start <= kKeysPerBlock) {
+          auto lane_offset = MM0::AccumLambdaIterator::get_lane_offset(
+              lane_id(), warp_id(), iteratorC_tile_offset);
+          int32_t last_col;
+          MM0::AccumLambdaIterator::iterateRows(
+              lane_offset,
+              [&](int accum_m) {
+                last_col = TileParams::query_start(threadblock_idx) + accum_m - iter_key_start;
+              },
+              [&](int accum_m, int accum_n, int idx) {
+                if (accum_n > last_col) {
+                  accum[idx] =
+                      -cutlass::platform::numeric_limits<accum_t>::infinity();
+                }
+              },
+              [&](int accum_m) {});
+        }
+        // DISPATCH_BOOL(iter_key_start == 0, kIsFirst, ([&] {
+        //         DISPATCH_BOOL(
+        //             num_keys - iter_key_start >= kKeysPerBlock,
+        //             kFullColumns,
+        //             ([&] {
+        //               // Update `mi` from accum stored in registers
+        //               // Also does accum[i] <- exp(accum[i] - mi)
+        //               iterative_softmax<
+        //                   typename MM0::Mma::Operator::IteratorC,
+        //                   kFullColumns,
+        //                   kIsFirst>(
+        //                   accum_o,
+        //                   accum,
+        //                   mi,
+        //                   m_prime,
+        //                   s_prime,
+        //                   lane_id(),
+        //                   thread_id(),
+        //                   warp_id(),
+        //                   num_keys - iter_key_start,
+        //                   iteratorC_tile_offset,
+        //                   kSupportsBias ? 1.0f : params.scale);
+        //             }));
+        //       }));
+
+        // Update `mi` from accum stored in registers
+        // Also does accum[i] <- exp(accum[i] - mi)
+        iterative_softmax<typename MM0::Mma::Operator::IteratorC>(
+            accum_o,
+            accum,
+            mi,
+            m_prime,
+            s_prime,
+            out_rescale,
+            shared_storage.addition_storage,
+            lane_id(),
+            thread_id(),
+            warp_id(),
+            num_keys - iter_key_start,
+            iter_key_start == 0,
+            iteratorC_tile_offset,
+            kSupportsBias ? 1.0f : params.scale);
+
+        // Output results to shared-memory
+        int warp_idx_mn_0 = warp_id() %
+            (MM0::Mma::Base::WarpCount::kM * MM0::Mma::Base::WarpCount::kN);
+        auto output_tile_coords = cutlass::MatrixCoord{
+            warp_idx_mn_0 % MM0::Mma::Base::WarpCount::kM,
+            warp_idx_mn_0 / MM0::Mma::Base::WarpCount::kM};
+
+        MM0::B2bGemm::accumToSmem(
+            shared_storage.after_mm0.si, accum, lane_id(), output_tile_coords);
+
+        __syncthreads();
+
+        //
+        // MATMUL: Attn . V
+        // Run the matmul `attn @ V` for a block of attn and V.
+        // `attn` is read from shared memory (in `shared_storage_si`)
+        // `V` is read from global memory (with iterator_B)
+        //
+
+        const int64_t nBlockN = kKeepOutputInRF ? 1
+                                                : ceil_div(
+                                                      (int64_t)problem_size_1_n,
+                                                      int64_t(MM1::ThreadblockShape::kN));
+
+        // Iterate over the N dimension of GEMM1
+        for (int blockN = 0; blockN < nBlockN; ++blockN) {
+          int gemm_k_iterations =
+              (problem_size_1_k + MM1::Mma::Shape::kK - 1) / MM1::Mma::Shape::kK;
+
+          // Compute threadblock-scoped matrix multiply-add and store it in accum
+          // (in registers)
+          if (!kPreloadV) {
+            __syncthreads(); // we share shmem between mma and epilogue
+          }
+
+          typename MM1::Mma::IteratorB iterator_V(
+            typename MM1::IteratorB::Params{typename MM1::LayoutB(params.ldv[problem_idx])},
+            params.ptr_V[problem_idx] + iter_key_start * params.ldv[problem_idx],
+            {problem_size_1_k, problem_size_1_n},
+            thread_id(),
+            cutlass::MatrixCoord{0, blockN * MM1::Mma::Shape::kN});
+
+          typename MM1::Mma mma_pv(
+            // operand A: Pij_dropped in shared memory
+            shared_storage.after_mm0.si.accum_ref(),
+            // operand B: shared memory staging area for Vj, which is loaded
+            // from global memory
+            shared_storage.after_mm0.mm1.operand_B_ref(),
+            (int)thread_id(),
+            (int)warp_id(),
+            (int)lane_id());
+
+          mma_pv.set_prologue_done(kPreloadV);
+          if (!kKeepOutputInRF) {
+            accum_o.clear();
+          }
+
+          mma_pv(gemm_k_iterations, accum_o, iterator_V, accum_o);
+          __syncthreads();
+
+          if (kPreloadV && !kKeepOutputInRF && blockN + 1 < nBlockN) {
+            prologueV(blockN + 1);
+          }
+
+          if (!kKeepOutputInRF) {
+            MM1::Mma::drain_cp_asyncs();
+            DISPATCH_BOOL(
+                iter_key_start == 0, kIsFirst, ([&] {
+                  DISPATCH_BOOL(
+                      (iter_key_start + kKeysPerBlock) >= num_keys,
+                      kIsLast,
+                      ([&] {
+                        using DefaultEpilogue = typename MM1::DefaultEpilogue;
+                        using DefaultOp = typename MM1::DefaultConfig::EpilogueOutputOp;
+                        using ElementCompute = typename DefaultOp::ElementCompute;
+                        using EpilogueOutputOp = typename cutlass::epilogue::
+                            thread::MemoryEfficientAttentionNormalize<
+                                typename cutlass::platform::conditional<
+                                    kIsLast::value,
+                                    output_t,
+                                    output_accum_t>::type,
+                                output_accum_t,
+                                DefaultOp::kCount,
+                                typename DefaultOp::ElementAccumulator,
+                                output_accum_t,
+                                kIsFirst::value,
+                                kIsLast::value,
+                                cutlass::Array<ElementCompute, kQueriesPerBlock>>;
+                        using Epilogue = typename cutlass::epilogue::threadblock::
+                            EpiloguePipelined<
+                                typename DefaultEpilogue::Shape,
+                                typename MM1::Mma::Operator,
+                                DefaultEpilogue::kPartitionsK,
+                                typename cutlass::platform::conditional<
+                                    kIsLast::value,
+                                    typename MM1::OutputTileIterator,
+                                    typename MM1::OutputTileIteratorAccum>::type,
+                                typename DefaultEpilogue::
+                                    AccumulatorFragmentIterator,
+                                typename DefaultEpilogue::WarpTileIterator,
+                                typename DefaultEpilogue::SharedLoadIterator,
+                                EpilogueOutputOp,
+                                typename DefaultEpilogue::Padding,
+                                DefaultEpilogue::kFragmentsPerIteration,
+                                true, // IterationsUnroll
+                                typename MM1::OutputTileIteratorAccum // Read
+                                                                      // iterator
+                                >;
+
+                        int col = blockN * MM1::Mma::Shape::kN;
+                        auto source_iter = createOutputAccumIter(col);
+                        auto dest_iter = gemm_kernel_utils::call_conditional<
+                            kIsLast::value,
+                            decltype(createOutputIter),
+                            decltype(createOutputAccumIter)>::
+                            apply(createOutputIter, createOutputAccumIter, col);
+                        EpilogueOutputOp rescale(s_prime, out_rescale);
+                        Epilogue epilogue(
+                            shared_storage.epilogue_shared_storage(),
+                            thread_id(),
+                            warp_id(),
+                            lane_id());
+                        epilogue(rescale, dest_iter, accum_o, source_iter);
+                      }));
+                }));
+            if (!kKeepOutputInRF) {
+              __syncthreads();
+            }
+          }
+        }
+         __syncthreads(); // we modify `m_prime` after
+      }
+
+      if (kKeepOutputInRF) {
+        constexpr bool kIsFirst = true;
+        constexpr bool kIsLast = true;
+        using DefaultEpilogue = typename MM1::DefaultEpilogue;
+        using DefaultOp = typename MM1::DefaultConfig::EpilogueOutputOp;
+        using ElementCompute = typename DefaultOp::ElementCompute;
+        using EpilogueOutputOp =
+            typename cutlass::epilogue::thread::MemoryEfficientAttentionNormalize<
+                output_t,       // output
+                output_accum_t, // source
+                DefaultOp::kCount,
+                typename DefaultOp::ElementAccumulator, // accum
+                output_accum_t, // compute
+                kIsFirst,
+                kIsLast,
+                cutlass::Array<ElementCompute, kQueriesPerBlock>>;
+        using Epilogue =
+            typename cutlass::epilogue::threadblock::EpiloguePipelined<
+                typename DefaultEpilogue::Shape,
+                typename MM1::Mma::Operator,
+                DefaultEpilogue::kPartitionsK,
+                typename MM1::OutputTileIterator, // destination
+                typename DefaultEpilogue::AccumulatorFragmentIterator,
+                typename DefaultEpilogue::WarpTileIterator,
+                typename DefaultEpilogue::SharedLoadIterator,
+                EpilogueOutputOp,
+                typename DefaultEpilogue::Padding,
+                DefaultEpilogue::kFragmentsPerIteration,
+                true, // IterationsUnroll
+                typename MM1::OutputTileIteratorAccum // source tile
+                >;
+        auto dest_iter = createOutputIter(0);
+        EpilogueOutputOp rescale(s_prime, out_rescale);
+        Epilogue epilogue(
+            shared_storage.epilogue_shared_storage(),
+            thread_id(),
+            warp_id(),
+            lane_id());
+        MM1::Mma::drain_cp_asyncs();
+        epilogue(rescale, dest_iter, accum_o);
+      }
+
+      // Next tile
+      problem_visitor.advance(gridDim.x);
+      __syncthreads(); // Don't start the next iteration until all threads are done using shared memory.
+    }
+  }
+
+  template <typename WarpIteratorC>
+  CUTLASS_DEVICE static void iterative_softmax(
+      typename WarpIteratorC::Fragment& frag_o, // output so far
+      typename WarpIteratorC::Fragment& frag,
+      cutlass::Array<accum_t, kQueriesPerBlock>& mi,
+      cutlass::Array<accum_t, kQueriesPerBlock>& m_prime,
+      cutlass::Array<accum_t, kQueriesPerBlock>& s_prime,
+      cutlass::Array<accum_t, kQueriesPerBlock>& out_rescale,
+      cutlass::Array<accum_t, kQueriesPerBlock * MM0::MmaCore::WarpCount::kN>&
+          addition_storage,
+      int8_t lane_id,
+      int8_t thread_id,
+      int8_t warp_id,
+      int max_col,
+      bool is_first,
+      typename WarpIteratorC::TensorCoord const& tile_offset,
+      float scaling) {
+    /* Iterates on the accumulator and corresponding position on result matrix
+
+    (1) Update `mi[r]` to the max value of the row `r`
+    (2) In a second iteration do the following:
+        (a) accum   <- exp(accum - mi)
+        (b) m_prime <- exp(m_prime - mi)
+        (c) s_prime <- s_prime * m_prime + sum(accum)
+
+    All of this is done on registers, before we store all of this
+    on shared memory for the next matmul with Value.
+    */
+    using Fragment = typename WarpIteratorC::Fragment;
+    using LambdaIterator = typename DefaultMmaAccumLambdaIterator<
+        WarpIteratorC,
+        accum_t,
+        kThreadsPerWarp>::Iterator;
+    // Convert to `accum_t` (rather than double)
+    constexpr float kLog2e = 1.4426950408889634074; // log_2(e) = M_LOG2E
+
+    static_assert(kQueriesPerBlock % kNumWarpsPerBlock == 0, "");
+    static constexpr int kLinesPerWarp = kQueriesPerBlock / kNumWarpsPerBlock;
+
+    frag = cutlass::multiplies<Fragment>()(scaling * kLog2e, frag);
+
+    auto lane_offset =
+        LambdaIterator::get_lane_offset(lane_id, warp_id, tile_offset);
+
+    // First update `mi` to the max per-row
+    {
+      accum_t max;
+      LambdaIterator::iterateRows(
+          lane_offset,
+          [&](int accum_m) {
+            max = -cutlass::platform::numeric_limits<accum_t>::infinity();
+          },
+          [&](int accum_m, int accum_n, int idx) {
+            if (accum_n < max_col) {
+              max = cutlass::fast_max(max, frag[idx]);
+            }
+          },
+          [&](int accum_m) {
+            // Having 4x atomicMax seems faster than reduce within warp
+            // first...
+            atomicMaxFloat(&mi[accum_m], max);
+          });
+    }
+
+    // Make sure we all share the update values for `mi`
+    __syncthreads();
+
+    // Doing this `exp` is quite expensive. Let's
+    // split it across the warps
+    bool restore_mi_to_minus_inf = false;
+    if (lane_id < kLinesPerWarp) {
+      int id = warp_id * kLinesPerWarp + lane_id;
+      auto m_prime_id = m_prime[id];
+      auto mi_id = mi[id];
+      bool changed = m_prime_id < mi_id; // `false` if both are -inf
+      if (changed) {
+        auto m_prime_exp = exp2f(m_prime_id - mi_id);
+        out_rescale[id] = m_prime_exp;
+        s_prime[id] *= m_prime_exp;
+      } else {
+        // Only when bias is enabled, it's possible that all the first values
+        // of attention are masked to `-inf`. In that case we want to avoid
+        // `nan = exp2f(-inf - (-inf))` so we temporarily set `mi` to 0
+        if (kSupportsBias &&
+            mi_id == -cutlass::platform::numeric_limits<accum_t>::infinity()) {
+          restore_mi_to_minus_inf = true;
+          mi[id] = 0.0f;
+        }
+        out_rescale[id] = 1.0f;
+      }
+    }
+    __syncthreads(); // Update output fragments
+    if (kKeepOutputInRF && !is_first) {
+      accum_t line_rescale;
+      LambdaIterator::iterateRows(
+          lane_offset,
+          [&](int accum_m) { line_rescale = out_rescale[accum_m]; },
+          [&](int accum_m, int accum_n, int idx) {
+            frag_o[idx] = frag_o[idx] * line_rescale;
+          },
+          [&](int accum_m) {});
+    }
+    // Update accum_m, accum_n, ...
+    {
+      accum_t mi_row, total_row;
+      LambdaIterator::iterateRows(
+          lane_offset,
+          [&](int accum_m) { mi_row = mi[accum_m]; },
+          [&](int accum_m, int accum_n, int idx) {
+            frag[idx] =
+                (accum_n < max_col) ? exp2f(frag[idx] - mi_row) : accum_t(0.0);
+          },
+          [&](int accum_m) {});
+      LambdaIterator::iterateRows(
+          lane_offset,
+          [&](int accum_m) { total_row = 0.0; },
+          [&](int accum_m, int accum_n, int idx) { total_row += frag[idx]; },
+          [&](int accum_m) {
+            if (LambdaIterator::reduceSameRow(
+                    lane_id, total_row, [](accum_t a, accum_t b) {
+                      return a + b;
+                    })) {
+              // NOTE: we could atomically add `total_row` to `s_prime`, but
+              // it's faster (and deterministic) to avoid atomics here
+              addition_storage
+                  [accum_m + kQueriesPerBlock * tile_offset.column()] =
+                      total_row;
+            }
+          });
+    }
+
+    __syncthreads();
+    if (lane_id < kLinesPerWarp) {
+      int id = warp_id * kLinesPerWarp + lane_id;
+      accum_t total_row = s_prime[id];
+      if (restore_mi_to_minus_inf) {
+        // Restore `mi`, see above when we set `restore_mi_to_minus_inf=true`
+        mi[id] = -cutlass::platform::numeric_limits<accum_t>::infinity();
+      } else {
+        m_prime[id] = mi[id];
+      }
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < MM0::MmaCore::WarpCount::kN; ++i) {
+        total_row += addition_storage[id + kQueriesPerBlock * i];
+      }
+      s_prime[id] = total_row;
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/third_party/fused_multi_head_attention/fmha_grouped_problem_visitor.h b/third_party/fused_multi_head_attention/fmha_grouped_problem_visitor.h
new file mode 100644
index 0000000000..f88219304b
--- /dev/null
+++ b/third_party/fused_multi_head_attention/fmha_grouped_problem_visitor.h
@@ -0,0 +1,178 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Scheduler for grouped FMHA
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/gemm/kernel/grouped_problem_visitor.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+// Helper for correctly representing problem sizes in grouped kernels
+template <typename ThreadblockShape>
+struct FMHAGroupedProblemSizeHelper {
+
+  CUTLASS_HOST_DEVICE
+  static cutlass::gemm::GemmCoord grid_shape(const cutlass::gemm::GemmCoord& problem) {
+    // FMHA only partitions tiles across the M dimension.
+    return cutlass::gemm::GemmCoord(
+      ((problem.m() - 1 + ThreadblockShape::kM) / ThreadblockShape::kM), 1, 1);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static void possibly_transpose_problem(cutlass::gemm::GemmCoord& problem) {}
+
+  CUTLASS_HOST_DEVICE
+  static int32_t tile_count(const cutlass::gemm::GemmCoord& grid) {
+    return grid.m() * grid.n();
+  }
+};
+
+} // namespace detail
+
+/// Visitor class to abstract away the algorithm for iterating over tiles
+template <typename ThreadblockShape,
+          GroupScheduleMode GroupScheduleMode_,
+          int PrefetchTileCount,
+          int ThreadCount,
+          bool Transposed = false>
+struct FMHAGroupedProblemVisitor : public GroupedProblemVisitor<
+                                            detail::FMHAGroupedProblemSizeHelper<ThreadblockShape>,
+                                            ThreadblockShape,
+                                            GroupScheduleMode_,
+                                            PrefetchTileCount,
+                                            ThreadCount> {
+
+  using ProblemSizeHelper = detail::FMHAGroupedProblemSizeHelper<ThreadblockShape>;
+  using Base = GroupedProblemVisitor<ProblemSizeHelper, ThreadblockShape, GroupScheduleMode_, PrefetchTileCount, ThreadCount>;
+  using BaseParams = typename Base::Params;
+  using SharedStorage = typename Base::SharedStorage;
+
+  cutlass::gemm::GemmCoord const *problem_sizes0;
+  cutlass::gemm::GemmCoord const *problem_sizes1;
+
+  struct Params {
+    cutlass::gemm::GemmCoord const *problem_sizes0;
+    cutlass::gemm::GemmCoord const *problem_sizes1;
+    int32_t                         problem_count;
+    void const                     *workspace;
+    int32_t                         tile_count;
+
+    //
+    // Methods
+    //
+
+    /// Ctor
+    CUTLASS_HOST_DEVICE
+    Params(): problem_sizes0(nullptr), problem_sizes1(nullptr),
+              problem_count(0), workspace(nullptr), tile_count(0) { }
+
+    /// Ctor
+    CUTLASS_HOST_DEVICE
+    Params(
+      cutlass::gemm::GemmCoord const *problem_sizes0,
+      cutlass::gemm::GemmCoord const *problem_sizes1,
+      int32_t                         problem_count,
+      void const                     *workspace = nullptr,
+      int32_t                         tile_count = 0
+    ):
+      problem_sizes0(problem_sizes0),
+      problem_sizes1(problem_sizes1),
+      problem_count(problem_count),
+      workspace(workspace),
+      tile_count(tile_count)
+    {}
+
+    /// Convert the FMHA-specific parameters to those used by the base class
+    CUTLASS_HOST_DEVICE
+    BaseParams to_base() const {
+        return BaseParams(// Set problem_sizes as problem_sizes1 because these determine
+                          // shape of the final output of FMHA
+                          problem_sizes1,
+                          problem_count,
+                          workspace,
+                          tile_count);
+    }
+
+  };
+
+  //
+  // Methods
+  //
+  CUTLASS_DEVICE
+  FMHAGroupedProblemVisitor(
+    Params const &params_,
+    SharedStorage &shared_storage_, 
+    int32_t block_idx
+  ): Base (
+        params_.to_base(),
+        shared_storage_, block_idx),
+     problem_sizes0(params_.problem_sizes0),
+     problem_sizes1(params_.problem_sizes1)
+  {}
+
+  /// Returns the problem size 0 for the current problem
+  CUTLASS_HOST_DEVICE
+  cutlass::gemm::GemmCoord problem_size0() const {
+    GemmCoord problem = problem_sizes0[this->problem_idx];
+    ProblemSizeHelper::possibly_transpose_problem(problem);
+    return problem;
+  }
+
+  /// Returns the problem size 1 for the current problem
+  CUTLASS_HOST_DEVICE
+  cutlass::gemm::GemmCoord problem_size1() const {
+    GemmCoord problem = problem_sizes1[this->problem_idx];
+    ProblemSizeHelper::possibly_transpose_problem(problem);
+    return problem;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/third_party/fused_multi_head_attention/fused_multi_head_attention_backward.cu b/third_party/fused_multi_head_attention/fused_multi_head_attention_backward.cu
new file mode 100644
index 0000000000..e91548875f
--- /dev/null
+++ b/third_party/fused_multi_head_attention/fused_multi_head_attention_backward.cu
@@ -0,0 +1,298 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <vector>
+#include <iostream>
+#include <fstream>
+
+#include "kernel_backward.h"
+
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/host_tensor.h"
+
+
+using Arch = cutlass::arch::Sm80;
+static constexpr int kMaxK = 128;
+
+template <typename ArchTag, typename Element, int kMaxK>
+struct DefaultKernel {
+    // Some heuristics to select the best kernel (tested on Sm60, Sm70, Sm80)
+    // NOTE: Requires quite a lot of shmem for Sm80+,
+    // so might require tweaking those manually for Sm86/Sm89
+
+    static constexpr bool kSupports64x128 =
+        ArchTag::kMinComputeCapability >= 80 ||
+        (ArchTag::kMinComputeCapability >= 70 &&
+        cutlass::sizeof_bits<Element>::value <= 16);
+    static constexpr int kBlockSizeI = kSupports64x128 && kMaxK > 64 ? 128 : 64;
+    static constexpr bool kIsHalf = cutlass::sizeof_bits<Element>::value <= 16;
+    static constexpr bool kOutputInRF = kIsHalf && kMaxK <= kBlockSizeI;
+    static constexpr bool kPreload = kIsHalf && ArchTag::kMinComputeCapability >= 80 && kOutputInRF;
+    static constexpr int kBlockSizeJ = kPreload && kMaxK > 64 ? 128 : 64;
+
+    using Kernel = AttentionBackwardKernel<
+        Arch,
+        Element,
+        true,        // kIsAligned_
+        false,       // kApplyDropout_
+        kPreload,    // kPreload_
+        kBlockSizeI, // kBlockSizeI_,
+        kBlockSizeJ, // kBlockSizeJ_,
+        kMaxK,       // kMaxK
+        false,       // kKeysQueriesAlignedToBlockSize
+        true         // kEnableSplitKeys
+    >;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+template <typename T> struct TypeName;
+template <> struct TypeName<float> { static constexpr const char* Name = "f32"; };
+template <> struct TypeName<cutlass::half_t> { static constexpr const char* Name = "f16"; };
+template <> struct TypeName<cutlass::bfloat16_t> { static constexpr const char* Name = "b16"; };
+
+void readExpect(std::string const& expected) {
+    std::string read;
+    std::cin >> read;
+    if (read != expected) {
+        std::cerr << "FATAL: Read '" << read << "' but expected '" << expected << "'" << std::endl;
+        std::exit(1);
+    }
+}
+
+/// Helpers to read from stdin
+template <typename Element>
+cutlass::HostTensor<Element, cutlass::layout::RowMajor> readTensorOnDevice(std::string const& expectedName) {
+    readExpect("tensor_begin");
+    readExpect(std::string(TypeName<Element>::Name) + ":" + expectedName);
+    uint64_t len = 0;
+    std::cin >> len;
+    readExpect("file");
+    std::string filename;
+    std::cin >> filename;
+
+    cutlass::HostTensor<Element, cutlass::layout::RowMajor> tensor({int64_t(1), int64_t(len / sizeof(Element))});
+    uint8_t* data = (uint8_t*)tensor.host_data();
+
+    std::fstream myFile(filename, std::ios::in | std::ios::binary );
+    myFile.read((char*)data, len);
+    readExpect("tensor_end");
+    tensor.sync_device();
+    return tensor;
+}
+
+int64_t readInt64(std::string const& expectedName) {
+    readExpect(expectedName);
+    int64_t s = 0;
+    std::cin >> s;
+    return s;
+}
+
+float readFloat(std::string const& expectedName) {
+    readExpect(expectedName);
+    float s = 0;
+    std::cin >> s;
+    return s;
+}
+
+// Writing
+template <typename Element>
+void writeTensor(std::string const& name, cutlass::HostTensor<Element, cutlass::layout::RowMajor>& tensor) {
+    tensor.sync_host(); // device->host
+    size_t u8len = tensor.size() * sizeof(Element);
+
+    // Python is expected to provide a file name to write to
+    readExpect("tmpfile");
+    std::string tmpfile;
+    std::cin >> tmpfile;
+
+    uint8_t* data = (uint8_t*)tensor.host_data();
+    std::fstream myFile(tmpfile, std::ios::out | std::ios::binary );
+    myFile.write((char*)data, u8len);
+    myFile.close();
+
+    std::cout << "tensor_begin " << TypeName<Element>::Name << ":" << name << " ";
+    std::cout << u8len << " file " << tmpfile << " tensor_end" << std::endl;
+}
+
+void writeInt64(std::string const& name, int64_t value) {
+    std::cout << name << " " << value << std::endl;
+}
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Element>
+int runKernel() {
+    using Kernel = typename DefaultKernel<Arch, Element, kMaxK>::Kernel;
+
+#define READ_I64(NAME) p.NAME = (decltype(p.NAME))readInt64(#NAME)
+#define READ_TENSOR_AND_STRIDES_BMH(DT, NAME, NAME_XS) \
+    auto storage##NAME = readTensorOnDevice<DT>(#NAME); \
+    p.NAME##_ptr = storage##NAME.device_data(); \
+    READ_I64(NAME_XS##_strideB); \
+    READ_I64(NAME_XS##_strideM); \
+    READ_I64(NAME_XS##_strideH);
+
+#define CUDA_CHECK(FN) { \
+    auto cudaError = FN; \
+    if (cudaError != cudaSuccess) { \
+        std::cerr << "FATAL: " #FN " failed: " << cudaGetErrorString(cudaError) << std::endl; \
+        return -1; \
+    } \
+}
+
+    typename Kernel::Params p;
+    p.scale = readFloat("scale");
+    READ_I64(head_dim);
+    READ_I64(head_dim_value);
+    READ_I64(num_queries);
+    READ_I64(num_keys);
+    READ_I64(num_heads);
+    READ_I64(custom_mask_type);
+    READ_I64(num_batches);
+    int64_t repeat_count = readInt64("repeat_count");
+    READ_I64(num_splits_key);
+
+    READ_TENSOR_AND_STRIDES_BMH(Element, query, q);
+    READ_TENSOR_AND_STRIDES_BMH(Element, key, k);
+    READ_TENSOR_AND_STRIDES_BMH(Element, value, v);
+    auto lse = readTensorOnDevice<typename Kernel::lse_scalar_t>("logsumexp");
+    p.logsumexp_ptr = lse.device_data();
+    p.lse_strideB = readInt64("lse_strideB");
+    p.lse_strideH = readInt64("lse_strideH");
+
+    // output
+    auto stOutput = readTensorOnDevice<Element>("output");
+    p.output_ptr = stOutput.device_data();
+    READ_I64(o_strideB);
+    auto o_strideM = readInt64("o_strideM");
+    if (o_strideM != p.o_strideM()) {
+        std::cerr << "Invalid `o_strideM`: " << o_strideM << " - expected " << p.o_strideM();
+        return 2;
+    }
+    READ_I64(o_strideH);
+
+    READ_TENSOR_AND_STRIDES_BMH(Element, grad_output, gO);
+
+    auto stDelta = readTensorOnDevice<typename Kernel::accum_t>("delta");
+    p.delta_ptr = stDelta.device_data();
+    READ_I64(delta_strideB);
+    READ_I64(delta_strideH);
+
+    // Allocate workspace
+    if (p.workspace_size()) {
+        cudaMalloc(&p.workspace, p.workspace_size());
+    }
+
+    // Allocate outputs in BMHK format
+    p.gQKV_strideM_multiplier = 1;
+    p.gQ_strideH = p.head_dim;
+    p.gQ_strideB = p.gQ_strideM() * p.num_queries;
+    p.gK_strideH = p.head_dim;
+    p.gK_strideB = p.gK_strideM() * p.num_keys;
+    p.gV_strideH = p.head_dim_value;
+    p.gV_strideB = p.gV_strideM() * p.num_keys;
+
+    cutlass::HostTensor<Element, cutlass::layout::RowMajor> gQ({int64_t(1), p.gQ_strideB * p.num_batches});
+    cutlass::HostTensor<Element, cutlass::layout::RowMajor> gK({int64_t(1), p.gK_strideB * p.num_batches});
+    cutlass::HostTensor<Element, cutlass::layout::RowMajor> gV({int64_t(1), p.gV_strideB * p.num_batches});
+    p.grad_query_ptr = gQ.device_data();
+    p.grad_key_ptr = gK.device_data();
+    p.grad_value_ptr = gV.device_data();
+
+    if (!Kernel::check_supported(p)) {
+      std::cerr << "FATAL: Kernel does not support these inputs" << std::endl;
+      return 2;
+    }
+
+    // Run kernel
+    cudaDeviceSynchronize();
+    auto kernel_fn = attention_kernel_backward_batched_impl<Kernel>;
+    size_t smem_bytes = sizeof(typename Kernel::SharedStorage);
+    CUDA_CHECK(cudaFuncSetAttribute(kernel_fn, cudaFuncAttributeMaxDynamicSharedMemorySize, int(smem_bytes)));
+    kernel_fn<<<p.getBlocksGrid(), p.getThreadsGrid(), smem_bytes>>>(p);
+
+    // Write outputs
+    std::cout << "OK ";
+    writeTensor("grad_query", gQ);
+    writeInt64("gQ_strideB", p.gQ_strideB);
+    writeInt64("gQ_strideM", p.gQ_strideM());
+    writeInt64("gQ_strideH", p.gQ_strideH);
+    writeTensor("grad_key", gK);
+    writeInt64("gK_strideB", p.gK_strideB);
+    writeInt64("gK_strideM", p.gK_strideM());
+    writeInt64("gK_strideH", p.gK_strideH);
+    writeTensor("grad_value", gV);
+    writeInt64("gV_strideB", p.gV_strideB);
+    writeInt64("gV_strideM", p.gV_strideM());
+    writeInt64("gV_strideH", p.gV_strideH);
+
+    // Timing
+    cudaEvent_t events[2];
+    for (auto & event : events) {
+      CUDA_CHECK(cudaEventCreate(&event));
+    }
+    CUDA_CHECK(cudaEventRecord(events[0]));
+    for (int i = 0; i < repeat_count; ++i) {
+        kernel_fn<<<p.getBlocksGrid(), p.getThreadsGrid(), smem_bytes>>>(p);
+    }
+    CUDA_CHECK(cudaEventRecord(events[1]));
+    CUDA_CHECK(cudaEventSynchronize(events[1]));
+    // Measure elapsed runtime
+    float runtime_ms = 0;
+    CUDA_CHECK(cudaEventElapsedTime(&runtime_ms, events[0], events[1]));
+
+    std::cout << "runtime_ms " << runtime_ms / float(repeat_count) << std::endl;
+    return 0;
+}
+
+int main() {
+    std::ios_base::sync_with_stdio(false);
+
+    std::string dtype;
+    std::cin >> dtype;
+    std::cerr << "Running kernel with dtype: " << dtype << std::endl;
+    if (dtype == "f16") {
+        return runKernel<cutlass::half_t>();
+    } else if (dtype == "b16") {
+        return runKernel<cutlass::bfloat16_t>();
+    } else if (dtype == "f32") {
+        return runKernel<float>();
+    } else {
+        std::cerr << "FATAL: Unknown dtype: " << dtype << std::endl;
+        return 3;
+    }
+}
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/third_party/fused_multi_head_attention/fused_multihead_attention_fixed_seqlen.cu b/third_party/fused_multi_head_attention/fused_multihead_attention_fixed_seqlen.cu
new file mode 100644
index 0000000000..5dad08d29e
--- /dev/null
+++ b/third_party/fused_multi_head_attention/fused_multihead_attention_fixed_seqlen.cu
@@ -0,0 +1,1110 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief CUTLASS Attention Example.
+
+    This workload computes a fused multi head attention.
+    Because it keeps the attention matrix in shared memory, it's both faster and
+    uses less global memory.
+
+    This is based on `"Self-Attention Does Not Need O(n^2) Memory" <http://arxiv.org/abs/2112.05682>`_,
+    and very similar to `"FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness" <https://arxiv.org/abs/2205.14135>`_.
+
+    Algorithm:
+      In short, we can compute the output incrementally in blocks of size B,
+      we just need to divide the final result by the sum of all coefficients in
+      the softmax (which we compute incrementally) with the following pseudo-code:
+
+      ```
+      s_prime = torch.zeros([num_queries, B])
+      O = torch.zeros([num_queries, head_size_v])
+      for i in range(0, K.shape[0], B):
+        si = exp((Q . K[i * B:(i+1) * B].t) * scale)
+        sum_coefs += attn_unscaled.sum(-1)
+        O  += si . V[i * B:(i+1) * B]
+      O = O / s_prime
+      ```
+
+      In practice, and for numerical stability reasons,
+      we also subtract the maximum so far (`mi`) before doing
+      the exponential. When we encounter new keys, the maximum
+      used to compute O so far (`m_prime`) can differ from the
+      current maximum, so we update O before accumulating with
+
+      ```
+      O       = O * exp(m_prime - mi)
+      m_prime = mi
+      ```
+
+    Implementation details:
+      - `si` is stored in shared memory between the 2 back to back gemms
+      - we keep and accumulate the output
+      directly in registers if we can (`head_size_v <= 128`).
+      Otherwise, we store it & accumulate in global memory (slower)
+      - blocks are parallelized across the batch dimension, the number
+      of heads, and the query sequence size
+
+
+    Examples:
+
+      # Run an attention example with default setup
+      $ ./examples/41_fused_multi_head_attention/41_fused_multi_head_attention_fixed_seqlen
+
+      # Run an attention example with custom setup
+      $ ./examples/41_fused_multi_head_attention/41_fused_multi_head_attention_fixed_seqlen --head_number=2 --batch_size=3 --head_size=32 --head_size_v=64 --seq_length=512 --seq_length_kv=1024 --causal=true
+
+      Acknowledgement: Fixed-sequence-length FMHA code was upstreamed by Meta xFormers (https://github.com/facebookresearch/xformers).
+*/
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <vector>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/kernel/gemm_grouped.h"
+#include "cutlass/gemm/kernel/default_gemm_grouped.h"
+#include "cutlass/gemm/device/gemm_grouped.h"
+#include "cutlass/gemm/device/gemm_universal.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm_complex.h"
+#include "cutlass/util/reference/device/gemm_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/gemm/kernel/gemm_grouped.h"
+#include "cutlass/gemm/kernel/gemm_transpose_operands.h"
+#include "cutlass/gemm/kernel/default_gemm.h"
+#include "cutlass/gemm/kernel/default_gemm_complex.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/epilogue/threadblock/epilogue_with_visitor.h"
+#include "cutlass/fast_math.h"
+#include "kernel_forward.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Result structure
+struct Result {
+
+  double runtime_ms;
+  double gflops;
+  cutlass::Status status;
+  cudaError_t error;
+  bool passed;
+
+  //
+  // Methods
+  //
+
+  Result(
+    double runtime_ms = 0,
+    double gflops = 0,
+    cutlass::Status status = cutlass::Status::kSuccess,
+    cudaError_t error = cudaSuccess
+  ):
+    runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+  bool error;
+  bool reference_check;
+  bool use_mask;
+  bool causal;
+
+  std::vector<cutlass::gemm::GemmCoord> problem_sizes0;
+  std::vector<cutlass::gemm::GemmCoord> problem_sizes1;
+
+  std::vector<cutlass::gemm::GemmCoord> problem_sizes0_real;
+  std::vector<cutlass::gemm::GemmCoord> problem_sizes1_real;
+
+  int alignment;
+  int head_number;
+  int batch_size;
+  int head_size;
+  int head_size_v;
+  int seq_length;
+  int seq_length_kv;
+  int iterations;
+
+  // alpha0, alpha1 and beta are fixed 
+  // in this multi-head attention example
+  float alpha0;
+  float alpha1;
+  float beta;
+
+  //
+  // Methods
+  // 
+
+  Options():
+    help(false),
+    error(false),
+    alignment(1),
+    reference_check(true),
+    head_number(12),
+    batch_size(16),
+    head_size(64),
+    head_size_v(64),
+    seq_length(1024),
+    seq_length_kv(1024),
+    use_mask(false),
+    iterations(20),
+    causal(false)
+  { }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+      return;
+    }
+
+    cmd.get_cmd_line_argument("alignment", alignment, 1);
+    cmd.get_cmd_line_argument("head_number", head_number, 12);
+    cmd.get_cmd_line_argument("batch_size", batch_size, 16);
+    cmd.get_cmd_line_argument("head_size", head_size, 64);
+    cmd.get_cmd_line_argument("head_size_v", head_size_v, head_size);
+    cmd.get_cmd_line_argument("seq_length", seq_length, 1024);
+    cmd.get_cmd_line_argument("seq_length_kv", seq_length_kv, seq_length);
+    cmd.get_cmd_line_argument("use_mask", use_mask, false);
+    cmd.get_cmd_line_argument("iterations", iterations, 20);
+    cmd.get_cmd_line_argument("reference-check", reference_check, true);
+    cmd.get_cmd_line_argument("causal", causal, true);
+
+    randomize_problems();
+
+  }
+
+  void randomize_problems() {
+
+    int problem_count = head_number * batch_size;
+
+    problem_sizes0.reserve(problem_count);
+    problem_sizes1.reserve(problem_count);
+
+    // When using mask, the original inputs are not padded
+    // and we need to save these info.
+    if (use_mask) {
+      problem_sizes0_real.reserve(problem_count);
+      problem_sizes1_real.reserve(problem_count);
+    }
+
+    for (int i = 0; i < batch_size; ++i) {
+      // problems belonging to the same batch share the same seq len
+      int m_real = seq_length;
+      int mkv_real = seq_length_kv;
+      int m = (m_real + alignment - 1) / alignment * alignment;
+      int mkv = (mkv_real + alignment - 1) / alignment * alignment;
+      int k0 = head_size;
+      int k1 = head_size_v;
+
+      for (int j = 0; j < head_number; ++j) {
+        cutlass::gemm::GemmCoord problem0(m, mkv, k0);
+        cutlass::gemm::GemmCoord problem1(m, k1, mkv);
+        problem_sizes0.push_back(problem0);
+        problem_sizes1.push_back(problem1);
+
+        if (use_mask) {
+          cutlass::gemm::GemmCoord problem0_real(m_real, mkv_real, k0);
+          cutlass::gemm::GemmCoord problem1_real(m_real, k1, mkv_real);
+          problem_sizes0_real.push_back(problem0_real);
+          problem_sizes1_real.push_back(problem1_real);
+        }
+      }
+    }
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "41_fused_multi_head_attention_fixed_seqlen\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement.\n\n"
+      << "  --head_number=<int>         Head number in multi-head attention (default: --head_number=12)\n"
+      << "  --batch_size=<int>          Batch size in multi-head attention (default: --batch_size=16)\n"
+      << "  --head_size=<int>           Head size in multi-head attention (default: --head_size=64)\n"
+      << "  --head_size_v=<int>         Head size in multi-head attention for V (default: --head_size_v=head_size)\n"
+      << "  --seq_length=<int>          Sequence length in multi-head attention for Q (default: --seq_length=1024)\n"
+      << "  --seq_length_kv=<int>       Sequence length in multi-head attention for K/V (default: --seq_length_kv=seq_length)\n"
+      << "  --use_mask=<bool>           If true, performs padding-like masking in softmax.\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform.\n"
+      << "  --reference-check=<bool>    If true, performs reference check.\n"
+      << "  --causal=<bool>             If true, uses causal masking.\n";
+
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const {
+
+    // Number of real-valued multiply-adds 
+    int64_t fops = int64_t();
+
+    for (size_t i = 0; i < problem_sizes0.size(); ++i) {
+      auto const& problem0 = problem_sizes0[i];
+      auto const& problem1 = problem_sizes1[i];
+      for (int row = 0; row < problem0.m(); ++row) {
+        int num_cols0 = problem0.n();
+        if (causal) {
+          num_cols0 = std::min(row + 1, num_cols0);
+        }
+        // P <- Q . K_t
+        fops += 2 * num_cols0 * problem0.k();
+        // P <- exp(P - max(P))
+        fops += 2 * num_cols0;
+        // S <- sum(P)
+        fops += num_cols0 - 1;
+        // O <- P . V
+        fops += 2 * num_cols0 * problem1.n();
+        // O <- O / S
+        fops += num_cols0 * problem1.n();
+      }
+    }
+
+    return double(fops) / double(1.0e9) / runtime_s;
+  }
+};
+
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Attention>
+class TestbedAttention {
+public:
+
+  //
+  // Type definitions
+  //
+
+  using ElementQ = typename Attention::scalar_t;
+  using ElementK = typename Attention::scalar_t;
+  using ElementP = typename Attention::accum_t;
+  using ElementAccumulator = typename Attention::accum_t;
+  using ElementV = typename Attention::scalar_t;
+  using ElementO = typename Attention::output_t;
+
+  using ElementCompute = typename Attention::accum_t;
+
+  using ElementNorm = typename Attention::accum_t;
+  using ElementSum = typename Attention::accum_t;
+  using ElementSoftmaxCompute = typename Attention::accum_t;
+
+  using LayoutQ = cutlass::layout::RowMajor;
+  using LayoutK = cutlass::layout::ColumnMajor;
+  using LayoutP = cutlass::layout::RowMajor;
+  using LayoutV = cutlass::layout::RowMajor;
+  using LayoutO = cutlass::layout::RowMajor;
+
+  using MatrixCoord = typename LayoutP::TensorCoord;
+
+private:
+
+  //
+  // Data members
+  //
+
+  Options & options;
+
+  /// Initialization
+  cutlass::Distribution::Kind init_Q;
+  cutlass::Distribution::Kind init_K;
+  cutlass::Distribution::Kind init_P;
+  cutlass::Distribution::Kind init_V;
+  cutlass::Distribution::Kind init_O;
+  uint32_t seed;
+
+  cutlass::DeviceAllocation<cutlass::gemm::GemmCoord> problem_sizes_device0;
+  cutlass::DeviceAllocation<cutlass::gemm::GemmCoord> problem_sizes_device1;
+  cutlass::DeviceAllocation<cutlass::gemm::GemmCoord> problem_sizes_device0_real;
+
+  std::vector<int64_t> offset_Q;
+  std::vector<int64_t> offset_K;
+  std::vector<int64_t> offset_P;
+  std::vector<int64_t> offset_V;
+  std::vector<int64_t> offset_O;
+
+  std::vector<int64_t> ldq_host;
+  std::vector<int64_t> ldk_host;
+  std::vector<int64_t> ldp_host;
+  std::vector<int64_t> ldv_host;
+  std::vector<int64_t> ldo_host;
+  std::vector<int64_t> seqlen_host;
+
+  cutlass::DeviceAllocation<int64_t> ldq;
+  cutlass::DeviceAllocation<int64_t> ldk;
+  cutlass::DeviceAllocation<int64_t> ldp;
+  cutlass::DeviceAllocation<int64_t> ldv;
+  cutlass::DeviceAllocation<int64_t> ldo;
+  cutlass::DeviceAllocation<int64_t> seqlen;
+
+  cutlass::DeviceAllocation<ElementQ> block_Q;
+  cutlass::DeviceAllocation<ElementK> block_K;
+  cutlass::DeviceAllocation<ElementP> block_P;
+  cutlass::DeviceAllocation<ElementV> block_V;
+  cutlass::DeviceAllocation<ElementO> block_O;
+  cutlass::DeviceAllocation<ElementNorm> block_Norm;
+  cutlass::DeviceAllocation<ElementSum> block_Sum;
+
+  cutlass::DeviceAllocation<int64_t> offset_P_Device;
+
+  cutlass::DeviceAllocation<ElementQ *> ptr_Q;
+  cutlass::DeviceAllocation<ElementK *> ptr_K;
+  cutlass::DeviceAllocation<ElementP *> ptr_P;
+  cutlass::DeviceAllocation<ElementV *> ptr_V;
+  cutlass::DeviceAllocation<ElementO *> ptr_O;
+
+public:
+
+  //
+  // Methods
+  //
+
+  TestbedAttention(
+    Options &options_,
+    cutlass::Distribution::Kind init_Q_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_K_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_P_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_V_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_O_ = cutlass::Distribution::Uniform,
+    uint32_t seed_ = 3080
+  ):
+    options(options_), init_Q(init_Q_), init_K(init_K_), init_P(init_P_), init_V(init_V_), init_O(init_O_), seed(seed_) { }
+
+  int problem_count() const {
+    return (options.head_number * options.batch_size);
+  }
+
+private:
+
+  /// Helper to initialize a tensor view
+  template <typename Element>
+  void initialize_tensor_(
+    Element *ptr,
+    size_t capacity, 
+    cutlass::Distribution::Kind dist_kind,
+    uint32_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      Element scope_max, scope_min;
+      int bits_input = cutlass::sizeof_bits<Element>::value;
+      int bits_output = cutlass::sizeof_bits<ElementP>::value;
+
+      if (bits_input == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      } else if (bits_input <= 8) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (bits_output == 16) {
+        scope_max = 8;
+        scope_min = -8;
+      } else {
+        scope_max = 8;
+        scope_min = -8;
+      }
+
+      cutlass::reference::device::BlockFillRandomUniform(
+        ptr, capacity, seed, scope_max, scope_min, 0);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::device::BlockFillRandomGaussian(
+        ptr, capacity, seed, Element(), Element(0.5f));
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      // Fill with increasing elements
+      cutlass::reference::device::BlockFillSequential(
+        ptr, capacity, Element(1), Element());
+    } 
+    else {
+
+      // Fill with all 1s
+      cutlass::reference::device::BlockFillSequential(
+        ptr, capacity, Element(), Element(1));
+    }
+  }
+
+  /// Initializes data structures
+  void initialize_() {
+
+    //
+    // Set scalors for the mha example
+    //
+
+    options.alpha0 = 1.0f / sqrt(float(options.head_size));
+    options.alpha1 = 1.0f;
+    options.beta = 0;
+
+    //
+    // Choose random problem sizes
+    //
+
+    // construct a few problems of random sizes
+    srand(seed);
+
+    int64_t total_elements_Q = 0;
+    int64_t total_elements_K = 0;
+    int64_t total_elements_P = 0;
+    int64_t total_elements_V = 0;
+    int64_t total_elements_O = 0;
+
+    ldq_host.resize(problem_count());
+    ldk_host.resize(problem_count());
+    ldp_host.resize(problem_count());
+    ldv_host.resize(problem_count());
+    ldo_host.resize(problem_count());
+    seqlen_host.resize(problem_count());
+
+    // Create tensors in BMHK format, where
+    // B = batch_size
+    // M = sequence length
+    // H = num_heads
+    // K = embedding size per head
+    int64_t batch_offset_Q, batch_offset_K, batch_offset_V, batch_offset_O;
+
+    for (int32_t b = 0; b < options.batch_size; ++b) {
+      batch_offset_Q = total_elements_Q;
+      batch_offset_K = total_elements_K;
+      batch_offset_V = total_elements_V;
+      batch_offset_O = total_elements_O;
+      for (int32_t h = 0; h < options.head_number; ++h) {
+        int32_t i = h + b * options.head_number;
+
+        auto problem0 = options.problem_sizes0.at(i);
+        auto problem1 = options.problem_sizes1.at(i);
+
+        ldq_host.at(i) = LayoutQ::packed({problem0.m(), options.head_number * problem0.k()}).stride(0);
+        ldk_host.at(i) = LayoutK::packed({options.head_number * problem0.k(), problem0.n()}).stride(0);
+        ldp_host.at(i) = LayoutP::packed({problem0.m(), problem0.n()}).stride(0);
+        ldv_host.at(i) = LayoutV::packed({problem1.k(), options.head_number * problem1.n()}).stride(0);
+        ldo_host.at(i) = LayoutO::packed({problem1.m(), options.head_number * problem1.n()}).stride(0);
+
+        // m = n for attention problems.
+        seqlen_host.at(i) = problem0.m();
+
+        offset_Q.push_back(batch_offset_Q + h * problem0.k());
+        offset_K.push_back(batch_offset_K + h * problem0.k());
+        offset_P.push_back(total_elements_P);
+        offset_V.push_back(batch_offset_V + h * problem0.k());
+        offset_O.push_back(batch_offset_O + h * problem1.n());
+
+        int64_t elements_Q = problem0.m() * problem0.k();
+        int64_t elements_K = problem0.k() * problem0.n();
+        int64_t elements_P = problem0.m() * problem0.n();
+        int64_t elements_V = problem1.k() * problem1.n();
+        int64_t elements_O = problem1.m() * problem1.n();
+
+        total_elements_Q += elements_Q;
+        total_elements_K += elements_K;
+        total_elements_P += elements_P;
+        total_elements_V += elements_V;
+        total_elements_O += elements_O;
+      }
+    }
+
+    problem_sizes_device0.reset(problem_count());
+    problem_sizes_device1.reset(problem_count());
+    problem_sizes_device0.copy_from_host(options.problem_sizes0.data());
+    problem_sizes_device1.copy_from_host(options.problem_sizes1.data());
+
+    if (options.use_mask) {
+      problem_sizes_device0_real.reset(problem_count());
+      problem_sizes_device0_real.copy_from_host(options.problem_sizes0_real.data());
+    }
+
+    ldq.reset(problem_count());
+    ldk.reset(problem_count());
+    ldp.reset(problem_count());
+    ldv.reset(problem_count());
+    ldo.reset(problem_count());
+    seqlen.reset(problem_count());
+
+    ldq.copy_from_host(ldq_host.data());
+    ldk.copy_from_host(ldk_host.data());
+    ldp.copy_from_host(ldp_host.data());
+    ldv.copy_from_host(ldv_host.data());
+    ldo.copy_from_host(ldo_host.data());
+    seqlen.copy_from_host(seqlen_host.data());
+
+    //
+    // Assign pointers
+    //
+
+    block_Q.reset(total_elements_Q);
+    block_K.reset(total_elements_K);
+    block_P.reset(total_elements_P);
+    block_V.reset(total_elements_V);
+    block_O.reset(total_elements_O);
+
+    offset_P_Device.reset(problem_count());
+
+    // sync offset with device
+    cutlass::device_memory::copy_to_device(offset_P_Device.get(), offset_P.data(), offset_P.size());
+
+    std::vector<ElementQ *> ptr_Q_host(problem_count());
+    std::vector<ElementK *> ptr_K_host(problem_count());
+    std::vector<ElementP *> ptr_P_host(problem_count());
+    std::vector<ElementV *> ptr_V_host(problem_count());
+    std::vector<ElementO *> ptr_O_host(problem_count());
+    std::vector<ElementNorm *> ptr_norm_host(problem_count());
+    std::vector<ElementSum *> ptr_sum_host(problem_count());
+
+    for (int32_t i = 0; i < problem_count(); ++i) {
+      ptr_Q_host.at(i) = block_Q.get() + offset_Q.at(i);
+      ptr_K_host.at(i) = block_K.get() + offset_K.at(i);
+      ptr_P_host.at(i) = block_P.get() + offset_P.at(i);
+      ptr_V_host.at(i) = block_V.get() + offset_V.at(i);
+      ptr_O_host.at(i) = block_O.get() + offset_O.at(i);
+    }
+
+    ptr_Q.reset(problem_count());
+    ptr_Q.copy_from_host(ptr_Q_host.data());
+    
+    ptr_K.reset(problem_count());
+    ptr_K.copy_from_host(ptr_K_host.data());
+    
+    ptr_P.reset(problem_count());
+    ptr_P.copy_from_host(ptr_P_host.data());
+
+    ptr_V.reset(problem_count());
+    ptr_V.copy_from_host(ptr_V_host.data());
+
+    ptr_O.reset(problem_count());
+    ptr_O.copy_from_host(ptr_O_host.data());
+
+    //
+    // Initialize the problems of the workspace
+    //
+
+    initialize_tensor_(block_Q.get(), total_elements_Q, init_Q, seed + 1);
+    initialize_tensor_(block_K.get(), total_elements_K, init_K, seed + 2);
+    initialize_tensor_(block_V.get(), total_elements_V, init_V, seed + 3);
+
+  }
+
+  template<typename Element>
+  bool verify_tensor_(std::vector<Element> vector_Input, \
+                       std::vector<Element> vector_Input_Ref,
+                       int64_t verify_length = -1) {
+
+    int64_t size = (vector_Input.size() < vector_Input_Ref.size()) ? vector_Input.size() : vector_Input_Ref.size();
+    size = (verify_length == -1) ? size : verify_length;
+
+    // 0.05 for absolute error
+    float abs_tol = 5e-2f;
+    // 10% for relative error
+    float rel_tol = 1e-1f;
+    for (int64_t i = 0; i < size; ++i) {
+      float diff = (float)(vector_Input.at(i) - vector_Input_Ref.at(i));
+      float abs_diff = fabs(diff);
+      float abs_ref = fabs((float)vector_Input_Ref.at(i) + 1e-5f);
+      float relative_diff = abs_diff / abs_ref;
+      if ( (isnan(vector_Input_Ref.at(i)) || isnan(abs_diff) || isinf(abs_diff)) ||  (abs_diff > abs_tol && relative_diff > rel_tol)) {
+        printf("[%d/%d] diff = %f, rel_diff = %f, {computed=%f, ref=%f}.\n", int(i), int(size), abs_diff, relative_diff, (float)(vector_Input.at(i)), (float)(vector_Input_Ref.at(i)));
+        return false;
+      }
+
+    }
+
+    return true;
+  }
+
+  /// Verifies the result is a GEMM
+  bool verify_() {
+
+    bool passed = true;
+
+    for (int32_t b = 0; b < options.batch_size; ++b) {
+      int32_t i = b * options.head_number;
+      // Problem size is the same for all heads
+      cutlass::gemm::GemmCoord problem0 = options.problem_sizes0.at(b * options.head_number);
+      cutlass::gemm::GemmCoord problem1 = options.problem_sizes1.at(b * options.head_number);
+
+      MatrixCoord extent_Q{problem0.m(), problem0.k()};
+      MatrixCoord extent_K{problem0.k(), problem0.n()};
+      MatrixCoord extent_P{problem0.m(), problem0.n()};
+      MatrixCoord extent_V{problem1.k(), problem1.n()};
+      MatrixCoord extent_O{problem1.m(), problem1.n()};
+
+      LayoutO layout_O(ldo_host.at(i));
+      std::vector<ElementO> matrix_O(layout_O.capacity(extent_O));
+      cutlass::device_memory::copy_to_host(matrix_O.data(),   block_O.get() + offset_O.at(i), matrix_O.size());
+      cutlass::DeviceAllocation<ElementO>    block_Ref_O(layout_O.capacity(extent_O));
+
+      for (int32_t h = 0; h < options.head_number; ++h) {
+        i = h + b * options.head_number;
+
+        LayoutQ layout_Q(ldq_host.at(i));
+        LayoutK layout_K(ldk_host.at(i));
+        LayoutP layout_P(ldp_host.at(i));
+        LayoutV layout_V(ldv_host.at(i));
+
+        cutlass::TensorView<ElementQ, LayoutQ> view_Q(block_Q.get() + offset_Q.at(i), layout_Q, extent_Q);
+        cutlass::TensorView<ElementK, LayoutK> view_K(block_K.get() + offset_K.at(i), layout_K, extent_K);
+        cutlass::TensorView<ElementV, LayoutV> view_V(block_V.get() + offset_V.at(i), layout_V, extent_V);
+        cutlass::TensorView<ElementO, LayoutO> view_Ref_O_device(block_Ref_O.get() + offset_O.at(i) - offset_O.at(b * options.head_number), layout_O, extent_O);
+
+        cutlass::DeviceAllocation<ElementP>    block_Ref_P(layout_P.capacity(extent_P));
+        cutlass::TensorView<ElementP, LayoutP> view_Ref_P_device(block_Ref_P.get(), layout_P, extent_P);
+
+        // Reference GEMM
+        cutlass::reference::device::GemmComplex<
+            ElementQ, LayoutQ,
+            ElementK, LayoutK,
+            ElementP, LayoutP, 
+            ElementCompute, ElementAccumulator
+        >(
+          problem0,
+          ElementAccumulator(options.alpha0), 
+          view_Q,
+          Attention::MM0::Mma::kTransformA,
+          view_K,
+          Attention::MM0::Mma::kTransformB,
+          ElementAccumulator(options.beta), 
+          view_Ref_P_device, 
+          view_Ref_P_device, 
+          ElementAccumulator(0)
+        );
+
+        // Compute softmax for P. We need to explicitly compute softmax
+        // over P because softmax is fused to the second GEMM in the
+        // profiled implementation.
+        std::vector<ElementP> matrix_Ref(layout_P.capacity(extent_P));
+        cutlass::device_memory::copy_to_host(matrix_Ref.data(), block_Ref_P.get(), matrix_Ref.size());
+        cutlass::TensorView<ElementP, LayoutP> view_Ref_host(matrix_Ref.data(), layout_P, extent_P);
+        std::vector<ElementNorm> vector_Norm_Ref(problem0.m());
+        std::vector<ElementSum> vector_Sum_Ref(problem0.m());
+
+        int n_dim = options.use_mask ? options.problem_sizes0_real.at(i).n() : problem0.n();
+
+        // Compute softmax for reference matrix
+        for (int m = 0; m < problem0.m(); m++) {
+          int n_dim_row = n_dim;
+          if (options.causal) {
+            n_dim_row = std::min(m + 1, n_dim);
+          }
+          ElementSoftmaxCompute max = ElementSoftmaxCompute(view_Ref_host.ref().at({m, 0}));
+          for (int n = 1; n < n_dim_row; n++) {
+            max = std::max(max, ElementSoftmaxCompute(view_Ref_host.ref().at({m, n})));
+          }
+
+          vector_Norm_Ref.at(m) = ElementNorm(max);
+
+          ElementSoftmaxCompute sum = ElementSoftmaxCompute();
+          for (int n = 0; n < n_dim_row; n++) {
+            sum += std::exp( ElementSoftmaxCompute(view_Ref_host.ref().at({m, n})) - max );
+          }
+          ElementSoftmaxCompute inv_sum = ElementSoftmaxCompute(1.0f / sum);
+
+          vector_Sum_Ref.at(m) = ElementSum(inv_sum);
+
+          for (int n = 0; n < n_dim_row; n++) {
+            view_Ref_host.ref().at({m, n}) = ElementP(
+              std::exp( ElementSoftmaxCompute(view_Ref_host.ref().at({m, n})) - max ) * inv_sum
+            );
+          }
+          // Mask out the rest of the attention matrix
+          for (int n = n_dim_row; n < n_dim; ++n) {
+            view_Ref_host.ref().at({m, n}) = ElementP(0);
+          }
+        }
+
+        // when not using mask, problem_real and problem share the same sizes
+        if (options.use_mask) {
+          for (int m = 0; m < problem0.m(); m++) {
+            for (int n = n_dim; n < problem0.n(); n++) {
+              view_Ref_host.ref().at({m, n}) = ElementP(0);
+            }
+          }
+        }
+
+        cutlass::device_memory::copy_to_device(block_Ref_P.get(), matrix_Ref.data(), matrix_Ref.size());
+
+        // Reference GEMM
+        cutlass::reference::device::GemmComplex<
+            ElementP, LayoutP,
+            ElementV, LayoutV,
+            ElementO, LayoutO, 
+            ElementCompute, ElementAccumulator
+        >(
+          problem1,
+          ElementAccumulator(options.alpha1), 
+          view_Ref_P_device,
+          Attention::MM0::Mma::kTransformA,
+          view_V,
+          Attention::MM0::Mma::kTransformB,
+          ElementAccumulator(options.beta), 
+          view_Ref_O_device, 
+          view_Ref_O_device, 
+          ElementAccumulator(0)
+        );
+      }
+
+      // Copy to host memory
+      std::vector<ElementO> matrix_Ref_O(layout_O.capacity(extent_O));
+      cutlass::device_memory::copy_to_host(matrix_Ref_O.data(), block_Ref_O.get(), matrix_Ref_O.size());
+
+      // printf("Pb %d: \n    Q=(offset=%d, ldq=%d)\n    K=(offset=%d, ldk=%d)\n    O=(offset=%d, ldo=%d)\n",
+      //   int(i), int(offset_Q[i]), int(ldq_host[i]), int(offset_K[i]), int(ldk_host[i]), int(offset_O[i]), int(ldo_host[i]));
+  
+      bool verified_O = false;
+
+      if (!verified_O) {
+        verified_O = verify_tensor_<ElementO>(matrix_O, matrix_Ref_O);
+      }
+
+      passed = passed && verified_O;
+
+      if (!passed) {
+        std::cerr << "\n***\nError - problem " << i << " (batch " << b << ") failed the QA check\n***\n" << std::endl;
+
+        if (!verified_O) {
+          std::cout << "Final matrix output is incorrect" << std::endl;
+        }
+
+        return passed;
+      }
+    }
+
+    return passed;
+  }
+
+public:
+
+
+  /// Executes a CUTLASS Attention kernel and measures runtime.
+  Result profile() {
+
+    Result result;
+    result.passed = false;
+
+    // Initialize the problem
+    initialize_();
+
+    typename Attention::Params p;
+    { // set parameters
+      p.query_ptr = block_Q.get();
+      p.key_ptr = block_K.get();
+      p.value_ptr = block_V.get();
+      p.logsumexp_ptr = nullptr; // Only needed for bw
+      p.output_accum_ptr = nullptr;
+      if (Attention::kNeedsOutputAccumulatorBuffer) {
+        cudaMalloc(&p.output_accum_ptr, block_O.size() * sizeof(typename Attention::output_accum_t));
+      }
+      p.output_ptr = block_O.get();
+
+      // TODO: support arbitrary seq lengths
+      // if (cu_seqlens_q.has_value()) {
+      //   p.cu_seqlens_q_ptr = (int32_t*)cu_seqlens_q->data_ptr();
+      //   p.cu_seqlens_k_ptr = (int32_t*)cu_seqlens_k->data_ptr();
+      // }
+
+      p.scale = options.alpha0;
+
+      p.num_heads = options.head_number;
+      p.num_batches = options.batch_size;
+      p.head_dim = options.head_size;
+      p.head_dim_value = options.head_size_v;
+      p.num_queries = options.seq_length;
+      p.num_keys = options.seq_length_kv;
+      if (options.causal) {
+        p.custom_mask_type = Attention::CausalFromTopLeft;
+      }
+
+      // All tensors are in BMHK shapes
+      p.q_strideH = options.head_size;
+      p.k_strideH = options.head_size;
+      p.v_strideH = options.head_size_v;
+      p.q_strideM = int32_t(ldq_host[0]);
+      p.k_strideM = int32_t(ldk_host[0]);
+      p.v_strideM = int32_t(ldv_host[0]);
+      p.q_strideB = p.q_strideM * options.seq_length;
+      p.k_strideB = p.k_strideM * options.seq_length_kv;
+      p.v_strideB = p.v_strideM * options.seq_length_kv;
+      p.o_strideM = p.head_dim_value * p.num_heads;
+    }
+
+    // launch kernel :)
+    constexpr auto kernel_fn = attention_kernel_batched_impl<Attention>;
+    int smem_bytes = sizeof(typename Attention::SharedStorage);
+    if (smem_bytes > 0xc000) {
+      cudaFuncSetAttribute(kernel_fn, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_bytes);
+    }
+    if (!Attention::check_supported(p)) {
+      std::cerr << "Kernel does not support these inputs" << std::endl;
+      return result;
+    }
+    kernel_fn<<<p.getBlocksGrid(), p.getThreadsGrid(), smem_bytes>>>(p);
+
+    // Wait for completion
+    result.error = cudaDeviceSynchronize();
+
+    if (result.error != cudaSuccess)  {
+      std::cerr << "Kernel execution error: " << cudaGetErrorString(result.error);
+      return result;
+    }
+
+    //
+    // Verify correctness
+    //
+    result.passed = true;
+
+    if (options.reference_check) {
+      result.passed = verify_();
+    }
+
+    //
+    // Warm-up run
+    //
+
+    kernel_fn<<<p.getBlocksGrid(), p.getThreadsGrid(), smem_bytes>>>(p);
+
+    if (result.status != cutlass::Status::kSuccess) {
+      std::cerr << "Failed to run CUTLASS Attention kernel." << std::endl;
+      return result;
+    }
+
+    //
+    // Construct events
+    //
+
+    cudaEvent_t events[2];
+
+    for (auto & event : events) {
+      result.error = cudaEventCreate(&event);
+      if (result.error != cudaSuccess) {
+        std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
+        return -1;
+      }
+    }
+
+    // Record an event at the start of a series of GEMM operations
+    result.error = cudaEventRecord(events[0]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    //
+    // Run profiling loop
+    //
+
+    for (int iter = 0; iter < options.iterations; ++iter) {
+      kernel_fn<<<p.getBlocksGrid(), p.getThreadsGrid(), smem_bytes>>>(p);
+    }
+
+    //
+    // Stop profiling loop
+    //
+
+    // Record an event when the GEMM operations have been launched.
+    result.error = cudaEventRecord(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Wait for work on the device to complete.
+    result.error = cudaEventSynchronize(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Measure elapsed runtime
+    float runtime_ms = 0;
+    result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Compute average runtime and GFLOPs.
+    result.runtime_ms = double(runtime_ms) / double(options.iterations);
+    result.gflops = options.gflops(result.runtime_ms / 1000.0);
+
+    //
+    // Cleanup
+    //
+
+    for (auto event : events) {
+      (void)cudaEventDestroy(event);
+    }
+
+    std::cout << std::endl;
+    std::cout << "CUTLASS Attention:\n"
+      << "====================================================" << std::endl;
+    std::cout << "    " << " {seq length Q, seq length KV, head size, head size V, head number, batch size} = {" << options.seq_length \
+      << ", " << options.seq_length_kv << ", " << options.head_size << ", " << options.head_size_v << ", " << options.head_number\
+      << ", " << options.batch_size << "}." << std::endl;
+    std::cout << std::endl;
+    std::cout << "    " << "Runtime: " << result.runtime_ms << " ms" << std::endl;
+    std::cout << "    " << "GFLOPs: " << result.gflops << std::endl;
+
+    return result;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int kQueriesPerBlock,
+  int kKeysPerBlock,
+  int kMaxK
+>
+int run_attention(Options& options) {
+  using Attention = AttentionKernel<
+    cutlass::half_t,      // scalar_t
+    cutlass::arch::Sm80,  // ArchTag
+    true,                 // Memory is aligned
+    kQueriesPerBlock,
+    kKeysPerBlock,
+    kMaxK,
+    false,                // Supports dropout
+    false                 // Supports bias
+  >;
+
+  //
+  // Test and profile
+  //
+
+  TestbedAttention<Attention> testbed(options);
+
+  Result result = testbed.profile();
+  if (!result.passed) {
+    std::cout << "Profiling CUTLASS attention has failed.\n";
+    std::cout << "\nFailed\n";
+    return -1;
+  }
+
+  std::cout << "\nPassed\n";
+  return 0;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  //
+  // This example uses mma.sync to directly access Tensor Cores to achieve peak performance.
+  //
+
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (__CUDACC_VER_MAJOR__ < 11 || props.major < 8) {
+  
+    //
+    // This example requires an NVIDIA Ampere-architecture GPU.
+    //
+
+    std::cout 
+      << "CUTLASS's CUTLASS Attention example requires a GPU of NVIDIA's Ampere Architecture or "
+      << "later (compute capability 80 or greater).\n";
+
+    return 0;
+  }
+
+  //
+  // Parse options
+  //
+
+  Options options;
+  
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  if (options.error) {
+    std::cerr << "Aborting execution." << std::endl;
+    return -1;
+  }
+
+  if (options.use_mask) {
+    std::cerr << "--use_mask is not supported at the moment\n";
+    return -2;
+  }
+  if (options.alignment != 1) {
+    std::cerr << "--alignment=1 is the only supported value\n";
+    return -2;
+  }
+
+  // Determine kernel configuration based on head size.
+  // If head size is less than or equal to 64, each block operates over 64 queries and
+  // 64 keys, and partial results can be stored in the register file.
+  // If head size is greater than 64, each block operates over 32 queries and 128 keys,
+  // and partial results are stored in shared memory.
+  if (options.head_size_v > 64) {
+    static int const kQueriesPerBlock = 32;
+    static int const kKeysPerBlock = 128;
+    if (options.head_size_v <= 128) {
+      return run_attention<kQueriesPerBlock, kKeysPerBlock, 128>(options);
+    } else {
+      return run_attention<kQueriesPerBlock, kKeysPerBlock, 65536>(options);
+    }
+  } else {
+    static constexpr int kMaxK = 64; // <- Decrease to 32/16 if your problem is smaller
+    static int const kQueriesPerBlock = 64;
+    static int const kKeysPerBlock = 64;
+    return run_attention<kQueriesPerBlock, kKeysPerBlock, kMaxK>(options);
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/third_party/fused_multi_head_attention/fused_multihead_attention_variable_seqlen.cu b/third_party/fused_multi_head_attention/fused_multihead_attention_variable_seqlen.cu
new file mode 100644
index 0000000000..6fbc7bc0bf
--- /dev/null
+++ b/third_party/fused_multi_head_attention/fused_multihead_attention_variable_seqlen.cu
@@ -0,0 +1,1195 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief CUTLASS Attention Example.
+
+    This workload computes a fused multi head attention that supports variable sequence lengths.
+    Because it keeps the attention matrix in shared memory, it's both faster and
+    uses less global memory.
+
+    This is based on `"Self-Attention Does Not Need O(n^2) Memory" <http://arxiv.org/abs/2112.05682>`_,
+    and very similar to `"FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness" <https://arxiv.org/abs/2205.14135>`_.
+
+    Algorithm:
+      In short, we can compute the output incrementally in blocks of size B,
+      we just need to divide the final result by the sum of all coefficients in
+      the softmax (which we compute incrementally) with the following pseudo-code:
+
+      ```
+      s_prime = torch.zeros([num_queries, B])
+      O = torch.zeros([num_queries, head_size_v])
+      for i in range(0, K.shape[0], B):
+        si = exp((Q . K[i * B:(i+1) * B].t) * scale)
+        sum_coefs += attn_unscaled.sum(-1)
+        O  += si . V[i * B:(i+1) * B]
+      O = O / s_prime
+      ```
+
+      In practice, and for numerical stability reasons,
+      we also subtract the maximum so far (`mi`) before doing
+      the exponential. When we encounter new keys, the maximum
+      used to compute O so far (`m_prime`) can differ from the
+      current maximum, so we update O before accumulating with
+
+      ```
+      O       = O * exp(m_prime - mi)
+      m_prime = mi
+      ```
+
+    Implementation details:
+      - `si` is stored in shared memory between the 2 back to back gemms
+      - we keep and accumulate the output
+      directly in registers if we can (`head_size_v <= 128`).
+      Otherwise, we store it & accumulate in global memory (slower)
+      - blocks are parallelized across the batch dimension, the number
+      of heads, and the query sequence size
+
+
+    Examples:
+
+      # Run an attention example with default setup
+      $ ./examples/41_fused_multi_head_attention/41_fused_multi_head_attention_variable_seqlen
+
+      # Run an attention example with custom setup
+      $ ./examples/41_fused_multi_head_attention/41_fused_multi_head_attention_variable_seqlen --head_number=2 --batch_size=3 --head_size=32 --head_size_v=64 --seq_length=512 --seq_length_kv=1024 --causal=true
+
+      Acknowledgement: Fixed-sequence-length FMHA code was upstreamed by Meta xFormers (https://github.com/facebookresearch/xformers).
+                       Using grouped GEMM to handle variable sequence lengths is inspired by an idea originally prototyped by ByteDance Inc.
+*/
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <vector>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/device/gemm_grouped.h"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/gemm_complex.h"
+#include "cutlass/util/reference/device/gemm_complex.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+
+#include "cutlass/layout/matrix.h"
+#include "cutlass/gemm/kernel/default_gemm.h"
+#include "cutlass/gemm/kernel/default_gemm_complex.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/fast_math.h"
+
+#include "default_fmha_grouped.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Result structure
+struct Result {
+
+  double runtime_ms;
+  double gflops;
+  cutlass::Status status;
+  cudaError_t error;
+  bool passed;
+
+  //
+  // Methods
+  //
+
+  Result(
+    double runtime_ms = 0,
+    double gflops = 0,
+    cutlass::Status status = cutlass::Status::kSuccess,
+    cudaError_t error = cudaSuccess
+  ):
+    runtime_ms(runtime_ms), gflops(gflops), status(status), error(error), passed(true) { }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help;
+  bool error;
+  bool reference_check;
+  bool use_mask;
+  bool causal;
+  bool fixed_seq_length;
+
+  std::vector<cutlass::gemm::GemmCoord> problem_sizes0;
+  std::vector<cutlass::gemm::GemmCoord> problem_sizes1;
+
+  std::vector<cutlass::gemm::GemmCoord> problem_sizes0_real;
+  std::vector<cutlass::gemm::GemmCoord> problem_sizes1_real;
+
+  int alignment;
+  int head_number;
+  int batch_size;
+  int head_size;
+  int head_size_v;
+  int seq_length;
+  int seq_length_kv;
+  int iterations;
+  int problem_count;
+
+  // alpha0, alpha1 and beta are fixed 
+  // in this multi-head attention example
+  float alpha0;
+  float alpha1;
+  float beta;
+
+  cutlass::gemm::kernel::GroupScheduleMode scheduler_mode;
+
+  //
+  // Methods
+  // 
+
+  Options():
+    help(false),
+    error(false),
+    alignment(1),
+    reference_check(true),
+    head_number(12),
+    batch_size(16),
+    head_size(64),
+    head_size_v(64),
+    seq_length(1024),
+    seq_length_kv(1024),
+    use_mask(false),
+    iterations(20),
+    causal(false),
+    fixed_seq_length(false),
+    problem_count(batch_size * head_number),
+    scheduler_mode(cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly)
+  { }
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+      return;
+    }
+
+    cmd.get_cmd_line_argument("alignment", alignment, 1);
+    cmd.get_cmd_line_argument("head_number", head_number, 12);
+    cmd.get_cmd_line_argument("batch_size", batch_size, 16);
+    cmd.get_cmd_line_argument("head_size", head_size, 64);
+    cmd.get_cmd_line_argument("head_size_v", head_size_v, head_size);
+    cmd.get_cmd_line_argument("seq_length", seq_length, 1024);
+    cmd.get_cmd_line_argument("seq_length_kv", seq_length_kv, seq_length);
+    cmd.get_cmd_line_argument("use_mask", use_mask, false);
+    cmd.get_cmd_line_argument("iterations", iterations, 20);
+    cmd.get_cmd_line_argument("reference-check", reference_check, true);
+    cmd.get_cmd_line_argument("causal", causal, true);
+    cmd.get_cmd_line_argument("fixed_seq_length", fixed_seq_length, false);
+
+    std::vector<std::string> scheduler_mode_strs;
+    cmd.get_cmd_line_arguments("scheduler-mode", scheduler_mode_strs);
+
+    if (!scheduler_mode_strs.empty()) {
+      if (scheduler_mode_strs.size() > 1) {
+        std::cerr << "Only one scheduler mode may be passed in" << std::endl;
+        error = true;
+        return;
+      }
+      std::string scheduler_mode_str = scheduler_mode_strs[0];
+      if (scheduler_mode_str == "kDeviceOnly") {
+        scheduler_mode = cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly;
+      } else if (scheduler_mode_str == "kHostPrecompute") {
+        scheduler_mode = cutlass::gemm::kernel::GroupScheduleMode::kHostPrecompute;
+      } else {
+          std::cerr << "Unrecognized scheduler mode '" << scheduler_mode_str << "'" << std::endl;
+          error = true;
+          return;
+      }
+    }
+
+    if (fixed_seq_length) {
+      std::cout << "NOTE: Better performance is expected for fixed-sized sequence length from 41_fused_multi_head_attention_fixed_seqlen." << std::endl;
+    }
+
+    randomize_problems();
+  }
+
+  void randomize_problems() {
+
+    problem_count = head_number * batch_size;
+
+    problem_sizes0.reserve(problem_count);
+    problem_sizes1.reserve(problem_count);
+
+    // When using mask, the original inputs are not padded
+    // and we need to save these info.
+    if (use_mask) {
+      problem_sizes0_real.reserve(problem_count);
+      problem_sizes1_real.reserve(problem_count);
+    }
+
+    for (int i = 0; i < batch_size; ++i) {
+      // problems belonging to the same batch share the same seq len
+
+      int m_real, mkv_real;
+      if (fixed_seq_length) {
+        m_real = seq_length;
+        mkv_real = seq_length_kv;
+      } else {
+        m_real = (rand() % seq_length) + 1;
+
+        // Only randomize seq_length_kv if it was set to a different value than
+        // seq_length originally.
+        if (seq_length != seq_length_kv) {
+          mkv_real = (rand() % seq_length_kv) + 1;
+        } else {
+          mkv_real = m_real;
+        }
+      }
+
+      int m = (m_real + alignment - 1) / alignment * alignment;
+      int mkv = (mkv_real + alignment - 1) / alignment * alignment;
+      int k0 = head_size;
+      int k1 = head_size_v;
+
+      for (int j = 0; j < head_number; ++j) {
+        cutlass::gemm::GemmCoord problem0(m, mkv, k0);
+        cutlass::gemm::GemmCoord problem1(m, k1, mkv);
+
+        problem_sizes0.push_back(problem0);
+        problem_sizes1.push_back(problem1);
+
+        if (use_mask) {
+          cutlass::gemm::GemmCoord problem0_real(m_real, mkv_real, k0);
+          cutlass::gemm::GemmCoord problem1_real(m_real, k1, mkv_real);
+          problem_sizes0_real.push_back(problem0_real);
+          problem_sizes1_real.push_back(problem1_real);
+        }
+
+      }
+    }
+  }
+
+  void print_problems() {
+    std::cout << "     Running " << batch_size << " batches, each with " << head_number << " heads of size " << head_size << ":" << std::endl;
+    for (int i = 0; i < batch_size; ++i) {
+      int idx = i * head_number;
+      std::cout << "       [" << i << "] seq_length = " << problem_sizes0[idx].m() << " seq_length_kv = " << problem_sizes0[idx].n() << std::endl;
+    }
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "41_fused_multi_head_attention_variable_seqlen\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement.\n\n"
+      << "  --head_number=<int>         Head number in multi-head attention (default: --head_number=12)\n"
+      << "  --batch_size=<int>          Batch size in multi-head attention (default: --batch_size=16)\n"
+      << "  --head_size=<int>           Head size in multi-head attention (default: --head_size=64)\n"
+      << "  --head_size_v=<int>         Head size in multi-head attention for V (default: --head_size_v=head_size)\n"
+      << "  --seq_length=<int>          Sequence length in multi-head attention for Q (default: --seq_length=1024)\n"
+      << "  --seq_length_kv=<int>       Sequence length in multi-head attention for K/V (default: --seq_length_kv=seq_length)\n"
+      << "  --use_mask=<bool>           If true, performs padding-like masking in softmax.\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform.\n"
+      << "  --reference-check=<bool>    If true, performs reference check.\n"
+      << "  --causal=<bool>             If true, uses causal masking.\n"
+      << "  --fixed_seq_length=<bool>   If true, uses the same sequence length for each item in the batch.\n";
+
+    return out;
+  }
+
+  /// Compute performance in GFLOP/s
+  double gflops(double runtime_s) const {
+
+    // Number of real-valued multiply-adds 
+    int64_t fops = int64_t();
+
+    for (size_t i = 0; i < problem_sizes0.size(); ++i) {
+      auto const& problem0 = problem_sizes0[i];
+      auto const& problem1 = problem_sizes1[i];
+
+      for (int row = 0; row < problem0.m(); ++row) {
+        int num_cols0 = problem0.n();
+        if (causal) {
+          num_cols0 = std::min(row + 1, num_cols0);
+        }
+        // P <- Q . K_t
+        fops += 2 * num_cols0 * problem0.k();
+        // P <- exp(P - max(P))
+        fops += 2 * num_cols0;
+        // S <- sum(P)
+        fops += num_cols0 - 1;
+        // O <- P . V
+        fops += 2 * num_cols0 * problem1.n();
+        // O <- O / S
+        fops += num_cols0 * problem1.n();
+      }
+    }
+
+    return double(fops) / double(1.0e9) / runtime_s;
+  }
+};
+
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Attention>
+class TestbedAttention {
+public:
+
+  //
+  // Type definitions
+  //
+
+  using scalar_t = typename Attention::GemmKernel::scalar_t;
+  using accum_t = typename Attention::GemmKernel::accum_t;
+  using output_t = typename Attention::GemmKernel::output_t;
+  using output_accum_t = typename Attention::GemmKernel::output_accum_t;
+
+  using ElementQ = scalar_t;
+  using ElementK = scalar_t;
+  using ElementP = accum_t;
+  using ElementAccumulator = accum_t;
+  using ElementV = scalar_t;
+  using ElementO = output_t;
+  using ElementOAccum = output_accum_t;
+
+  using ElementCompute = accum_t;
+
+  using ElementNorm = accum_t;
+  using ElementSum = accum_t;
+  using ElementSoftmaxCompute = accum_t;
+
+  using LayoutQ = cutlass::layout::RowMajor;
+  using LayoutK = cutlass::layout::ColumnMajor;
+  using LayoutP = cutlass::layout::RowMajor;
+  using LayoutV = cutlass::layout::RowMajor;
+  using LayoutO = cutlass::layout::RowMajor;
+
+  using MatrixCoord = typename LayoutP::TensorCoord;
+
+  static bool const kNeedsOutputAccumulatorBuffer = Attention::GemmKernel::kNeedsOutputAccumulatorBuffer;
+
+private:
+
+  //
+  // Data members
+  //
+
+  Options & options;
+
+  /// Initialization
+  cutlass::Distribution::Kind init_Q;
+  cutlass::Distribution::Kind init_K;
+  cutlass::Distribution::Kind init_P;
+  cutlass::Distribution::Kind init_V;
+  cutlass::Distribution::Kind init_O;
+  uint32_t seed;
+
+  cutlass::DeviceAllocation<cutlass::gemm::GemmCoord> problem_sizes_device0;
+  cutlass::DeviceAllocation<cutlass::gemm::GemmCoord> problem_sizes_device1;
+  cutlass::DeviceAllocation<cutlass::gemm::GemmCoord> problem_sizes_device0_real;
+
+  std::vector<int64_t> offset_Q;
+  std::vector<int64_t> offset_K;
+  std::vector<int64_t> offset_P;
+  std::vector<int64_t> offset_V;
+  std::vector<int64_t> offset_O;
+
+  std::vector<int64_t> ldq_host;
+  std::vector<int64_t> ldk_host;
+  std::vector<int64_t> ldp_host;
+  std::vector<int64_t> ldv_host;
+  std::vector<int64_t> ldo_host;
+  std::vector<int64_t> seqlen_host;
+
+  cutlass::DeviceAllocation<int64_t> ldq;
+  cutlass::DeviceAllocation<int64_t> ldk;
+  cutlass::DeviceAllocation<int64_t> ldp;
+  cutlass::DeviceAllocation<int64_t> ldv;
+  cutlass::DeviceAllocation<int64_t> ldo;
+  cutlass::DeviceAllocation<int64_t> seqlen;
+
+  cutlass::DeviceAllocation<ElementQ> block_Q;
+  cutlass::DeviceAllocation<ElementK> block_K;
+  cutlass::DeviceAllocation<ElementP> block_P;
+  cutlass::DeviceAllocation<ElementV> block_V;
+  cutlass::DeviceAllocation<ElementO> block_O;
+  cutlass::DeviceAllocation<ElementOAccum> block_O_accumulate;
+  cutlass::DeviceAllocation<ElementNorm> block_Norm;
+  cutlass::DeviceAllocation<ElementSum> block_Sum;
+
+  cutlass::DeviceAllocation<int64_t> offset_P_Device;
+
+  cutlass::DeviceAllocation<ElementQ *> ptr_Q;
+  cutlass::DeviceAllocation<ElementK *> ptr_K;
+  cutlass::DeviceAllocation<ElementP *> ptr_P;
+  cutlass::DeviceAllocation<ElementV *> ptr_V;
+  cutlass::DeviceAllocation<ElementO *> ptr_O;
+  cutlass::DeviceAllocation<ElementOAccum *> ptr_O_accumulate;
+
+
+public:
+
+  //
+  // Methods
+  //
+
+  TestbedAttention(
+    Options &options_,
+    cutlass::Distribution::Kind init_Q_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_K_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_P_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_V_ = cutlass::Distribution::Uniform,
+    cutlass::Distribution::Kind init_O_ = cutlass::Distribution::Uniform,
+    uint32_t seed_ = 3080
+  ):
+    options(options_), init_Q(init_Q_), init_K(init_K_), init_P(init_P_), init_V(init_V_), init_O(init_O_), seed(seed_) { }
+
+  int problem_count() const {
+    return (options.head_number * options.batch_size);
+  }
+
+private:
+
+  /// Helper to initialize a tensor view
+  template <typename Element>
+  void initialize_tensor_(
+    Element *ptr,
+    size_t capacity, 
+    cutlass::Distribution::Kind dist_kind,
+    uint32_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      Element scope_max, scope_min;
+      int bits_input = cutlass::sizeof_bits<Element>::value;
+      int bits_output = cutlass::sizeof_bits<ElementP>::value;
+
+      if (bits_input == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      } else if (bits_input <= 8) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (bits_output == 16) {
+        scope_max = 8;
+        scope_min = -8;
+      } else {
+        scope_max = 8;
+        scope_min = -8;
+      }
+
+      cutlass::reference::device::BlockFillRandomUniform(
+        ptr, capacity, seed, scope_max, scope_min, 0);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::device::BlockFillRandomGaussian(
+        ptr, capacity, seed, Element(), Element(0.5f));
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      // Fill with increasing elements
+      cutlass::reference::device::BlockFillSequential(
+        ptr, capacity, Element(1), Element());
+    } 
+    else {
+
+      // Fill with all 1s
+      cutlass::reference::device::BlockFillSequential(
+        ptr, capacity, Element(), Element(1));
+    }
+  }
+
+  /// Initializes data structures
+  void initialize_() {
+
+    //
+    // Set scalors for the mha example
+    //
+
+    options.alpha0 = 1.0f / sqrt(float(options.head_size));
+    options.alpha1 = 1.0f;
+    options.beta = 0;
+
+    //
+    // Choose random problem sizes
+    //
+
+    // construct a few problems of random sizes
+    srand(seed);
+
+    int64_t total_elements_Q = 0;
+    int64_t total_elements_K = 0;
+    int64_t total_elements_P = 0;
+    int64_t total_elements_V = 0;
+    int64_t total_elements_O = 0;
+
+    ldq_host.resize(problem_count());
+    ldk_host.resize(problem_count());
+    ldp_host.resize(problem_count());
+    ldv_host.resize(problem_count());
+    ldo_host.resize(problem_count());
+    seqlen_host.resize(problem_count());
+
+    for (int32_t i = 0; i < problem_count(); ++i) {
+
+      auto problem0 = options.problem_sizes0.at(i);
+      auto problem1 = options.problem_sizes1.at(i);
+
+      ldq_host.at(i) = LayoutQ::packed({problem0.m(), problem0.k()}).stride(0);
+      ldk_host.at(i) = LayoutK::packed({problem0.k(), problem0.n()}).stride(0);
+      ldp_host.at(i) = LayoutP::packed({problem0.m(), problem0.n()}).stride(0);
+      ldv_host.at(i) = LayoutV::packed({problem1.k(), problem1.n()}).stride(0);
+      ldo_host.at(i) = LayoutO::packed({problem1.m(), problem1.n()}).stride(0);
+
+      // m = n for attention problems.
+      seqlen_host.at(i) = problem0.m();
+
+      offset_Q.push_back(total_elements_Q);
+      offset_K.push_back(total_elements_K);
+      offset_P.push_back(total_elements_P);
+      offset_V.push_back(total_elements_V);
+      offset_O.push_back(total_elements_O);
+
+      int64_t elements_Q = problem0.m() * problem0.k();
+      int64_t elements_K = problem0.k() * problem0.n();
+      int64_t elements_P = problem0.m() * problem0.n();
+      int64_t elements_V = problem1.k() * problem1.n();
+      int64_t elements_O = problem1.m() * problem1.n();
+
+      total_elements_Q += elements_Q;
+      total_elements_K += elements_K;
+      total_elements_P += elements_P;
+      total_elements_V += elements_V;
+      total_elements_O += elements_O;
+
+    }
+
+    problem_sizes_device0.reset(problem_count());
+    problem_sizes_device1.reset(problem_count());
+    problem_sizes_device0.copy_from_host(options.problem_sizes0.data());
+    problem_sizes_device1.copy_from_host(options.problem_sizes1.data());
+
+    if (options.use_mask) {
+      problem_sizes_device0_real.reset(problem_count());
+      problem_sizes_device0_real.copy_from_host(options.problem_sizes0_real.data());
+    }
+
+    ldq.reset(problem_count());
+    ldk.reset(problem_count());
+    ldp.reset(problem_count());
+    ldv.reset(problem_count());
+    ldo.reset(problem_count());
+    seqlen.reset(problem_count());
+
+    ldq.copy_from_host(ldq_host.data());
+    ldk.copy_from_host(ldk_host.data());
+    ldp.copy_from_host(ldp_host.data());
+    ldv.copy_from_host(ldv_host.data());
+    ldo.copy_from_host(ldo_host.data());
+    seqlen.copy_from_host(seqlen_host.data());
+
+    //
+    // Assign pointers
+    //
+
+    block_Q.reset(total_elements_Q);
+    block_K.reset(total_elements_K);
+    block_P.reset(total_elements_P);
+    block_V.reset(total_elements_V);
+    block_O.reset(total_elements_O);
+
+    if (kNeedsOutputAccumulatorBuffer) {
+      block_O_accumulate.reset(total_elements_O);
+    }
+
+    offset_P_Device.reset(problem_count());
+
+    // sync offset with device
+    cutlass::device_memory::copy_to_device(offset_P_Device.get(), offset_P.data(), offset_P.size());
+
+    std::vector<ElementQ *> ptr_Q_host(problem_count());
+    std::vector<ElementK *> ptr_K_host(problem_count());
+    std::vector<ElementP *> ptr_P_host(problem_count());
+    std::vector<ElementV *> ptr_V_host(problem_count());
+    std::vector<ElementO *> ptr_O_host(problem_count());
+    std::vector<ElementOAccum *> ptr_O_accumulate_host(problem_count());
+    std::vector<ElementNorm *> ptr_norm_host(problem_count());
+    std::vector<ElementSum *> ptr_sum_host(problem_count());
+
+    for (int32_t i = 0; i < problem_count(); ++i) {
+      ptr_Q_host.at(i) = block_Q.get() + offset_Q.at(i);
+      ptr_K_host.at(i) = block_K.get() + offset_K.at(i);
+      ptr_P_host.at(i) = block_P.get() + offset_P.at(i);
+      ptr_V_host.at(i) = block_V.get() + offset_V.at(i);
+      ptr_O_host.at(i) = block_O.get() + offset_O.at(i);
+
+      if (kNeedsOutputAccumulatorBuffer) {
+        ptr_O_accumulate_host.at(i) = block_O_accumulate.get() + offset_O.at(i);
+      }
+    }
+
+    ptr_Q.reset(problem_count());
+    ptr_Q.copy_from_host(ptr_Q_host.data());
+    
+    ptr_K.reset(problem_count());
+    ptr_K.copy_from_host(ptr_K_host.data());
+    
+    ptr_P.reset(problem_count());
+    ptr_P.copy_from_host(ptr_P_host.data());
+
+    ptr_V.reset(problem_count());
+    ptr_V.copy_from_host(ptr_V_host.data());
+
+    ptr_O.reset(problem_count());
+    ptr_O.copy_from_host(ptr_O_host.data());
+
+    if (kNeedsOutputAccumulatorBuffer) {
+      ptr_O_accumulate.reset(problem_count());
+      ptr_O_accumulate.copy_from_host(ptr_O_accumulate_host.data());
+    }
+
+    //
+    // Initialize the problems of the workspace
+    //
+
+    initialize_tensor_(block_Q.get(), total_elements_Q, init_Q, seed + 1);
+    initialize_tensor_(block_K.get(), total_elements_K, init_K, seed + 2);
+    initialize_tensor_(block_V.get(), total_elements_V, init_V, seed + 3);
+
+  }
+
+  template<typename Element>
+  bool verify_tensor_(std::vector<Element> vector_Input, \
+                       std::vector<Element> vector_Input_Ref,
+                       int64_t verify_length = -1) {
+
+    int64_t size = (vector_Input.size() < vector_Input_Ref.size()) ? vector_Input.size() : vector_Input_Ref.size();
+    size = (verify_length == -1) ? size : verify_length;
+
+    // 0.05 for absolute error
+    float abs_tol = 5e-2f;
+    // 10% for relative error
+    float rel_tol = 1e-1f;
+    for (int64_t i = 0; i < size; ++i) {
+      float diff = (float)(vector_Input.at(i) - vector_Input_Ref.at(i));
+      float abs_diff = fabs(diff);
+      float abs_ref = fabs((float)vector_Input_Ref.at(i) + 1e-5f);
+      float relative_diff = abs_diff / abs_ref;
+      if ( (isnan(abs_diff) || isinf(abs_diff)) ||  (abs_diff > abs_tol && relative_diff > rel_tol)) {
+        printf("[%d/%d] diff = %f, rel_diff = %f, {computed=%f, ref=%f}.\n", int(i), int(size), abs_diff, relative_diff, (float)(vector_Input.at(i)), (float)(vector_Input_Ref.at(i)));
+        return false;
+      }
+
+    }
+    
+    return true;
+  }
+
+  /// Verifies the result is a GEMM
+  bool verify_() {
+
+    bool passed = true;
+
+    for (int32_t i = 0; i < problem_count(); ++i) {
+      cutlass::gemm::GemmCoord problem0 = options.problem_sizes0.at(i);
+      cutlass::gemm::GemmCoord problem1 = options.problem_sizes1.at(i);
+
+      LayoutQ layout_Q(ldq_host.at(i));
+      LayoutK layout_K(ldk_host.at(i));
+      LayoutP layout_P(ldp_host.at(i));
+      LayoutV layout_V(ldv_host.at(i));
+      LayoutO layout_O(ldo_host.at(i));
+
+      MatrixCoord extent_Q{problem0.m(), problem0.k()};
+      MatrixCoord extent_K{problem0.k(), problem0.n()};
+      MatrixCoord extent_P{problem0.m(), problem0.n()};
+      MatrixCoord extent_V{problem1.k(), problem1.n()};
+      MatrixCoord extent_O{problem1.m(), problem1.n()};
+
+      cutlass::TensorView<ElementQ, LayoutQ> view_Q(block_Q.get() + offset_Q.at(i), layout_Q, extent_Q);
+      cutlass::TensorView<ElementK, LayoutK> view_K(block_K.get() + offset_K.at(i), layout_K, extent_K);
+      cutlass::TensorView<ElementP, LayoutP> view_P(block_P.get() + offset_P.at(i), layout_P, extent_P);
+      cutlass::TensorView<ElementV, LayoutV> view_V(block_V.get() + offset_V.at(i), layout_V, extent_V);
+
+      cutlass::DeviceAllocation<ElementP>    block_Ref(layout_P.capacity(extent_P));
+      cutlass::TensorView<ElementP, LayoutP> view_Ref_device(block_Ref.get(), layout_P, extent_P);
+
+      cutlass::DeviceAllocation<ElementO>    block_Ref_O(layout_O.capacity(extent_O));
+      cutlass::TensorView<ElementO, LayoutO> view_Ref_O_device(block_Ref_O.get(), layout_O, extent_O);
+      cutlass::reference::device::TensorFill(view_Ref_O_device, ElementO(0));
+
+      // Reference GEMM
+      cutlass::reference::device::GemmComplex<
+          ElementQ, LayoutQ,
+          ElementK, LayoutK,
+          ElementP, LayoutP, 
+          ElementCompute, ElementAccumulator
+      >(
+        problem0,
+        ElementAccumulator(options.alpha0), 
+        view_Q,
+        Attention::GemmKernel::MM0::Mma::kTransformA,
+        view_K,
+        Attention::GemmKernel::MM0::Mma::kTransformB,
+        ElementAccumulator(options.beta), 
+        view_P, 
+        view_Ref_device, 
+        ElementAccumulator(0)
+      );
+
+      // Compute softmax for P. We need to explicitly compute softmax
+      // over P because softmax is fused to the second GEMM in the
+      // profiled implementation.
+      std::vector<ElementP> matrix_Ref(layout_P.capacity(extent_P));
+      cutlass::device_memory::copy_to_host(matrix_Ref.data(), block_Ref.get(), matrix_Ref.size());
+      cutlass::TensorView<ElementP, LayoutP> view_Ref_host(matrix_Ref.data(), layout_P, extent_P);
+      std::vector<ElementNorm> vector_Norm_Ref(problem0.m());
+      std::vector<ElementSum> vector_Sum_Ref(problem0.m());
+
+      int n_dim = options.use_mask ? options.problem_sizes0_real.at(i).n() : problem0.n();
+
+      // Compute softmax for reference matrix
+      for (int m = 0; m < problem0.m(); m++) {
+        int n_dim_row = n_dim;
+        if (options.causal) {
+          n_dim_row = std::min(m + 1, n_dim);
+        }
+        ElementSoftmaxCompute max = ElementSoftmaxCompute(view_Ref_host.ref().at({m, 0}));
+        for (int n = 1; n < n_dim_row; n++) {
+           max = std::max(max, ElementSoftmaxCompute(view_Ref_host.ref().at({m, n})));
+        }
+
+        vector_Norm_Ref.at(m) = ElementNorm(max);
+
+        ElementSoftmaxCompute sum = ElementSoftmaxCompute();
+        for (int n = 0; n < n_dim_row; n++) {
+          sum += std::exp( ElementSoftmaxCompute(view_Ref_host.ref().at({m, n})) - max );
+        }
+        ElementSoftmaxCompute inv_sum = ElementSoftmaxCompute(1.0f / sum);
+
+        vector_Sum_Ref.at(m) = ElementSum(inv_sum);
+
+        for (int n = 0; n < n_dim_row; n++) {
+          view_Ref_host.ref().at({m, n}) = ElementP(
+            std::exp( ElementSoftmaxCompute(view_Ref_host.ref().at({m, n})) - max ) * inv_sum
+          );
+        }
+        // Mask out the rest of the attention matrix
+        for (int n = n_dim_row; n < n_dim; ++n) {
+          view_Ref_host.ref().at({m, n}) = ElementP(0);
+        }
+
+      }
+
+      // when not using mask, problem_real and problem share the same sizes
+      if (options.use_mask) {
+        for (int m = 0; m < problem0.m(); m++) {
+          for (int n = n_dim; n < problem0.n(); n++) {
+            view_Ref_host.ref().at({m, n}) = ElementP(0);
+          }
+        }
+      }
+
+      cutlass::device_memory::copy_to_device(block_P.get() + offset_P.at(i), matrix_Ref.data(), matrix_Ref.size());
+
+      // Reference GEMM
+      cutlass::reference::device::GemmComplex<
+          ElementP, LayoutP,
+          ElementV, LayoutV,
+          ElementO, LayoutO, 
+          ElementCompute, ElementAccumulator
+      >(
+        problem1,
+        ElementAccumulator(options.alpha1), 
+        view_P,
+        Attention::GemmKernel::MM0::Mma::kTransformA,
+        view_V,
+        Attention::GemmKernel::MM0::Mma::kTransformB,
+        ElementAccumulator(options.beta), 
+        view_Ref_O_device, 
+        view_Ref_O_device, 
+        ElementAccumulator(0)
+      );
+
+      // Copy to host memory
+      cutlass::TensorView<ElementP, LayoutP> view_Ref(matrix_Ref.data(), layout_P, extent_P);
+
+      std::vector<ElementO> matrix_O(layout_O.capacity(extent_O));
+      cutlass::device_memory::copy_to_host(matrix_O.data(),   block_O.get() + offset_O.at(i), matrix_O.size());
+      std::vector<ElementO> matrix_Ref_O(layout_O.capacity(extent_O));
+      cutlass::device_memory::copy_to_host(matrix_Ref_O.data(), block_Ref_O.get(), matrix_Ref_O.size());
+
+
+      bool verified_O = false;
+      if (!verified_O) {
+        verified_O = verify_tensor_<ElementO>(matrix_O, matrix_Ref_O);
+      }
+
+      passed = passed && verified_O;
+
+      if (!passed) {
+        std::cerr << "\n***\nError - problem " << i << " failed the QA check\n***\n" << std::endl;
+
+        if (!verified_O) {
+          std::cout << "Final matrix output is incorrect" << std::endl;
+        }
+
+        return passed;
+      }
+
+    }
+
+    return passed;
+  }
+
+public:
+
+
+  /// Executes a CUTLASS Attention kernel and measures runtime.
+  Result profile() {
+
+    Result result;
+    result.passed = false;
+
+    int threadblock_count = Attention::sufficient(options.problem_sizes1.data(), options.problem_count);
+
+    // Early exit
+    if (!threadblock_count) {
+      std::cout << "Active CUDA device lacks hardware resources to run CUTLASS Grouped FMHA kernel." << std::endl;
+      return result;
+    }
+
+    result.passed = false;
+
+    // Initialize the problem
+    initialize_();
+
+    typename Attention::Arguments args(
+      problem_sizes_device0.get(),
+      problem_sizes_device1.get(),
+      options.problem_count,
+      threadblock_count,
+      ptr_Q.get(),
+      ptr_K.get(),
+      ptr_P.get(),
+      ptr_V.get(),
+      ptr_O.get(),
+      ptr_O_accumulate.get(),
+      ldq.get(),
+      ldk.get(),
+      ldp.get(),
+      ldv.get(),
+      ldo.get(),
+      options.causal,
+      options.alpha0,
+      options.problem_sizes1.data()
+    );
+
+    Attention fmha;
+
+    size_t workspace_size = fmha.get_workspace_size(args);
+    cutlass::DeviceAllocation<uint8_t> workspace(workspace_size);
+
+    result.status = fmha.initialize(args, workspace.get());
+
+    if (result.status != cutlass::Status::kSuccess) {
+      std::cerr << "Failed to initialize CUTLASS Grouped FMHA kernel." << std::endl;
+      return result;
+    }
+
+    // Run the grouped FMHA object
+    result.status = fmha.run();
+
+    if (result.status != cutlass::Status::kSuccess) {
+      std::cerr << "Failed to run CUTLASS Grouped FMHA kernel." << std::endl;
+      return result;
+    }
+
+    // Wait for completion
+    result.error = cudaDeviceSynchronize();
+
+    if (result.error != cudaSuccess)  {
+      std::cerr << "Kernel execution error: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    //
+    // Verify correctness
+    //
+    result.passed = true;
+
+    if (options.reference_check) {
+      result.passed = verify_();
+    }
+
+    //
+    // Warm-up run of the grouped FMHA object
+    //
+    result.status = fmha.run();
+
+    if (result.status != cutlass::Status::kSuccess) {
+      std::cerr << "Failed to run CUTLASS Grouped FMHA kernel." << std::endl;
+      return result;
+    }
+
+    //
+    // Construct events
+    //
+
+    cudaEvent_t events[2];
+
+    for (auto & event : events) {
+      result.error = cudaEventCreate(&event);
+      if (result.error != cudaSuccess) {
+        std::cerr << "cudaEventCreate() failed: " << cudaGetErrorString(result.error) << std::endl;
+        return -1;
+      }
+    }
+
+    // Record an event at the start of a series of FMHA operations
+    result.error = cudaEventRecord(events[0]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    //
+    // Run profiling loop
+    //
+
+    for (int iter = 0; iter < this->options.iterations; ++iter) {
+      fmha();
+    }
+
+    //
+    // Stop profiling loop
+    //
+
+    // Record an event when the GEMM operations have been launched.
+    result.error = cudaEventRecord(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventRecord() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Wait for work on the device to complete.
+    result.error = cudaEventSynchronize(events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventSynchronize() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Measure elapsed runtime
+    float runtime_ms = 0;
+    result.error = cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+    if (result.error != cudaSuccess) {
+      std::cerr << "cudaEventElapsed() failed: " << cudaGetErrorString(result.error) << std::endl;
+      return result;
+    }
+
+    // Compute average runtime and GFLOPs.
+    result.runtime_ms = double(runtime_ms) / double(this->options.iterations);
+    result.gflops = this->options.gflops(result.runtime_ms / 1000.0);
+
+    //
+    // Cleanup
+    //
+
+    for (auto event : events) {
+      (void)cudaEventDestroy(event);
+    }
+
+    std::cout << std::endl;
+    std::cout << "CUTLASS Attention:\n"
+      << "====================================================" << std::endl;
+    std::cout << "    " << " {seq length Q, seq length KV, head size, head size V, head number, batch size} = {" << options.seq_length \
+      << ", " << options.seq_length_kv << ", " << options.head_size << ", " << options.head_size_v << ", " << options.head_number\
+      << ", " << options.batch_size << "}." << std::endl;
+    options.print_problems();
+    std::cout << std::endl;
+    std::cout << "    " << "Runtime: " << result.runtime_ms << " ms" << std::endl;
+    std::cout << "    " << "GFLOPs: " << result.gflops << std::endl;
+
+    return result;
+  }
+
+
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+  int kQueriesPerBlock,
+  int kKeysPerBlock,
+  int kMaxK,
+  cutlass::gemm::kernel::GroupScheduleMode GroupScheduleMode_
+>
+int run_grouped(Options& options) {
+  using AttentionKernel = typename cutlass::gemm::kernel::DefaultFMHAGrouped<
+    cutlass::half_t,      // scalar_t
+    cutlass::arch::Sm80,  // ArchTag
+    true,                 // Memory is aligned
+    kQueriesPerBlock,
+    kKeysPerBlock,
+    kMaxK,
+    GroupScheduleMode_
+  >::FMHAKernel;
+
+  using FMHA = cutlass::gemm::device::GemmGrouped<AttentionKernel>;
+
+  //
+  // Test and profile
+  //
+
+  TestbedAttention<FMHA> testbed(options);
+
+  Result result = testbed.profile();
+  if (!result.passed) {
+    std::cout << "Profiling CUTLASS attention has failed.\n";
+    std::cout << "\nFailed\n";
+    return -1;
+  }
+
+  std::cout << "\nPassed\n";
+  return 0;
+}
+
+
+template <
+  int kQueriesPerBlock,
+  int kKeysPerBlock,
+  int kMaxK
+>
+int run_attention(Options& options) {
+  if (options.scheduler_mode == cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly) {
+    return run_grouped<kQueriesPerBlock,
+                       kKeysPerBlock,
+                       kMaxK,
+                       cutlass::gemm::kernel::GroupScheduleMode::kDeviceOnly>(options);
+  } else {
+    return run_grouped<kQueriesPerBlock,
+                       kKeysPerBlock,
+                       kMaxK,
+                       cutlass::gemm::kernel::GroupScheduleMode::kHostPrecompute>(options);
+  }
+}
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  //
+  // This example uses mma.sync to directly access Tensor Cores to achieve peak performance.
+  //
+
+  cudaDeviceProp props;
+
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (error != cudaSuccess) {
+    std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl;
+    return -1;
+  }
+
+  if (__CUDACC_VER_MAJOR__ < 11 || props.major < 8) {
+  
+    //
+    // This example requires an NVIDIA Ampere-architecture GPU.
+    //
+
+    std::cout 
+      << "CUTLASS's CUTLASS Attention example requires a GPU of NVIDIA's Ampere Architecture or "
+      << "later (compute capability 80 or greater).\n";
+
+    return 0;
+  }
+
+  //
+  // Parse options
+  //
+
+  Options options;
+
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  if (options.error) {
+    std::cerr << "Aborting execution." << std::endl;
+    return -1;
+  }
+
+  if (options.use_mask) {
+    std::cerr << "--use_mask is not supported at the moment\n";
+    return -2;
+  }
+  if (options.alignment != 1) {
+    std::cerr << "--alignment=1 is the only supported value\n";
+    return -2;
+  }
+
+  // Determine kernel configuration based on head size.
+  // If head size is less than or equal to 64, each block operates over 64 queries and
+  // 64 keys, and partial results can be stored in the register file.
+  // If head size is greater than 64, each block operates over 32 queries and 128 keys,
+  // and partial results are stored in shared memory.
+  if (options.head_size_v > 64) {
+    static int const kQueriesPerBlock = 32;
+    static int const kKeysPerBlock = 128;
+    if (options.head_size_v <= kKeysPerBlock) {
+      return run_attention<kQueriesPerBlock, kKeysPerBlock, 128>(options);
+    } else {
+      return run_attention<kQueriesPerBlock, kKeysPerBlock, 65536>(options);
+    }
+  } else {
+    static constexpr int kMaxK = 64; // <- Decrease to 32/16 if your problem is smaller
+    static int const kQueriesPerBlock = 64;
+    static int const kKeysPerBlock = 64;
+    return run_attention<kQueriesPerBlock, kKeysPerBlock, kMaxK>(options);
+  }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/third_party/fused_multi_head_attention/gemm/custom_mma.h b/third_party/fused_multi_head_attention/gemm/custom_mma.h
new file mode 100644
index 0000000000..f3a1d4cbc2
--- /dev/null
+++ b/third_party/fused_multi_head_attention/gemm/custom_mma.h
@@ -0,0 +1,124 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "custom_mma_multistage.h"
+#include "custom_mma_pipelined.h"
+#include "cutlass/gemm/threadblock/mma_multistage.h"
+#include "cutlass/gemm/threadblock/mma_pipelined.h"
+
+template <typename Mma, int kMaxK>
+struct MakeCustomMma;
+
+template <
+    typename Shape,
+    typename IteratorA,
+    typename SmemIteratorA,
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    typename IteratorB,
+    typename SmemIteratorB,
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    typename ElementC,
+    typename LayoutC,
+    typename Policy,
+    int Stages,
+    cutlass::gemm::SharedMemoryClearOption SharedMemoryClear,
+    int kMaxK>
+struct MakeCustomMma<
+    cutlass::gemm::threadblock::MmaMultistage<
+        Shape,
+        IteratorA,
+        SmemIteratorA,
+        CacheOpA,
+        IteratorB,
+        SmemIteratorB,
+        CacheOpB,
+        ElementC,
+        LayoutC,
+        Policy,
+        Stages,
+        SharedMemoryClear>,
+    kMaxK> {
+  // Reduce the number of stages if we don't need that many
+  static int constexpr kStages =
+      kMaxK == cutlass::platform::numeric_limits<int>::max()
+      ? Stages
+      : cutlass::const_min(
+            Stages,
+            (kMaxK + int(Shape::kK) - 1) / int(Shape::kK));
+  using Mma = cutlass::gemm::threadblock::CustomMmaMultistage<
+      Shape,
+      IteratorA,
+      SmemIteratorA,
+      CacheOpA,
+      IteratorB,
+      SmemIteratorB,
+      CacheOpB,
+      ElementC,
+      LayoutC,
+      Policy,
+      kStages,
+      SharedMemoryClear,
+      kMaxK>;
+};
+
+template <
+    typename Shape,
+    typename IteratorA,
+    typename SmemIteratorA,
+    typename IteratorB,
+    typename SmemIteratorB,
+    typename ElementC,
+    typename LayoutC,
+    typename Policy,
+    int kMaxK>
+struct MakeCustomMma<
+    cutlass::gemm::threadblock::MmaPipelined<
+        Shape,
+        IteratorA,
+        SmemIteratorA,
+        IteratorB,
+        SmemIteratorB,
+        ElementC,
+        LayoutC,
+        Policy>,
+    kMaxK> {
+  using Mma = cutlass::gemm::threadblock::CustomMmaPipelined<
+      Shape,
+      IteratorA,
+      SmemIteratorA,
+      IteratorB,
+      SmemIteratorB,
+      ElementC,
+      LayoutC,
+      Policy>;
+};
diff --git a/third_party/fused_multi_head_attention/gemm/custom_mma_base.h b/third_party/fused_multi_head_attention/gemm/custom_mma_base.h
new file mode 100644
index 0000000000..66c099d15b
--- /dev/null
+++ b/third_party/fused_multi_head_attention/gemm/custom_mma_base.h
@@ -0,0 +1,182 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/mma_base.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Used for partial specialization
+    typename Enable = bool>
+class CustomMmaBase {
+ public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  //
+  // Dependent types
+  //
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Shape describing the overall GEMM computed from shared memory
+  /// by each warp.
+  using WarpGemm = typename Policy::Operator::Shape;
+
+  /// Shape describing the number of warps filling the CTA
+  using WarpCount = GemmShape<
+      Shape::kM / WarpGemm::kM,
+      Shape::kN / WarpGemm::kN,
+      Shape::kK / WarpGemm::kK>;
+
+  /// Number of warp-level GEMM oeprations
+  static int const kWarpGemmIterations =
+      (WarpGemm::kK / Operator::Policy::MmaShape::kK);
+
+  /// Number of stages
+  static int const kStages = Stages;
+
+  //
+  // Nested structs
+  //
+
+  /// Shared storage object needed by threadblock-scoped GEMM
+  template <typename Element, typename OperandShape, typename OperandLayout>
+  struct OperandSharedStorage {
+    AlignedBuffer<Element, OperandShape::kCount> buffer;
+    using TensorRef = TensorRef<Element, OperandLayout>;
+
+    CUTLASS_DEVICE
+    static OperandLayout Layout() {
+      return OperandLayout::packed({OperandShape::kRow, OperandShape::kColumn});
+    }
+
+    /// Returns a TensorRef to the operand
+    CUTLASS_HOST_DEVICE
+    TensorRef ref() {
+      return TensorRef{buffer.data(), Layout()};
+    }
+  };
+
+  /// Shape of the A matrix operand in shared memory
+  using ShapeA = MatrixShape<
+      Shape::kM + Policy::SmemPaddingA::kRow,
+      Shape::kK * kStages + Policy::SmemPaddingA::kColumn>;
+
+  /// Shape of the B matrix operand in shared memory
+  using ShapeB = MatrixShape<
+      Shape::kK * kStages + Policy::SmemPaddingB::kRow,
+      Shape::kN + Policy::SmemPaddingB::kColumn>;
+
+  using SharedStorageA = OperandSharedStorage<
+      typename Operator::ElementA,
+      ShapeA,
+      typename Operator::LayoutA>;
+  using SharedStorageB = OperandSharedStorage<
+      typename Operator::ElementB,
+      ShapeB,
+      typename Operator::LayoutB>;
+  using TensorRefA = typename SharedStorageA::TensorRef;
+  using TensorRefB = typename SharedStorageB::TensorRef;
+
+  struct SharedStorage {
+    /// Buffer for A operand
+    SharedStorageA operand_A;
+
+    /// Buffer for B operand
+    SharedStorageB operand_B;
+  };
+
+ protected:
+  //
+  // Data members
+  //
+
+  /// Iterator to load a warp-scoped tile of A operand from shared memory
+  typename Operator::IteratorA warp_tile_iterator_A_;
+
+  /// Iterator to load a warp-scoped tile of B operand from shared memory
+  typename Operator::IteratorB warp_tile_iterator_B_;
+
+ public:
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  CustomMmaBase(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      SharedStorageA& shared_storageA,
+      SharedStorageB& shared_storageB,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : warp_tile_iterator_A_(shared_storageA.ref(), lane_idx),
+        warp_tile_iterator_B_(shared_storageB.ref(), lane_idx) {}
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/third_party/fused_multi_head_attention/gemm/custom_mma_multistage.h b/third_party/fused_multi_head_attention/gemm/custom_mma_multistage.h
new file mode 100644
index 0000000000..145315e413
--- /dev/null
+++ b/third_party/fused_multi_head_attention/gemm/custom_mma_multistage.h
@@ -0,0 +1,760 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/cache_operation.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "custom_mma_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
+    /// Upper boundon the K dimension
+    int kMaxK = cutlass::platform::numeric_limits<int>::max(),
+    /// Used for partial specialization
+    typename Enable = bool>
+class CustomMmaMultistage : public CustomMmaBase<Shape_, Policy_, Stages> {
+ public:
+  ///< Base class
+  using Base = CustomMmaBase<Shape_, Policy_, Stages>;
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+  ///< Iterates over tiles of A operand in global memory
+  using IteratorA = IteratorA_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB = IteratorB_;
+  ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA;
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Minimum architecture is Sm80 to support cp.async
+  using ArchTag = arch::Sm80;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  /// Internal structure exposed for introspection.
+  struct Detail {
+    static_assert(
+        Base::kWarpGemmIterations > 1,
+        "The pipelined structure requires at least two warp-level "
+        "GEMM operations.");
+
+    /// Number of cp.async instructions to load one stage of operand A
+    static int const AsyncCopyIterationsPerStageA =
+        IteratorA::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const AsyncCopyIterationsPerStageB =
+        IteratorB::ThreadMap::Iterations::kCount;
+
+    /// Number of stages
+    static int const kStages = Stages;
+
+    /// Number of cp.async instructions to load on group of operand A
+    static int const kAccessesPerGroupA =
+        (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) /
+        Base::kWarpGemmIterations;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB =
+        (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) /
+        Base::kWarpGemmIterations;
+  };
+
+  static bool const kSmemContainsEntireMat = kMaxK <= Shape::kK * Stages;
+  static constexpr int kNumStagesConcurrentLoad =
+      kSmemContainsEntireMat ? Stages : Stages - 1;
+
+ private:
+  using WarpLoadedFragmentA = typename Operator::FragmentA;
+  using WarpLoadedFragmentB = typename Operator::FragmentB;
+  using WarpTransformedFragmentA = typename Operator::TransformedFragmentA;
+  using WarpTransformedFragmentB = typename Operator::TransformedFragmentB;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+  bool prologue_done_;
+
+  // Set to `True` to ensure the accumulator will be zero outside the GEMM
+  // footprint
+  bool zero_outside_bounds_;
+
+ public:
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  CustomMmaMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorageA& shared_storageA,
+      typename Base::SharedStorageB& shared_storageB,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : Base(shared_storageA, shared_storageB, thread_idx, warp_idx, lane_idx),
+        smem_iterator_A_(shared_storageA.ref(), thread_idx),
+        smem_iterator_B_(shared_storageB.ref(), thread_idx),
+        prologue_done_(false),
+        zero_outside_bounds_(false) {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+  CUTLASS_DEVICE
+  CustomMmaMultistage(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage& st,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : CustomMmaMultistage(
+            st.operand_A,
+            st.operand_B,
+            thread_idx,
+            warp_idx,
+            lane_idx) {}
+
+  CUTLASS_DEVICE
+  bool set_prologue_done(bool value) {
+    prologue_done_ = value;
+    return true;
+  }
+
+  CUTLASS_DEVICE
+  bool set_zero_outside_bounds(bool value) {
+    zero_outside_bounds_ = value;
+    return true;
+  }
+
+  template <bool kLoadA = true, bool kLoadB = true>
+  CUTLASS_DEVICE static void prologue(
+      typename Base::SharedStorage& shared_storage,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      int thread_idx,
+      int problem_size_k) {
+    prologue<kLoadA, kLoadB>(
+        shared_storage.operand_A,
+        shared_storage.operand_B,
+        iterator_A,
+        iterator_B,
+        thread_idx,
+        problem_size_k);
+  }
+
+  template <bool kLoadA = true, bool kLoadB = true>
+  CUTLASS_DEVICE static void prologue(
+      typename Base::SharedStorageA& shared_storageA,
+      typename Base::SharedStorageB& shared_storageB,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      int thread_idx,
+      int problem_size_k) {
+    SmemIteratorA smem_iterator_A(shared_storageA.ref(), thread_idx);
+    SmemIteratorB smem_iterator_B(shared_storageB.ref(), thread_idx);
+    int32_t iter = (problem_size_k + Base::Shape::kK - 1) / Base::Shape::kK;
+    _prologue<kLoadA, kLoadB>(
+        iterator_A, iterator_B, iter, smem_iterator_A, smem_iterator_B);
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance(
+      IteratorA& iterator_A,
+      IteratorB& iterator_B,
+      int group_start_A = 0,
+      int group_start_B = 0) {
+    iterator_A.set_iteration_index(
+        group_start_A * IteratorA::kAccessesPerVector);
+    this->smem_iterator_A_.set_iteration_index(group_start_A);
+
+    // Async Copy for operand A
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) {
+      if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) {
+        typename IteratorA::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType*>(
+                this->smem_iterator_A_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorA::Element>::value *
+            IteratorA::ThreadMap::kElementsPerAccess /
+            IteratorA::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_A.get();
+
+          if (zero_outside_bounds_ ||
+              SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, gmem_ptr, iterator_A.valid());
+          } else {
+            cutlass::arch::cp_async<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, gmem_ptr, iterator_A.valid());
+          }
+
+          ++iterator_A;
+        }
+
+        ++this->smem_iterator_A_;
+      }
+    }
+
+    iterator_B.set_iteration_index(
+        group_start_B * IteratorB::kAccessesPerVector);
+    this->smem_iterator_B_.set_iteration_index(group_start_B);
+
+    // Async Copy for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) {
+      if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) {
+        typename IteratorB::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType*>(
+                this->smem_iterator_B_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorB::Element>::value *
+            IteratorB::ThreadMap::kElementsPerAccess /
+            IteratorB::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B.get();
+
+          if (zero_outside_bounds_ ||
+              SharedMemoryClear == SharedMemoryClearOption::kZfill) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, gmem_ptr, iterator_B.valid());
+          } else {
+            cutlass::arch::cp_async<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, gmem_ptr, iterator_B.valid());
+          }
+
+          ++iterator_B;
+        }
+        ++this->smem_iterator_B_;
+      }
+    }
+  }
+
+  template <bool kLoadA = true, bool kLoadB = true>
+  CUTLASS_DEVICE static void _prologue(
+      IteratorA& iterator_A,
+      IteratorB& iterator_B,
+      int32_t& gemm_k_iterations,
+      SmemIteratorA& smem_iterator_A_,
+      SmemIteratorB& smem_iterator_B_) {
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < kNumStagesConcurrentLoad;
+         ++stage, --gemm_k_iterations) {
+      iterator_A.clear_mask(gemm_k_iterations == 0);
+      iterator_B.clear_mask(gemm_k_iterations == 0);
+
+      iterator_A.set_iteration_index(0);
+      smem_iterator_A_.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType*>(
+                smem_iterator_A_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorA::Element>::value *
+              IteratorA::ThreadMap::kElementsPerAccess /
+              IteratorA::kAccessesPerVector / 8;
+
+          int src_bytes = (iterator_A.valid() ? kSrcBytes : 0);
+
+          if (kLoadA) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpA>(
+                dst_ptr + v, iterator_A.get(), iterator_A.valid());
+          }
+
+          ++iterator_A;
+        }
+
+        ++smem_iterator_A_;
+      }
+
+      iterator_B.set_iteration_index(0);
+      smem_iterator_B_.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+        typename IteratorB::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType*>(
+                smem_iterator_B_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorB::Element>::value *
+              IteratorB::ThreadMap::kElementsPerAccess /
+              IteratorB::kAccessesPerVector / 8;
+
+          if (kLoadB) {
+            cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB>(
+                dst_ptr + v, iterator_B.get(), iterator_B.valid());
+          }
+
+          ++iterator_B;
+        }
+
+        ++smem_iterator_B_;
+      }
+
+      // Move to the next stage
+      iterator_A.add_tile_offset({0, 1});
+      iterator_B.add_tile_offset({1, 0});
+
+      smem_iterator_A_.add_tile_offset({0, 1});
+      smem_iterator_B_.add_tile_offset({1, 0});
+
+      // Defines the boundary of a stage of cp.async.
+      cutlass::arch::cp_async_fence();
+    }
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations,
+      ///< destination accumulator tile
+      FragmentC& accum,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      ///< initial value of accumulator
+      FragmentC const& src_accum) {
+    //
+    // Prologue
+    //
+
+    if (!prologue_done_) {
+      _prologue<true, true>(
+          iterator_A,
+          iterator_B,
+          gemm_k_iterations,
+          smem_iterator_A_,
+          smem_iterator_B_);
+    } else if (!kSmemContainsEntireMat) {
+      _prologue<false, false>(
+          iterator_A,
+          iterator_B,
+          gemm_k_iterations,
+          smem_iterator_A_,
+          smem_iterator_B_);
+    } else {
+      gemm_k_iterations -= kNumStagesConcurrentLoad;
+    }
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    //
+    // Clear the remaining tiles of SMEM. This is a functional requirement for
+    // some kernels so that all accumulator elements outside the GEMM footprint
+    // are zero.
+    //
+
+    if (SharedMemoryClear == SharedMemoryClearOption::kClearLastStage) {
+      /// Iterator to write threadblock-scoped tile of A operand to shared
+      /// memory
+      SmemIteratorA last_smem_iterator_A(this->smem_iterator_A_);
+
+      typename IteratorA::AccessType zero_A;
+      zero_A.clear();
+
+      last_smem_iterator_A.set_iteration_index(0);
+
+      // Async Copy for operand A
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) {
+        typename IteratorA::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorA::AccessType*>(
+                last_smem_iterator_A.get());
+
+        *dst_ptr = zero_A;
+
+        ++last_smem_iterator_A;
+      }
+
+      /// Iterator to write threadblock-scoped tile of B operand to shared
+      /// memory
+      SmemIteratorB last_smem_iterator_B(this->smem_iterator_B_);
+      typename IteratorB::AccessType zero_B;
+
+      zero_B.clear();
+      last_smem_iterator_B.set_iteration_index(0);
+
+      // Async Copy for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) {
+        typename IteratorB::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorB::AccessType*>(
+                last_smem_iterator_B.get());
+
+        *dst_ptr = zero_B;
+
+        ++last_smem_iterator_B;
+      }
+    }
+
+    // Waits until kStages-2 stages have committed.
+    cutlass::arch::cp_async_wait<kNumStagesConcurrentLoad - 1>();
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA warp_loaded_frag_A[2];
+    WarpLoadedFragmentB warp_loaded_frag_B[2];
+    WarpTransformedFragmentA warp_transformed_frag_A[2];
+    WarpTransformedFragmentB warp_transformed_frag_B[2];
+
+    Operator warp_mma;
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    iterator_A.clear_mask(gemm_k_iterations == 0);
+    iterator_B.clear_mask(gemm_k_iterations == 0);
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma.transform(
+        warp_transformed_frag_A[0],
+        warp_transformed_frag_B[0],
+        warp_loaded_frag_A[0],
+        warp_loaded_frag_B[0]);
+
+    // tf32x3 kernels use staging accumulation. warp_mma uses a temporary
+    // accumulator and this temporary accumulator is added to the final
+    // accumulator once in every mainloop iteration.
+    plus<FragmentC> plus_accum;
+
+    FragmentC tmp_accum;
+
+    if (platform::is_same<
+            typename Operator::MathOperator,
+            arch::OpMultiplyAddFastF32>::value ||
+        platform::is_same<
+            typename Operator::MathOperator,
+            arch::OpMultiplyAddComplexFastF32>::value) {
+      tmp_accum.clear();
+    }
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > (-kNumStagesConcurrentLoad);) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
+           ++warp_mma_k) {
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        this->warp_tile_iterator_A_.set_kgroup_index(
+            (warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index(
+            (warp_mma_k + 1) % Base::kWarpGemmIterations);
+
+        // In case of a non-circular buffer ("kSmemContainsEntireMat")
+        // make sure we don't load out of bounds data.
+        if (!kSmemContainsEntireMat ||
+            gemm_k_iterations > (-kNumStagesConcurrentLoad) ||
+            warp_mma_k < Base::kWarpGemmIterations - 1) {
+          this->warp_tile_iterator_A_.load(
+              warp_loaded_frag_A[(warp_mma_k + 1) % 2]);
+          this->warp_tile_iterator_B_.load(
+              warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+        }
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k > 0)
+          warp_mma.transform(
+              warp_transformed_frag_A[warp_mma_k % 2],
+              warp_transformed_frag_B[warp_mma_k % 2],
+              warp_loaded_frag_A[warp_mma_k % 2],
+              warp_loaded_frag_B[warp_mma_k % 2]);
+
+        if (platform::is_same<
+                typename Operator::MathOperator,
+                arch::OpMultiplyAddFastF32>::value ||
+            platform::is_same<
+                typename Operator::MathOperator,
+                arch::OpMultiplyAddComplexFastF32>::value) {
+          warp_mma(
+              tmp_accum,
+              warp_transformed_frag_A[warp_mma_k % 2],
+              warp_transformed_frag_B[warp_mma_k % 2],
+              tmp_accum);
+
+          if (warp_mma_k == 0) {
+            accum = plus_accum(accum, tmp_accum);
+            tmp_accum.clear();
+          }
+        } else {
+          warp_mma(
+              accum,
+              warp_transformed_frag_A[warp_mma_k % 2],
+              warp_transformed_frag_B[warp_mma_k % 2],
+              accum);
+        }
+
+        // Issue global->shared copies for the this stage
+        if (!kSmemContainsEntireMat &&
+            warp_mma_k < Base::kWarpGemmIterations - 1) {
+          int group_start_iteration_A, group_start_iteration_B;
+
+          group_start_iteration_A = warp_mma_k * Detail::kAccessesPerGroupA;
+          group_start_iteration_B = warp_mma_k * Detail::kAccessesPerGroupB;
+
+          copy_tiles_and_advance(
+              iterator_A,
+              iterator_B,
+              group_start_iteration_A,
+              group_start_iteration_B);
+        }
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations) {
+          if (!kSmemContainsEntireMat) {
+            int group_start_iteration_A, group_start_iteration_B;
+            group_start_iteration_A =
+                (warp_mma_k + 1) * Detail::kAccessesPerGroupA;
+            group_start_iteration_B =
+                (warp_mma_k + 1) * Detail::kAccessesPerGroupB;
+
+            copy_tiles_and_advance(
+                iterator_A,
+                iterator_B,
+                group_start_iteration_A,
+                group_start_iteration_B);
+          }
+
+          // Inserts a memory fence between stages of cp.async instructions.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages have committed.
+          cutlass::arch::cp_async_wait<kNumStagesConcurrentLoad - 1>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_A.add_tile_offset({0, 1});
+          iterator_B.add_tile_offset({1, 0});
+
+          this->smem_iterator_A_.add_tile_offset({0, 1});
+          this->smem_iterator_B_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == (Base::kStages - 1)) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+            smem_write_stage_idx = 0;
+          } else {
+            ++smem_write_stage_idx;
+          }
+
+          if (!kSmemContainsEntireMat &&
+              smem_read_stage_idx == (Base::kStages - 1)) {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0,
+                 -Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations,
+                 0});
+            smem_read_stage_idx = 0;
+          } else {
+            ++smem_read_stage_idx;
+          }
+
+          --gemm_k_iterations;
+          iterator_A.clear_mask(gemm_k_iterations == 0);
+          iterator_B.clear_mask(gemm_k_iterations == 0);
+        }
+
+        // Do any conversions feeding the first stage at the end of the loop so
+        // we can start right away on mma instructions
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations)
+          warp_mma.transform(
+              warp_transformed_frag_A[(warp_mma_k + 1) % 2],
+              warp_transformed_frag_B[(warp_mma_k + 1) % 2],
+              warp_loaded_frag_A[(warp_mma_k + 1) % 2],
+              warp_loaded_frag_B[(warp_mma_k + 1) % 2]);
+      }
+    }
+
+    if (platform::is_same<
+            typename Operator::MathOperator,
+            arch::OpMultiplyAddFastF32>::value ||
+        platform::is_same<
+            typename Operator::MathOperator,
+            arch::OpMultiplyAddComplexFastF32>::value) {
+      accum = plus_accum(accum, tmp_accum);
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/third_party/fused_multi_head_attention/gemm/custom_mma_pipelined.h b/third_party/fused_multi_head_attention/gemm/custom_mma_pipelined.h
new file mode 100644
index 0000000000..b967b86c01
--- /dev/null
+++ b/third_party/fused_multi_head_attention/gemm/custom_mma_pipelined.h
@@ -0,0 +1,401 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Template for a double-buffered threadblock-scoped GEMM kernel.
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_conversion.h"
+
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+
+#include "custom_mma_base.h"
+#include "cutlass/gemm/gemm.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Transformation applied to A operand
+    typename TransformA_ = NumericArrayConverter<
+        typename SmemIteratorA_::Element,
+        typename IteratorA_::Element,
+        IteratorA_::Fragment::kElements>,
+    ///
+    /// Transformation applied to B operand
+    typename TransformB_ = NumericArrayConverter<
+        typename SmemIteratorB_::Element,
+        typename IteratorB_::Element,
+        IteratorB_::Fragment::kElements>,
+    /// Used for partial specialization
+    typename Enable = bool>
+class CustomMmaPipelined : public CustomMmaBase<Shape_, Policy_, 2> {
+ public:
+  ///< Base class
+  using Base = CustomMmaBase<Shape_, Policy_, 2>;
+
+  using Shape =
+      Shape_; ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using IteratorA =
+      IteratorA_; ///< Iterates over tiles of A operand in global memory
+  using IteratorB =
+      IteratorB_; ///< Iterates over tiles of B operand in global memory
+  using ElementC = ElementC_; ///< Data type of accumulator matrix
+  using LayoutC = LayoutC_; ///< Layout of accumulator matrix
+  using Policy = Policy_; ///< Policy describing tuning details
+
+  using SmemIteratorA = SmemIteratorA_;
+  using SmemIteratorB = SmemIteratorB_;
+
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of operand A loaded from global memory
+  using FragmentA = typename IteratorA::Fragment;
+
+  /// Fragment of operand B loaded from global memory
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Obtain the arch tag from the warp-level operator
+  using ArchTag = typename Policy::Operator::ArchTag;
+
+  /// Complex transform on A operand
+  static ComplexTransform const kTransformA = Operator::kTransformA;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  // staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
+  static_assert(
+      (Base::kStages == 2),
+      "MmaPipelined requires kStages set to value 2");
+
+  static bool const kSmemContainsEntireMat = false;
+
+ private:
+  using WarpFragmentA = typename Operator::FragmentA;
+  using WarpFragmentB = typename Operator::FragmentB;
+
+ protected:
+  /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+ public:
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  CustomMmaPipelined(
+      typename Base::SharedStorageA& shared_storageA,
+      typename Base::SharedStorageB& shared_storageB,
+      int thread_idx, ///< ID within the threadblock
+      int warp_idx, ///< ID of warp
+      int lane_idx ///< ID of each thread within a warp
+      )
+      : Base(shared_storageA, shared_storageB, thread_idx, warp_idx, lane_idx),
+        smem_iterator_A_(shared_storageA.ref(), thread_idx),
+        smem_iterator_B_(shared_storageB.ref(), thread_idx) {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+  CUTLASS_DEVICE
+  CustomMmaPipelined(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      typename Base::SharedStorage& st,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : CustomMmaPipelined(
+            st.operand_A,
+            st.operand_B,
+            thread_idx,
+            warp_idx,
+            lane_idx) {}
+
+  CUTLASS_DEVICE
+  bool set_prologue_done(bool value) {
+    // NOT IMPLEMENTED FOR PIPELINED
+  }
+
+  CUTLASS_DEVICE
+  bool set_zero_outside_bounds(bool value) {
+    // NOT NEEDED FOR PIPELINED
+    // shared memory will always be zero-filled
+  }
+
+  template <bool kLoadA = true, bool kLoadB = true>
+  CUTLASS_DEVICE static void prologue(
+      typename Base::SharedStorage& shared_storage,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      int thread_idx,
+      int problem_size_k) {
+    prologue<kLoadA, kLoadB>(
+        shared_storage.operand_A,
+        shared_storage.operand_B,
+        iterator_A,
+        iterator_B,
+        thread_idx,
+        problem_size_k);
+  }
+
+  template <bool kLoadA = true, bool kLoadB = true>
+  CUTLASS_DEVICE static void prologue(
+      typename Base::SharedStorageA& shared_storageA,
+      typename Base::SharedStorageB& shared_storageB,
+      ///< iterator over A operand in global memory
+      IteratorA iterator_A,
+      ///< iterator over B operand in global memory
+      IteratorB iterator_B,
+      int thread_idx,
+      int problem_size_k) {
+    // NOT IMPLEMENTED FOR PIPELINED
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      int gemm_k_iterations, ///< number of iterations of the mainloop
+      FragmentC& accum, ///< destination accumulator tile
+      IteratorA iterator_A, ///< iterator over A operand in global memory
+      IteratorB iterator_B, ///< iterator over B operand in global memory
+      FragmentC const& src_accum, ///< source accumulator tile
+      TransformA transform_A =
+          TransformA(), ///< transformation applied to A fragment
+      TransformB transform_B =
+          TransformB()) { ///< transformation applied to B fragment
+
+    //
+    // Prologue
+    //
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    FragmentA tb_frag_A;
+    FragmentB tb_frag_B;
+
+    tb_frag_A.clear();
+    tb_frag_B.clear();
+
+    // The last kblock is loaded in the prolog
+    iterator_A.load(tb_frag_A);
+    iterator_B.load(tb_frag_B);
+
+    ++iterator_A;
+    ++iterator_B;
+
+    this->smem_iterator_A_.store(transform_A(tb_frag_A));
+    this->smem_iterator_B_.store(transform_B(tb_frag_B));
+
+    ++this->smem_iterator_A_;
+    ++this->smem_iterator_B_;
+
+    __syncthreads();
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpFragmentA warp_frag_A[2];
+    WarpFragmentB warp_frag_B[2];
+
+    this->warp_tile_iterator_A_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_frag_A[0]);
+    this->warp_tile_iterator_B_.load(warp_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_B_;
+
+    Operator warp_mma;
+
+    int smem_write_stage_idx = 1;
+
+    // Avoid reading out of bounds
+    iterator_A.clear_mask(gemm_k_iterations <= 1);
+    iterator_B.clear_mask(gemm_k_iterations <= 1);
+
+    // Issue loads during the first warp-level matrix multiply-add *AFTER*
+    // issuing shared memory loads (which have the tightest latency
+    // requirement).
+
+    //
+    // Mainloop
+    //
+
+    // Note: The main loop does not support Base::kWarpGemmIterations == 2.
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > 0; --gemm_k_iterations) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
+           ++warp_mma_k) {
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+
+        if (warp_mma_k == Base::kWarpGemmIterations - 1) {
+          // Write fragments to shared memory
+          this->smem_iterator_A_.store(transform_A(tb_frag_A));
+
+          this->smem_iterator_B_.store(transform_B(tb_frag_B));
+
+          __syncthreads();
+
+          ++this->smem_iterator_A_;
+          ++this->smem_iterator_B_;
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (smem_write_stage_idx == 1) {
+            this->smem_iterator_A_.add_tile_offset({0, -Base::kStages});
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+          } else {
+            this->warp_tile_iterator_A_.add_tile_offset(
+                {0,
+                 -Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations});
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations,
+                 0});
+          }
+
+          smem_write_stage_idx ^= 1;
+        }
+
+        this->warp_tile_iterator_A_.set_kgroup_index(
+            (warp_mma_k + 1) % Base::kWarpGemmIterations);
+        this->warp_tile_iterator_B_.set_kgroup_index(
+            (warp_mma_k + 1) % Base::kWarpGemmIterations);
+
+        this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]);
+        this->warp_tile_iterator_B_.load(warp_frag_B[(warp_mma_k + 1) % 2]);
+
+        ++this->warp_tile_iterator_A_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k == 0) {
+          iterator_A.load(tb_frag_A);
+          iterator_B.load(tb_frag_B);
+
+          ++iterator_A;
+          ++iterator_B;
+
+          // Avoid reading out of bounds if this was the last loop iteration
+          iterator_A.clear_mask(gemm_k_iterations <= 2);
+          iterator_B.clear_mask(gemm_k_iterations <= 2);
+        }
+
+        warp_mma(
+            accum,
+            warp_frag_A[warp_mma_k % 2],
+            warp_frag_B[warp_mma_k % 2],
+            accum);
+      }
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/third_party/fused_multi_head_attention/gemm/find_default_mma.h b/third_party/fused_multi_head_attention/gemm/find_default_mma.h
new file mode 100644
index 0000000000..560da450ff
--- /dev/null
+++ b/third_party/fused_multi_head_attention/gemm/find_default_mma.h
@@ -0,0 +1,191 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Cutlass provides helper template functions to figure out the right
+   datastructures to instantiate to run a GEMM with various parameters (see
+   `cutlass/gemm/threadblock/default_mma.h`). However, due to template
+   instantiation priority rules, it will only create an MmaMultiStage with
+   kStages=3 (otherwise creates an MmePipelined - which is not compatible with
+   FastF32). kStages=3 uses too much shared memory and we want to use kStages=2,
+   so we just copy-pasted some code from `default_mma.h` and
+   `default_mma_core.h` files and wrapped this template to allow our usecase.
+
+    This is really only for the FastF32 case - aka using TensorCores with fp32.
+*/
+
+#pragma once
+
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+template <
+    /// Element type for A matrix operand
+    typename ElementA,
+    /// Layout type for A matrix operand
+    typename LayoutA,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    /// Element type for B matrix operand
+    typename ElementB,
+    /// Layout type for B matrix operand
+    typename LayoutB,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    /// Element type for internal accumulation
+    typename ElementAccumulator,
+    /// Layout type for C and D matrix operand
+    typename LayoutC,
+    /// Operator class tag
+    typename OperatorClass,
+    /// Tag indicating architecture to tune for
+    typename ArchTag,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    /// Number of stages used in the pipelined mainloop
+    int Stages,
+    /// Operation performed by GEMM
+    typename Operator,
+    typename Enable_ = void>
+struct FindDefaultMma {
+  static constexpr bool AccumulatorsInRowMajor = false;
+  static constexpr SharedMemoryClearOption SharedMemoryClear =
+      SharedMemoryClearOption::kNone;
+  using DefaultMma = cutlass::gemm::threadblock::DefaultMma<
+      ElementA,
+      LayoutA,
+      kAlignmentA,
+      ElementB,
+      LayoutB,
+      kAlignmentB,
+      ElementAccumulator,
+      LayoutC,
+      OperatorClass,
+      ArchTag,
+      ThreadblockShape,
+      WarpShape,
+      InstructionShape,
+      Stages,
+      Operator,
+      AccumulatorsInRowMajor,
+      SharedMemoryClear>;
+};
+
+/// Specialization for sm80 / FastF32 / multistage with kStages=2
+template <
+    typename ElementA_,
+    /// Layout type for A matrix operand
+    typename LayoutA_,
+    /// Access granularity of A matrix in units of elements
+    int kAlignmentA,
+    typename ElementB_,
+    /// Layout type for B matrix operand
+    typename LayoutB_,
+    /// Access granularity of B matrix in units of elements
+    int kAlignmentB,
+    typename ElementAccumulator,
+    /// Threadblock-level tile size (concept: GemmShape)
+    typename ThreadblockShape,
+    /// Warp-level tile size (concept: GemmShape)
+    typename WarpShape,
+    /// Instruction-level tile size (concept: GemmShape)
+    typename InstructionShape,
+    int kStages,
+    typename Operator>
+struct FindDefaultMma<
+    ElementA_,
+    LayoutA_,
+    kAlignmentA,
+    ElementB_,
+    LayoutB_,
+    kAlignmentB,
+    ElementAccumulator,
+    layout::RowMajor,
+    arch::OpClassTensorOp,
+    arch::Sm80,
+    ThreadblockShape,
+    WarpShape,
+    InstructionShape,
+    kStages,
+    Operator,
+    typename cutlass::platform::enable_if<(kAlignmentA > 1)>::type> {
+  using LayoutC = layout::RowMajor;
+  using OperatorClass = arch::OpClassTensorOp;
+  using ArchTag = arch::Sm80;
+
+  using DefaultMma_ = cutlass::gemm::threadblock::DefaultMma<
+      ElementA_,
+      LayoutA_,
+      kAlignmentA,
+      ElementB_,
+      LayoutB_,
+      kAlignmentB,
+      ElementAccumulator,
+      LayoutC,
+      OperatorClass,
+      ArchTag,
+      ThreadblockShape,
+      WarpShape,
+      InstructionShape,
+      3,
+      Operator>;
+  struct DefaultMma : DefaultMma_ {
+    using MmaCore_ = typename DefaultMma_::MmaCore;
+    // Define the threadblock-scoped multistage matrix multiply
+    using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage<
+        typename MmaCore_::Shape,
+        typename DefaultMma_::IteratorA,
+        typename MmaCore_::SmemIteratorA,
+        MmaCore_::kCacheOpA,
+        typename DefaultMma_::IteratorB,
+        typename MmaCore_::SmemIteratorB,
+        MmaCore_::kCacheOpB,
+        ElementAccumulator,
+        LayoutC,
+        typename MmaCore_::MmaPolicy,
+        kStages>;
+  };
+};
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
diff --git a/third_party/fused_multi_head_attention/gemm/mma_accum_lambda_iterator.h b/third_party/fused_multi_head_attention/gemm/mma_accum_lambda_iterator.h
new file mode 100644
index 0000000000..7692389c5c
--- /dev/null
+++ b/third_party/fused_multi_head_attention/gemm/mma_accum_lambda_iterator.h
@@ -0,0 +1,378 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/functional.h"
+#include "cutlass/gemm/warp/mma_simt_tile_iterator.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h"
+#include "cutlass/matrix_shape.h"
+
+/*
+TensorCores have different accumulator layouts.
+This file provides a class to easily map the accumulator
+i-th element with the corresponding matrix row/col.
+*/
+
+template <typename T, typename accum_t, int kWarpSize>
+struct AccumLambdaIteratorSm80 {
+  static_assert(
+      cutlass::platform::
+          is_same<typename T::Layout, cutlass::layout::RowMajor>::value,
+      "only RowMajor is supported");
+
+  using Policy = typename T::Policy;
+  using InstructionShape = typename T::InstructionShape;
+  using OpDelta = typename T::OpDelta;
+  using Shape = typename T::Shape;
+  static int const kElementsPerAccess = InstructionShape::kN / 4;
+  static int const kRowsPerTile = 8;
+  static int const kAccumulatorRows = InstructionShape::kM / kRowsPerTile;
+
+  static cutlass::MatrixCoord CUTLASS_DEVICE get_lane_offset(
+      int8_t lane_id,
+      int8_t warp_id,
+      typename T::TensorCoord const& tile_offset) {
+    int quad = (lane_id >> 2);
+    int lane_in_quad = (lane_id & 3);
+    return cutlass::MatrixCoord(
+        quad + tile_offset.row() * Shape::kRow,
+        lane_in_quad * kElementsPerAccess +
+            tile_offset.column() * Shape::kColumn);
+  }
+
+  template <typename FA, typename FB, typename FC>
+  CUTLASS_DEVICE static void iterateRows(
+      cutlass::MatrixCoord& lane_offset,
+      FA beginRow,
+      FB op,
+      FC endRow) {
+    // See cutlass/gemm/warp/mma_tensor_op_tile_iterator.h
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int row = 0; row < kAccumulatorRows; ++row) {
+        int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow +
+            row * kRowsPerTile + lane_offset.row();
+        beginRow(accum_m);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+          int mma_accum_start = kAccumulatorRows * kElementsPerAccess *
+              (mma_n * Policy::MmaIterations::kRow + mma_m);
+          CUTLASS_PRAGMA_UNROLL
+          for (int col = 0; col < kElementsPerAccess; ++col) {
+            int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn +
+                col + lane_offset.column();
+            int idx = mma_accum_start + row * kElementsPerAccess + col;
+            op(accum_m, accum_n, idx);
+          }
+        }
+
+        endRow(accum_m);
+      }
+    }
+  }
+
+  template <typename DT, typename F>
+  CUTLASS_DEVICE static bool reduceSameRow(int lane_id, DT& myValue, F fn) {
+    // In each warp, 4 threads will work on the same row
+    // - the ones with the same `quad`
+    auto otherV = __shfl_xor_sync(0xffffffff, myValue, 1);
+    myValue = fn(myValue, otherV);
+    otherV = __shfl_xor_sync(0xffffffff, myValue, 2);
+    myValue = fn(myValue, otherV);
+    int lane_in_quad = (lane_id & 3);
+    return lane_in_quad == 0;
+  }
+};
+
+template <typename T, typename accum_t, int kWarpSize>
+struct AccumLambdaIteratorSm70 {
+  static_assert(
+      cutlass::platform::
+          is_same<typename T::Layout, cutlass::layout::RowMajor>::value,
+      "only RowMajor is supported");
+
+  using Policy = typename T::Policy;
+  using InstructionShape = typename T::InstructionShape;
+  using OpDelta = typename T::OpDelta;
+  using Shape = typename T::Shape;
+  using Element = accum_t;
+
+  static int const kElementsPerPartial = 4;
+  using EleShapePerPatial = typename cutlass::platform::conditional<
+      cutlass::platform::is_same<Element, float>::value,
+      cutlass::MatrixShape<2, 2>,
+      cutlass::MatrixShape<1, 4>>::type;
+  static int const kElementsPerMma = 8;
+  static int const kAccumulatorPatials = 2;
+  using QuadShapePerPatialMma = cutlass::MatrixShape<4, 4>;
+
+  static cutlass::MatrixCoord CUTLASS_DEVICE get_lane_offset(
+      int8_t lane_id,
+      int8_t warp_id,
+      typename T::TensorCoord const& tile_offset) {
+    int quad = (lane_id >> 2);
+    int lane_in_quad = (lane_id & 3);
+    int accum_m, accum_n;
+
+    if (cutlass::platform::is_same<Element, float>::value) {
+      // (quad[2],quad[0])+lane_in_quad[0]
+      accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 + (lane_in_quad & 1);
+      // (quad[1])+lane_in_quad[1]
+      accum_n =
+          ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials +
+          (lane_in_quad & 2);
+    } else {
+      accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 +
+          lane_in_quad; // (quad[2],quad[0])
+      accum_n = ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials;
+    }
+    return cutlass::MatrixCoord(
+        accum_m + tile_offset.row() * Shape::kRow,
+        accum_n + tile_offset.column() * Shape::kColumn);
+  }
+
+  template <typename DT, typename F>
+  CUTLASS_DEVICE static bool reduceSameRow(int lane_id, DT& myValue, F fn) {
+    static_assert(
+        cutlass::platform::is_same<Element, float>::value,
+        "update to support non-float accum");
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-fragment-mma-884-f16
+    // T0 & T2 share same line within a quad
+    auto otherV = __shfl_xor_sync(0xffffffff, myValue, 1 << 1);
+    myValue = fn(myValue, otherV);
+    // quad 0 and quad 2 are on the same lines
+    otherV = __shfl_xor_sync(0xffffffff, myValue, 1 << 3);
+    myValue = fn(myValue, otherV);
+    return (lane_id & ((1 << 1) | (1 << 3))) == 0;
+  }
+
+  template <typename FA, typename FB, typename FC>
+  CUTLASS_DEVICE static void iterateRows(
+      cutlass::MatrixCoord& lane_offset,
+      FA beginRow,
+      FB op,
+      FC endRow) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int tile_m = 0; tile_m < Policy::TileIterations::kRow; ++tile_m) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int m = 0; m < EleShapePerPatial::kRow; ++m) {
+          int accum_m = tile_m * Policy::InterleavedTile::kRow +
+              mma_m * QuadShapePerPatialMma::kRow + m * 2 + lane_offset.row();
+          beginRow(accum_m);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int tile_n = 0; tile_n < Policy::TileIterations::kColumn;
+               ++tile_n) {
+            CUTLASS_PRAGMA_UNROLL
+            for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn;
+                 ++mma_n) {
+              CUTLASS_PRAGMA_UNROLL
+              for (int p = 0; p < kAccumulatorPatials; ++p) {
+                CUTLASS_PRAGMA_UNROLL
+                for (int n = 0; n < EleShapePerPatial::kColumn; ++n) {
+                  int mma_accum_start =
+                      (((tile_n * Policy::TileIterations::kRow + tile_m) *
+                            Policy::MmaIterations::kColumn +
+                        mma_n) *
+                           Policy::MmaIterations::kRow +
+                       mma_m) *
+                      kElementsPerMma;
+                  int accum_n = tile_n * Policy::InterleavedTile::kColumn +
+                      mma_n * QuadShapePerPatialMma::kColumn +
+                      p * Policy::InterleavedTile::kColumn / 2 + n +
+                      lane_offset.column();
+                  int idx = mma_accum_start + p * kElementsPerPartial +
+                      m * EleShapePerPatial::kColumn + n;
+                  op(accum_m, accum_n, idx);
+                }
+              }
+            }
+          }
+          endRow(accum_m);
+        }
+      }
+    }
+  }
+};
+
+template <typename T, typename accum_t, int kWarpSize>
+struct AccumLambdaIteratorSimt {
+  using Policy = typename T::Policy;
+  using Iterations = typename T::Iterations;
+  using Element = typename T::Element;
+  using Delta = typename T::Delta;
+  using Shape = typename T::Shape;
+  static_assert(
+      cutlass::platform::
+          is_same<typename T::Layout, cutlass::layout::RowMajor>::value,
+      "only RowMajor is supported");
+
+  template <typename DT, typename F>
+  CUTLASS_DEVICE static bool reduceSameRow(int lane_id, DT& myValue, F fn) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int bit = 1; bit < Policy::WarpShape::kColumn; bit *= 2) {
+      auto otherV = __shfl_xor_sync(0xffffffff, myValue, bit);
+      myValue = fn(myValue, otherV);
+    }
+    return (lane_id & (Policy::WarpShape::kColumn - 1)) == 0;
+  }
+
+  template <typename FA, typename FB, typename FC>
+  CUTLASS_DEVICE static void iterateRows(
+      cutlass::MatrixCoord& lane_offset,
+      FA beginRow,
+      FB op,
+      FC endRow) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_m = 0; mma_m < Iterations::kRow; ++mma_m) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int m = 0; m < Policy::LaneMmaShape::kM; ++m) {
+        int accum_m = mma_m * Delta::kRow + m + lane_offset.row();
+        beginRow(accum_m);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n = 0; mma_n < Iterations::kColumn; ++mma_n) {
+          int accum_n =
+              mma_n * Policy::WarpShape::kColumn * Policy::LaneMmaShape::kN +
+              lane_offset.column();
+          CUTLASS_PRAGMA_UNROLL
+          for (int n = 0; n < Policy::LaneMmaShape::kN; ++n) {
+            int idx = n +
+                Policy::LaneMmaShape::kN *
+                    (mma_n +
+                     Iterations::kColumn *
+                         (m + mma_m * Policy::LaneMmaShape::kM));
+            op(accum_m, accum_n + n, idx);
+          }
+        }
+        endRow(accum_m);
+      }
+    }
+  }
+
+  static cutlass::MatrixCoord CUTLASS_DEVICE get_lane_offset(
+      int8_t lane_id,
+      int8_t warp_id,
+      typename T::TensorCoord const& tile_offset) {
+    static_assert(
+        cutlass::platform::is_same<
+            typename Policy::LaneLayout,
+            cutlass::layout::RowMajorInterleaved<1>>::value,
+        "");
+    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+    cutlass::MatrixCoord lane_offset = lane_layout.inverse(lane_id) *
+        cutlass::MatrixCoord(Policy::LaneMmaShape::kM,
+                             Policy::LaneMmaShape::kN);
+    return lane_offset +
+        tile_offset * cutlass::MatrixCoord(Shape::kRow, Shape::kColumn);
+  }
+};
+
+template <typename T, typename accum_t, int kWarpSize>
+struct DefaultMmaAccumLambdaIterator;
+
+// Simt
+template <typename S, typename P, typename accum_t, int kWarpSize>
+struct DefaultMmaAccumLambdaIterator<
+    cutlass::gemm::warp::MmaSimtTileIterator<
+        S,
+        cutlass::gemm::Operand::kC,
+        accum_t,
+        cutlass::layout::RowMajor,
+        P,
+        1,
+        1>,
+    accum_t,
+    kWarpSize> {
+  using WarpIterator = typename cutlass::gemm::warp::MmaSimtTileIterator<
+      S,
+      cutlass::gemm::Operand::kC,
+      accum_t,
+      cutlass::layout::RowMajor,
+      P,
+      1,
+      1>;
+  using Iterator = AccumLambdaIteratorSimt<WarpIterator, accum_t, kWarpSize>;
+};
+
+// TensorOp - Volta
+template <typename S1, typename S2, typename accum_t, int kWarpSize>
+struct DefaultMmaAccumLambdaIterator<
+    cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator<
+        S1,
+        accum_t,
+        cutlass::layout::RowMajor,
+        S2,
+        cutlass::MatrixShape<1, 1>>,
+    accum_t,
+    kWarpSize> {
+  using WarpIterator =
+      typename cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator<
+          S1,
+          accum_t,
+          cutlass::layout::RowMajor,
+          S2,
+          cutlass::MatrixShape<1, 1>>;
+  using Iterator = AccumLambdaIteratorSm70<WarpIterator, accum_t, kWarpSize>;
+};
+
+// TensorOp - Sm75+
+template <
+    typename S1,
+    typename S2,
+    typename S3,
+    typename accum_t,
+    int kWarpSize>
+struct DefaultMmaAccumLambdaIterator<
+    cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator<
+        S1,
+        accum_t,
+        cutlass::layout::RowMajor,
+        S2,
+        S3>,
+    accum_t,
+    kWarpSize> {
+  using WarpIterator =
+      typename cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator<
+          S1,
+          accum_t,
+          cutlass::layout::RowMajor,
+          S2,
+          S3>;
+  using Iterator = AccumLambdaIteratorSm80<WarpIterator, accum_t, kWarpSize>;
+};
diff --git a/third_party/fused_multi_head_attention/gemm/mma_from_smem.h b/third_party/fused_multi_head_attention/gemm/mma_from_smem.h
new file mode 100644
index 0000000000..f2b94d0031
--- /dev/null
+++ b/third_party/fused_multi_head_attention/gemm/mma_from_smem.h
@@ -0,0 +1,1955 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Tools and utils to store a GEMM output in shmem, and to use that
+   output as operandA for another GEMM back-to-back
+*/
+
+#pragma once
+
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/functional.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/platform/platform.h"
+#include "cutlass/transform/threadblock/vector_iterator.h"
+
+#include "../epilogue/epilogue_thread_apply_logsumexp.h"
+#include "../gemm/mma_accum_lambda_iterator.h"
+#include "../gemm_kernel_utils.h"
+#include "../iterators/default_warp_iterator_from_smem.h"
+#include "../iterators/make_residual_last.h"
+#include "../iterators/transpose_warp_iterator.h"
+#include "../iterators/warp_iterator_from_smem.h"
+#include "cutlass/epilogue/threadblock/epilogue_smem_accumulator.h"
+#include "cutlass/gemm/threadblock/mma_base.h"
+#include "cutlass/gemm/threadblock/mma_multistage.h"
+#include "cutlass/gemm/threadblock/mma_pipelined.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_access_iterator.h"
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+/// Shared storage object needed by accumulator
+/// From 13_two_tensor_op_fusion/threadblock/b2b_mma_base_smem_accumulator.h
+template <
+    typename Shape_,
+    typename Element_,
+    typename Layout_,
+    typename Padding_>
+class AccumulatorSharedStorage {
+ public:
+  //
+  // Type definitions
+  //
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = Layout_;
+  using Padding = Padding_;
+
+  /// Tensor reference to the accumulator
+  using TensorRefAccum = cutlass::TensorRef<Element, Layout>;
+
+  /// Shape of the accumulator matrix in shared memory
+  using ShapeAccum = cutlass::
+      MatrixShape<Shape::kM + Padding::kRow, Shape::kN + Padding::kColumn>;
+
+ public:
+  //
+  // Data members
+  //
+
+  /// Buffer for accumulator
+  cutlass::AlignedBuffer<Element, ShapeAccum::kCount> accum;
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Returns a layout object for the Accum matrix
+  CUTLASS_DEVICE
+  static Layout LayoutAccum() {
+    return Layout::packed({ShapeAccum::kRow, ShapeAccum::kColumn});
+  }
+
+  /// Returns a TensorRef to the Accumulator
+  CUTLASS_HOST_DEVICE
+  TensorRefAccum accum_ref() {
+    return TensorRefAccum{accum.data(), LayoutAccum()};
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Taken from
+// https://github.com/NVIDIA/cutlass/blob/master/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base_smem_accumulator.h
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    // Maximum K dimension - also the dimension of the shared-memory
+    // holding `OperandA`
+    int kMaxK_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Layout in shared-memory of operand A
+    typename SmemLayoutA,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaBaseFromSharedMemory {
+ public:
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape = Shape_;
+  static constexpr int kMaxK = kMaxK_;
+
+  ///< Policy describing tuning details
+  using Policy = Policy_;
+
+  //
+  // Dependent types
+  //
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Shape describing the overall GEMM computed from shared memory
+  /// by each warp.
+  using WarpGemm = typename Policy::Operator::Shape;
+
+  /// Shape describing the number of warps filling the CTA
+  using WarpCount = GemmShape<
+      Shape::kM / WarpGemm::kM,
+      Shape::kN / WarpGemm::kN,
+      Shape::kK / WarpGemm::kK>;
+  using WarpCount1 = WarpCount;
+
+  /// Number of warp-level GEMM oeprations
+  static int const kWarpGemmIterations =
+      (WarpGemm::kK / Operator::Policy::MmaShape::kK);
+  static int const kWarpGemmIterations1 = kWarpGemmIterations;
+
+  /// Number of stages
+  static int const kStages = Stages;
+
+  /// If this is true, we fill the entire shmem buffer at start
+  /// and don't need to iterate through it in a circular fashion
+  static bool const kSmemContainsEntireB = kMaxK <= Shape::kK * kStages;
+
+  /// Tensor reference to the A operand
+  using TensorRefA = TensorRef<typename Operator::ElementA, SmemLayoutA>;
+
+  /// Tensor reference to the B operand
+  using TensorRefB =
+      TensorRef<typename Operator::ElementB, typename Operator::LayoutB>;
+
+  //
+  // Nested structs
+  //
+
+  /// Shared storage object needed by threadblock-scoped GEMM
+  class SharedStorage {
+   public:
+    //
+    // Type definitions
+    //
+
+    /// Shape of the B matrix operand in shared memory
+    using ShapeB = MatrixShape<
+        Shape::kK * kStages + Policy::SmemPaddingB::kRow,
+        Shape::kN + Policy::SmemPaddingB::kColumn>;
+
+   public:
+    //
+    // Data members
+    //
+
+    /// Buffer for B operand
+    AlignedBuffer<typename Operator::ElementB, ShapeB::kCount> operand_B;
+
+   public:
+    //
+    // Methods
+    //
+
+    /// Returns a layout object for the B matrix
+    CUTLASS_HOST_DEVICE
+    static typename Operator::LayoutB LayoutB() {
+      return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn});
+    }
+
+    /// Returns a TensorRef to the B operand
+    CUTLASS_HOST_DEVICE
+    TensorRefB operand_B_ref() {
+      return TensorRefB{operand_B.data(), LayoutB()};
+    }
+  };
+
+ protected:
+  //
+  // Data members
+  //
+
+  // /// Iterator to load a warp-scoped tile of A operand from shared memory
+  // typename Operator::IteratorA warp_tile_iterator_A_;
+
+  /// Iterator to load a warp-scoped tile of B operand from shared memory
+  typename Operator::IteratorB warp_tile_iterator_B_;
+
+ public:
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaBaseFromSharedMemory(
+      ///< Shared storage needed for internal use by threadblock-scoped GEMM
+      TensorRefB& b_tile,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : warp_tile_iterator_B_(b_tile, lane_idx) {}
+};
+
+namespace {
+
+// has necessary trait compliance with WarpIteratorFromSmem but doesn't do
+// anything, can be default initialized, and uses fragment that takes up
+// (almost) no space. this warp iterator is selected at compile time when
+// elementwise on-the-fly scaling for operand A is disabled, in which case
+// operations related to loading scale factors for operand A get wiped out by
+// the compiler.
+template <typename TensorRef>
+class NoOpWarpIteratorScale {
+ public:
+  // in pipelined+multistage MMA implementations we keep an array of fragments.
+  // if we aren't using scaling we don't want to waste registers on fragments
+  // of scale elements, so ideally this would be sized 0.
+  // Since arrays of zero-sized objects are not allowed, using size as 1.
+  // The compiler will most likely wipe it out anyways.
+  using Fragment = cutlass::Array<char, 1>;
+
+  CUTLASS_HOST_DEVICE
+  NoOpWarpIteratorScale() {}
+
+  CUTLASS_HOST_DEVICE
+  NoOpWarpIteratorScale(TensorRef const&, int) {}
+
+  CUTLASS_HOST_DEVICE
+  NoOpWarpIteratorScale& add_tile_offset(
+      typename TensorRef::TensorCoord const&) {
+    return *this;
+  }
+
+  CUTLASS_HOST_DEVICE
+  NoOpWarpIteratorScale& operator++() {
+    return *this;
+  }
+
+  CUTLASS_DEVICE
+  void load(Fragment&) const {}
+};
+
+// if scaling is enabled, performs fragment elementwise multiplication between
+// fragment and its scaling factor.
+template <typename Fragment, typename FragmentScale, bool ScalingEnabled>
+class FragmentElementwiseScaler;
+
+// specialization for scaling being enabled.
+template <typename Fragment, typename FragmentScale>
+class FragmentElementwiseScaler<Fragment, FragmentScale, true> {
+ public:
+  // cast scale_frag to correct type then apply elementwise to fragment
+  CUTLASS_DEVICE
+  static Fragment apply(Fragment frag, FragmentScale const& scale_frag) {
+    Fragment converted_scale_frag = cutlass::NumericArrayConverter<
+        typename Fragment::Element,
+        typename FragmentScale::Element,
+        FragmentScale::kElements>()(scale_frag);
+    return cutlass::multiplies<Fragment>()(frag, converted_scale_frag);
+  }
+};
+
+// specialization for scaling being disabled. doesn't do anything and should
+// just get wiped out by the compiler.
+template <typename Fragment, typename FragmentScale>
+class FragmentElementwiseScaler<Fragment, FragmentScale, false> {
+ public:
+  CUTLASS_DEVICE
+  static Fragment apply(Fragment frag, FragmentScale const&) {
+    return frag;
+  }
+};
+} // namespace
+
+////////////////////////////////////////////////////////////////////////////////
+// Taken from
+// https://github.com/NVIDIA/cutlass/blob/master/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined_smem_accumulator.h
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    // BEGIN smem
+    /// Iterates over the intermediate accumulator tile in shared memory
+    typename WarpIteratorA_,
+    /// whether or not to perform elementwise multiplication of A
+    //  by another matrix (A_scale) that is also kept in shared memory prior
+    //  to matmul A @ B
+    bool ScaleOperandA_,
+    /// Max GEMM problem size in K dimension
+    int MaxK,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Transformation applied to B operand
+    typename TransformB_ = NumericArrayConverter<
+        typename SmemIteratorB_::Element,
+        typename IteratorB_::Element,
+        IteratorB_::Fragment::kElements>,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaPipelinedFromSharedMemory : public MmaBaseFromSharedMemory<
+                                         Shape_,
+                                         MaxK,
+                                         Policy_,
+                                         2,
+                                         typename WarpIteratorA_::Layout> {
+ public:
+  ///< Base class
+  using Base = MmaBaseFromSharedMemory<
+      Shape_,
+      MaxK,
+      Policy_,
+      2,
+      typename WarpIteratorA_::Layout>;
+
+  using Shape =
+      Shape_; ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  static constexpr bool ScaleOperandA = ScaleOperandA_;
+
+  using WarpIteratorA = WarpIteratorA_;
+  ///< loads fragments of A_scale from shared memory if operand A scaling is
+  ///< enabled. otherwise no-op.
+  using WarpIteratorAScale = typename cutlass::platform::conditional<
+      ScaleOperandA,
+      WarpIteratorA,
+      NoOpWarpIteratorScale<typename WarpIteratorA::TensorRef>>::type;
+
+  using IteratorB =
+      IteratorB_; ///< Iterates over tiles of B operand in global memory
+  using ElementC = ElementC_; ///< Data type of accumulator matrix
+  using LayoutC = LayoutC_; ///< Layout of accumulator matrix
+  using Policy = Policy_; ///< Policy describing tuning details
+
+  using SmemIteratorB = SmemIteratorB_;
+
+  using TransformB = TransformB_;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of operand B loaded from global memory
+  using FragmentB = typename IteratorB::Fragment;
+
+  /// Fragment of accumulator tile
+  using FragmentC = typename Policy::Operator::FragmentC;
+
+  /// Warp-level Mma
+  using Operator = typename Policy::Operator;
+
+  /// Obtain the arch tag from the warp-level operator
+  using ArchTag = typename Policy::Operator::ArchTag;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB = Operator::kTransformB;
+
+  // staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline)
+  static_assert(
+      (Base::kStages == 2),
+      "MmaPipelined requires kStages set to value 2");
+
+ private:
+  using WarpFragmentA = typename Operator::FragmentA;
+
+  /// fragment type of OperandA elementwise scaling matrix. (almost) empty
+  /// if operand A scaling is disabled.
+  using WarpFragmentAScale = typename WarpIteratorAScale::Fragment;
+
+  using WarpFragmentB = typename Operator::FragmentB;
+
+  /// applies scaling factor to operand A fragment if operand A scaling is
+  /// enabled. otherwise no-op.
+  using FragmentAScaler = FragmentElementwiseScaler<
+      WarpFragmentA,
+      WarpFragmentAScale,
+      ScaleOperandA>;
+
+ protected:
+  // /// Iterator to write threadblock-scoped tile of A operand to shared memory
+  // SmemIteratorA smem_iterator_A_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB smem_iterator_B_;
+
+  /// Iterator to load a warp-scoped tile of A operand from intermediate
+  /// accumulator tile
+  WarpIteratorA warp_tile_iterator_A_;
+
+  /// Iterator to load a warp-scoped tile of A_scale from intermediate
+  /// accumulator tile (only used if ScaleOperandA_ is true)
+  WarpIteratorAScale warp_tile_iterator_A_scale_;
+
+ public:
+  /// constructor for MMA with operand A scaling enabled.
+  CUTLASS_DEVICE
+  MmaPipelinedFromSharedMemory(
+      typename Base::TensorRefA a, // Operand A in shared memory
+      typename Base::TensorRefA a_scale, // Operand A_scale in shared memory
+      typename Base::TensorRefB
+          b_staging, // staging memory for loading tiles of B
+      int thread_idx,
+      int warp_idx,
+      int lane_idx)
+      : Base(b_staging, thread_idx, warp_idx, lane_idx),
+        warp_tile_iterator_A_(a, lane_idx),
+        warp_tile_iterator_A_scale_(a_scale, lane_idx),
+        smem_iterator_B_(b_staging, thread_idx) {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_A_scale_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaPipelinedFromSharedMemory(
+      typename Base::TensorRefA a, ///< Operand A in shared memory
+      typename Base::TensorRefB b_staging, ///< staging memory for loading B
+      int thread_idx, ///< ID within the threadblock
+      int warp_idx, ///< ID of warp
+      int lane_idx) ///< ID of each thread within a warp
+      : Base(b_staging, thread_idx, warp_idx, lane_idx),
+        warp_tile_iterator_A_(a, lane_idx),
+        smem_iterator_B_(b_staging, thread_idx) {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN);
+    int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN);
+
+    int warp_idx_m = warp_idx_mn % Base::WarpCount::kM;
+    int warp_idx_n = warp_idx_mn / Base::WarpCount::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    this->warp_tile_iterator_A_.add_tile_offset(
+        {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n});
+  }
+
+  // For API compatibility with MmaMultistageFromSharedMemory
+  // but not supported as it worsens perf: older gpus < sm80 don't
+  // support async transfers and have to waste registers
+  CUTLASS_DEVICE
+  void set_prologue_done(bool value) {}
+  CUTLASS_DEVICE
+  static void prologue(
+      typename Base::SharedStorage& shared_storage,
+      IteratorB iterator_B1,
+      int thread_idx,
+      int problem_size_0_n) {}
+
+  CUTLASS_DEVICE
+  static void drain_cp_asyncs() {}
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      int gemm_k_iterations, ///< number of iterations of the mainloop
+      FragmentC& accum, ///< destination accumulator tile
+      // IteratorA iterator_A,                             ///< iterator over A
+      // operand in global memory
+      IteratorB iterator_B, ///< iterator over B operand in global memory
+      FragmentC const& src_accum, ///< source accumulator tile
+      // TransformA transform_A = TransformA(),            ///< transformation
+      // applied to A fragment
+      TransformB transform_B =
+          TransformB()) { ///< transformation applied to B fragment
+
+    //
+    // Prologue
+    //
+
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    FragmentB tb_frag_B;
+
+    tb_frag_B.clear();
+
+    // The last kblock is loaded in the prolog
+    iterator_B.set_residual_tile(gemm_k_iterations == 1);
+    iterator_B.load(tb_frag_B);
+
+    ++iterator_B;
+
+    this->smem_iterator_B_.store(transform_B(tb_frag_B));
+
+    ++this->smem_iterator_B_;
+
+    __syncthreads();
+
+    // remember that WarpFragmentAScale and WarpIteratorAScale are empty/no-op
+    // if scaling is disabled.
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpFragmentA warp_frag_A[2];
+    WarpFragmentAScale warp_frag_A_scale[2];
+    WarpFragmentB warp_frag_B[2];
+    warp_frag_A[0].clear();
+    warp_frag_A_scale[0].clear();
+    warp_frag_B[0].clear();
+
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+
+    this->warp_tile_iterator_A_.load(warp_frag_A[0]);
+    this->warp_tile_iterator_A_scale_.load(warp_frag_A_scale[0]);
+    this->warp_tile_iterator_B_.load(warp_frag_B[0]);
+
+    ++this->warp_tile_iterator_A_;
+    ++this->warp_tile_iterator_A_scale_;
+    ++this->warp_tile_iterator_B_;
+
+    Operator warp_mma;
+
+    int smem_write_stage_idx = 1;
+
+    // Avoid reading out of bounds
+    iterator_B.set_residual_tile(gemm_k_iterations == 2);
+    iterator_B.clear_mask(gemm_k_iterations <= 1);
+
+    // Issue loads during the first warp-level matrix multiply-add *AFTER*
+    // issuing shared memory loads (which have the tightest latency
+    // requirement).
+
+    //
+    // Mainloop
+    //
+
+    // Note: The main loop does not support Base::kWarpGemmIterations == 2.
+    CUTLASS_GEMM_LOOP
+    for (; gemm_k_iterations > 0; --gemm_k_iterations) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations;
+           ++warp_mma_k) {
+        // Load warp-level tiles from shared memory, wrapping to k offset if
+        // this is the last group as the case may be.
+        bool hasNext = true;
+
+        if (warp_mma_k == Base::kWarpGemmIterations - 1) {
+          if (gemm_k_iterations > 1) {
+            // Write fragments to shared memory
+            this->smem_iterator_B_.store(transform_B(tb_frag_B));
+          }
+
+          __syncthreads();
+
+          ++this->smem_iterator_B_;
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory SMEM: Don't reset iterator A, as
+          // we are continuing our iteration at this point
+          if (smem_write_stage_idx == 1) {
+            this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0});
+          } else {
+            this->warp_tile_iterator_B_.add_tile_offset(
+                {-Base::kStages * Policy::kPartitionsK *
+                     Base::kWarpGemmIterations,
+                 0});
+          }
+
+          smem_write_stage_idx ^= 1;
+          hasNext = gemm_k_iterations > 1;
+        }
+
+        // Only read the next if we need to
+        if (hasNext) {
+          this->warp_tile_iterator_B_.set_kgroup_index(
+              (warp_mma_k + 1) % Base::kWarpGemmIterations);
+
+          this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]);
+          this->warp_tile_iterator_A_scale_.load(
+              warp_frag_A_scale[(warp_mma_k + 1) % 2]);
+          this->warp_tile_iterator_B_.load(warp_frag_B[(warp_mma_k + 1) % 2]);
+
+          ++this->warp_tile_iterator_A_;
+          ++this->warp_tile_iterator_A_scale_;
+          ++this->warp_tile_iterator_B_;
+
+          if (warp_mma_k == 0) {
+            iterator_B.load(tb_frag_B);
+
+            ++iterator_B;
+
+            // Avoid reading out of bounds if this was the last loop iteration
+            iterator_B.set_residual_tile(gemm_k_iterations == 3);
+            iterator_B.clear_mask(gemm_k_iterations <= 2);
+          }
+        }
+
+        warp_mma(
+            accum,
+            FragmentAScaler::apply(
+                warp_frag_A[warp_mma_k % 2], warp_frag_A_scale[warp_mma_k % 2]),
+            warp_frag_B[warp_mma_k % 2],
+            accum);
+      }
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Taken from
+// https://github.com/NVIDIA/cutlass/blob/master/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_multistage_smem_accumulator.h
+////////////////////////////////////////////////////////////////////////////////
+
+/// Structure to compute the matrix product targeting CUDA cores and SIMT math
+/// instructions.
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape1_,
+    /// Iterates over the intermediate accumulator tile in shared memory
+    typename WarpIteratorA1_,
+    /// whether or not to perform elementwise multiplication of A
+    //  by another matrix (A_scale) that is also kept in shared memory prior
+    //  to matmul A @ B
+    bool ScaleOperandA_,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB1_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB1_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB1,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy1_,
+    /// Number of stages,
+    int Stages_,
+    int kMaxK_,
+    /// Used for partial specialization
+    typename Enable = bool>
+class MmaMultistageFromSharedMemory : public MmaBaseFromSharedMemory<
+                                          Shape1_,
+                                          kMaxK_,
+                                          Policy1_,
+                                          Stages_,
+                                          typename WarpIteratorA1_::Layout> {
+ public:
+  ///< Base class
+  using Base = MmaBaseFromSharedMemory<
+      Shape1_,
+      kMaxK_,
+      Policy1_,
+      Stages_,
+      typename WarpIteratorA1_::Layout>;
+
+  ///< Size of the Gemm problem - concept: gemm::GemmShape<>
+  using Shape1 = Shape1_;
+  ///< Iterates over tiles of B operand in global memory
+  using IteratorB1 = IteratorB1_;
+  using IteratorB = IteratorB1;
+  ///< Policy describing tuning details
+  using Policy1 = Policy1_;
+
+  using SmemIteratorB1 = SmemIteratorB1_;
+  using WarpIteratorA1 = WarpIteratorA1_; ///< Iterates over the intermediate
+                                          ///< accumulator tile in shared memory
+  static constexpr bool ScaleOperandA = ScaleOperandA_;
+
+  ///< warp level iterator over A_scale matrix tile kept in shared memory.
+  ///< if elementwise A scaling is disabled then everything this does is no-op.
+  using WarpIteratorAScale = typename cutlass::platform::conditional<
+      ScaleOperandA,
+      WarpIteratorA1,
+      NoOpWarpIteratorScale<typename WarpIteratorA1::TensorRef>>::type;
+  ///< Data type of accumulator matrix
+  using ElementC = ElementC_;
+  ///< Layout of accumulator matrix
+  using LayoutC = LayoutC_;
+
+  static cutlass::arch::CacheOperation::Kind const kCacheOpB1 = CacheOpB1;
+  static constexpr bool kSmemContainsEntireB = Base::kSmemContainsEntireB;
+
+  //
+  // Dependent types
+  //
+
+  /// Fragment of accumulator tile
+  using FragmentC1 = typename Policy1::Operator::FragmentC;
+  using FragmentC = FragmentC1;
+
+  /// Warp-level Mma
+  using Operator1 = typename Policy1::Operator;
+
+  /// Minimum architecture is Sm80 to support cp.async
+  using ArchTag = arch::Sm80;
+
+  /// Complex transform on B operand
+  static ComplexTransform const kTransformB1 = Operator1::kTransformB;
+
+  /// Internal structure exposed for introspection.
+  struct Detail {
+    static_assert(
+        Base::kWarpGemmIterations1 > 1,
+        "The pipelined structure requires at least two warp-level "
+        "GEMM operations.");
+
+    /// Number of cp.async instructions to load one stage of operand B
+    static int const TBLoadIterationsB1 =
+        IteratorB1::ThreadMap::Iterations::kCount;
+
+    /// Number of cp.async instructions to load on group of operand B
+    static int const kAccessesPerGroupB1 =
+        (TBLoadIterationsB1 + Base::kWarpGemmIterations1 - 1) /
+        Base::kWarpGemmIterations1;
+  };
+
+  static constexpr int kNumStagesConcurrentLoad =
+      kSmemContainsEntireB ? Base::kStages : Base::kStages - 1;
+
+ private:
+  using WarpLoadedFragmentA1 = typename Operator1::FragmentA;
+  /// fragment of OperandA scale matrix. if operand A scaling is disabled this
+  /// is (almost) empty.
+  using WarpLoadedFragmentA1Scale = typename WarpIteratorAScale::Fragment;
+  using WarpLoadedFragmentB1 = typename Operator1::FragmentB;
+  using WarpTransformedFragmentA1 = typename Operator1::TransformedFragmentA;
+  using WarpTransformedFragmentB1 = typename Operator1::TransformedFragmentB;
+
+  /// applies elementwise scaling to fragment of A. if operand A scaling is
+  /// disabled this is a no-op.
+  using FragmentAScaler = FragmentElementwiseScaler<
+      WarpLoadedFragmentA1,
+      WarpLoadedFragmentA1Scale,
+      ScaleOperandA>;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Iterator to load a warp-scoped tile of A1 operand from intermediate
+  /// accumulator tile
+  WarpIteratorA1 warp_tile_iterator_A1_;
+
+  /// Iterator to load a warp-scoped tile of A1_scale operand from shared memory
+  /// if operand A scaling is disabled everything this does is a no-op.
+  WarpIteratorAScale warp_tile_iterator_A1_scale_;
+
+  /// Iterator to write threadblock-scoped tile of B operand to shared memory
+  SmemIteratorB1 smem_iterator_B1_;
+
+  bool prologue_done_;
+
+ public:
+  /// constructor for MMA with operand A scaling enabled.
+  CUTLASS_DEVICE
+  MmaMultistageFromSharedMemory(
+      typename Base::TensorRefA a,
+      typename Base::TensorRefA a_scale,
+      typename Base::TensorRefB b_tile,
+      int thread_idx,
+      int warp_idx,
+      int lane_idx)
+      : Base(b_tile, thread_idx, warp_idx, lane_idx),
+        warp_tile_iterator_A1_(a, lane_idx),
+        warp_tile_iterator_A1_scale_(a_scale, lane_idx),
+        smem_iterator_B1_(b_tile, thread_idx),
+        prologue_done_(false) {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+    int warp_idx_mn_1 =
+        warp_idx % (Base::WarpCount1::kM * Base::WarpCount1::kN);
+    int warp_idx_k_1 = warp_idx / (Base::WarpCount1::kM * Base::WarpCount1::kN);
+    int warp_idx_m_1 = warp_idx_mn_1 % Base::WarpCount1::kM;
+    int warp_idx_n_1 = warp_idx_mn_1 / Base::WarpCount1::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    warp_tile_iterator_A1_.add_tile_offset(
+        {warp_idx_m_1, Base::kWarpGemmIterations1 * warp_idx_k_1});
+    warp_tile_iterator_A1_scale_.add_tile_offset(
+        {warp_idx_m_1, Base::kWarpGemmIterations1 * warp_idx_k_1});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations1 * warp_idx_k_1, warp_idx_n_1});
+  }
+
+  /// Construct from tensor references
+  CUTLASS_DEVICE
+  MmaMultistageFromSharedMemory(
+      typename Base::TensorRefA a,
+      typename Base::TensorRefB b_tile,
+      ///< ID within the threadblock
+      int thread_idx,
+      ///< ID of warp
+      int warp_idx,
+      ///< ID of each thread within a warp
+      int lane_idx)
+      : Base(b_tile, thread_idx, warp_idx, lane_idx),
+        warp_tile_iterator_A1_(a, lane_idx),
+        smem_iterator_B1_(b_tile, thread_idx),
+        prologue_done_(false) {
+    // Compute warp location within threadblock tile by mapping the warp_id to
+    // three coordinates:
+    //   _m: the warp's position within the threadblock along the M dimension
+    //   _n: the warp's position within the threadblock along the N dimension
+    //   _k: the warp's position within the threadblock along the K dimension
+
+    int warp_idx_mn_1 =
+        warp_idx % (Base::WarpCount1::kM * Base::WarpCount1::kN);
+    int warp_idx_k_1 = warp_idx / (Base::WarpCount1::kM * Base::WarpCount1::kN);
+
+    int warp_idx_m_1 = warp_idx_mn_1 % Base::WarpCount1::kM;
+    int warp_idx_n_1 = warp_idx_mn_1 / Base::WarpCount1::kM;
+
+    // Add per-warp offsets in units of warp-level tiles
+    warp_tile_iterator_A1_.add_tile_offset(
+        {warp_idx_m_1, Base::kWarpGemmIterations1 * warp_idx_k_1});
+    this->warp_tile_iterator_B_.add_tile_offset(
+        {Base::kWarpGemmIterations1 * warp_idx_k_1, warp_idx_n_1});
+  }
+
+  CUTLASS_DEVICE
+  void set_prologue_done(bool value) {
+    prologue_done_ = value;
+  }
+
+  CUTLASS_DEVICE
+  static void prologue(
+      typename Base::SharedStorage& shared_storage,
+      IteratorB iterator_B1,
+      int thread_idx,
+      int problem_size_0_n) {
+    SmemIteratorB1 smem_iterator_B1(shared_storage.operand_B_ref(), thread_idx);
+    _prologue(
+        iterator_B1,
+        (problem_size_0_n + Base::Shape::kK - 1) / Base::Shape::kK,
+        smem_iterator_B1);
+  }
+
+  CUTLASS_DEVICE
+  static void drain_cp_asyncs() {
+    // commit and drain all pending and predicated cp.async pnz from the GEMM
+    // mainloop
+    cutlass::arch::cp_async_fence();
+    cutlass::arch::cp_async_wait<0>();
+    __syncthreads();
+  }
+
+  CUTLASS_DEVICE
+  void copy_tiles_and_advance_1(
+      IteratorB1& iterator_B1,
+      int group_start_B1 = 0) {
+    iterator_B1.set_iteration_index(
+        group_start_B1 * IteratorB1::kAccessesPerVector);
+    this->smem_iterator_B1_.set_iteration_index(group_start_B1);
+
+    // Load for operand B
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < Detail::kAccessesPerGroupB1; ++j) {
+      if (group_start_B1 + j < Detail::TBLoadIterationsB1) {
+        typename IteratorB1::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorB1::AccessType*>(
+                this->smem_iterator_B1_.get());
+
+        int const kSrcBytes = sizeof_bits<typename IteratorB1::Element>::value *
+            IteratorB1::ThreadMap::kElementsPerAccess /
+            IteratorB1::kAccessesPerVector / 8;
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) {
+          auto gmem_ptr = iterator_B1.get();
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB1>(
+              dst_ptr + v, gmem_ptr, iterator_B1.valid());
+
+          ++iterator_B1;
+        }
+        ++this->smem_iterator_B1_;
+      }
+    }
+  }
+
+  CUTLASS_DEVICE
+  static void _prologue(
+      IteratorB& iterator_B1,
+      int32_t gemm_k_iterations_1,
+      SmemIteratorB1& smem_iterator_B1_) {
+    // Issue several complete stages
+    CUTLASS_PRAGMA_UNROLL
+    for (int stage = 0; stage < kNumStagesConcurrentLoad;
+         ++stage, --gemm_k_iterations_1) {
+      iterator_B1.set_residual_tile(gemm_k_iterations_1 == 1);
+      iterator_B1.clear_mask(gemm_k_iterations_1 == 0);
+
+      iterator_B1.set_iteration_index(0);
+      smem_iterator_B1_.set_iteration_index(0);
+
+      // Load for operand B
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < Detail::TBLoadIterationsB1; ++j) {
+        typename IteratorB1::AccessType* dst_ptr =
+            reinterpret_cast<typename IteratorB1::AccessType*>(
+                smem_iterator_B1_.get());
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) {
+          int const kSrcBytes =
+              sizeof_bits<typename IteratorB1::Element>::value *
+              IteratorB1::ThreadMap::kElementsPerAccess /
+              IteratorB1::kAccessesPerVector / 8;
+
+          cutlass::arch::cp_async_zfill<kSrcBytes, kCacheOpB1>(
+              dst_ptr + v, iterator_B1.get(), iterator_B1.valid());
+
+          ++iterator_B1;
+        }
+
+        ++smem_iterator_B1_;
+      }
+
+      // Move to the next stage
+      iterator_B1.add_tile_offset({1, 0});
+
+      smem_iterator_B1_.add_tile_offset({1, 0});
+
+      // Defines the boundary of a stage of cp.async.
+      cutlass::arch::cp_async_fence();
+    }
+    iterator_B1.set_residual_tile(gemm_k_iterations_1 == 1);
+    iterator_B1.clear_mask(gemm_k_iterations_1 == 0);
+  }
+
+  /// Perform a threadblock-scoped matrix multiply-accumulate
+  CUTLASS_DEVICE
+  void operator()(
+      ///< problem size of GEMM
+      int gemm_k_iterations_1_,
+      ///< destination accumulator tile
+      FragmentC1& accum,
+      ///< iterator over B1 operand in global memory
+      IteratorB1 iterator_B1,
+      ///< initial value of accumulator
+      FragmentC1 const& src_accum) {
+    // 2nd Gemm
+
+    //
+    // Prologue
+    //
+    // Perform accumulation in the 'd' output operand
+    accum = src_accum;
+
+    if (!prologue_done_) {
+      _prologue(iterator_B1, gemm_k_iterations_1_, smem_iterator_B1_);
+    } else if (!kSmemContainsEntireB) {
+      // Restore the iterators increments
+
+      int gemm_k_iterations_1 = gemm_k_iterations_1_;
+      // Issue several complete stages
+      CUTLASS_PRAGMA_UNROLL
+      for (int stage = 0; stage < kNumStagesConcurrentLoad;
+           ++stage, --gemm_k_iterations_1) {
+        iterator_B1.set_iteration_index(0);
+        this->smem_iterator_B1_.set_iteration_index(0);
+
+        // Load for operand B
+        CUTLASS_PRAGMA_UNROLL
+        for (int j = 0; j < Detail::TBLoadIterationsB1; ++j) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) {
+            ++iterator_B1;
+          }
+          ++this->smem_iterator_B1_;
+        }
+        iterator_B1.add_tile_offset({1, 0});
+        this->smem_iterator_B1_.add_tile_offset({1, 0});
+      }
+      iterator_B1.set_residual_tile(gemm_k_iterations_1 <= 1);
+      iterator_B1.clear_mask(gemm_k_iterations_1 <= 0);
+    }
+
+    // DEPBAR+SYNC
+    cutlass::arch::cp_async_wait<kNumStagesConcurrentLoad - 1>();
+    __syncthreads();
+
+    // remember that WarpFragmentAScale and WarpIteratorAScale are no-op/empty
+    // if scaling is disabled.
+
+    // Pair of fragments used to overlap shared memory loads and math
+    // instructions
+    WarpLoadedFragmentA1 warp_loaded_frag_A1[2];
+    WarpLoadedFragmentA1Scale warp_loaded_frag_A1_scale[2];
+    WarpLoadedFragmentB1 warp_loaded_frag_B1[2];
+    WarpTransformedFragmentA1 warp_transformed_frag_A1[2];
+    WarpTransformedFragmentB1 warp_transformed_frag_B1[2];
+
+    Operator1 warp_mma1;
+
+    warp_tile_iterator_A1_.load(warp_loaded_frag_A1[0]);
+    ++warp_tile_iterator_A1_;
+
+    warp_tile_iterator_A1_scale_.load(warp_loaded_frag_A1_scale[0]);
+    ++warp_tile_iterator_A1_scale_;
+
+    this->warp_tile_iterator_B_.set_kgroup_index(0);
+    this->warp_tile_iterator_B_.load(warp_loaded_frag_B1[0]);
+    ++this->warp_tile_iterator_B_;
+
+    int smem_write_stage_idx = Base::kStages - 1;
+    int smem_read_stage_idx = 0;
+
+    warp_mma1.transform(
+        warp_transformed_frag_A1[0],
+        warp_transformed_frag_B1[0],
+        FragmentAScaler::apply(
+            warp_loaded_frag_A1[0], warp_loaded_frag_A1_scale[0]),
+        warp_loaded_frag_B1[0]);
+
+    // tf32x3 kernels use staging accumulation. warp_mma uses a temporary
+    // accumulator and this temporary accumulator is added to the final
+    // accumulator once in every mainloop iteration.
+    plus<FragmentC1> plus_accum;
+
+    FragmentC1 tmp_accum;
+
+    if (platform::is_same<
+            typename Operator1::MathOperator,
+            arch::OpMultiplyAddFastF32>::value ||
+        platform::is_same<
+            typename Operator1::MathOperator,
+            arch::OpMultiplyAddComplexFastF32>::value) {
+      tmp_accum.clear();
+    }
+
+    //
+    // Mainloop
+    //
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int gemm_k_iterations_1 = gemm_k_iterations_1_ - (Base::kStages - 1);
+         gemm_k_iterations_1 > (-Base::kStages + 1);
+         gemm_k_iterations_1--) {
+      //
+      // Loop over GEMM K dimension
+      //
+
+      // Computes a warp-level GEMM on data held in shared memory
+      // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate
+      CUTLASS_PRAGMA_UNROLL
+      for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations1;
+           ++warp_mma_k) {
+        // Load warp-level tile from accumulator fragment (A)
+        // or shared memory (operand B)
+        this->warp_tile_iterator_B_.set_kgroup_index(
+            (warp_mma_k + 1) % Base::kWarpGemmIterations1);
+        // skip warp tile loading for the last kgroup (we are out of the buf)
+        if (gemm_k_iterations_1 > (-Base::kStages + 2) ||
+            warp_mma_k < Base::kWarpGemmIterations1 - 1) {
+          warp_tile_iterator_A1_.load(
+              warp_loaded_frag_A1[(warp_mma_k + 1) % 2]);
+          warp_tile_iterator_A1_scale_.load(
+              warp_loaded_frag_A1_scale[(warp_mma_k + 1) % 2]);
+          this->warp_tile_iterator_B_.load(
+              warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
+        }
+        ++warp_tile_iterator_A1_;
+        ++warp_tile_iterator_A1_scale_;
+        ++this->warp_tile_iterator_B_;
+
+        if (warp_mma_k > 0)
+          warp_mma1.transform(
+              warp_transformed_frag_A1[warp_mma_k % 2],
+              warp_transformed_frag_B1[warp_mma_k % 2],
+              FragmentAScaler::apply(
+                  warp_loaded_frag_A1[warp_mma_k % 2],
+                  warp_loaded_frag_A1_scale[warp_mma_k % 2]),
+              warp_loaded_frag_B1[warp_mma_k % 2]);
+
+        if (platform::is_same<
+                typename Operator1::MathOperator,
+                arch::OpMultiplyAddFastF32>::value ||
+            platform::is_same<
+                typename Operator1::MathOperator,
+                arch::OpMultiplyAddComplexFastF32>::value) {
+          warp_mma1(
+              tmp_accum,
+              warp_transformed_frag_A1[warp_mma_k % 2],
+              warp_transformed_frag_B1[warp_mma_k % 2],
+              tmp_accum);
+
+          if (warp_mma_k == 0) {
+            accum = plus_accum(accum, tmp_accum);
+            tmp_accum.clear();
+          }
+        } else {
+          warp_mma1(
+              accum,
+              warp_transformed_frag_A1[warp_mma_k % 2],
+              warp_transformed_frag_B1[warp_mma_k % 2],
+              accum);
+        }
+
+        // Issue global->shared copies for the this stage
+        if (warp_mma_k < Base::kWarpGemmIterations1 - 1) {
+          int group_start_iteration_B1;
+
+          group_start_iteration_B1 = warp_mma_k * Detail::kAccessesPerGroupB1;
+
+          if (!kSmemContainsEntireB) {
+            copy_tiles_and_advance_1(iterator_B1, group_start_iteration_B1);
+          }
+        }
+
+        if (warp_mma_k + 2 == Base::kWarpGemmIterations1) {
+          int group_start_iteration_B1;
+          group_start_iteration_B1 =
+              (warp_mma_k + 1) * Detail::kAccessesPerGroupB1;
+
+          if (!kSmemContainsEntireB) {
+            copy_tiles_and_advance_1(iterator_B1, group_start_iteration_B1);
+          }
+
+          // Inserts a memory fence between stages of cp.async instructions.
+          cutlass::arch::cp_async_fence();
+
+          // Waits until kStages-2 stages have committed.
+          arch::cp_async_wait<kNumStagesConcurrentLoad - 1>();
+          __syncthreads();
+
+          // Move to the next stage
+          iterator_B1.add_tile_offset({1, 0});
+
+          this->smem_iterator_B1_.add_tile_offset({1, 0});
+
+          // Add negative offsets to return iterators to the 'start' of the
+          // circular buffer in shared memory
+          if (!kSmemContainsEntireB) {
+            if (smem_write_stage_idx == (Base::kStages - 1)) {
+              this->smem_iterator_B1_.add_tile_offset({-Base::kStages, 0});
+              smem_write_stage_idx = 0;
+            } else {
+              ++smem_write_stage_idx;
+            }
+
+            if (smem_read_stage_idx == (Base::kStages - 1)) {
+              this->warp_tile_iterator_B_.add_tile_offset(
+                  {-Base::kStages * Policy1::kPartitionsK *
+                       Base::kWarpGemmIterations1,
+                   0});
+              smem_read_stage_idx = 0;
+            } else {
+              ++smem_read_stage_idx;
+            }
+          }
+
+          iterator_B1.set_residual_tile(gemm_k_iterations_1 == 2);
+          iterator_B1.clear_mask(gemm_k_iterations_1 == 1);
+        }
+
+        // Do any conversions feeding the first stage at the end of the loop so
+        // we can start right away on mma instructions
+        if (warp_mma_k + 1 == Base::kWarpGemmIterations1)
+          warp_mma1.transform(
+              warp_transformed_frag_A1[(warp_mma_k + 1) % 2],
+              warp_transformed_frag_B1[(warp_mma_k + 1) % 2],
+              FragmentAScaler::apply(
+                  warp_loaded_frag_A1[(warp_mma_k + 1) % 2],
+                  warp_loaded_frag_A1_scale[(warp_mma_k + 1) % 2]),
+              warp_loaded_frag_B1[(warp_mma_k + 1) % 2]);
+      }
+    }
+
+    if (platform::is_same<
+            typename Operator1::MathOperator,
+            arch::OpMultiplyAddFastF32>::value ||
+        platform::is_same<
+            typename Operator1::MathOperator,
+            arch::OpMultiplyAddComplexFastF32>::value) {
+      accum = plus_accum(accum, tmp_accum);
+    }
+  }
+};
+
+// Converts a "regular" Mma into their counterpart from shared memory
+template <
+    typename Mma_,
+    int kMaxK,
+    typename WarpIteratorA_,
+    /// whether or not to apply elementwise multiplication of operand A by
+    /// another matrix in shared memory before usage in A @ B
+    bool kScaleOperandA,
+    bool kTransposeA = false>
+struct DefaultMmaFromSharedMemory;
+
+// Mma pipelined
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    typename WarpIteratorA_,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Transformation applied to A operand
+    typename TransformA_,
+    /// Transformation applied to B operand
+    typename TransformB_,
+    // Max MMA problem size K
+    int kMaxK,
+    /// whether or not to apply elementwise multiplication of operand A by
+    /// another matrix in shared memory before usage in A @ B
+    bool kScaleOperandA,
+    bool kTransposeA>
+struct DefaultMmaFromSharedMemory<
+    MmaPipelined<
+        Shape_,
+        IteratorA_,
+        SmemIteratorA_,
+        IteratorB_,
+        SmemIteratorB_,
+        ElementC_,
+        LayoutC_,
+        Policy_,
+        TransformA_,
+        TransformB_>,
+    kMaxK,
+    WarpIteratorA_,
+    kScaleOperandA,
+    kTransposeA> {
+  using RegularMma = MmaPipelined<
+      Shape_,
+      IteratorA_,
+      SmemIteratorA_,
+      IteratorB_,
+      SmemIteratorB_,
+      ElementC_,
+      LayoutC_,
+      Policy_,
+      TransformA_,
+      TransformB_>;
+
+  using WarpShape = typename Policy_::Operator::Shape;
+  using InstructionShape = typename Policy_::Operator::InstructionShape;
+  using ArchMmaOperator = typename Policy_::Operator;
+
+  static constexpr bool kIsTransposedA = false;
+  using WarpIteratorA = WarpIteratorA_;
+  using IteratorB =
+      typename cutlass::transform::threadblock::MakeIteratorResidualLast<
+          IteratorB_>::Iterator;
+
+  using Mma = typename cutlass::gemm::threadblock::MmaPipelinedFromSharedMemory<
+      Shape_,
+      WarpIteratorA,
+      kScaleOperandA,
+      kMaxK,
+      IteratorB,
+      SmemIteratorB_,
+      ElementC_,
+      LayoutC_,
+      Policy_>;
+};
+
+template <
+    /// Size of the Gemm problem - concept: gemm::GemmShape<>
+    typename Shape_,
+    /// Iterates over tiles of A operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorA_,
+    /// Iterates over tiles of A operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorA_,
+    typename WarpIteratorA_,
+    /// Cache operation for operand A
+    cutlass::arch::CacheOperation::Kind CacheOpA,
+    /// Iterates over tiles of B operand in global memory
+    //  (concept: ReadableTileIterator | ForwardTileIterator |
+    //  MaskedTileIterator)
+    typename IteratorB_,
+    /// Iterates over tiles of B operand in shared memory
+    /// (concept: WriteableTileIterator | RandomAccessTileIterator)
+    typename SmemIteratorB_,
+    /// Cache operation for operand B
+    cutlass::arch::CacheOperation::Kind CacheOpB,
+    /// Data type of accumulator matrix
+    typename ElementC_,
+    /// Data type of accumulator matrix
+    typename LayoutC_,
+    /// Policy describing tuning details (concept: MmaPolicy)
+    typename Policy_,
+    /// Number of stages,
+    int Stages,
+    /// Use zfill or predicate for out-of-bound cp.async
+    SharedMemoryClearOption SharedMemoryClear,
+    int kMaxK,
+    /// whether or not to apply elementwise multiplication of operand A by
+    /// another matrix in shared memory before usage in A @ B
+    bool kScaleOperandA,
+    bool kTransposeA>
+struct DefaultMmaFromSharedMemory<
+    MmaMultistage<
+        Shape_,
+        IteratorA_,
+        SmemIteratorA_,
+        CacheOpA,
+        IteratorB_,
+        SmemIteratorB_,
+        CacheOpB,
+        ElementC_,
+        LayoutC_,
+        Policy_,
+        Stages,
+        SharedMemoryClear>,
+    kMaxK,
+    WarpIteratorA_,
+    kScaleOperandA,
+    kTransposeA> {
+  using RegularMma = MmaMultistage<
+      Shape_,
+      IteratorA_,
+      SmemIteratorA_,
+      CacheOpA,
+      IteratorB_,
+      SmemIteratorB_,
+      CacheOpB,
+      ElementC_,
+      LayoutC_,
+      Policy_,
+      Stages,
+      SharedMemoryClear>;
+
+  using WarpShape = typename Policy_::Operator::Shape;
+  using InstructionShape = typename Policy_::Operator::InstructionShape;
+  using WarpIteratorTranspose = TransposeWarpIterator<WarpIteratorA_>;
+  static constexpr bool kIsTransposedA =
+      WarpIteratorTranspose::kSupportsTranspose && kTransposeA;
+  using WarpIteratorA = typename platform::conditional<
+      kIsTransposedA,
+      typename WarpIteratorTranspose::Iterator,
+      WarpIteratorA_>::type;
+
+  // Reduce the number of stages if we don't need that many
+  static int constexpr kStagesMax =
+      (kMaxK + int(Shape_::kK) - 1) / int(Shape_::kK);
+  static int constexpr kStages = cutlass::const_min(Stages, kStagesMax);
+
+  using IteratorB =
+      typename cutlass::transform::threadblock::MakeIteratorResidualLast<
+          IteratorB_>::Iterator;
+  using Mma =
+      typename cutlass::gemm::threadblock::MmaMultistageFromSharedMemory<
+          Shape_,
+          WarpIteratorA,
+          kScaleOperandA,
+          IteratorB,
+          SmemIteratorB_,
+          RegularMma::kCacheOpB,
+          ElementC_,
+          LayoutC_,
+          Policy_,
+          kStages,
+          kMaxK>;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    typename IteratorC,
+    typename Operator,
+    typename scalar_t,
+    typename WarpShape_,
+    typename ThreadblockShape_>
+struct B2bGemm;
+
+// Tensor Cores >= Sm75 specialization (Ampere ...)
+template < /// Size of the matrix to load (concept: MatrixShape)
+    typename Shape_,
+    /// Element type
+    typename Element_,
+    /// Layout of operand in memory
+    typename Layout_,
+    /// Shape of one matrix product operation (concept: MatrixShape)
+    typename InstructionShape_,
+    /// Interval between adjacent *MMA instructions (in units of MMA
+    /// instructions, concept: MatrixShape)
+    typename OpDelta_,
+    typename Operator,
+    typename scalar_t,
+    typename WarpShape_,
+    typename ThreadblockShape_>
+struct B2bGemm<
+    cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator<
+        Shape_,
+        Element_,
+        Layout_,
+        InstructionShape_,
+        OpDelta_>,
+    Operator,
+    scalar_t,
+    WarpShape_,
+    ThreadblockShape_> {
+  using IteratorC =
+      typename cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator<
+          Shape_,
+          Element_,
+          Layout_,
+          InstructionShape_,
+          OpDelta_>;
+  using FragmentC = typename IteratorC::Fragment;
+  using InstructionShape = InstructionShape_;
+  using WarpShape = WarpShape_;
+  using ThreadblockShape = ThreadblockShape_;
+  using accum_t = Element_;
+  using lse_scalar_t = float;
+
+  using SmemAccumulatorLayout = cutlass::layout::RowMajor;
+
+  // Iterator to load accumulators (results of matmul in registers)
+  using FragmentIteratorAccumulator =
+      cutlass::epilogue::warp::FragmentIteratorTensorOp<
+          WarpShape,
+          InstructionShape,
+          accum_t,
+          typename Operator::Policy::Operator::FragmentC,
+          cutlass::layout::RowMajor>;
+
+  // Iterator to store to shared-memory
+  using SmemIteratorD0 = typename cutlass::epilogue::warp::TileIteratorTensorOp<
+      WarpShape,
+      InstructionShape,
+      scalar_t, // accum_t,
+      SmemAccumulatorLayout>;
+  using AccumulatorSharedStorage =
+      cutlass::gemm::threadblock::AccumulatorSharedStorage<
+          ThreadblockShape,
+          typename SmemIteratorD0::Element,
+          typename SmemIteratorD0::TensorLayout,
+          typename SmemIteratorD0::Padding>;
+  // We need to provide an operation for the epilogue. Let's create an
+  // operation that does nothing (ScaleType::Nothing), just converts
+  // from accum_t (float) -> scalar_t (can be half)
+  using OutputOpNoOp = cutlass::epilogue::thread::LinearCombination<
+      typename SmemIteratorD0::Element, // ElementOutput
+      FragmentIteratorAccumulator::Fragment::kElements,
+      accum_t, // ElementAccumulator
+      typename SmemIteratorD0::Element, // ElementCompute
+      cutlass::epilogue::thread::ScaleType::Nothing>;
+  using Epilogue = cutlass::epilogue::threadblock::EpilogueSmemAccumulator<
+      SmemIteratorD0,
+      FragmentIteratorAccumulator,
+      SmemIteratorD0, // ScaleBiasIterator - not used
+      OutputOpNoOp>;
+
+  // Epilogue 2: with LSE (for backwards pass)
+  static int const kElementsPerAccess = 2; // TODO: Why 2?
+  using IteratorAccumulatorLSE =
+      cutlass::transform::threadblock::VectorIterator<
+          cutlass::transform::threadblock::PredicatedVectorAccessIterator<
+              // Shape
+              cutlass::MatrixShape<ThreadblockShape::kM, ThreadblockShape::kN>,
+              // WarpShape
+              cutlass::MatrixShape<WarpShape::kM, WarpShape::kN>,
+              lse_scalar_t,
+              cutlass::layout::RowMajor,
+              kElementsPerAccess>>;
+  using EpilogueOpApplyLSE = cutlass::epilogue::thread::ApplyLogSumExp<
+      scalar_t, // ElementOutput_
+      lse_scalar_t, // ElementLSE_
+      accum_t, // ElementAccumulator_
+      accum_t, // ElementCompute_
+      128 / cutlass::sizeof_bits<scalar_t>::value
+      // FragmentIteratorAccumulator::Fragment::kElements
+      // InstructionShape::kM * InstructionShape::kN / 32
+      >;
+  using EpilogueWithLSE =
+      cutlass::epilogue::threadblock::EpilogueSmemAccumulator<
+          SmemIteratorD0,
+          FragmentIteratorAccumulator,
+          IteratorAccumulatorLSE,
+          EpilogueOpApplyLSE>;
+
+  static void CUTLASS_DEVICE accumToSmem(
+      AccumulatorSharedStorage& shared_storage,
+      FragmentC const& accum,
+      int lane_id,
+      cutlass::MatrixCoord const& tile_coords) {
+    SmemIteratorD0 smem_iterator_attn(shared_storage.accum_ref(), lane_id);
+    smem_iterator_attn.add_tile_offset(
+        tile_coords *
+        cutlass::MatrixCoord{
+            SmemIteratorD0::TileIterations::kRow,
+            SmemIteratorD0::TileIterations::kColumn});
+    Epilogue epilogue;
+    epilogue(OutputOpNoOp({}), smem_iterator_attn, accum);
+  }
+
+  static void CUTLASS_DEVICE accumApplyLSEToSmem(
+      AccumulatorSharedStorage& shared_storage,
+      FragmentC& accum,
+      lse_scalar_t const* lse,
+      int32_t lse_extents,
+      int thread_id,
+      int warp_id,
+      int lane_id,
+      cutlass::MatrixCoord const& tile_coords) {
+    constexpr int32_t kAlignLSE = 32;
+    IteratorAccumulatorLSE iterator_lse(
+        lse,
+        {(int32_t)0, (int32_t)ceil_div(lse_extents, kAlignLSE) * kAlignLSE},
+        thread_id,
+        warp_id,
+        cutlass::MatrixCoord{0, 0} // offset
+    );
+
+    SmemIteratorD0 smem_iterator_attn(shared_storage.accum_ref(), lane_id);
+    smem_iterator_attn.add_tile_offset(
+        tile_coords *
+        cutlass::MatrixCoord{
+            SmemIteratorD0::TileIterations::kRow,
+            SmemIteratorD0::TileIterations::kColumn});
+    EpilogueWithLSE epilogue;
+    EpilogueOpApplyLSE minus_lse_exp({});
+    epilogue(
+        minus_lse_exp,
+        smem_iterator_attn,
+        accum,
+        // scale - unused
+        iterator_lse,
+        // bias
+        iterator_lse);
+  }
+};
+
+// Volta Specialization
+// only supported for f16
+template <typename Operator, typename WarpShape_, typename ThreadblockShape_>
+struct B2bGemm<
+    cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator<
+        cutlass::MatrixShape<32, 32>,
+        float,
+        cutlass::layout::RowMajor,
+        cutlass::gemm::GemmShape<16, 16, 4>,
+        cutlass::MatrixShape<1, 1>>,
+    Operator,
+    cutlass::half_t,
+    WarpShape_,
+    ThreadblockShape_> {
+  using IteratorC =
+      cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator<
+          cutlass::MatrixShape<32, 32>,
+          float,
+          cutlass::layout::RowMajor,
+          cutlass::gemm::GemmShape<16, 16, 4>,
+          cutlass::MatrixShape<1, 1>>;
+  using scalar_t = cutlass::half_t;
+  using accum_t = IteratorC::Element;
+  using WarpShape = WarpShape_;
+  using ThreadblockShape = ThreadblockShape_;
+  using FragmentC = IteratorC::Fragment;
+  using lse_scalar_t = float;
+
+  // Storage in shared-memory for Q.Kt
+  using SmemAccumulatorLayout =
+      cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise<16, 32>;
+  using AccumulatorSharedStorage =
+      cutlass::gemm::threadblock::AccumulatorSharedStorage<
+          ThreadblockShape,
+          scalar_t,
+          SmemAccumulatorLayout,
+          cutlass::MatrixShape<0, 0> // Padding
+          >;
+  using TensorRef = cutlass::TensorRef<scalar_t, SmemAccumulatorLayout>;
+  using Policy = typename IteratorC::Policy;
+  using Element = accum_t;
+  // Those are MmaVoltaTensorOpAccumulatorTileIterator private fields
+  // Let's copy their values
+  static int const kElementsPerPartial = 4;
+  using EleShapePerPatial = typename cutlass::platform::conditional<
+      cutlass::platform::is_same<Element, float>::value,
+      cutlass::MatrixShape<2, 2>,
+      cutlass::MatrixShape<1, 4>>::type;
+  static int const kElementsPerMma = 8;
+  static int const kAccumulatorPatials = 2;
+  using QuadShapePerPatialMma = cutlass::MatrixShape<4, 4>;
+
+  static void CUTLASS_DEVICE accumToSmem(
+      AccumulatorSharedStorage& shared_storage,
+      FragmentC const& accum,
+      int lane_id,
+      cutlass::MatrixCoord const& tile_coords) {
+    // ctor - from MmaVoltaTensorOpAccumulatorTileIterator
+    TensorRef ref_(shared_storage.accum_ref());
+    int quad = (lane_id >> 2);
+    int lane_in_quad = (lane_id & 3);
+    int accum_m, accum_n;
+
+    if (cutlass::platform::is_same<Element, float>::value) {
+      // (quad[2],quad[0])+lane_in_quad[0]
+      accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 + (lane_in_quad & 1);
+      // (quad[1])+lane_in_quad[1]
+      accum_n =
+          ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials +
+          (lane_in_quad & 2);
+    } else {
+      accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 +
+          lane_in_quad; // (quad[2],quad[0])
+      accum_n = ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials;
+    }
+    cutlass::MatrixCoord lane_offset(accum_m, accum_n);
+
+    // Tile offset
+    ref_.add_coord_offset(
+        tile_coords *
+        cutlass::MatrixCoord(
+            {IteratorC::Shape::kRow, IteratorC::Shape::kColumn}));
+
+    using AccessType = cutlass::Array<scalar_t, EleShapePerPatial::kColumn>;
+
+    // store - from MmaVoltaTensorOpAccumulatorTileIterator
+    CUTLASS_PRAGMA_UNROLL
+    for (int tile_n = 0; tile_n < Policy::TileIterations::kColumn; ++tile_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int tile_m = 0; tile_m < Policy::TileIterations::kRow; ++tile_m) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) {
+            int mma_accum_start =
+                (((tile_n * Policy::TileIterations::kRow + tile_m) *
+                      Policy::MmaIterations::kColumn +
+                  mma_n) *
+                     Policy::MmaIterations::kRow +
+                 mma_m) *
+                kElementsPerMma;
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int p = 0; p < kAccumulatorPatials; ++p) {
+              CUTLASS_PRAGMA_UNROLL
+              for (int m = 0; m < EleShapePerPatial::kRow; ++m) {
+                int accum_m = tile_m * Policy::InterleavedTile::kRow +
+                    mma_m * QuadShapePerPatialMma::kRow + m * 2;
+                int accum_n = tile_n * Policy::InterleavedTile::kColumn +
+                    mma_n * QuadShapePerPatialMma::kColumn +
+                    p * Policy::InterleavedTile::kColumn / 2;
+                int r = (accum_m + lane_offset.row());
+                AccessType to_store;
+                CUTLASS_PRAGMA_UNROLL
+                for (int n = 0; n < EleShapePerPatial::kColumn; ++n) {
+                  int idx = mma_accum_start + p * kElementsPerPartial +
+                      m * EleShapePerPatial::kColumn + n;
+                  int c = (accum_n + n + lane_offset.column());
+                  to_store[n] = scalar_t(accum[idx]);
+                }
+                int c = (accum_n + lane_offset.column());
+                assert(r < 32);
+                assert(c < 32);
+                *reinterpret_cast<AccessType*>(
+                    ref_.data() + ref_.offset({r, c})) = to_store;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  static void CUTLASS_DEVICE accumApplyLSEToSmem(
+      AccumulatorSharedStorage& shared_storage,
+      typename IteratorC::Fragment& accum,
+      lse_scalar_t const* lse,
+      int lse_extent,
+      int thread_id,
+      int warp_id,
+      int lane_id,
+      cutlass::MatrixCoord const& tile_coords) {
+    // Non-optimized way to apply LSE to registers
+    // NOTE: accum is attn.T
+    // TODO: Optimize for each architecture
+    static constexpr int WarpSize = 32;
+    using AccumLambdaIterator =
+        typename DefaultMmaAccumLambdaIterator<IteratorC, accum_t, WarpSize>::
+            Iterator;
+    auto lane_offset =
+        AccumLambdaIterator::get_lane_offset(lane_id, warp_id, tile_coords);
+
+    cutlass::Array<lse_scalar_t, IteratorC::Fragment::kElements> lse_prefetched;
+    lse_prefetched.clear();
+    int rowIdx = 0;
+    int colIdx = 0;
+    AccumLambdaIterator::iterateRows(
+        lane_offset,
+        [&](int accum_m) {
+          ++rowIdx;
+          colIdx = 0;
+        },
+        [&](int accum_m, int accum_n, int idx) {
+          if (rowIdx == 1) {
+            lse_prefetched[colIdx] = accum_n < lse_extent
+                ? lse[accum_n]
+                : cutlass::platform::numeric_limits<accum_t>::infinity();
+          }
+          accum[idx] = expf(accum[idx] - lse_prefetched[colIdx]);
+          ++colIdx;
+        },
+        [&](int accum_m) {});
+    accumToSmem(shared_storage, accum, lane_id, tile_coords);
+  }
+};
+
+// Simt Specialization
+// for f32 on Sm70-Sm75 and f16/f32 below
+
+template <
+    typename Operator,
+    typename OperatorPolicy,
+    typename scalar_t,
+    typename WarpShape_,
+    typename ThreadblockShape_>
+struct B2bGemm<
+    cutlass::gemm::warp::MmaSimtTileIterator<
+        cutlass::MatrixShape<32, 32>,
+        cutlass::gemm::Operand::kC,
+        float,
+        cutlass::layout::RowMajor,
+        OperatorPolicy,
+        1,
+        1>,
+    Operator,
+    scalar_t,
+    WarpShape_,
+    ThreadblockShape_> {
+  using IteratorC = cutlass::gemm::warp::MmaSimtTileIterator<
+      cutlass::MatrixShape<32, 32>,
+      cutlass::gemm::Operand::kC,
+      float,
+      cutlass::layout::RowMajor,
+      OperatorPolicy,
+      1,
+      1>;
+  using accum_t = typename IteratorC::Element;
+  using WarpShape = WarpShape_;
+  using ThreadblockShape = ThreadblockShape_;
+  using FragmentC = typename IteratorC::Fragment;
+  using lse_scalar_t = float;
+
+  // Storage in shared-memory for Q.Kt
+  using AccumulatorSharedStorage =
+      cutlass::gemm::threadblock::AccumulatorSharedStorage<
+          ThreadblockShape,
+          scalar_t,
+          cutlass::layout::ColumnMajor,
+          cutlass::MatrixShape<0, 0> // Padding
+          >;
+
+  static void CUTLASS_DEVICE accumToSmem(
+      AccumulatorSharedStorage& shared_storage,
+      FragmentC const& accum,
+      int lane_id,
+      cutlass::MatrixCoord const& tile_coords) {
+    using Policy = typename IteratorC::Policy;
+    using Element = typename IteratorC::Element;
+    using Iterations = typename IteratorC::Iterations;
+    using Delta = typename IteratorC::Delta;
+
+    auto ref_ = shared_storage.accum_ref();
+    // ctor - MmaSimtTileIterator
+    // compute offset based on thread ID and lane layout
+    typename Policy::LaneLayout lane_layout = Policy::get_lane_layout();
+
+    MatrixCoord lane_offset = lane_layout.inverse(lane_id) *
+        MatrixCoord(Policy::LaneMmaShape::kM, Policy::LaneMmaShape::kN);
+
+    ref_.add_coord_offset(lane_offset);
+
+    // Tile offset
+    ref_.add_coord_offset(
+        tile_coords *
+        cutlass::MatrixCoord(
+            {IteratorC::Shape::kRow, IteratorC::Shape::kColumn}));
+
+    // store - MmaSimtTileIterator
+    CUTLASS_PRAGMA_UNROLL
+    for (int mma_n = 0; mma_n < Iterations::kColumn; ++mma_n) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int n = 0; n < Policy::LaneMmaShape::kN; ++n) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int mma_m = 0; mma_m < Iterations::kRow; ++mma_m) {
+          CUTLASS_PRAGMA_UNROLL
+          for (int m = 0; m < Policy::LaneMmaShape::kM; ++m) {
+            int r =
+                Policy::LaneMmaShape::kM * (mma_m * Policy::WarpShape::kRow) +
+                m;
+            int c = mma_n * Delta::kColumn + n;
+            int idx = n +
+                Policy::LaneMmaShape::kN *
+                    (mma_n +
+                     Iterations::kColumn *
+                         (m + mma_m * Policy::LaneMmaShape::kM));
+            ref_.at({r, c}) = scalar_t(accum[idx]);
+          }
+        }
+      }
+    }
+  }
+
+  static void CUTLASS_DEVICE accumApplyLSEToSmem(
+      AccumulatorSharedStorage& shared_storage,
+      typename IteratorC::Fragment& accum,
+      lse_scalar_t const* lse,
+      int lse_extent,
+      int thread_id,
+      int warp_id,
+      int lane_id,
+      cutlass::MatrixCoord const& tile_coords) {
+    // Non-optimized way to apply LSE to registers
+    // NOTE: accum is attn.T
+    // TODO: Optimize for each architecture
+    static constexpr int WarpSize = 32;
+    using AccumLambdaIterator =
+        typename DefaultMmaAccumLambdaIterator<IteratorC, accum_t, WarpSize>::
+            Iterator;
+    auto lane_offset =
+        AccumLambdaIterator::get_lane_offset(lane_id, warp_id, tile_coords);
+
+    cutlass::Array<lse_scalar_t, IteratorC::Fragment::kElements> lse_prefetched;
+    lse_prefetched.clear();
+    int rowIdx = 0;
+    int colIdx = 0;
+    AccumLambdaIterator::iterateRows(
+        lane_offset,
+        [&](int accum_m) {
+          ++rowIdx;
+          colIdx = 0;
+        },
+        [&](int accum_m, int accum_n, int idx) {
+          if (rowIdx == 1) {
+            lse_prefetched[colIdx] = accum_n < lse_extent
+                ? lse[accum_n]
+                : cutlass::platform::numeric_limits<accum_t>::infinity();
+          }
+          accum[idx] = expf(accum[idx] - lse_prefetched[colIdx]);
+          ++colIdx;
+        },
+        [&](int accum_m) {});
+    accumToSmem(shared_storage, accum, lane_id, tile_coords);
+  }
+};
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/third_party/fused_multi_head_attention/gemm_kernel_utils.h b/third_party/fused_multi_head_attention/gemm_kernel_utils.h
new file mode 100644
index 0000000000..3703257a17
--- /dev/null
+++ b/third_party/fused_multi_head_attention/gemm_kernel_utils.h
@@ -0,0 +1,258 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "cutlass/arch/mma.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// Some helper functions
+////////////////////////////////////////////////////////////////////////////////
+#define DISPATCH_TYPES(tensor, func)                                           \
+  {                                                                            \
+    if (query.scalar_type() == at::ScalarType::Float) {                        \
+      using scalar_t = float;                                                  \
+      func();                                                                  \
+    } else if (query.scalar_type() == at::ScalarType::Half) {                  \
+      using scalar_t = cutlass::half_t;                                        \
+      func();                                                                  \
+    } else if (query.scalar_type() == at::ScalarType::BFloat16) {              \
+      using scalar_t = cutlass::bfloat16_t;                                    \
+      func();                                                                  \
+    } else {                                                                   \
+      XFORMERS_CHECK(false, "Only fp32, half & bf16 supported at the moment"); \
+    }                                                                          \
+  }
+
+#define DISPATCH_BOOL(BOOL_V, BOOL_NAME, F) \
+  {                                         \
+    if (BOOL_V) {                           \
+      using BOOL_NAME = std::true_type;      \
+      F();                                  \
+    } else {                                \
+      using BOOL_NAME = std::false_type;      \
+      F();                                  \
+    }                                       \
+  }
+
+#define DISPATCH_ARCHTAG(CC, func)                                        \
+  {                                                                       \
+    if (CC >= 80) {                                                       \
+      using ArchTag = cutlass::arch::Sm80;                                \
+      func();                                                             \
+    } else if (CC >= 75) {                                                \
+      using ArchTag = cutlass::arch::Sm75;                                \
+      func();                                                             \
+    } else if (CC >= 70) {                                                \
+      using ArchTag = cutlass::arch::Sm70;                                \
+      func();                                                             \
+    } else if (CC >= 50) {                                                \
+      using ArchTag = cutlass::arch::Sm50;                                \
+      func();                                                             \
+    } else {                                                              \
+      XFORMERS_CHECK(                                                     \
+          false,                                                          \
+          "Your device is too old. We require compute capability >= 50"); \
+    }                                                                     \
+  }
+
+#define CHECK_NOSPARSE_CONTIGUOUS_CUDA(TENSOR)                            \
+  XFORMERS_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor");     \
+  XFORMERS_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \
+  XFORMERS_CHECK(TENSOR.is_contiguous());
+
+#define CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(TENSOR)                        \
+  XFORMERS_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor");     \
+  XFORMERS_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \
+  XFORMERS_CHECK(                                                         \
+      TENSOR.stride(-1) == 1, #TENSOR ": last dimension must be contiguous");
+
+#ifdef TORCH_CHECK
+#define CHECK_ALIGNED_PTR(PTR, ALIGNMENT) \
+  XFORMERS_CHECK(                         \
+      uint64_t(PTR) % ALIGNMENT == 0, #PTR " is not correctly aligned")
+#define XFORMERS_CHECK TORCH_CHECK
+#elif defined(__CUDACC_RTC__)
+#define CHECK_ALIGNED_PTR(PTR, ALIGNMENT)  \
+  if (!(uint64_t(PTR) % ALIGNMENT == 0)) { \
+    return false;                          \
+  }
+#define XFORMERS_CHECK(COND, ERR) \
+  if (!(COND)) {                  \
+    return false;                 \
+  }
+#else
+#include <iostream>
+#define CHECK_ALIGNED_PTR(PTR, ALIGNMENT)            \
+  if (!(uint64_t(PTR) % ALIGNMENT == 0)) {           \
+    std::cerr << #PTR " is not correctly aligned\n"; \
+    return false;                                    \
+  }
+#define XFORMERS_CHECK(COND, ERR)                       \
+  if (!(COND)) {                                        \
+    std::cerr << "'" #COND "' failed: " << ERR << "\n"; \
+    return false;                                       \
+  }
+#endif
+
+#define ASSIGN_CHECK_OVERFLOW(A, B)                                    \
+  {                                                                    \
+    A = B;                                                             \
+    XFORMERS_CHECK(                                                    \
+        B < std::numeric_limits<decltype(A)>::max(), #B " overflows"); \
+  }
+
+namespace gemm_kernel_utils {
+
+template <typename integer>
+constexpr CUTLASS_HOST_DEVICE integer ceil_div(integer n, integer m) {
+  return (n + m - 1) / m;
+}
+
+template <typename integer>
+constexpr CUTLASS_HOST_DEVICE integer align_up(integer n, integer m) {
+  return ((n + m - 1) / m) * m;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Determine the type of GEMM we do (TensorCores or not, Shapes ...)
+// TODO: Maybe we could rely on Cutlass's DefaultGemm templates
+////////////////////////////////////////////////////////////////////////////////
+
+// Fallback to Simt (FMA on cuda cores) if not in a special case below
+template <typename ArchTag, typename scalar_t_, typename Enable = void>
+struct DefaultGemmType {
+  static constexpr int ThreadK = 8;
+  static constexpr int WarpK = 8;
+  static constexpr int kMinimumAlignment = 1;
+  using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+  using OpClass = cutlass::arch::OpClassSimt;
+  using Operator = cutlass::arch::OpMultiplyAdd;
+};
+
+// Specialization for tensorcores with f32
+template <typename ArchTag>
+struct DefaultGemmType<
+    ArchTag,
+    float,
+    typename cutlass::platform::enable_if<
+        ArchTag::kMinComputeCapability >= 80>::type> {
+  static constexpr int ThreadK = 32;
+  static constexpr int WarpK = 32;
+  static constexpr int kMinimumAlignment = 4;
+  using OpClass = cutlass::arch::OpClassTensorOp;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using Operator = cutlass::arch::OpMultiplyAddFastF32;
+};
+
+// Specialization for tensorcores with f16/bf16 - Sm75+
+template <typename ArchTag, typename scalar_t>
+struct DefaultGemmType<
+    ArchTag,
+    scalar_t,
+    typename cutlass::platform::enable_if<
+        ArchTag::kMinComputeCapability >= 75 &&
+        cutlass::sizeof_bits<scalar_t>::value == 16>::type> {
+  static constexpr int ThreadK = 32;
+  static constexpr int WarpK = 32;
+  static constexpr int kMinimumAlignment = 4;
+  using OpClass = cutlass::arch::OpClassTensorOp;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  using Operator = cutlass::arch::OpMultiplyAdd;
+};
+
+// Specialization for tensorcores with f16 - Volta
+template <>
+struct DefaultGemmType<cutlass::arch::Sm70, cutlass::half_t, void> {
+  static constexpr int ThreadK = 32;
+  static constexpr int WarpK = 32;
+  static constexpr int kMinimumAlignment = 2;
+  using OpClass = cutlass::arch::OpClassTensorOp;
+  using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>;
+  using Operator = cutlass::arch::OpMultiplyAdd;
+};
+
+// Enables to do
+// `auto x = kCondition ? fa(arg) : fb(arg)`
+// when `fa` and `fb` have different types
+template <bool kVal, typename TA, typename TB>
+struct call_conditional;
+
+template <typename TA, typename TB>
+struct call_conditional<true, TA, TB> {
+  template <typename Arg>
+  static CUTLASS_HOST_DEVICE auto apply(TA ta, TB tb, Arg arg)
+      -> decltype(ta(arg)) {
+    return ta(arg);
+  }
+};
+
+template <typename TA, typename TB>
+struct call_conditional<false, TA, TB> {
+  template <typename Arg>
+  static CUTLASS_HOST_DEVICE auto apply(TA ta, TB tb, Arg arg)
+      -> decltype(tb(arg)) {
+    return tb(arg);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Mark a variable as warp-uniform - enables some compiler optimizations
+// The cheapest way to do it is just to broadcast it from lane 0
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+CUTLASS_DEVICE T warp_uniform(T value) {
+  struct {
+    union {
+      T value;
+      uint32_t asInt;
+    };
+  } p;
+  p.value = value;
+  p.asInt = __shfl_sync(0xffffffff, (unsigned)p.asInt, 0);
+  return p.value;
+}
+
+template <typename T>
+CUTLASS_DEVICE T* warp_uniform(T* ptr) {
+  struct {
+    union {
+      T* ptr;
+      uint32_t asInt[2];
+    };
+  } p;
+  p.ptr = ptr;
+  p.asInt[0] = warp_uniform(p.asInt[0]);
+  p.asInt[1] = warp_uniform(p.asInt[1]);
+  return p.ptr;
+}
+} // namespace gemm_kernel_utils
diff --git a/third_party/fused_multi_head_attention/iterators/default_warp_iterator_from_smem.h b/third_party/fused_multi_head_attention/iterators/default_warp_iterator_from_smem.h
new file mode 100644
index 0000000000..930ee46dfe
--- /dev/null
+++ b/third_party/fused_multi_head_attention/iterators/default_warp_iterator_from_smem.h
@@ -0,0 +1,142 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Instantiates the right WarpIterator to read from shared memory
+    The class `DefaultWarpIteratorAFromSharedMemory` is useful when reading
+        data dumped with `B2bGemm::accumToSmem`.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/warp/mma_tensor_op_tile_access_iterator.h"
+#include "cutlass/platform/platform.h"
+
+#include "warp_iterator_from_smem.h"
+
+namespace cutlass {
+namespace gemm {
+namespace threadblock {
+
+template <
+    typename WarpShape,
+    typename InstructionShape,
+    typename RegularWarpIterator,
+    typename Policy,
+    typename Enable = void>
+struct DefaultWarpIteratorAFromSharedMemory {};
+
+// TensorOp - Ampere half
+template <typename RegularWarpIterator, typename Policy, int kInstrK>
+struct DefaultWarpIteratorAFromSharedMemory<
+    cutlass::gemm::GemmShape<32, 32, 32>,
+    cutlass::gemm::GemmShape<16, 8, kInstrK>,
+    RegularWarpIterator,
+    Policy,
+    typename platform::enable_if<(
+        sizeof_bits<typename RegularWarpIterator::Element>::value == 16 &&
+        Policy::Operator::Policy::OpDelta::kRow == 1)>::type> {
+  using OpDelta = typename Policy::Operator::Policy::OpDelta;
+  using WarpShape = cutlass::MatrixShape<32, 32>;
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, kInstrK>;
+
+  using WarpIterator = cutlass::gemm::warp::WarpIteratorFromSmem<
+      cutlass::gemm::Operand::kA,
+      typename RegularWarpIterator::Element,
+      cutlass::MatrixShape<InstructionShape::kM, InstructionShape::kK>>;
+};
+
+// TensorOp - Ampere f32
+template <typename WarpShape, typename RegularWarpIterator, typename Policy>
+struct DefaultWarpIteratorAFromSharedMemory<
+    WarpShape,
+    cutlass::gemm::GemmShape<16, 8, 8>,
+    RegularWarpIterator,
+    Policy,
+    typename platform::enable_if<(
+        sizeof_bits<typename RegularWarpIterator::Element>::value != 16 ||
+        Policy::Operator::Policy::OpDelta::kRow != 1)>::type> {
+  using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>;
+  static constexpr auto kWarpSize = 32;
+  using OpDelta = typename Policy::Operator::Policy::OpDelta;
+
+  using WarpIterator =
+      cutlass::gemm::warp::MmaTensorOpMultiplicandTileAccessIterator<
+          cutlass::MatrixShape<WarpShape::kM, WarpShape::kK>,
+          cutlass::gemm::Operand::kA,
+          typename RegularWarpIterator::Element,
+          cutlass::layout::RowMajor,
+          cutlass::MatrixShape<InstructionShape::kM, InstructionShape::kK>,
+          OpDelta::kRow,
+          kWarpSize>;
+};
+
+// TensorOp - Volta
+template <typename WarpShape, typename RegularWarpIterator, typename Policy>
+struct DefaultWarpIteratorAFromSharedMemory<
+    WarpShape,
+    cutlass::gemm::GemmShape<16, 16, 4>,
+    RegularWarpIterator,
+    Policy> {
+  using InstructionShape = cutlass::gemm::GemmShape<16, 16, 4>;
+  static constexpr auto kWarpSize = 32;
+  using OpDelta = typename Policy::Operator::Policy::OpDelta;
+
+  using WarpIterator =
+      cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator<
+          cutlass::MatrixShape<32, 32>, // MatrixShape<WarpShape::kM,
+                                        // WarpShape::kK>,
+          cutlass::gemm::Operand::kA,
+          typename RegularWarpIterator::Element,
+          cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise<16, 32>,
+          cutlass::MatrixShape<16, 4>,
+          OpDelta::kRow,
+          kWarpSize>;
+};
+
+// Simt
+template <typename WarpShape, typename RegularWarpIterator, typename Policy>
+struct DefaultWarpIteratorAFromSharedMemory<
+    WarpShape,
+    cutlass::gemm::GemmShape<1, 1, 1>,
+    RegularWarpIterator,
+    Policy> {
+  using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
+  static constexpr auto kWarpSize = 32;
+
+  // We just use the same iterator, as we reproduced the same shared-memory
+  // schema. Just modify it to handle non-complete tiles.
+  using WarpIterator = RegularWarpIterator;
+};
+
+} // namespace threadblock
+} // namespace gemm
+} // namespace cutlass
diff --git a/third_party/fused_multi_head_attention/iterators/epilogue_predicated_tile_iterator.h b/third_party/fused_multi_head_attention/iterators/epilogue_predicated_tile_iterator.h
new file mode 100644
index 0000000000..7a52e96a36
--- /dev/null
+++ b/third_party/fused_multi_head_attention/iterators/epilogue_predicated_tile_iterator.h
@@ -0,0 +1,751 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+  \brief Epilogue iterator that supports prefetching
+
+  Mostly copied from "cutlass/epilogue/threadblock/predicated_tile_iterator.h"
+*/
+
+#pragma once
+
+#include "cutlass/arch/arch.h"
+#include "cutlass/arch/memory.h"
+#include "cutlass/array.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/threadblock/output_tile_thread_map.h"
+#include "cutlass/epilogue/threadblock/predicated_tile_iterator_params.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/tensor.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace epilogue {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Tile iterator used to load and store output tile from global memory in
+/// epilogue.
+///
+/// Satisfies: ReadableTileIterator | PredicatedTileIterator |
+/// ForwardTileIterator
+///
+template <
+    typename ThreadMap_, ///< Thread map (conept: OutputTileThreadMap)
+    typename Element_, ///< Element data type
+    bool ScatterD = false, ///< Scatter D operand or not
+    bool UseCUDAStore = false>
+class PredicatedTileIteratorPrefetch {
+ public:
+  using ThreadMap = ThreadMap_;
+  using Shape = typename ThreadMap::Shape;
+
+  using Element = Element_;
+
+  using Layout = layout::RowMajor;
+  using TensorRef = TensorRef<Element, Layout>;
+  using ConstTensorRef = typename TensorRef::ConstTensorRef;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+  using TensorCoord = MatrixCoord;
+
+  static int const kElementsPerAccess = ThreadMap::kElementsPerAccess;
+  static int const kThreads = ThreadMap::kThreads;
+  static int const kIterations = ThreadMap::Count::kTile;
+
+  static_assert(
+      ThreadMap::Iterations::kRow > 0,
+      "ThreadMap::Iterations::kRow must be > 0");
+  static_assert(
+      ThreadMap::Iterations::kGroup > 0,
+      "ThreadMap::Iterations::kGroup must be > 0");
+  static_assert(
+      ThreadMap::Iterations::kCluster > 0,
+      "ThreadMap::Iterations::kCluster must be > 0");
+  static_assert(
+      ThreadMap::Iterations::kColumn > 0,
+      "ThreadMap::Iterations::kColumn must be > 0");
+
+  /// Fragment object
+  using Fragment = Array<
+      Element,
+      ThreadMap::Iterations::kColumn * ThreadMap::Iterations::kRow *
+          ThreadMap::Iterations::kGroup * ThreadMap::Iterations::kCluster *
+          ThreadMap::kElementsPerAccess>;
+
+  /// Memory access size
+  using AccessType = AlignedArray<Element, ThreadMap::kElementsPerAccess>;
+
+  //
+  // Parameters struct
+  //
+
+  /// Uses a non-template class
+  struct Params : PredicatedTileIteratorParams {
+    using Base = PredicatedTileIteratorParams;
+
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : PredicatedTileIteratorParams(
+              layout.stride(0) * int(sizeof(AccessType)) / kElementsPerAccess,
+              make_OutputTileThreadMapDesc<ThreadMap>()) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(Base const& base) : Base(base) {}
+  };
+
+  /// Mask object
+  struct Mask {
+    static int const kCount = ThreadMap::Iterations::kColumn;
+
+    /// Predicate state
+    bool predicates[kCount];
+
+    //
+    // Mask
+    //
+    CUTLASS_HOST_DEVICE
+    Mask() {
+      enable();
+    }
+
+    ///< Efficiently disables all accesses guarded by mask
+    CUTLASS_HOST_DEVICE void clear() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = false;
+      }
+    }
+
+    ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask
+    CUTLASS_DEVICE void enable() {
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < kCount; ++i) {
+        predicates[i] = true;
+      }
+    }
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Parameters structure containing reference and precomputed state.
+  PredicatedTileIteratorParams params_;
+
+  /// Byte-level pointer
+  uint8_t* byte_pointer_;
+
+  /// Array of boolean values to contain steady-state predicates
+  Mask mask_;
+
+  /// Extent of the matrix tile in rows
+  Index extent_row_;
+
+  /// Extent of the matrix tile in rows
+  Index extent_column_;
+
+  /// A thread's starting row position (assuming steady-state predicates have
+  /// been computed)
+  Index thread_start_row_;
+
+  /// A thread's starting column
+  Index thread_start_column_;
+
+  /// Internal state counter
+  int state_[3];
+
+  /// Scatter indices
+  int const* indices_;
+
+  //
+  // Static asserts about internal strides
+  //
+
+  static_assert(sizeof(extent_row_) == 4, "Expected 32b extents");
+  static_assert(sizeof(thread_start_row_) == 4, "Expected 32b extents");
+  static_assert(
+      sizeof(PredicatedTileIteratorParams::stride) == 8,
+      "Expected 64b strides");
+
+ private:
+  //
+  // Methods
+  //
+
+ public:
+  //
+  // Methods
+  //
+
+  /// Constructor
+  CUTLASS_DEVICE
+  PredicatedTileIteratorPrefetch(
+      PredicatedTileIteratorParams const& params,
+      Element* pointer,
+      TensorCoord extent,
+      int thread_idx,
+      TensorCoord threadblock_offset = TensorCoord(),
+      int const* indices = nullptr)
+      : params_(params), indices_(indices) {
+    TensorCoord thread_offset =
+        ThreadMap::initial_offset(thread_idx) + threadblock_offset;
+
+    extent_row_ = extent.row();
+    extent_column_ = extent.column();
+
+    thread_start_row_ = thread_offset.row();
+    thread_start_column_ = thread_offset.column();
+
+    // Initialize predicates
+    CUTLASS_PRAGMA_UNROLL
+    for (int c = 0; c < ThreadMap::Iterations::kColumn; ++c) {
+      mask_.predicates[c] =
+          ((thread_offset.column() + ThreadMap::Delta::kColumn * c) <
+           extent.column());
+    }
+
+    // Null pointer performs no accesses
+    if (!pointer) {
+      mask_.clear();
+    }
+
+    if (ScatterD && !indices) {
+      mask_.clear();
+    }
+
+    // Initialize pointer
+    byte_pointer_ = reinterpret_cast<uint8_t*>(pointer) +
+        LongIndex(thread_offset.row()) * LongIndex(params_.stride) +
+        LongIndex(thread_offset.column()) * sizeof(AccessType) /
+            kElementsPerAccess;
+
+    if (ScatterD) {
+      byte_pointer_ = reinterpret_cast<uint8_t*>(pointer) +
+          LongIndex(thread_offset.column()) * sizeof(AccessType) /
+              kElementsPerAccess;
+    }
+
+    // Initialize internal state counter
+    state_[0] = state_[1] = state_[2] = 0;
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    byte_pointer_ += pointer_offset * sizeof_bits<Element>::value / 8;
+  }
+
+  CUTLASS_DEVICE
+  void prefetch_all() {
+    CUTLASS_PRAGMA_UNROLL
+    for (int iter = 0; iter < kIterations; ++iter) {
+      prefetch();
+      ++(*this);
+    }
+  }
+
+  CUTLASS_DEVICE
+  void prefetch() {
+    uint8_t* byte_pointer = byte_pointer_;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int row_offset = row * ThreadMap::Delta::kRow +
+              group * ThreadMap::Delta::kGroup +
+              cluster * ThreadMap::Delta::kCluster;
+
+          AccessType* memory_pointer =
+              reinterpret_cast<AccessType*>(byte_pointer);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            // on windows using unsigned long here gives the error
+            // error: asm operand type size(4) does not match
+            // type/size implied by constraint 'l'
+            uint64_t addr = (uint64_t)((void*)&memory_pointer
+                                           [column * ThreadMap::Delta::kColumn /
+                                            kElementsPerAccess]);
+            asm volatile("prefetch.global.L1 [ %1 ];" : "=l"(addr) : "l"(addr));
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            if (!ScatterD) {
+              byte_pointer += params_.increment_row;
+            }
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, int64_t byte_offset) const {
+    uint8_t* byte_pointer = byte_pointer_;
+    AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int frag_row_idx =
+              (row +
+               ThreadMap::Iterations::kRow *
+                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow +
+              group * ThreadMap::Delta::kGroup +
+              cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          AccessType* memory_pointer =
+              reinterpret_cast<AccessType*>(byte_pointer + byte_offset);
+
+          if (ScatterD && row_guard) {
+            assert(indices_);
+
+            memory_pointer = reinterpret_cast<AccessType*>(
+                byte_pointer + byte_offset +
+                LongIndex(indices_[row_offset + thread_start_row_]) *
+                    LongIndex(params_.stride));
+          }
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            bool guard = row_guard && mask_.predicates[column];
+
+            cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+                frag_ptr
+                    [frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                (void*)&memory_pointer
+                    [column * ThreadMap::Delta::kColumn / kElementsPerAccess],
+                guard);
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            if (!ScatterD) {
+              byte_pointer += params_.increment_row;
+            }
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) const {
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, int64_t byte_offset) const {
+    uint8_t* byte_pointer = byte_pointer_;
+    AccessType const* frag_ptr = reinterpret_cast<AccessType const*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int frag_row_idx =
+              (row +
+               ThreadMap::Iterations::kRow *
+                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow +
+              group * ThreadMap::Delta::kGroup +
+              cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          AccessType* memory_pointer =
+              reinterpret_cast<AccessType*>(byte_pointer + byte_offset);
+
+          if (ScatterD && row_guard) {
+            assert(indices_);
+
+            memory_pointer = reinterpret_cast<AccessType*>(
+                byte_pointer + byte_offset +
+                LongIndex(indices_[row_offset + thread_start_row_]) *
+                    LongIndex(params_.stride));
+          }
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            bool guard = row_guard && mask_.predicates[column];
+
+            if (UseCUDAStore) {
+              if (guard) {
+                memory_pointer
+                    [column * ThreadMap::Delta::kColumn / kElementsPerAccess] =
+                        frag_ptr
+                            [frag_row_idx * ThreadMap::Iterations::kColumn +
+                             column];
+              }
+            } else {
+              cutlass::arch::global_store<AccessType, sizeof(AccessType)>(
+                  frag_ptr
+                      [frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                  (void*)&memory_pointer
+                      [column * ThreadMap::Delta::kColumn / kElementsPerAccess],
+                  guard);
+            }
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            if (!ScatterD) {
+              byte_pointer += params_.increment_row;
+            }
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Stores a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) const {
+    store_with_byte_offset(frag, 0);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void downsample_load_with_byte_offset(
+      Fragment& frag,
+      int64_t byte_offset,
+      int convolution_P,
+      int convolution_Q,
+      int add_P,
+      int add_Q,
+      int problem_N) const {
+    uint8_t* byte_pointer = byte_pointer_;
+    AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int frag_row_idx =
+              (row +
+               ThreadMap::Iterations::kRow *
+                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow +
+              group * ThreadMap::Delta::kGroup +
+              cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          int output_row = row_offset + thread_start_row_;
+          int output_N = output_row / (convolution_P * convolution_Q);
+          int output_PQ = output_row % (convolution_P * convolution_Q);
+          int output_P = output_PQ / convolution_Q;
+          int output_Q = output_PQ % convolution_Q;
+
+          int input_row = output_N * 2 * convolution_P * 2 * convolution_Q +
+              (2 * output_P + add_P) * 2 * convolution_Q + 2 * output_Q + add_Q;
+
+          int64_t byte_offset =
+              (input_row - output_row) * problem_N * sizeof(float);
+
+          AccessType* memory_pointer =
+              reinterpret_cast<AccessType*>(byte_pointer + byte_offset);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            bool guard = row_guard && mask_.predicates[column];
+
+            cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+                frag_ptr
+                    [frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                (void*)&memory_pointer
+                    [column * ThreadMap::Delta::kColumn / kElementsPerAccess],
+                guard);
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            byte_pointer += params_.increment_row;
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void upsample_load_with_byte_offset(
+      Fragment& frag,
+      int64_t byte_offset,
+      int convolution_P,
+      int convolution_Q,
+      int add_P,
+      int add_Q,
+      int problem_N) const {
+    uint8_t* byte_pointer = byte_pointer_;
+    AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster;
+         ++cluster) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) {
+          int frag_row_idx =
+              (row +
+               ThreadMap::Iterations::kRow *
+                   (group + ThreadMap::Iterations::kGroup * cluster));
+
+          int row_offset = row * ThreadMap::Delta::kRow +
+              group * ThreadMap::Delta::kGroup +
+              cluster * ThreadMap::Delta::kCluster;
+
+          bool row_guard = ((row_offset + thread_start_row_) < extent_row_);
+
+          int output_row = row_offset + thread_start_row_;
+          int output_N = output_row / (convolution_P * convolution_Q);
+          int output_PQ = output_row % (convolution_P * convolution_Q);
+          int output_P = output_PQ / convolution_Q;
+          int output_Q = output_PQ % convolution_Q;
+          int row_add_P = add_P;
+          int row_add_Q = add_Q;
+          if (output_P > convolution_P - 2)
+            row_add_P = 0;
+          if (output_Q > convolution_Q - 2)
+            row_add_Q = 0;
+
+          int input_row = output_N * (convolution_P / 2) * (convolution_Q / 2) +
+              ((output_P + row_add_P) / 2) * (convolution_Q / 2) +
+              (output_Q + row_add_Q) / 2;
+
+          int64_t byte_offset =
+              (input_row - output_row) * problem_N * sizeof(float);
+
+          AccessType* memory_pointer =
+              reinterpret_cast<AccessType*>(byte_pointer + byte_offset);
+
+          CUTLASS_PRAGMA_UNROLL
+          for (int column = 0; column < ThreadMap::Iterations::kColumn;
+               ++column) {
+            bool guard = row_guard && mask_.predicates[column];
+
+            cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+                frag_ptr
+                    [frag_row_idx * ThreadMap::Iterations::kColumn + column],
+                (void*)&memory_pointer
+                    [column * ThreadMap::Delta::kColumn / kElementsPerAccess],
+                guard);
+          }
+
+          if (row + 1 < ThreadMap::Iterations::kRow) {
+            byte_pointer += params_.increment_row;
+          }
+        }
+
+        if (group + 1 < ThreadMap::Iterations::kGroup) {
+          byte_pointer += params_.increment_group;
+        }
+      }
+
+      if (cluster + 1 < ThreadMap::Iterations::kCluster) {
+        byte_pointer += params_.increment_cluster;
+      }
+    }
+  }
+
+  CUTLASS_DEVICE
+  MatrixCoord thread_start() const {
+    return MatrixCoord(thread_start_row_, thread_start_column_);
+  }
+
+  /// Need to get the thread start row from the tile iterator
+  CUTLASS_DEVICE
+  int32_t thread_start_row() const {
+    return thread_start_row_;
+  }
+
+  /// Need to get the thread start row from the tile iterator
+  CUTLASS_DEVICE
+  int32_t thread_start_column() const {
+    return thread_start_column_;
+  }
+
+  /// Extent of the matrix in rows
+  CUTLASS_DEVICE
+  Index extent_row() const {
+    return extent_row_;
+  }
+
+  /// Extent of the matrix in columns
+  CUTLASS_DEVICE
+  Index extent_column() const {
+    return extent_column_;
+  }
+
+  /// Advances to the next position to load or store
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorPrefetch& operator++() {
+    ++state_[0];
+
+    if (!ScatterD) {
+      byte_pointer_ += params_.advance_row;
+    }
+
+    thread_start_row_ += ThreadMap::Shape::kRow;
+
+    if (state_[0] == ThreadMap::Count::kRow) {
+      state_[0] = 0;
+      ++state_[1];
+      byte_pointer_ += params_.advance_group;
+
+      thread_start_row_ += (ThreadMap::Shape::kGroup - 1) *
+          ThreadMap::Shape::kRow * ThreadMap::Count::kRow;
+
+      if (state_[1] == ThreadMap::Count::kGroup) {
+        state_[1] = 0;
+        ++state_[2];
+        byte_pointer_ += params_.advance_cluster;
+
+        thread_start_row_ += ThreadMap::Count::kGroup *
+            ThreadMap::Shape::kGroup * ThreadMap::Count::kRow *
+            ThreadMap::Shape::kRow;
+
+        if (state_[2] == ThreadMap::Count::kCluster) {
+          state_[2] = 0;
+          byte_pointer_ += params_.advance_tile;
+        }
+      }
+    }
+
+    return *this;
+  }
+
+  ///< Efficiently disables all accesses guarded by mask
+  CUTLASS_DEVICE void clear_mask() {
+    mask_.clear();
+  }
+
+  ///< Efficiently enables all accesses guarded by mask
+  CUTLASS_DEVICE void enable_mask() {
+    mask_.enable();
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void get_mask(Mask& mask) const {
+    mask = mask_;
+  }
+
+  ///< Sets the mask
+  CUTLASS_DEVICE void set_mask(Mask const& mask) {
+    mask_ = mask;
+  }
+};
+
+template <typename IT>
+struct MakePrefetchableIterator {
+  using Iterator = PredicatedTileIteratorPrefetch<
+      typename IT::ThreadMap,
+      typename IT::Element>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace epilogue
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/third_party/fused_multi_head_attention/iterators/make_residual_last.h b/third_party/fused_multi_head_attention/iterators/make_residual_last.h
new file mode 100644
index 0000000000..a667d67527
--- /dev/null
+++ b/third_party/fused_multi_head_attention/iterators/make_residual_last.h
@@ -0,0 +1,97 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "predicated_tile_access_iterator_residual_last.h"
+#include "predicated_tile_iterator_residual_last.h"
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+template <typename BaseIterator>
+struct MakeIteratorResidualLast;
+
+template <
+    typename Shape,
+    typename Element,
+    typename Layout,
+    int AdvanceRank,
+    typename ThreadMap,
+    int AccessSize,
+    bool Gather>
+struct MakeIteratorResidualLast<PredicatedTileIterator<
+    Shape,
+    Element,
+    Layout,
+    AdvanceRank,
+    ThreadMap,
+    AccessSize,
+    Gather>> {
+  using Iterator = PredicatedTileIteratorResidualLast<
+      Shape,
+      Element,
+      Layout,
+      AdvanceRank,
+      ThreadMap,
+      AccessSize,
+      Gather>;
+};
+
+template <
+    typename Shape,
+    typename Element,
+    typename Layout,
+    int AdvanceRank,
+    typename ThreadMap,
+    typename AccessType,
+    bool Gather>
+struct MakeIteratorResidualLast<PredicatedTileAccessIterator<
+    Shape,
+    Element,
+    Layout,
+    AdvanceRank,
+    ThreadMap,
+    AccessType,
+    Gather>> {
+  using Iterator = PredicatedTileAccessIteratorResidualLast<
+      Shape,
+      Element,
+      Layout,
+      AdvanceRank,
+      ThreadMap,
+      AccessType,
+      Gather>;
+};
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
diff --git a/third_party/fused_multi_head_attention/iterators/predicated_tile_access_iterator_residual_last.h b/third_party/fused_multi_head_attention/iterators/predicated_tile_access_iterator_residual_last.h
new file mode 100644
index 0000000000..d007f0445b
--- /dev/null
+++ b/third_party/fused_multi_head_attention/iterators/predicated_tile_access_iterator_residual_last.h
@@ -0,0 +1,2114 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates calculating the address and predicates to the load of tiles
+    from pitch-linear rank=2 tensors.
+
+    This iterator uses masks to guard out-of-bounds accesses. The first tile
+   this iterator visits maybe partial, then the remaining tiles are complete.
+   So, we only need to compute the predicates twice, once before the first tile
+   and once for the remaining full tiles which can share the same predicates.
+
+    A precomputed "Params" object minimizes the amount of state that must be
+    stored in registers, and integer addition is used to advance the pointer
+    through memory.
+*/
+
+#pragma once
+
+#include "cutlass/array.h"
+#include "cutlass/coord.h"
+#include "cutlass/cutlass.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/predicate_vector.h"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/tensor_view.h"
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator_params.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// PredicatedTileAccessIteratorResidualLast
+///
+template <
+    typename Shape,
+    typename Element,
+    typename Layout,
+    int AdvanceRank,
+    typename ThreadMap,
+    typename AccessType,
+    bool Gather = false>
+class PredicatedTileAccessIteratorResidualLast;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for pitch-linear
+/// data.
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_,
+    bool Gather>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::PitchLinear,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    Gather> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingPredicates = PredicatedTileAccessIteratorPredicates<
+      Shape,
+      Element,
+      Layout,
+      AdvanceRank,
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector =
+      ThreadMap::kElementsPerAccess / AccessType::kElements;
+
+  static_assert(
+      !(ThreadMap::kElementsPerAccess % AccessType::kElements),
+      "Vectors implied by the thread map must be divisible by the access type.");
+
+  using Mask = typename UnderlyingPredicates::Mask;
+
+  /// Uses a non-template class
+  struct Params : PredicatedTileAccessIteratorParams {
+    using Base = PredicatedTileAccessIteratorParams;
+
+    // Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : Base(
+              layout.stride(0),
+              MakePredicatedTileAccessIteratorDesc<
+                  Shape,
+                  Element,
+                  Layout,
+                  kAdvanceRank,
+                  ThreadMap>()()) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(Base const& base) : Base(base) {}
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char*;
+
+ private:
+  //
+  // Data members
+  //
+
+  UnderlyingPredicates the_predicates;
+  Mask residual_tile_mask;
+
+  /// Parameters object with precomputed internal state
+  Params params_;
+
+  /// Internal pointer to first access of tile
+  BytePointer pointer_;
+
+  /// Below is used when Gather is turned on.  We need to record strided_offset
+  /// and contiguous_offset separated to compute the offset by using
+  ///
+  /// offset = contiguous_offset + indices[strided_offset]
+  ///
+
+  /// Gather indices
+  int const* indices_;
+
+  Index gather_offset_strided;
+
+ private:
+  /// Computes predicates based on internally tracked per-thread offset.
+  CUTLASS_DEVICE
+  void compute_predicates_(
+      /// Extent of the matrix window
+      TensorCoord extent,
+      /// optionally, simplify predicate calculation during 'steady state' phase
+      bool is_steady_state = false) {
+    the_predicates.compute_predicates_(extent, is_steady_state);
+  }
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      /// Gather indices
+      int const* indices = nullptr)
+      : params_(params),
+        pointer_(reinterpret_cast<BytePointer>(
+            const_cast<NonConstPointer>(pointer))),
+        the_predicates(extent),
+        indices_(indices) {
+    the_predicates.set_predicates(thread_id, threadblock_offset);
+    the_predicates.get_mask(residual_tile_mask);
+
+    // Working around a weird compiler bug happening on P100 for the backward.
+    // I've seen together: the_predicates.predicates_[0] = 14 (instead of 15)
+    // residual_tile_mask[0] = 15 (correct)
+    //
+    // Adding prints when the value is calculated (in `compute_predicates_`)
+    // sometimes removes the bug. The consequence is that we skip some
+    // element of a tensor, leading to wrong results
+    // Setting `compute_predicates_`'s second argument (`is_steady_state`) to
+    // true also seems to get rid of the bug - at the cost of twice as many
+    // comparisons.
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
+    constexpr bool kWorkAroundCompilerBug = false;
+#else
+    constexpr bool kWorkAroundCompilerBug = true;
+#endif
+    the_predicates.compute_predicates_(extent, true && !kWorkAroundCompilerBug);
+
+    // update internal pointers
+    Layout layout(params_.stride_);
+
+    if (!Gather) {
+      add_pointer_offset(layout(the_predicates.thread_offset_));
+    } else {
+      gather_offset_strided = the_predicates.thread_offset_.strided();
+      add_pointer_offset(
+          layout(make_Coord(the_predicates.thread_offset_.contiguous(), 0)));
+    }
+  }
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id)
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    the_predicates.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool is_residual_tile) {
+    if (is_residual_tile) {
+      the_predicates.set_mask(residual_tile_mask);
+    }
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += sizeof_bits<Element>::value * pointer_offset / 8;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    if (!Gather) {
+      if (kAdvanceRank) {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset.strided());
+        pointer_ += Shape::kContiguous * tile_offset.contiguous();
+      } else {
+        pointer_ += params_.inc_advance_ * LongIndex(tile_offset.contiguous());
+        pointer_ += Shape::kStrided * tile_offset.strided();
+      }
+    } else {
+      add_pointer_offset(Shape::kContiguous * tile_offset.contiguous());
+      gather_offset_strided += Shape::kStrided * tile_offset.strided();
+    }
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    if (Gather) {
+      assert(indices_);
+
+      if (!valid()) {
+        return nullptr;
+      }
+
+      LongIndex contiguous_offset = the_predicates.iteration_contiguous_ *
+              (ThreadMap::Delta::kContiguous * sizeof_bits<Element>::value /
+               8) +
+          the_predicates.iteration_vector_;
+      int strided_index = gather_offset_strided +
+          the_predicates.iteration_strided_ * ThreadMap::Delta::kStrided;
+
+      LongIndex strided_offset = indices_[strided_index] *
+          LongIndex(params_.stride_) * sizeof_bits<Element>::value / 8;
+
+      return reinterpret_cast<AccessType*>(
+          pointer_ + contiguous_offset + strided_offset);
+    }
+
+    return reinterpret_cast<AccessType*>(
+               pointer_ +
+               the_predicates.iteration_contiguous_ *
+                   (ThreadMap::Delta::kContiguous *
+                    sizeof_bits<Element>::value) /
+                   8) +
+        the_predicates.iteration_vector_;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    the_predicates.operator++();
+
+    ++the_predicates.iteration_vector_;
+    if (the_predicates.iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+
+    the_predicates.iteration_vector_ = 0;
+    ++the_predicates.iteration_contiguous_;
+
+    if (the_predicates.iteration_contiguous_ <
+        ThreadMap::Iterations::kContiguous) {
+      return *this;
+    }
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    the_predicates.iteration_contiguous_ = 0;
+    ++the_predicates.iteration_strided_;
+
+    if (the_predicates.iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      if (!Gather) {
+        pointer_ += params_.inc_strided_;
+      }
+
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    the_predicates.iteration_strided_ = 0;
+
+    if (!Gather) {
+      // advance to next tile
+      pointer_ += params_.inc_next_;
+
+      // now return to start tile - if the iterator is subsequently advanced,
+      // this subtraction as well as the subsequent integer addition are both
+      // elided by the compiler.
+      pointer_ -= params_.inc_advance_;
+    }
+
+    return *this;
+  }
+
+  /// Increment and return an instance to self.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    the_predicates.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    the_predicates.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    the_predicates.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    the_predicates.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() const {
+    return the_predicates.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for column-major
+/// data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_,
+    bool Gather>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::ColumnMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    Gather> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 0 : 1),
+      ThreadMap,
+      AccessType,
+      Gather>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))){};
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      ///< Precomputed parameters object
+      Params const& params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.row(), extent.column()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.row(),
+                threadblock_offset.column()),
+            indices) {}
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iterator_.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for row-major
+/// data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_,
+    bool Gather>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::RowMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    Gather> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 1 : 0),
+      ThreadMap,
+      AccessType,
+      Gather>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))){};
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      ///< Precomputed parameters object
+      Params const& params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      /// Gather indices
+      int const* indices = nullptr)
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.column(), extent.row()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.column(),
+                threadblock_offset.row()),
+            indices) {}
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iterator_.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for affine rank 2
+/// data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::AffineRankN<2>,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRankN<2>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingPredicates = PredicatedTileAccessIteratorPredicates<
+      Shape,
+      Element,
+      layout::PitchLinear,
+      AdvanceRank,
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector =
+      ThreadMap::kElementsPerAccess / AccessType::kElements;
+
+  static_assert(
+      !(ThreadMap::kElementsPerAccess % AccessType::kElements),
+      "Vectors implied by the thread map must be divisible by the access type.");
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingPredicates::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+   private:
+    /// stride of pitch-linear layout (units of Element)
+    Coord<Layout::kStrideRank, Layout::LongIndex> stride_;
+    /// amount (in byte) to increment pointer to move to next access along
+    /// contiguous dimension
+    LongIndex inc_contiguous_;
+    /// amount (in byte) to increment pointer from first access of current
+    /// contiguous dimension to first access of next one.
+    LongIndex inc_strided_;
+    /// amount (in byte) to increment pointer from last access of current
+    /// contiguous dimension to first access of next one.
+    LongIndex inc_next_strided_;
+    /// amount (in byte) to increment pointer from last access to first access
+    /// of next tile
+    LongIndex inc_next_;
+    /// amount (in byte) to increment pointer from first access of current tile
+    /// to first access of next tile
+    LongIndex inc_advance_;
+
+   public:
+    // Default ctor
+    CUTLASS_HOST_DEVICE
+    Params()
+        : stride_(0),
+          inc_contiguous_(0),
+          inc_strided_(0),
+          inc_next_(0),
+          inc_advance_(0) {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : stride_({layout.stride(0), layout.stride(1)}) {
+      inc_contiguous_ =
+          (LongIndex(stride_[0]) * ThreadMap::Delta::kContiguous) *
+          sizeof_bits<Element>::value / 8;
+
+      inc_strided_ = (LongIndex(stride_[1]) * ThreadMap::Delta::kStrided) *
+          sizeof_bits<Element>::value / 8;
+
+      inc_next_strided_ = inc_strided_ -
+          LongIndex(ThreadMap::Iterations::kContiguous - 1) * inc_contiguous_;
+
+      if (kAdvanceRank) {
+        // advance along strided dimension
+        inc_advance_ = Shape::kStrided * LongIndex(stride_[1]) *
+            sizeof_bits<Element>::value / 8;
+      } else {
+        // advance along contiguous dimension
+        inc_advance_ =
+            Shape::kContiguous * stride_[0] * sizeof_bits<Element>::value / 8;
+      }
+
+      inc_next_ = inc_advance_ -
+          LongIndex(ThreadMap::Iterations::kContiguous - 1) * inc_contiguous_ -
+          LongIndex(ThreadMap::Iterations::kStrided - 1) * inc_strided_;
+    };
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char*;
+
+  //
+  // Data members
+  //
+
+  /// Parameters object with precomputed internal state
+  Params params_;
+
+  /// Internal pointer to first access of tile
+  BytePointer pointer_;
+
+  UnderlyingPredicates the_predicates;
+  Mask residual_tile_mask;
+
+ private:
+  /// Computes predicates based on internally tracked per-thread offset.
+  CUTLASS_DEVICE
+  void compute_predicates_(
+      /// Extent of the matrix window
+      TensorCoord extent,
+      /// optionally, simplify predicate calculation during 'steady state' phase
+      bool is_steady_state = false) {
+    the_predicates.compute_predicates_(extent, is_steady_state);
+  }
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      ///< Precomputed parameters object
+      Params const& params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : params_(params),
+        pointer_(reinterpret_cast<BytePointer>(
+            const_cast<NonConstPointer>(pointer))),
+        the_predicates(extent) {
+    the_predicates.set_predicates(thread_id, threadblock_offset);
+
+    // update internal pointers
+    Layout layout(params_.stride_);
+    add_pointer_offset(layout(the_predicates.thread_offset_));
+  }
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    the_predicates.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool is_residual_tile) {
+    if (is_residual_tile) {
+      the_predicates.set_mask(residual_tile_mask);
+    }
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    pointer_ += sizeof_bits<Element>::value * pointer_offset / 8;
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    if (kAdvanceRank) {
+      pointer_ += params_.inc_advance_ * LongIndex(tile_offset[1]);
+      pointer_ += Shape::kContiguous * tile_offset[0];
+    } else {
+      pointer_ += params_.inc_advance_ * LongIndex(tile_offset[0]);
+      pointer_ += Shape::kStrided * tile_offset[1];
+    }
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(pointer_) +
+        the_predicates.iteration_vector_;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    the_predicates.operator++();
+    ++the_predicates.iteration_vector_;
+    if (the_predicates.iteration_vector_ < kAccessesPerVector) {
+      return *this;
+    }
+
+    the_predicates.iteration_vector_ = 0;
+    ++the_predicates.iteration_contiguous_;
+
+    if (the_predicates.iteration_contiguous_ <
+        ThreadMap::Iterations::kContiguous) {
+      pointer_ += params_.inc_contiguous_;
+      return *this;
+    }
+
+    // Enter here only if (iteration_contiguous_ ==
+    // ThreadMap::Iteration::kContiguous)
+    the_predicates.iteration_contiguous_ = 0;
+    ++the_predicates.iteration_strided_;
+
+    if (the_predicates.iteration_strided_ < ThreadMap::Iterations::kStrided) {
+      pointer_ += params_.inc_next_strided_;
+      return *this;
+    }
+
+    // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided)
+    // which means we enter the next tile.
+    the_predicates.iteration_strided_ = 0;
+
+    // advance to next tile
+    pointer_ += params_.inc_next_;
+
+    // now return to start tile - if the iterator is subsequently advanced, this
+    // subtraction as well as the subsequent integer addition are both elided by
+    // the compiler.
+    pointer_ -= params_.inc_advance_;
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    the_predicates.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    the_predicates.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    the_predicates.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    the_predicates.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return the_predicates.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for affine rank 2
+/// column-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::AffineRank2ColumnMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRank2ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  // Map to the underlying AffineRankN<2> layout
+  using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+      Element,
+      layout::AffineRankN<2>,
+      (kAdvanceRank == 0 ? 0 : 1),
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given an AffineRankN<2> tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1))){};
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying AffineRankN<2> tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      ///< Precomputed parameters object
+      Params const& params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.row(), extent.column()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.row(),
+                threadblock_offset.column())) {}
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iterator_.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    iterator_.add_tile_offset(
+        make_Coord(tile_offset.row(), tile_offset.column()));
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for affine rank-2
+/// row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::AffineRank2RowMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRank2RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  // Map to the underlying AffineRankN<2> layout
+  using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+      Element,
+      layout::AffineRankN<2>,
+      (kAdvanceRank == 0 ? 1 : 0),
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    /// Default ctor
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given an AffineRankN<2> tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0))){};
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying AffineRankN<2> tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      ///< Precomputed parameters object
+      Params const& params,
+      ///< Pointer to start of tensor
+      Pointer pointer,
+      ///< Extent of tensor
+      TensorCoord extent,
+      ///< ID of each participating thread
+      int thread_id,
+      ///< Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.column(), extent.row()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.column(),
+                threadblock_offset.row())) {}
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iterator_.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    iterator_.add_tile_offset(
+        make_Coord(tile_offset.column(), tile_offset.row()));
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for column-major
+/// interleaved data. It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_,
+    int InterleavedK>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::ColumnMajorInterleaved<InterleavedK>,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::ColumnMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+      layout::PitchLinearShape<
+          Shape::kRow * kInterleavedK,
+          Shape::kColumn / kInterleavedK>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 0 : 1),
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(
+                extent.row() * kInterleavedK,
+                extent.column() / kInterleavedK),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.row() * kInterleavedK,
+                threadblock_offset.column() / kInterleavedK)) {}
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iterator_.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileAccessIteratorResidualLast for row-major
+/// interleaved data.
+//  It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    typename AccessType_,
+    int InterleavedK>
+class PredicatedTileAccessIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::RowMajorInterleaved<InterleavedK>,
+    AdvanceRank,
+    ThreadMap_,
+    AccessType_,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::RowMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+  using AccessType = AccessType_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast<
+      layout::PitchLinearShape<
+          Shape::kColumn * kInterleavedK,
+          Shape::kRow / kInterleavedK>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 1 : 0),
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileAccessIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(
+                extent.column() * kInterleavedK,
+                extent.row() / kInterleavedK),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.column() * kInterleavedK,
+                threadblock_offset.row() / kInterleavedK)) {}
+
+  /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileAccessIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Overrides the internal iteration index
+  CUTLASS_HOST_DEVICE
+  void set_iteration_index(int index) {
+    iterator_.set_iteration_index(index);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  void add_tile_offset(TensorCoord const& tile_offset) {
+    iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()});
+  }
+
+  /// Returns a pointer
+  CUTLASS_HOST_DEVICE
+  AccessType* get() const {
+    return reinterpret_cast<AccessType*>(iterator_.get());
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileAccessIteratorResidualLast operator++(int) {
+    PredicatedTileAccessIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Returns whether access is valid or not
+  CUTLASS_HOST_DEVICE
+  bool valid() {
+    return iterator_.valid();
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/third_party/fused_multi_head_attention/iterators/predicated_tile_iterator_residual_last.h b/third_party/fused_multi_head_attention/iterators/predicated_tile_iterator_residual_last.h
new file mode 100644
index 0000000000..fa40d850c8
--- /dev/null
+++ b/third_party/fused_multi_head_attention/iterators/predicated_tile_iterator_residual_last.h
@@ -0,0 +1,2119 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Templates implementing loading of tiles from pitch-linear rank=2
+   tensors.
+
+    This iterator uses masks to guard out-of-bounds accesses. The first tile
+   this iterator visits maybe partial, then the remaining tiles are complete.
+   So, we only need to compute the predicates twice, once before the first tile
+   and once for the remaining full tiles which can share the same predicates.
+
+    A precomputed "Params" object minimizes the amount of state that must be
+   stored in registers, and integer addition is used to advance the pointer
+   through memory.
+*/
+
+#pragma once
+
+#include "cutlass/arch/memory.h"
+#include "cutlass/transform/threadblock/predicated_tile_access_iterator.h"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace transform {
+namespace threadblock {
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// PredicatedTileIteratorResidualLast
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+/// Regular tile iterator using a precomputed control structure to minimize
+/// register liveness and integer arithmetic.
+///
+/// Layout is assumed to be invariant at the time the precomputed "Params"
+/// object is constructed.
+///
+/// Base pointer and tensor extents may be specified at the time the iterator is
+/// constructed. Subsequently, they are assumed to be immutable.
+///
+/// Adding a logical coordinate offset may be performed at the time the iterator
+/// is constructed. Subsequent additions to logical coordinate offset may be
+/// performed but are relatively expensive.
+///
+/// Visitation order is intended to first visit a "residual" tile that may be
+/// partially full in both the advance dimension and the steady-state dimension.
+/// This is assumed to be the last tile in the iteration sequence. Advancing an
+/// iterator that has just been constructed moves to the first tile that is full
+/// in the advance dimension and recomputes predicates. Subsequent accesses may
+/// be performed without updating internal predicates and are efficient in terms
+/// of live register state and pointer arithmetic instructions.
+///
+/// To be efficient, this assumes the iterator will be dereferenced and advanced
+/// at least once outside any looping structure to minimize integer arithmetic.
+///
+/// Accesses out of bounds are safe so long as `clear_mask()` is called prior to
+/// dereferencing the iterator.
+///
+///
+/// Example:
+///
+/// An efficient pipeline structure may be constructed as follows:
+///
+// template <typename Iterator>
+// __global__ void kernel(
+//   typename Iterator::Params params,
+//   typename Iterator::Element *ptr,
+//   TensorCoord extent) {
+//
+//   typename Iterator::Fragment fragment;
+//
+//   TensorCoord threadblock_offset(0, 0);
+//
+//   Iterator iter(params, ptr, extent, threadIdx.x, threadblock_offsets);
+//
+//
+//   fragment = *iter;        // load "residue" tile first
+//   ++iter;                  // advance to first "steady state" tile and update
+//   internal masks
+//
+//
+//   #pragma unroll
+//   for (int i = Remaining - 1; i >= 0; --i) {
+//
+//     f(fragment);
+//
+//     if (!i) {
+//       iter.clear_mask();   // light-weight operation to clear masks -
+//       subsequent loads become NO-OPs.
+//     }
+//
+//     fragment = *iter;      // load tile during "steady state" phase
+//     ++iter;                // advance to next tile - lightweight due to
+//     steady-state masks
+//   }
+// }
+//
+// void host(TensorView<Element, 2, layout::PitchLinear> view) {
+//
+//   using Iterator =
+//   transform::threadblock::PredicatedTileIteratorResidualLast;
+//
+//   typename Iterator::Params params(view.layout());
+//
+//   kernel<Iterator>(params, view.data());
+// }
+///
+///
+template <
+    typename Shape,
+    typename Element,
+    typename Layout,
+    int AdvanceRank,
+    typename ThreadMap,
+    int AccessSize = ThreadMap::kElementsPerAccess,
+    bool Gather = false>
+class PredicatedTileIteratorResidualLast;
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize,
+    bool Gather>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::PitchLinear,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    Gather> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::PitchLinear;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  /// Type used for internal memory accesses
+  using AccessType = AlignedArray<
+      Element,
+      AccessSize,
+      (AccessSize * sizeof_bits<Element>::value / 8)>;
+
+  /// Underlying iterator to compute the addresses
+  using TileAccessIterator = PredicatedTileAccessIteratorResidualLast<
+      Shape,
+      Element,
+      Layout,
+      kAdvanceRank,
+      ThreadMap,
+      AccessType,
+      Gather>;
+
+  static int const kAccessesPerVector = TileAccessIterator::kAccessesPerVector;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename TileAccessIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    using Base = typename TileAccessIterator::Params::Base;
+
+    friend PredicatedTileIteratorResidualLast;
+
+   private:
+    /// Parameters object
+    typename TileAccessIterator::Params params_;
+
+   public:
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout) : params_(layout) {}
+
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    CUTLASS_HOST_DEVICE
+    Params(Base const& base) : params_(base) {}
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char*;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Data member to the tile access iterator
+  TileAccessIterator address_iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      /// Gather indices
+      int const* indices = nullptr)
+      : address_iterator_(
+            params.params_,
+            pointer,
+            extent,
+            thread_id,
+            threadblock_offset,
+            indices) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    address_iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    if (kAdvanceRank)
+      address_iterator_.add_tile_offset({0, 1});
+    else
+      address_iterator_.add_tile_offset({1, 0});
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    address_iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    address_iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    address_iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    address_iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    address_iterator_.get_mask(mask);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    load_with_byte_offset(
+        frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
+    AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+          int idx = v +
+              kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+          address_iterator_.set_iteration_index(idx);
+          char const* byte_ptr =
+              reinterpret_cast<char const*>(address_iterator_.get()) +
+              byte_offset;
+
+          AccessType const* access_ptr =
+              reinterpret_cast<AccessType const*>(byte_ptr);
+
+          cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+              frag_ptr[idx], access_ptr, address_iterator_.valid());
+
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    store_with_byte_offset(
+        frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
+    address_iterator_.set_iteration_index(0);
+    AccessType const* frag_ptr = reinterpret_cast<AccessType const*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+          int idx = v +
+              kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+          char* byte_ptr =
+              reinterpret_cast<char*>(address_iterator_.get()) + byte_offset;
+          AccessType* access_ptr = reinterpret_cast<AccessType*>(byte_ptr);
+
+          if (address_iterator_.valid()) {
+            *access_ptr = frag_ptr[idx];
+          }
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_byte_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize,
+    bool Gather>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::ColumnMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    Gather> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 0 : 1),
+      ThreadMap,
+      AccessSize,
+      Gather>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id, ///< ID of each participating thread
+      TensorCoord const& threadblock_offset, ///< Initial offset of threadblock
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.row(), extent.column()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.row(),
+                threadblock_offset.column()),
+            indices) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for pitch-linear data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize,
+    bool Gather>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::RowMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    Gather> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 1 : 0),
+      ThreadMap,
+      AccessSize,
+      Gather>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id, ///< ID of each participating thread
+      TensorCoord const& threadblock_offset, ///< Initial offset of threadblock
+      int const* indices = nullptr ///< Gather indices
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.column(), extent.row()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.column(),
+                threadblock_offset.row()),
+            indices) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for affine rank-2 data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::AffineRankN<2>,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRankN<2>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  /// Type used for internal memory accesses
+  using AccessType = AlignedArray<
+      Element,
+      AccessSize,
+      (AccessSize * sizeof_bits<Element>::value / 8)>;
+
+  /// Underlying iterator to compute the addresses
+  using TileAccessIterator = PredicatedTileAccessIteratorResidualLast<
+      Shape,
+      Element,
+      Layout,
+      kAdvanceRank,
+      ThreadMap,
+      AccessType>;
+
+  static int const kAccessesPerVector = TileAccessIterator::kAccessesPerVector;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename TileAccessIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   public:
+    friend PredicatedTileIteratorResidualLast;
+
+   private:
+    /// Parameters object
+    typename TileAccessIterator::Params params_;
+
+   public:
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout) : params_(layout) {}
+
+    CUTLASS_HOST_DEVICE
+    Params() {}
+  };
+
+ private:
+  /// Internal pointer type permits fast address arithmetic
+  using BytePointer = char*;
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Data member to the tile access iterator
+  TileAccessIterator address_iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : address_iterator_(
+            params.params_,
+            pointer,
+            extent,
+            thread_id,
+            threadblock_offset) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    address_iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    if (kAdvanceRank)
+      address_iterator_.add_tile_offset(make_Coord(0, 1));
+    else
+      address_iterator_.add_tile_offset(make_Coord(1, 0));
+
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    address_iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    address_iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    address_iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    address_iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    address_iterator_.get_mask(mask);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    load_with_byte_offset(
+        frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
+    AccessType* frag_ptr = reinterpret_cast<AccessType*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+          int idx = v +
+              kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+          address_iterator_.set_iteration_index(idx);
+          char const* byte_ptr =
+              reinterpret_cast<char const*>(address_iterator_.get()) +
+              byte_offset;
+
+          AccessType const* access_ptr =
+              reinterpret_cast<AccessType const*>(byte_ptr);
+
+          cutlass::arch::global_load<AccessType, sizeof(AccessType)>(
+              frag_ptr[idx], access_ptr, address_iterator_.valid());
+
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_byte_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    store_with_byte_offset(
+        frag, pointer_offset * sizeof_bits<Element>::value / 8);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
+    address_iterator_.set_iteration_index(0);
+    AccessType const* frag_ptr = reinterpret_cast<AccessType const*>(&frag);
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int v = 0; v < kAccessesPerVector; ++v) {
+          int idx = v +
+              kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous);
+
+          char* byte_ptr =
+              reinterpret_cast<char*>(address_iterator_.get()) + byte_offset;
+          AccessType* access_ptr = reinterpret_cast<AccessType*>(byte_ptr);
+
+          if (address_iterator_.valid()) {
+            *access_ptr = frag_ptr[idx];
+          }
+          ++address_iterator_;
+        }
+      }
+    }
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_byte_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for affine rank 2
+/// column-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::AffineRank2ColumnMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRank2ColumnMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  // Map to the underlying AffineRankN<2> layout
+  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kRow, Shape::kColumn>,
+      Element,
+      layout::AffineRankN<2>,
+      (kAdvanceRank == 0 ? 0 : 1),
+      ThreadMap,
+      AccessSize>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given an AffineRankN<2> tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1))) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying AffineRankN<2> tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id, ///< ID of each participating thread
+      TensorCoord const& threadblock_offset, ///< Initial offset of threadblock
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.row(), extent.column()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.row(),
+                threadblock_offset.column())) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for affine rank 2
+/// row-major data.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::AffineRank2RowMajor,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  using Layout = layout::AffineRank2RowMajor;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  // Map to the underlying AffineRankN<2> layout
+  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
+      layout::PitchLinearShape<Shape::kColumn, Shape::kRow>,
+      Element,
+      layout::AffineRankN<2>,
+      (kAdvanceRank == 0 ? 1 : 0),
+      ThreadMap,
+      AccessSize>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given an AffineRankN<2> tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0))) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying AffineRankN<2> tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id, ///< ID of each participating thread
+      TensorCoord const& threadblock_offset, ///< Initial offset of threadblock
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(extent.column(), extent.row()),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.column(),
+                threadblock_offset.row())) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) {
+    iterator_.load_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) {
+    iterator_.store_with_byte_offset(frag, byte_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for interleaved data.
+/// It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize,
+    int InterleavedK>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::ColumnMajorInterleaved<InterleavedK>,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::ColumnMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
+      layout::PitchLinearShape<
+          Shape::kRow * kInterleavedK,
+          Shape::kColumn / kInterleavedK>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 0 : 1),
+      ThreadMap,
+      AccessSize>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(
+                extent.row() * kInterleavedK,
+                extent.column() / kInterleavedK),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.row() * kInterleavedK,
+                threadblock_offset.column() / kInterleavedK)) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Specialization of PredicatedTileIteratorResidualLast for interleaved-32
+/// data.  It is mapped to the congruous layout.
+///
+/// Satisfies: ForwardTileIteratorConcept |
+///            ReadableContiguousTileIteratorConcept |
+///            WriteableContiguousTileIteratorConcept |
+///            MaskedTileIteratorConcept
+///
+template <
+    typename Shape_,
+    typename Element_,
+    int AdvanceRank,
+    typename ThreadMap_,
+    int AccessSize,
+    int InterleavedK>
+class PredicatedTileIteratorResidualLast<
+    Shape_,
+    Element_,
+    layout::RowMajorInterleaved<InterleavedK>,
+    AdvanceRank,
+    ThreadMap_,
+    AccessSize,
+    false> {
+ public:
+  static_assert(
+      AdvanceRank == 0 || AdvanceRank == 1,
+      "Specialization for pitch-linear iterator may along advance along the "
+      "contiguous(rank=0) or strided(rank=1) dimension.");
+
+  using Shape = Shape_;
+  using Element = Element_;
+  static int const kInterleavedK = InterleavedK;
+  using Layout = layout::RowMajorInterleaved<kInterleavedK>;
+  static int const kAdvanceRank = AdvanceRank;
+  using ThreadMap = ThreadMap_;
+
+  using Index = typename Layout::Index;
+  using LongIndex = typename Layout::LongIndex;
+
+  using TensorRef = TensorRef<Element, Layout>;
+  using TensorView = TensorView<Element, Layout>;
+  using TensorCoord = typename Layout::TensorCoord;
+
+  using Pointer = Element*;
+  using NonConstPointer = typename platform::remove_const<Element>::type*;
+
+  using UnderlyingIterator = PredicatedTileIteratorResidualLast<
+      layout::PitchLinearShape<
+          Shape::kColumn * kInterleavedK,
+          Shape::kRow / kInterleavedK>,
+      Element,
+      layout::PitchLinear,
+      (kAdvanceRank == 0 ? 1 : 0),
+      ThreadMap,
+      AccessSize>;
+
+  using AccessType = typename UnderlyingIterator::AccessType;
+
+  /// Fragment object to be loaded or stored
+  using Fragment = cutlass::Array<
+      Element,
+      ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>;
+
+  /// Predicate vector stores mask to guard accesses
+  using Mask = typename UnderlyingIterator::Mask;
+
+  /// Parameters object is precomputed state and is host-constructible
+  class Params {
+   private:
+    friend PredicatedTileIteratorResidualLast;
+
+    /// Parameters object
+    typename UnderlyingIterator::Params params_;
+
+   public:
+    CUTLASS_HOST_DEVICE
+    Params() {}
+
+    /// Construct the Params object given a pitch-linear tensor's layout
+    CUTLASS_HOST_DEVICE
+    Params(Layout const& layout)
+        : params_(layout::PitchLinear(layout.stride(0))) {}
+
+    CUTLASS_HOST_DEVICE
+    Params(typename UnderlyingIterator::Params::Base const& base)
+        : params_(base) {}
+  };
+
+ private:
+  //
+  // Data members
+  //
+
+  /// Underlying pitch-linear tile iterator
+  UnderlyingIterator iterator_;
+
+ public:
+  /// Constructs a TileIterator from its precomputed state, threadblock offset,
+  /// and thread ID
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      /// Precomputed parameters object
+      Params const& params,
+      /// Pointer to start of tensor
+      Pointer pointer,
+      /// Extent of tensor
+      TensorCoord extent,
+      /// ID of each participating thread
+      int thread_id,
+      /// Initial offset of threadblock
+      TensorCoord const& threadblock_offset,
+      int const* indices =
+          nullptr ///< gather/scatter indices, note no support for
+                  ///< gather/scatter at this specialization
+      )
+      : iterator_(
+            params.params_,
+            pointer,
+            layout::PitchLinearCoord(
+                extent.column() * kInterleavedK,
+                extent.row() / kInterleavedK),
+            thread_id,
+            layout::PitchLinearCoord(
+                threadblock_offset.column() * kInterleavedK,
+                threadblock_offset.row() / kInterleavedK)) {}
+
+  /// Construct a PredicatedTileIteratorResidualLast with zero threadblock
+  /// offset
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast(
+      Params const& params, ///< Precomputed parameters object
+      Pointer pointer, ///< Pointer to start of tensor
+      TensorCoord extent, ///< Extent of tensor
+      int thread_id ///< ID of each participating thread
+      )
+      : PredicatedTileIteratorResidualLast(
+            params,
+            pointer,
+            extent,
+            thread_id,
+            make_Coord(0, 0)) {}
+
+  /// Adds a pointer offset in units of Element
+  CUTLASS_HOST_DEVICE
+  void add_pointer_offset(LongIndex pointer_offset) {
+    iterator_.add_pointer_offset(pointer_offset);
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast& operator++() {
+    ++iterator_;
+    return *this;
+  }
+
+  /// Advances to the next tile in memory.
+  ///
+  /// The first time this method is called, predicates are updated, and the
+  /// iterator's internal pointer is reverted to the first "steady state" tile.
+  /// Subsequent calls are lightweight and must only update the internal
+  /// pointer.
+  CUTLASS_HOST_DEVICE
+  PredicatedTileIteratorResidualLast operator++(int) {
+    PredicatedTileIteratorResidualLast self(*this);
+    operator++();
+    return self;
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void clear_mask(bool enable = true) {
+    iterator_.clear_mask(enable);
+  }
+
+  CUTLASS_HOST_DEVICE
+  void set_residual_tile(bool enable) {
+    iterator_.set_residual_tile(enable);
+  }
+
+  /// Clears the predicate set efficiently
+  CUTLASS_HOST_DEVICE
+  void enable_mask() {
+    iterator_.enable_mask();
+  }
+
+  /// Sets the predicate mask, overriding value stored in predicate iterator
+  CUTLASS_HOST_DEVICE
+  void set_mask(Mask const& mask) {
+    iterator_.set_mask(mask);
+  }
+
+  /// Gets the mask
+  CUTLASS_HOST_DEVICE
+  void get_mask(Mask& mask) {
+    iterator_.get_mask(mask);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load_with_pointer_offset(Fragment& frag, Index pointer_offset) {
+    iterator_.load_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Loads a fragment from memory
+  CUTLASS_DEVICE
+  void load(Fragment& frag) {
+    load_with_pointer_offset(frag, 0);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) {
+    iterator_.store_with_pointer_offset(frag, pointer_offset);
+  }
+
+  /// Store a fragment to memory
+  CUTLASS_DEVICE
+  void store(Fragment const& frag) {
+    store_with_pointer_offset(frag, 0);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace threadblock
+} // namespace transform
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/third_party/fused_multi_head_attention/iterators/transpose_warp_iterator.h b/third_party/fused_multi_head_attention/iterators/transpose_warp_iterator.h
new file mode 100644
index 0000000000..18858ab732
--- /dev/null
+++ b/third_party/fused_multi_head_attention/iterators/transpose_warp_iterator.h
@@ -0,0 +1,55 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include "warp_iterator_from_smem.h"
+
+template <typename WarpIterator>
+struct TransposeWarpIterator {
+  using Iterator = char;
+  static bool constexpr kSupportsTranspose = false;
+};
+
+template <
+    /// Operand identity
+    cutlass::gemm::Operand Operand,
+    /// Data type of A elements
+    typename Element,
+    typename InstructionShape,
+    bool kTranspose>
+struct TransposeWarpIterator<
+    cutlass::gemm::warp::
+        WarpIteratorFromSmem<Operand, Element, InstructionShape, kTranspose>> {
+  using Iterator = cutlass::gemm::warp::
+      WarpIteratorFromSmem<Operand, Element, InstructionShape, !kTranspose>;
+  static bool constexpr kSupportsTranspose = true;
+};
diff --git a/third_party/fused_multi_head_attention/iterators/warp_iterator_from_smem.h b/third_party/fused_multi_head_attention/iterators/warp_iterator_from_smem.h
new file mode 100644
index 0000000000..3f4ebec698
--- /dev/null
+++ b/third_party/fused_multi_head_attention/iterators/warp_iterator_from_smem.h
@@ -0,0 +1,283 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Inspired from
+   "cutlass/gemm/warp/mma_tensor_op_tile_access_iterator.h" Loads tiles of GEMM
+   operands from a RowMajor shared-memory layout into registers to use by A100
+   TensorCores.
+
+    The difference with "mma_tensor_op_tile_access_iterator.h" is that:
+    (1) We use "ldmatrix" to load tiles, rather than manual loads (slightly
+   faster) (2) We support to transpose the operand (eg read `A.transpose()` when
+   the shared memory holds `A`)
+
+    This is only implemented for the specific shapes.
+*/
+#pragma once
+
+#include <cutlass/gemm/gemm.h>
+
+////////////////////////////////////////////////////////////////////////////////
+namespace cutlass {
+namespace gemm {
+namespace warp {
+
+template <
+    /// Operand identity
+    Operand Operand_,
+    /// Data type of A elements
+    typename Element_,
+    typename InstructionShape_,
+    bool kTranspose = false>
+class WarpIteratorFromSmem {
+ public:
+  /// Shape of tile to load (concept: MatrixShape)
+  using Shape = cutlass::MatrixShape<32, 32>;
+
+  /// Operand tag
+  static Operand const kOperand = Operand_;
+  static_assert(
+      kOperand == Operand::kA,
+      "No support for OperandB at the moment");
+
+  /// Basic check
+  static_assert(
+      kOperand == Operand::kA || kOperand == Operand::kB,
+      "WarpIteratorFromSmem may only be instantiated for A or B operands to warp-level Mma.");
+
+  /// Element type
+  using Element = Element_;
+  static_assert(sizeof_bits<Element>::value == 16, "Only supported for half");
+
+  /// Layout of source tile
+  using Layout = cutlass::layout::RowMajor;
+
+  /// Shape of one matrix product operation (concept: MatrixShape)
+  using InstructionShape = InstructionShape_;
+  static_assert(InstructionShape::kRow == 16, "Only supports 16x8x8 / 16x8x16");
+  static_assert(
+      InstructionShape::kColumn == 8 || InstructionShape::kColumn == 16,
+      "Only supports 16x8x8 / 16x8x16");
+
+  /// Delta between *MMA operations (in units of *MMA operations, concept:
+  /// MatrixShape)
+  static int const kOpDelta = 1;
+
+  /// Number of participating threads
+  static int const kThreads = 32;
+
+  /// TensorRef type for loading element from a tensor
+  using TensorRef = TensorRef<Element, Layout>;
+
+  /// Index type
+  using Index = typename TensorRef::Index;
+
+  /// Long Index type
+  using LongIndex = typename TensorRef::LongIndex;
+
+  /// Coordinate for an element in the tensor
+  using TensorCoord = typename TensorRef::TensorCoord;
+
+  /// Number of elements accessed per Shared Memory load
+  static int const kElementsPerAccess =
+      (sizeof_bits<Element>::value >= 32 ? 1
+                                         : 32 / sizeof_bits<Element>::value);
+
+  using InstructionCount = MatrixShape<
+      Shape::kRow / InstructionShape::kRow,
+      Shape::kColumn / InstructionShape::kColumn>;
+
+  static int const kIterations = (kOperand == Operand::kA)
+      ? InstructionCount::kColumn
+      : InstructionCount::kRow;
+
+ public:
+  //
+  // Derived quantities
+  //
+
+  /// Fragment object holding a thread's part of a tile
+  using Fragment = Array<
+      Element,
+      (kOperand == Operand::kA)
+          ? (Shape::kRow* InstructionShape::kColumn / kThreads)
+          : (Shape::kColumn* InstructionShape::kRow / kThreads)>;
+
+  /// Memory access type
+  // using AccessType = AlignedArray<Element, kElementsPerAccess>;
+  using AccessType = Array<unsigned, 4>;
+
+  static int constexpr kWarpShapeDivisibleInner =
+      (kOperand == Operand::kA ? InstructionShape::kColumn
+                               : InstructionShape::kRow);
+  static int constexpr kAccessesInner =
+      (kWarpShapeDivisibleInner / kElementsPerAccess) / 4;
+  // Number of 32bits tiles to load per `ldmatrix`
+  static int const kTilesPerInstruction = InstructionShape::kRow / 8;
+  static_assert(kTilesPerInstruction == 2, "Only supports 16x8x16 and 16x8x8");
+
+ private:
+  /// Underlying tensor reference
+  TensorRef ref_;
+
+  /// Origin
+  MatrixCoord origin_;
+
+  /// Iterations in a tile
+  int iterations_;
+
+ public:
+  /// Constructor from TensorRef
+  CUTLASS_HOST_DEVICE
+  WarpIteratorFromSmem(TensorRef const& ref, int lane_id)
+      : WarpIteratorFromSmem(ref, {Shape::kRow, Shape::kColumn}, lane_id) {}
+  CUTLASS_HOST_DEVICE
+  WarpIteratorFromSmem(TensorRef const& ref, TensorCoord extent, int lane_id)
+      : ref_(ref), iterations_(0) {
+    // See also:
+    // https://docs.nvidia.com/cuda/archive/11.7.1/parallel-thread-execution/index.html#warp-level-matrix-fragment-mma-1688
+    // 16x8x8: kAccessesInner = 1 (1 ldmatrix.x4)
+    // 16x8x16: kAccessesInner = 2 (2 ldmatrix.x4)
+    int ldsm_vec_num = (lane_id >> 3);
+    if (kOperand == Operand::kA) {
+      origin_ = MatrixCoord(lane_id % 8, 0);
+      static_assert(
+          InstructionCount::kRow * kTilesPerInstruction == 4,
+          "can't use ldmatrix.x4");
+      int access_m_idx = ldsm_vec_num % kTilesPerInstruction;
+      int inner_idx = (ldsm_vec_num / kTilesPerInstruction) % kAccessesInner;
+      int inst_m_idx = ldsm_vec_num / (kTilesPerInstruction * kAccessesInner);
+      MatrixCoord offset(
+          access_m_idx * 8 + inst_m_idx * InstructionShape::kRow,
+          inner_idx * 4 * kElementsPerAccess);
+      if (kTranspose) {
+        offset = MatrixCoord(offset.column(), offset.row());
+      }
+      origin_ += offset;
+    } else {
+      // Note: This is not tested or used
+      origin_ = MatrixCoord(0, lane_id % 8);
+      static_assert(InstructionCount::kColumn * kAccessesInner == 4, "");
+      CUTLASS_PRAGMA_UNROLL
+      for (int inst_n_idx = 0; inst_n_idx < InstructionCount::kColumn;
+           ++inst_n_idx) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int inner_idx = 0; inner_idx < kAccessesInner; ++inner_idx) {
+          int access_idx = inner_idx + kAccessesInner * inst_n_idx;
+
+          MatrixCoord offset(
+              inner_idx * 4 * kElementsPerAccess, inst_n_idx * 8);
+
+          if (access_idx == ldsm_vec_num) {
+            if (kTranspose) {
+              offset = MatrixCoord(offset.column(), offset.row());
+            }
+            origin_ += offset;
+          }
+        }
+      }
+    }
+
+    ref_.add_coord_offset(origin_);
+  }
+
+  /// Advances an iterator along logical dimensions of matrix in units of whole
+  /// tiles
+  CUTLASS_HOST_DEVICE
+  WarpIteratorFromSmem& add_tile_offset(TensorCoord const& tile_offset) {
+    TensorCoord coord_offset(
+        tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn);
+    if (kTranspose) {
+      coord_offset = TensorCoord{coord_offset.column(), coord_offset.row()};
+    }
+    origin_ += coord_offset;
+
+    ref_.add_coord_offset(coord_offset);
+
+    return *this;
+  }
+
+  /// Advances the iterator along the advance dimension
+  CUTLASS_DEVICE
+  void advance() {
+    if (kOperand == Operand::kA) {
+      add_tile_offset({0, 1});
+    } else {
+      add_tile_offset({1, 0});
+    }
+
+    iterations_ = 0;
+  }
+
+  /// increase iterations in a tile
+  CUTLASS_HOST_DEVICE
+  WarpIteratorFromSmem& operator++() {
+    iterations_++;
+
+    if (iterations_ >= kIterations)
+      advance();
+
+    return *this;
+  }
+
+  /// Loads a fragment from memory at the location pointed to by the iterator.
+  CUTLASS_DEVICE
+  void load(Fragment& frag) const {
+    AccessType* access_ptr = reinterpret_cast<AccessType*>(&frag);
+    using LoadLayout = typename platform::
+        conditional<kTranspose, layout::ColumnMajor, layout::RowMajor>::type;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int access_m_idx = 0; access_m_idx <
+         (InstructionCount::kRow * kTilesPerInstruction * kAccessesInner) / 4;
+         ++access_m_idx) {
+      MatrixCoord offset;
+      if (kOperand == Operand::kA) {
+        offset = MatrixCoord(
+            access_m_idx * 16, iterations_ * InstructionShape::kColumn);
+      } else {
+        offset = MatrixCoord(iterations_ * InstructionShape::kRow, 0);
+      }
+      if (kTranspose) {
+        offset = MatrixCoord(offset.column(), offset.row());
+      }
+      cutlass::arch::ldsm<LoadLayout, 4>(
+          access_ptr[access_m_idx], ref_.data() + ref_.offset(offset));
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace warp
+} // namespace gemm
+} // namespace cutlass
+////////////////////////////////////////////////////////////////////////////////
diff --git a/third_party/fused_multi_head_attention/kernel_backward.h b/third_party/fused_multi_head_attention/kernel_backward.h
new file mode 100644
index 0000000000..5cdb7c2145
--- /dev/null
+++ b/third_party/fused_multi_head_attention/kernel_backward.h
@@ -0,0 +1,2554 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <cmath>
+#include <cinttypes>
+#include <type_traits>
+#include <vector>
+
+#include <cuda_fp16.h>
+#include <curand_kernel.h>
+
+#ifdef HAS_PYTORCH
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAGeneratorImpl.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/CUDAGraphsUtils.cuh>
+#endif
+
+#include "cutlass/cutlass.h"
+#include "cutlass/epilogue/thread/linear_combination.h"
+#include "cutlass/epilogue/thread/scale_type.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/functional.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/numeric_conversion.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+
+#include "debug_utils.h"
+#include "gemm_kernel_utils.h"
+
+#include "cutlass/epilogue/thread/linear_combination_relu.h"
+#include "cutlass/epilogue/threadblock/epilogue_smem_accumulator.h"
+#include "cutlass/epilogue/warp/fragment_iterator_tensor_op.h"
+#include "cutlass/epilogue/warp/tile_iterator_tensor_op.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/kernel/default_gemm.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/integer_subbyte.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/platform/platform.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "cutlass/transform/threadblock/vector_iterator.h"
+#include "epilogue/epilogue_pipelined.h"
+#include "iterators/epilogue_predicated_tile_iterator.h"
+
+#include "gemm/custom_mma.h"
+#include "gemm/find_default_mma.h"
+#include "gemm/mma_accum_lambda_iterator.h"
+#include "gemm/mma_from_smem.h"
+#include "transform/tile_smem_loader.h"
+
+using namespace gemm_kernel_utils;
+
+namespace {
+
+template <typename FragmentType, int32_t kNumThreads>
+struct GmemTile {
+  /*
+    Helper functions to efficient store/load RF to gmem
+
+    GEMM accumulators have a particular format on A100, and
+    it takes some compute/shared-memory to rearrange them to
+    a RowMajor or ColumnMajor format in global memory through
+    an Epilogue. The same complexity goes for loading into RF.
+
+    This class loads/stores RF as they are, and can be used for
+    efficient accumulation across gemms for instance:
+
+    ```
+    GmemTile tile;
+    for (int i = 0; i < N; ++i) {
+      // ...
+
+      Fragment accum;
+      if (i == 0) {
+        accum.clear();
+      } else {
+        tile.load(accum);
+      }
+      mma(accum, ...);
+      if (i < N-1) {
+        // Store for next GEMM
+        tile.store(accum);
+      } else {
+        // Store in tensor (eg RowMajor)
+        epilogue(accum);
+      }
+
+      // ...
+    }
+    ```
+  */
+
+  // 128bits per thread
+  using AccessType = cutlass::Array<float, 4>;
+  static constexpr int32_t kBytes = sizeof(AccessType);
+  static constexpr int32_t kStride = kNumThreads * AccessType::kElements;
+  static constexpr int32_t kNumIters =
+      FragmentType::kElements / AccessType::kElements;
+  static constexpr int32_t kElementsStored =
+      kNumThreads * FragmentType::kElements;
+  static_assert(
+      FragmentType::kElements % AccessType::kElements == 0,
+      "fragment not aligned on 128 bits");
+
+  float* ptr;
+
+  CUTLASS_DEVICE void load(FragmentType& fragment, int thread_id) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kNumIters; ++i) {
+      AccessType* __restrict__ gmem_ptr = reinterpret_cast<AccessType*>(
+          ptr + thread_id * AccessType::kElements + i * kStride);
+      AccessType sub_fragment;
+      cutlass::arch::global_load<AccessType, kBytes>(
+          sub_fragment, gmem_ptr, true);
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < AccessType::kElements; ++j) {
+        fragment[i * AccessType::kElements + j] = sub_fragment[j];
+      }
+    }
+  }
+
+  CUTLASS_DEVICE void store(FragmentType const& fragment, int thread_id) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kNumIters; ++i) {
+      AccessType* __restrict__ gmem_ptr = reinterpret_cast<AccessType*>(
+          ptr + thread_id * AccessType::kElements + i * kStride);
+      AccessType sub_fragment;
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < AccessType::kElements; ++j) {
+        sub_fragment[j] = fragment[i * AccessType::kElements + j];
+      }
+      cutlass::arch::global_store<AccessType, kBytes>(
+          sub_fragment, gmem_ptr, true);
+    }
+  }
+
+  CUTLASS_DEVICE void storeAtomicAdd(
+      FragmentType const& fragment,
+      int thread_id) {
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kNumIters; ++i) {
+      float* gmem_ptr = ptr + thread_id * AccessType::kElements + i * kStride;
+      CUTLASS_PRAGMA_UNROLL
+      for (int j = 0; j < AccessType::kElements; ++j) {
+        float val = fragment[i * AccessType::kElements + j];
+        float* ptr = gmem_ptr + j;
+        atomicAdd(ptr, val);
+      }
+    }
+  }
+};
+
+struct AtomicLock {
+  CUTLASS_DEVICE static void acquire(
+      int32_t* lock,
+      int set_val,
+      int thread_id) {
+    if (thread_id == 0) {
+      while (atomicCAS(lock, 0 /*cmp*/, set_val /*setval*/) != set_val) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+        __nanosleep(40);
+#endif
+      }
+    }
+    __syncthreads();
+  }
+  CUTLASS_DEVICE static void release(int32_t* lock, int thread_id) {
+    if (thread_id == 0) {
+      int status = 0;
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+      asm volatile("st.global.release.gpu.b32 [%0], %1;\n"
+                   :
+                   : "l"(lock), "r"(status));
+#else
+      asm volatile("st.global.cg.b32 [%0], %1;\n" : : "l"(lock), "r"(status));
+#endif
+    }
+  }
+};
+
+template <typename scalar_t, typename Arch>
+constexpr int getWarpsPerSmBw() {
+  bool is_half = !cutlass::platform::is_same<scalar_t, float>::value;
+  if (Arch::kMinComputeCapability >= 80) {
+    return is_half ? 12 : 8;
+  }
+  return 8;
+}
+} // namespace
+
+template <
+    // which arch we target (eg `cutlass::arch::Sm80`)
+    typename ArchTag_,
+    // input/output type
+    typename scalar_t_,
+    // run optimized kernel because memory accesses will be aligned
+    bool kIsAligned_,
+    // use dropout if enabled
+    bool kApplyDropout_,
+    // when doing a GEMM, preload the next one (uses more shmem)
+    bool kPreload_,
+    // block dimensions
+    int kBlockSizeI_,
+    int kBlockSizeJ_,
+    // upperbound on `max(value.shape[-1], query.shape[-1])`
+    int kMaxK_ = (int)cutlass::platform::numeric_limits<uint32_t>::max(),
+    // assumes that `cu_seqlen` is None, and
+    // (1) `num_queries % kBlockSizeI == 0`
+    // (2) `num_keys % kBlockSizeJ == 0`
+    bool kKeysQueriesAlignedToBlockSize_ = false,
+    // Allows to parallelize across keys
+    bool kEnableSplitKeys_ = true>
+struct AttentionBackwardKernel {
+  enum CustomMaskType {
+    NoCustomMask = 0,
+    CausalFromTopLeft = 1,
+    CausalFromBottomRight = 2,
+    NumCustomMaskTypes,
+  };
+  using scalar_t = scalar_t_;
+  using output_t = scalar_t;
+  using output_accum_t = float;
+  using lse_scalar_t = float;
+  using accum_t = float;
+  using ArchTag = ArchTag_;
+  static constexpr bool kIsAligned = kIsAligned_;
+  static constexpr bool kApplyDropout = kApplyDropout_;
+  static constexpr bool kPreload = kPreload_;
+  static constexpr int kBlockSizeI = kBlockSizeI_;
+  static constexpr int kBlockSizeJ = kBlockSizeJ_;
+  static constexpr int kMaxK = kMaxK_;
+  static constexpr bool kKeysQueriesAlignedToBlockSize =
+      kKeysQueriesAlignedToBlockSize_;
+
+  static constexpr int64_t kWarpSize = 32;
+
+  // If this is true, we store and accumulate dK/dV in RF
+  // rather than going back to gmem everytime
+  static constexpr bool kIsHalf = cutlass::sizeof_bits<scalar_t>::value <= 16;
+  static constexpr bool kOutputInRF = kIsHalf && kMaxK <= kBlockSizeI;
+  static_assert(
+      !kPreload ||
+          (kIsHalf && ArchTag::kMinComputeCapability >= 80 && kOutputInRF),
+      "preload MMA not supported");
+  static constexpr bool kPrologueQK = kPreload;
+  static constexpr bool kPrologueGV = kPreload;
+  static constexpr bool kPrologueDOV = kPreload;
+  static constexpr bool kPrologueGQ = kPreload;
+  static constexpr bool kPrologueGK = kPreload;
+
+  static constexpr int64_t kNumWarpsPerBlock =
+      (kBlockSizeI * kBlockSizeJ) / (32 * 32);
+
+  // Compute delta for the f16 kernels
+  // TODO: Figure out why it's slower on the f32 kernels
+  // (something due to RF pressure?)
+  // TODO: Remove condition on `kOutputInRF` - this is needed to work
+  // around a compiler bug on V100, not exactly sure why but I spent
+  // too much time on this already. Reproducible with
+  // (B, Mq, Mkv, K) = (1, 1, 1, 136) for instance
+  static constexpr bool kKernelComputesDelta =
+      kIsHalf && (kOutputInRF || ArchTag::kMinComputeCapability != 70);
+
+  // Launch bounds
+  static constexpr int64_t kNumThreads = kWarpSize * kNumWarpsPerBlock;
+  static constexpr int64_t kMinBlocksPerSm =
+      getWarpsPerSmBw<scalar_t, ArchTag>() / kNumWarpsPerBlock;
+
+  using GemmType = DefaultGemmType<ArchTag, scalar_t>;
+  using DefaultConfig =
+      typename cutlass::gemm::device::DefaultGemmConfiguration<
+          typename GemmType::OpClass,
+          ArchTag,
+          scalar_t,
+          scalar_t,
+          scalar_t, // ElementC
+          accum_t // ElementAccumulator
+          >;
+  static constexpr auto kOptimalAlignement = cutlass::platform::max(
+      DefaultConfig::kAlignmentA,
+      DefaultConfig::kAlignmentB);
+  static constexpr auto kMinimumAlignment = GemmType::kMinimumAlignment;
+
+  struct MatmulQK {
+    /*
+    attn_T = k_j @ q_i.transpose(-2, -1) # matmul
+    attn_T = (attn_T - logsumexp[i_start:i_end].unsqueeze(1).transpose(-2,
+    -1)).exp() # epilogue
+
+    with attn_T.shape = (kBlockSizeJ, kBlockSizeI)
+    */
+    using ThreadblockShape =
+        cutlass::gemm::GemmShape<kBlockSizeJ, kBlockSizeI, GemmType::ThreadK>;
+    using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+    using DefaultMma = typename cutlass::gemm::threadblock::DefaultMma<
+        scalar_t, // ElementA
+        cutlass::layout::RowMajor, // LayoutA
+        kIsAligned ? DefaultConfig::kAlignmentA : GemmType::kMinimumAlignment,
+        scalar_t, // ElementB
+        cutlass::layout::ColumnMajor, // LayoutB
+        kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment,
+        accum_t, // ElementC
+        cutlass::layout::RowMajor, // LayoutC
+        typename GemmType::OpClass,
+        ArchTag,
+        ThreadblockShape,
+        WarpShape,
+        typename GemmType::InstructionShape,
+        DefaultConfig::kStages,
+        typename GemmType::Operator,
+        false, // AccumulatorsInRowMajor = false,
+        cutlass::gemm::SharedMemoryClearOption::kNone>;
+    using MmaCore = typename DefaultMma::MmaCore;
+    using Mma =
+        typename MakeCustomMma<typename DefaultMma::ThreadblockMma, kMaxK>::Mma;
+
+    // used for efficient load of bias tile (Bij) from global memory to shared
+    // memory
+    using BiasLoader = TileSmemLoader<
+        scalar_t,
+        // Bij is applied to transposed attn matrix tile (Pij.T). Bij is loaded
+        // row-major but needs to have transposed shape so we get the same
+        // elements.
+        cutlass::MatrixShape<ThreadblockShape::kN, ThreadblockShape::kM>,
+        MmaCore::kThreads,
+        // input restriction: kv_len has to be a multiple of this value
+        128 / cutlass::sizeof_bits<scalar_t>::value>;
+
+    // Epilogue to store to shared-memory in a format that we can use later for
+    // the second matmul
+    using B2bGemm = typename cutlass::gemm::threadblock::B2bGemm<
+        typename Mma::Operator::IteratorC,
+        typename Mma::Operator,
+        scalar_t,
+        WarpShape,
+        ThreadblockShape>;
+    using AccumLambdaIterator = typename DefaultMmaAccumLambdaIterator<
+        typename Mma::Operator::IteratorC,
+        accum_t,
+        kWarpSize>::Iterator;
+    using AccumulatorSharedStorage = typename B2bGemm::AccumulatorSharedStorage;
+  };
+
+  struct MatmulGradV {
+    /*
+    grad_v[j_start:j_end] += attn_T @ do_i # matmul
+
+    Dimensions: (kBlockSizeJ * kNumWarpsPerBlock, kBlockSizeI, K)
+    (we might need to iterate multiple times on K)
+    */
+    using ThreadblockShape =
+        cutlass::gemm::GemmShape<kBlockSizeJ, kBlockSizeI, GemmType::ThreadK>;
+    using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+    using InstructionShape = typename GemmType::InstructionShape;
+
+    using DefaultGemm = cutlass::gemm::kernel::DefaultGemm<
+        scalar_t, // ElementA,
+        cutlass::layout::RowMajor, // LayoutA,
+        DefaultConfig::kAlignmentA,
+        scalar_t, // ElementB,
+        cutlass::layout::RowMajor, // LayoutB,
+        kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment,
+        output_t,
+        cutlass::layout::RowMajor, // LayoutC,
+        accum_t,
+        typename GemmType::OpClass,
+        ArchTag,
+        ThreadblockShape,
+        WarpShape,
+        typename GemmType::InstructionShape,
+        typename DefaultConfig::EpilogueOutputOp,
+        void, // ThreadblockSwizzle - not used
+        DefaultConfig::kStages,
+        false, // SplitKSerial
+        typename GemmType::Operator>;
+
+    // if dropout:
+    //   for computing dVj += (Pij.T * Zij) @ dOi
+    //   Pij_dropped.T = Pij.T * Zij is computed on the fly as fragments of
+    //   Pij.T are loaded in. The reason we do it this way is because Pij.T and
+    //   Zij are reused in later steps, while Pij_dropped.T is only needed in
+    //   this step. computing Pij_dropped.T on the fly allows us to avoid
+    //   keeping all 3 of Pij_dropped.T, Pij.T, and Zij in shared memory at the
+    //   same time.
+    // if no dropout:
+    //   for computing dVj += Pij.T @ dOi
+    using WarpIteratorA = typename cutlass::gemm::threadblock::
+        DefaultWarpIteratorAFromSharedMemory<
+            typename DefaultGemm::Mma::Operator::Shape, // WarpShape
+            typename DefaultGemm::Mma::Operator::
+                InstructionShape, // InstructionShape
+            typename DefaultGemm::Mma::Operator::
+                IteratorA, // RegularWarpIterator
+            typename DefaultGemm::Mma::Policy // Policy
+            >::WarpIterator;
+    using DefaultMmaFromSmem =
+        typename cutlass::gemm::threadblock::DefaultMmaFromSharedMemory<
+            typename DefaultGemm::Mma,
+            MatmulQK::AccumulatorSharedStorage::Shape::kN,
+            WarpIteratorA,
+            kApplyDropout>; // kScaleOperandA
+
+    using Mma = typename DefaultMmaFromSmem::Mma;
+    using IteratorB = typename Mma::IteratorB;
+    using WarpCount = typename Mma::WarpCount;
+
+    // Epilogue
+    using DefaultOutputOp = typename DefaultConfig::EpilogueOutputOp;
+    using DefaultEpilogue = typename DefaultGemm::Epilogue;
+    using OutputTileIterator =
+        typename cutlass::epilogue::threadblock::MakePrefetchableIterator<
+            typename DefaultEpilogue::OutputTileIterator>::Iterator;
+    using AccumTileGmem = GmemTile<typename Mma::FragmentC, (int)kNumThreads>;
+  };
+
+  struct MatmulDOIVJ {
+    /*
+    doi_t_vj = do_i @ v_j.transpose(-2, -1) # matmul
+    tmp = (doi_t_vj - Di.unsqueeze(1)) * attn # inplace / epilogue?
+    */
+    using ThreadblockShape =
+        cutlass::gemm::GemmShape<kBlockSizeI, kBlockSizeJ, GemmType::ThreadK>;
+    using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+
+    using ElementC = output_t;
+    using ElementAccum = accum_t;
+
+    // no-op output op - epilogue just stores result to global memory
+    using BiasGradEpilogueOutputOp =
+        typename cutlass::epilogue::thread::LinearCombination<
+            ElementC,
+            DefaultConfig::EpilogueOutputOp::kCount,
+            typename DefaultConfig::EpilogueOutputOp::ElementAccumulator,
+            typename DefaultConfig::EpilogueOutputOp::ElementCompute,
+            cutlass::epilogue::thread::ScaleType::Nothing>;
+
+    using DefaultGemm = typename cutlass::gemm::kernel::DefaultGemm<
+        scalar_t, // ElementA
+        cutlass::layout::RowMajor, // LayoutA
+        kIsAligned ? DefaultConfig::kAlignmentA : GemmType::kMinimumAlignment,
+        scalar_t, // ElementB
+        cutlass::layout::ColumnMajor, // LayoutB
+        kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment,
+        ElementC, // ElementC
+        cutlass::layout::RowMajor, // LayoutC
+        ElementAccum, // ElementAccumulator
+        typename GemmType::OpClass,
+        ArchTag,
+        ThreadblockShape,
+        WarpShape,
+        typename GemmType::InstructionShape,
+        BiasGradEpilogueOutputOp, // EpilogueOutputOp
+        void, // ThreadblockSwizzle (not used)
+        // multiple preloads, dropout Zij tile, and 3 stages push us over shared
+        // memory capacity on A100. set a ceiling on number of stages to save
+        // shared memory if dropout is in use.
+        kPreload && kApplyDropout && (kBlockSizeI * kBlockSizeJ > 64 * 64)
+            ? cutlass::const_min(2, DefaultConfig::kStages)
+            : DefaultConfig::kStages, // Stages
+        false, // SplitKSerial
+        typename GemmType::Operator,
+        cutlass::gemm::SharedMemoryClearOption::kNone>;
+    using Mma = typename MakeCustomMma<typename DefaultGemm::Mma, kMaxK>::Mma;
+    using AccumLambdaIterator = typename DefaultMmaAccumLambdaIterator<
+        typename Mma::Operator::IteratorC,
+        ElementAccum,
+        kWarpSize>::Iterator;
+
+    // epilogue used to write bias gradient, which is just the output of this
+    // matmul with some operations applied to the fragment
+    using BiasGradEpilogue = typename DefaultGemm::Epilogue;
+
+    // Epilogue to store to shared-memory in a format that we can use later for
+    // the second matmul
+    using B2bGemm = typename cutlass::gemm::threadblock::B2bGemm<
+        typename DefaultGemm::Mma::Operator::IteratorC,
+        typename DefaultGemm::Mma::Operator,
+        scalar_t,
+        WarpShape,
+        ThreadblockShape>;
+    using AccumulatorSharedStorage = typename B2bGemm::AccumulatorSharedStorage;
+  };
+
+  struct MatmulGradQ {
+    // grad_q <- tmp @ k_j
+    using ThreadblockShape =
+        cutlass::gemm::GemmShape<kBlockSizeI, kBlockSizeJ, GemmType::ThreadK>;
+    using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+    using InstructionShape = typename GemmType::InstructionShape;
+
+    using DefaultGemm = cutlass::gemm::kernel::DefaultGemm<
+        scalar_t, // ElementA,
+        cutlass::layout::RowMajor, // LayoutA,
+        DefaultConfig::kAlignmentA,
+        scalar_t, // ElementB,
+        cutlass::layout::RowMajor, // LayoutB,
+        kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment,
+        output_t,
+        cutlass::layout::RowMajor, // LayoutC,
+        accum_t,
+        typename GemmType::OpClass,
+        ArchTag,
+        ThreadblockShape,
+        WarpShape,
+        typename GemmType::InstructionShape,
+        typename DefaultConfig::EpilogueOutputOp,
+        void, // ThreadblockSwizzle - not used
+        DefaultConfig::kStages,
+        false, // SplitKSerial
+        typename GemmType::Operator>;
+
+    using WarpIteratorA = typename cutlass::gemm::threadblock::
+        DefaultWarpIteratorAFromSharedMemory<
+            typename DefaultGemm::Mma::Operator::Shape,
+            typename DefaultGemm::Mma::Operator::InstructionShape,
+            typename DefaultGemm::Mma::Operator::IteratorA,
+            typename DefaultGemm::Mma::Policy>::WarpIterator;
+    using DefaultMmaFromSmem =
+        typename cutlass::gemm::threadblock::DefaultMmaFromSharedMemory<
+            typename DefaultGemm::Mma,
+            MatmulDOIVJ::AccumulatorSharedStorage::Shape::kN,
+            WarpIteratorA,
+            false>; // kScaleOperandA
+    using Mma = typename DefaultMmaFromSmem::Mma;
+    using IteratorB = typename Mma::IteratorB;
+    using WarpCount = typename Mma::WarpCount;
+
+    // Epilogue
+    using DefaultOutputOp = typename DefaultConfig::EpilogueOutputOp;
+    using DefaultEpilogue = typename DefaultGemm::Epilogue;
+    using OutputTileIterator =
+        typename cutlass::epilogue::threadblock::MakePrefetchableIterator<
+            typename DefaultEpilogue::OutputTileIterator>::Iterator;
+    using AccumTileGmem = GmemTile<typename Mma::FragmentC, (int)kNumThreads>;
+  };
+  struct MatmulGradK {
+    // grad_k <- tmp.transpose(-2, -1) @ q_i
+    using ThreadblockShape =
+        cutlass::gemm::GemmShape<kBlockSizeJ, kBlockSizeI, GemmType::ThreadK>;
+    using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+    using InstructionShape = typename GemmType::InstructionShape;
+
+    using DefaultGemm = cutlass::gemm::kernel::DefaultGemm<
+        scalar_t, // ElementA,
+        cutlass::layout::RowMajor, // LayoutA,
+        DefaultConfig::kAlignmentA,
+        scalar_t, // ElementB,
+        cutlass::layout::RowMajor, // LayoutB,
+        kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment,
+        output_t,
+        cutlass::layout::RowMajor, // LayoutC,
+        accum_t,
+        typename GemmType::OpClass,
+        ArchTag,
+        ThreadblockShape,
+        WarpShape,
+        typename GemmType::InstructionShape,
+        typename DefaultConfig::EpilogueOutputOp,
+        void, // ThreadblockSwizzle - not used
+        DefaultConfig::kStages,
+        false, // SplitKSerial
+        typename GemmType::Operator>;
+
+    using WarpIteratorA = typename cutlass::gemm::threadblock::
+        DefaultWarpIteratorAFromSharedMemory<
+            typename DefaultGemm::Mma::Operator::Shape,
+            typename DefaultGemm::Mma::Operator::InstructionShape,
+            typename DefaultGemm::Mma::Operator::IteratorA,
+            typename DefaultGemm::Mma::Policy>::WarpIterator;
+    using DefaultMmaFromSmemN =
+        typename cutlass::gemm::threadblock::DefaultMmaFromSharedMemory<
+            typename DefaultGemm::Mma,
+            MatmulQK::AccumulatorSharedStorage::Shape::kN, // kMaxK
+            WarpIteratorA,
+            false>; // kScaleOperandA
+    using DefaultMmaFromSmemT =
+        typename cutlass::gemm::threadblock::DefaultMmaFromSharedMemory<
+            typename DefaultGemm::Mma,
+            MatmulDOIVJ::AccumulatorSharedStorage::Shape::kM, // kMaxK
+            WarpIteratorA,
+            false, // kScaleOperandA
+            kPreload>; // kTransposeA
+    using DefaultMmaFromSmem = typename cutlass::platform::conditional<
+        DefaultMmaFromSmemT::kIsTransposedA,
+        DefaultMmaFromSmemT,
+        DefaultMmaFromSmemN>::type;
+    using Mma = typename DefaultMmaFromSmem::Mma;
+    using IteratorB = typename Mma::IteratorB;
+    using WarpCount = typename Mma::WarpCount;
+
+    // Epilogue
+    using DefaultOutputOp = typename DefaultConfig::EpilogueOutputOp;
+    using DefaultEpilogue = typename DefaultGemm::Epilogue;
+    using OutputTileIterator =
+        typename cutlass::epilogue::threadblock::MakePrefetchableIterator<
+            typename DefaultEpilogue::OutputTileIterator>::Iterator;
+    using AccumTileGmem = GmemTile<typename Mma::FragmentC, (int)kNumThreads>;
+  };
+
+  static constexpr bool kEnableSplitKeys = kEnableSplitKeys_;
+
+  static constexpr bool kNeedsAccumGradQ = kEnableSplitKeys ||
+      !cutlass::platform::is_same<output_accum_t, output_t>::value;
+  static constexpr bool kNeedsAccumGradK = !kOutputInRF &&
+      !cutlass::platform::is_same<output_accum_t, output_t>::value;
+  static constexpr bool kNeedsAccumGradV = !kOutputInRF &&
+      !cutlass::platform::is_same<output_accum_t, output_t>::value;
+
+  struct GradQTempStorage {
+    int32_t lock;
+    int32_t counter;
+    int32_t pad[2]; // pad to 128bits
+    output_accum_t buffer[MatmulGradQ::AccumTileGmem::kElementsStored];
+  };
+
+  struct Params {
+    // Input tensors
+    scalar_t* query_ptr = nullptr; // [Mq, nH, K]
+    scalar_t* key_ptr = nullptr; // [Mk, nH, K]
+    scalar_t* value_ptr = nullptr; // [Mk, nH, Kv]
+    scalar_t* bias_ptr = nullptr;
+    lse_scalar_t* logsumexp_ptr = nullptr; // [nH, Mq]
+    scalar_t* output_ptr = nullptr; // [Mq, nH, Kv]
+    scalar_t* grad_output_ptr = nullptr; // [Mq, nH, Kv]
+    accum_t* delta_ptr = nullptr; // [nH, Mq]
+    int32_t* cu_seqlens_q_ptr = nullptr;
+    int32_t* cu_seqlens_k_ptr = nullptr;
+
+    // Output tensors
+    output_t* grad_query_ptr = nullptr; //  [Mq, nH, K]
+    output_t* grad_key_ptr = nullptr; //    [Mk, nH, K]
+    output_t* grad_value_ptr = nullptr; //  [Mk, nH, Kv]
+    output_t* grad_bias_ptr = nullptr;
+
+    // Accumulators
+    output_accum_t* workspace = nullptr; // [Mq, Kq] + [Mkv, Kq] + [Mkv, Kv]
+    output_accum_t* workspace_gv =
+        nullptr; // (will be calculated by the kernel)
+    GradQTempStorage* workspace_gq =
+        nullptr; // (will be calculated by the kernel)
+
+    // Scale
+    accum_t scale = 1.0f;
+
+    // Dimensions/strides
+    int32_t head_dim = -1;
+    int32_t head_dim_value = -1;
+    int32_t num_queries = -1;
+    int32_t num_keys = -1;
+    int32_t num_heads = -1;
+    uint8_t custom_mask_type = NoCustomMask;
+
+    int32_t q_strideM = -1;
+    int32_t k_strideM = -1;
+    int32_t v_strideM = -1;
+    int32_t bias_strideM = 0;
+    int32_t gO_strideM = -1;
+    int32_t gB_strideM = -1;
+    int8_t gQKV_strideM_multiplier = 1; // 3 for packed, 1 otherwise
+
+#ifdef HAS_PYTORCH
+    // dropout
+    at::PhiloxCudaState rng_engine_inputs = {0, 0};
+#endif
+    // RNG sequence offset based on batch_id and head_id
+    unsigned long long dropout_batch_head_rng_offset = 0;
+    float dropout_prob = 0.0f;
+
+    CUTLASS_HOST_DEVICE int32_t o_strideM() const {
+      return head_dim_value * num_heads;
+    }
+    CUTLASS_HOST_DEVICE int32_t gQ_strideM() const {
+      return gQKV_strideM_multiplier * num_heads * head_dim;
+    }
+    CUTLASS_HOST_DEVICE int32_t gK_strideM() const {
+      return gQKV_strideM_multiplier * num_heads * head_dim;
+    }
+    CUTLASS_HOST_DEVICE int32_t gV_strideM() const {
+      return gQKV_strideM_multiplier * num_heads * head_dim_value;
+    }
+
+    // Everything below is only used in `advance_to_block`
+    // and shouldn't use registers
+    int64_t o_strideH = -1;
+    int32_t q_strideH = -1;
+    int32_t k_strideH = -1;
+    int32_t v_strideH = -1;
+    int64_t bias_strideH = 0;
+    int64_t o_strideB = -1;
+    int64_t q_strideB = -1;
+    int64_t k_strideB = -1;
+    int64_t v_strideB = -1;
+    int64_t bias_strideB = 0;
+    int64_t lse_strideB = -1;
+    int64_t lse_strideH = -1;
+    int64_t delta_strideB = -1;
+    int64_t delta_strideH = -1;
+    int32_t num_batches = -1;
+    int16_t num_splits_key = 1; // We use `gridDim.x` inside kernel
+
+    int64_t gO_strideB = 0;
+    int64_t gQ_strideB = 0;
+    int64_t gK_strideB = 0;
+    int64_t gV_strideB = 0;
+    int64_t gB_strideB = 0;
+    int64_t gO_strideH = 0;
+    int64_t gQ_strideH = 0;
+    int64_t gK_strideH = 0;
+    int64_t gV_strideH = 0;
+    int64_t gB_strideH = 0;
+
+    CUTLASS_DEVICE int16_t num_splits_key_device() const {
+      return kEnableSplitKeys ? gridDim.x : 1;
+    }
+    CUTLASS_DEVICE int16_t split_key_device() const {
+      return kEnableSplitKeys ? blockIdx.x : 0;
+    }
+
+    CUTLASS_DEVICE bool advance_to_block() {
+      int64_t batch_id = blockIdx.z;
+      int32_t head_id = blockIdx.y;
+
+      if (kNeedsAccumGradQ || kNeedsAccumGradK || kNeedsAccumGradV) {
+        assert(workspace_size() == 0 || workspace != nullptr);
+
+        workspace += (batch_id * num_heads + head_id) * workspace_strideBH();
+        workspace = warp_uniform(workspace);
+        workspace_gv = workspace + workspace_elements_gk();
+        workspace_gq =
+            (GradQTempStorage*)(workspace_gv + workspace_elements_gv());
+        if (kEnableSplitKeys) {
+          workspace_gv += workspace_elements_gv() * split_key_device() /
+              num_splits_key_device();
+          workspace += workspace_elements_gk() * split_key_device() /
+              num_splits_key_device();
+        }
+      } else {
+        workspace = nullptr;
+      }
+
+      // Advance pointers that depend on the total concatenated
+      // number of queries, as `num_queries` is modified in the block
+      // below
+      dropout_batch_head_rng_offset =
+          batch_id * (num_heads * num_queries * num_keys) +
+          head_id * (num_queries * num_keys);
+      logsumexp_ptr += batch_id * lse_strideB + head_id * lse_strideH;
+
+      if (cu_seqlens_q_ptr != nullptr) {
+        assert(cu_seqlens_k_ptr != nullptr);
+        cu_seqlens_q_ptr += batch_id;
+        cu_seqlens_k_ptr += batch_id;
+        int32_t q_start = cu_seqlens_q_ptr[0];
+        int32_t k_start = cu_seqlens_k_ptr[0];
+        int64_t q_next_start = cu_seqlens_q_ptr[1];
+        int64_t k_next_start = cu_seqlens_k_ptr[1];
+        assert(q_next_start - q_start <= num_queries);
+        assert(k_next_start - k_start <= num_keys);
+        num_queries = q_next_start - q_start;
+        num_keys = k_next_start - k_start;
+
+        // Jump manually
+        batch_id = 0;
+
+        query_ptr += q_start * q_strideM;
+        key_ptr += k_start * k_strideM;
+        value_ptr += k_start * v_strideM;
+        assert(bias_ptr == nullptr);
+        assert(grad_bias_ptr == nullptr);
+        output_ptr += q_start * o_strideM();
+        grad_output_ptr += q_start * gO_strideM;
+        delta_ptr += q_start;
+
+        grad_query_ptr += q_start * gQ_strideM();
+        grad_key_ptr += k_start * gK_strideM();
+        grad_value_ptr += k_start * gV_strideM();
+      }
+
+      query_ptr += batch_id * q_strideB + head_id * q_strideH;
+      key_ptr += batch_id * k_strideB + head_id * k_strideH;
+      value_ptr += batch_id * v_strideB + head_id * v_strideH;
+      if (bias_ptr != nullptr) {
+        bias_ptr += batch_id * bias_strideB + head_id * bias_strideH;
+      }
+      output_ptr += batch_id * o_strideB + head_id * o_strideH;
+      grad_output_ptr += batch_id * gO_strideB + head_id * gO_strideH;
+      delta_ptr += batch_id * delta_strideB + head_id * delta_strideH;
+
+      grad_query_ptr += batch_id * gQ_strideB + head_id * gQ_strideH;
+      grad_key_ptr += batch_id * gK_strideB + head_id * gK_strideH;
+      grad_value_ptr += batch_id * gV_strideB + head_id * gV_strideH;
+      if (grad_bias_ptr != nullptr) {
+        grad_bias_ptr += batch_id * gB_strideB + head_id * gB_strideH;
+      }
+
+      // Some values are modified above
+      // Signal to the compiler that they are the same in all threads
+      // and can be stored in warp-uniform registers (Sm75+)
+      num_queries = warp_uniform(num_queries);
+      num_keys = warp_uniform(num_keys);
+      custom_mask_type = warp_uniform(custom_mask_type);
+
+      query_ptr = warp_uniform(query_ptr);
+      key_ptr = warp_uniform(key_ptr);
+      value_ptr = warp_uniform(value_ptr);
+      bias_ptr = warp_uniform(bias_ptr);
+      logsumexp_ptr = warp_uniform(logsumexp_ptr);
+      output_ptr = warp_uniform(output_ptr);
+      grad_output_ptr = warp_uniform(grad_output_ptr);
+      delta_ptr = warp_uniform(delta_ptr);
+
+      grad_query_ptr = warp_uniform(grad_query_ptr);
+      grad_key_ptr = warp_uniform(grad_key_ptr);
+      grad_value_ptr = warp_uniform(grad_value_ptr);
+      grad_bias_ptr = warp_uniform(grad_bias_ptr);
+
+#if 0
+      PRINT_T0("[b:%d h:%d] dp[0]:%f Q:%f K:%f V:%f LSE:%f",
+        int(blockIdx.z), int(blockIdx.y),
+        float(delta_ptr[0]),
+        float(query_ptr[0]), float(key_ptr[0]), float(value_ptr[0]),
+        float(logsumexp_ptr[0])
+      )
+#endif
+      return true;
+    }
+
+    __host__ dim3 getBlocksGrid() const {
+      return dim3(num_splits_key, num_heads, num_batches);
+    }
+    __host__ dim3 getThreadsGrid() const {
+      return dim3(kWarpSize * kNumWarpsPerBlock, 1, 1);
+    }
+    CUTLASS_HOST_DEVICE int64_t workspace_elements_gk() const {
+      if (!kNeedsAccumGradK) {
+        return 0;
+      }
+      return num_splits_key * align_up(num_keys, (int32_t)kBlockSizeJ) *
+          align_up(head_dim, (int32_t)kBlockSizeI);
+    }
+    CUTLASS_HOST_DEVICE int64_t workspace_elements_gv() const {
+      if (!kNeedsAccumGradV) {
+        return 0;
+      }
+      return num_splits_key * align_up(num_keys, (int32_t)kBlockSizeJ) *
+          align_up(head_dim_value, (int32_t)kBlockSizeI);
+    }
+    CUTLASS_HOST_DEVICE int64_t workspace_elements_gq() const {
+      if (!kNeedsAccumGradQ) {
+        return 0;
+      }
+      int num_blocks = ceil_div(num_queries, kBlockSizeI);
+      int num_cols = ceil_div(head_dim, MatmulGradQ::ThreadblockShape::kN);
+      return num_blocks * num_cols * sizeof(GradQTempStorage) /
+          sizeof(output_accum_t);
+    }
+    CUTLASS_HOST_DEVICE int64_t workspace_strideBH() const {
+      // Aligned on 128bits
+      return align_up(
+          workspace_elements_gk() + workspace_elements_gv() +
+              workspace_elements_gq(),
+          int64_t(4));
+    }
+    CUTLASS_HOST_DEVICE int64_t workspace_size() const {
+      // Returns size of buffer we need to run this kernel
+      return num_batches * num_heads * workspace_strideBH() * sizeof(float);
+    }
+    CUTLASS_HOST_DEVICE bool should_zero_workspace() const {
+      return num_splits_key > 1;
+    }
+  };
+
+  // shared storage for keeping Zij matrix. not needed if we aren't using
+  // dropout, in which case we use an empty array to save shared memory
+  using ZijSharedStorage = typename cutlass::platform::conditional<
+      kApplyDropout,
+      typename MatmulQK::AccumulatorSharedStorage,
+      // dummy shared storage object that takes up no space.
+      typename cutlass::gemm::threadblock::AccumulatorSharedStorage<
+#ifdef _WIN32
+          // windows builds throw the error:
+          // "type containing an unknown-size array is not allowed"
+          // if we try to make Zij shared storage zero-sized.
+          // To get around this just make it sized 1 on windows.
+          typename cutlass::gemm::GemmShape<1, 1, 0>,
+#else
+          typename cutlass::gemm::GemmShape<0, 0, 0>,
+#endif
+          typename MatmulQK::AccumulatorSharedStorage::Element,
+          typename MatmulQK::AccumulatorSharedStorage::Layout,
+          typename cutlass::MatrixShape<0, 0>>>::type;
+
+  struct SharedStoragePrologue {
+    struct {
+      cutlass::Array<accum_t, kBlockSizeI> di; // (do_i * o_i).sum(-1)
+      typename MatmulQK::Mma::SharedStorageA mm_qk_k;
+    } persistent;
+    union {
+      struct {
+        // part1 - after Q.K / dV / dO.V
+        union {
+          // 1. efficient load of bias tile Bij, which is then applied to Pij
+          typename MatmulQK::BiasLoader::SmemTile bias;
+          // 4. store Pij. it is needed:
+          // - in dVj += (Pij.T * Zij) @ dOi
+          // - in dSij = Pij * (dPij - Di)
+          // 6. dVj += (Pij.T * Zij) @ dOi
+          // 10. write to fragment
+          typename MatmulQK::AccumulatorSharedStorage attn_shared_storage;
+        };
+        // 5. store Zij. it is needed in dVj += (Pij.T * Zij) @ dOi
+        ZijSharedStorage zij;
+
+        union {
+          // 2. prologue for dVj
+          // 6. workspace for dVj += (Pij.T * Zij) @ dOi
+          typename MatmulGradV::Mma::SharedStorage mm_gradV;
+          // 7. dVj epilogue
+          typename MatmulGradV::DefaultEpilogue::SharedStorage gradV_epilogue;
+        };
+
+        // 3. prologue for dPij_dropped
+        // 8. used in dPij_dropped = dOi @ Vj.T
+        typename MatmulDOIVJ::Mma::SharedStorage mm_doivj;
+      } part1;
+
+      struct {
+        // part2 - dQ
+        union {
+          typename MatmulQK::AccumulatorSharedStorage
+              tmpT_shared_storage; // (from part1)
+          typename MatmulDOIVJ::AccumulatorSharedStorage tmp_shared_storage;
+        };
+        typename MatmulGradK::Mma::SharedStorage mm_gradK; // (preload)
+        typename MatmulGradQ::Mma::SharedStorage mm_gradQ; // (preload)
+        union {
+          // store dB = dSij to global memory
+          typename MatmulDOIVJ::BiasGradEpilogue::SharedStorage gradB_epilogue;
+          typename MatmulGradQ::DefaultEpilogue::SharedStorage gradQ_epilogue;
+        };
+
+      } part2;
+
+      struct {
+        // part3 - after last iteration on dQ's epilogue / dK
+        union {
+          typename MatmulQK::AccumulatorSharedStorage
+              tmpT_shared_storage; // (from part1)
+          typename MatmulDOIVJ::AccumulatorSharedStorage tmp_shared_storage;
+        };
+        typename MatmulGradK::Mma::SharedStorage mm_gradK; // (preload)
+        typename MatmulGradQ::DefaultEpilogue::SharedStorage
+            gradQ_epilogue_lastIter;
+
+        typename MatmulGradK::DefaultEpilogue::SharedStorage gradK_epilogue;
+      } part3;
+
+      struct {
+        // part4 - after last iteration on dK's epilogue / preload next K.Q_t
+        typename MatmulQK::Mma::SharedStorageB mm_qk_q;
+
+        // If we reach end of current key, dump RF->gmem with "final" epilogues
+        typename MatmulGradK::DefaultEpilogue::SharedStorage
+            gradK_epilogue_final;
+        typename MatmulGradV::DefaultEpilogue::SharedStorage
+            gradV_epilogue_final;
+      } part4;
+    };
+    static void print_size() {
+      // Field size
+#define FSZ(f) int((sizeof(((SharedStoragePrologue*)0)->f)))
+
+      printf("Total smem: %d bytes\n", int(sizeof(SharedStoragePrologue)));
+      printf("  persistent: %db\n", FSZ(persistent));
+      printf("    mm_qk_k: %db\n", FSZ(persistent.mm_qk_k));
+      printf("  part1: %db\n", FSZ(part1));
+      printf("    bias: %db\n", FSZ(part1.bias));
+      printf("    attn_shared_storage: %db\n", FSZ(part1.attn_shared_storage));
+      printf("    zij: %db\n", FSZ(part1.zij));
+      printf("    mm_gradV: %db\n", FSZ(part1.mm_gradV));
+      printf("    gradV_epilogue: %db\n", FSZ(part1.gradV_epilogue));
+      printf("    mm_doivj: %db\n", FSZ(part1.mm_doivj));
+      printf("  part2: %db\n", FSZ(part2));
+      printf("    tmpT_shared_storage: %db\n", FSZ(part2.tmpT_shared_storage));
+      printf("    tmp_shared_storage: %db\n", FSZ(part2.tmp_shared_storage));
+      printf("    mm_gradK: %db\n", FSZ(part2.mm_gradK));
+      printf("    mm_gradQ: %db\n", FSZ(part2.mm_gradQ));
+      printf("    gradB_epilogue: %db\n", FSZ(part2.gradB_epilogue));
+      printf("    gradQ_epilogue: %db\n", FSZ(part2.gradQ_epilogue));
+      printf("  part3: %db\n", FSZ(part3));
+      printf("    tmpT_shared_storage: %db\n", FSZ(part3.tmpT_shared_storage));
+      printf("  part4: %db\n", FSZ(part4));
+      printf("    mm_qk_q: %db\n", FSZ(part4.mm_qk_q));
+      printf(
+          "    gradK_epilogue_final: %db\n", FSZ(part4.gradK_epilogue_final));
+      printf(
+          "    gradV_epilogue_final: %db\n", FSZ(part4.gradV_epilogue_final));
+    }
+// ===========================================
+#define FIELD(INSIDE_STRUCT, FIELDNAME) \
+  CUTLASS_DEVICE auto& FIELDNAME() {    \
+    return INSIDE_STRUCT.FIELDNAME;     \
+  }
+
+    FIELD(persistent, di)
+    FIELD(persistent, mm_qk_k)
+    FIELD(part1, bias)
+    FIELD(part1, attn_shared_storage)
+    FIELD(part1, zij)
+    FIELD(part1, mm_gradV)
+    FIELD(part1, gradV_epilogue)
+    FIELD(part1, mm_doivj)
+    FIELD(part2, mm_gradK)
+    FIELD(part2, mm_gradQ)
+    FIELD(part2, gradB_epilogue)
+    FIELD(part2, gradQ_epilogue)
+    FIELD(part2, tmp_shared_storage)
+    FIELD(part3, tmpT_shared_storage)
+    FIELD(part3, gradQ_epilogue_lastIter)
+    FIELD(part3, gradK_epilogue)
+    FIELD(part4, mm_qk_q)
+    FIELD(part4, gradK_epilogue_final)
+    FIELD(part4, gradV_epilogue_final)
+  };
+
+  struct SharedStorageNoPrologue {
+    struct {
+      cutlass::Array<accum_t, kBlockSizeI> di; // (do_i * o_i).sum(-1)
+    } persistent;
+    union {
+      struct {
+        // part1 - Q.K matmul
+        typename MatmulQK::Mma::SharedStorageA mm_qk_k;
+        typename MatmulQK::Mma::SharedStorageB mm_qk_q;
+      } part1;
+
+      struct {
+        // part2 - compute gradV
+        union {
+          // 1. efficient load of bias tile Bij, which is then applied to Pij
+          typename MatmulQK::BiasLoader::SmemTile bias;
+          // 2. store Pij to shared memory. it is needed:
+          // - in this step, where it is used in dVj += (Pij.T * Zij) @ dOi
+          // - in next step where it is used in dSij = Pij * (dPij - Di)
+          typename MatmulQK::AccumulatorSharedStorage attn_shared_storage;
+        };
+        // 3. store Zij. it is needed in this step, where it is used
+        // to compute Pij_dropped = Pij * Zij on the fly as fragments of Pij are
+        // loaded for the computation of dVj.
+        ZijSharedStorage zij;
+
+        union {
+          typename MatmulGradV::Mma::SharedStorage mm_gradV;
+          typename MatmulGradV::DefaultEpilogue::SharedStorage gradV_epilogue;
+        };
+      } part2;
+
+      struct {
+        // part3 - DO.V matmul
+        union {
+          // first compute dPij = (dOi @ Vj.T) * Zij
+          // and dSij = Pij * (dPij - Di)
+          struct {
+            // (from part2) - Pij for computing dSij = Pij * (dPij - Di)
+            typename MatmulQK::AccumulatorSharedStorage attn_shared_storage;
+            // matmul to compute dOiVj
+            typename MatmulDOIVJ::Mma::SharedStorage mm_doivj;
+          };
+          // then store dB = dSij to global memory
+          typename MatmulDOIVJ::BiasGradEpilogue::SharedStorage gradB_epilogue;
+        };
+      } part3;
+
+      struct {
+        // part4 - compute gradQ
+        typename MatmulQK::AccumulatorSharedStorage
+            tmpT_shared_storage; // (from part2)
+        typename MatmulDOIVJ::AccumulatorSharedStorage tmp_shared_storage;
+        union {
+          typename MatmulGradQ::Mma::SharedStorage mm_gradQ;
+          typename MatmulGradQ::DefaultEpilogue::SharedStorage gradQ_epilogue;
+          typename MatmulGradQ::DefaultEpilogue::SharedStorage
+              gradQ_epilogue_lastIter;
+        };
+      } part4;
+
+      struct {
+        // part5 - compute gradK
+        typename MatmulQK::AccumulatorSharedStorage
+            tmpT_shared_storage; // (from part2)
+        typename MatmulDOIVJ::AccumulatorSharedStorage tmp_shared_storage;
+        union {
+          typename MatmulGradK::Mma::SharedStorage mm_gradK;
+          typename MatmulGradK::DefaultEpilogue::SharedStorage gradK_epilogue;
+        };
+      } part5;
+
+      struct {
+        // part6 - store RF accumulated into gmem
+        typename MatmulGradK::DefaultEpilogue::SharedStorage
+            gradK_epilogue_final;
+        typename MatmulGradV::DefaultEpilogue::SharedStorage
+            gradV_epilogue_final;
+      } part6;
+    };
+    static void print_size() {
+#define FIELD_SIZEOF(f) int((sizeof(((SharedStorageNoPrologue*)0)->f)))
+      printf("Total smem: %d bytes\n", int(sizeof(SharedStorageNoPrologue)));
+      printf("  persistent: %db\n", FIELD_SIZEOF(persistent));
+      printf("  part1: %db\n", FIELD_SIZEOF(part1));
+      printf("  part2: %db\n", FIELD_SIZEOF(part2));
+      printf("  part3: %db\n", FIELD_SIZEOF(part3));
+      printf("  part4: %db\n", FIELD_SIZEOF(part4));
+      printf("  part5: %db\n", FIELD_SIZEOF(part5));
+      printf("  part6: %db\n", FIELD_SIZEOF(part6));
+    }
+// ===========================================
+#define FIELD(INSIDE_STRUCT, FIELDNAME) \
+  CUTLASS_DEVICE auto& FIELDNAME() {    \
+    return INSIDE_STRUCT.FIELDNAME;     \
+  }
+
+    FIELD(persistent, di)
+    FIELD(part1, mm_qk_k)
+    FIELD(part1, mm_qk_q)
+    FIELD(part2, bias)
+    FIELD(part2, attn_shared_storage)
+    FIELD(part2, zij)
+    FIELD(part2, mm_gradV)
+    FIELD(part2, gradV_epilogue)
+    FIELD(part3, mm_doivj)
+    FIELD(part3, gradB_epilogue)
+    FIELD(part4, tmpT_shared_storage)
+    FIELD(part4, tmp_shared_storage)
+    FIELD(part4, mm_gradQ)
+    FIELD(part4, gradQ_epilogue)
+    FIELD(part4, gradQ_epilogue_lastIter)
+    FIELD(part5, mm_gradK)
+    FIELD(part5, gradK_epilogue)
+    FIELD(part6, gradK_epilogue_final)
+    FIELD(part6, gradV_epilogue_final)
+  };
+
+  using SharedStorage = typename cutlass::platform::conditional<
+      kPreload,
+      SharedStoragePrologue,
+      SharedStorageNoPrologue>::type;
+
+  struct OutputFragments {
+    typename MatmulGradV::Mma::FragmentC gradV;
+    typename MatmulGradK::Mma::FragmentC gradK;
+
+    CUTLASS_DEVICE void clear() {
+      gradV.clear();
+      gradK.clear();
+    }
+  };
+
+  static bool __host__ check_supported(Params const& p) {
+    CHECK_ALIGNED_PTR(p.query_ptr, kMinimumAlignment);
+    CHECK_ALIGNED_PTR(p.key_ptr, kMinimumAlignment);
+    CHECK_ALIGNED_PTR(p.value_ptr, kMinimumAlignment);
+    CHECK_ALIGNED_PTR(p.output_ptr, kMinimumAlignment);
+    CHECK_ALIGNED_PTR(p.grad_output_ptr, kMinimumAlignment);
+    CHECK_ALIGNED_PTR(p.bias_ptr, kMinimumAlignment);
+    XFORMERS_CHECK(p.lse_strideH % 8 == 0, "LSE is not correctly aligned");
+    XFORMERS_CHECK(p.lse_strideB % 8 == 0, "LSE is not correctly aligned");
+    XFORMERS_CHECK(
+        p.num_heads <= 1 || p.q_strideH % kMinimumAlignment == 0,
+        "query is not correctly aligned (strideH)");
+    XFORMERS_CHECK(
+        p.num_heads <= 1 || p.k_strideH % kMinimumAlignment == 0,
+        "key is not correctly aligned (strideH)");
+    XFORMERS_CHECK(
+        p.num_heads <= 1 || p.v_strideH % kMinimumAlignment == 0,
+        "value is not correctly aligned (strideH)");
+    XFORMERS_CHECK(
+        p.num_batches <= 1 || p.q_strideB % kMinimumAlignment == 0,
+        "query is not correctly aligned (strideB)");
+    XFORMERS_CHECK(
+        p.num_batches <= 1 || p.k_strideB % kMinimumAlignment == 0,
+        "key is not correctly aligned (strideB)");
+    XFORMERS_CHECK(
+        p.num_batches <= 1 || p.v_strideB % kMinimumAlignment == 0,
+        "value is not correctly aligned (strideB)");
+    XFORMERS_CHECK(
+        p.q_strideM % kMinimumAlignment == 0,
+        "query is not correctly aligned (strideM)");
+    XFORMERS_CHECK(
+        p.k_strideM % kMinimumAlignment == 0,
+        "key is not correctly aligned (strideM)");
+    XFORMERS_CHECK(
+        p.v_strideM % kMinimumAlignment == 0,
+        "value is not correctly aligned (strideM)");
+    if (p.bias_ptr) {
+      XFORMERS_CHECK(
+          p.num_batches <= 1 || p.bias_strideB % kMinimumAlignment == 0,
+          "attn_bias is not correctly aligned (strideB)");
+      XFORMERS_CHECK(
+          p.num_heads <= 1 || p.bias_strideH % kMinimumAlignment == 0,
+          "attn_bias is not correctly aligned (strideH)");
+      XFORMERS_CHECK(
+          p.bias_strideM % kMinimumAlignment == 0,
+          "attn_bias is not correctly aligned (strideM)");
+    }
+    if (p.grad_bias_ptr) {
+      XFORMERS_CHECK(
+          p.num_batches <= 1 || p.gB_strideB % kMinimumAlignment == 0,
+          "attn_bias.grad is not correctly aligned (strideB)");
+      XFORMERS_CHECK(
+          p.num_heads <= 1 || p.gB_strideH % kMinimumAlignment == 0,
+          "attn_bias.grad is not correctly aligned (strideH)");
+      XFORMERS_CHECK(
+          p.gB_strideM % kMinimumAlignment == 0,
+          "attn_bias.grad is not correctly aligned (strideM)");
+    }
+    XFORMERS_CHECK(
+        !(p.cu_seqlens_q_ptr && p.bias_ptr),
+        "CuSeqlen + bias not implemented yet");
+    XFORMERS_CHECK(
+        p.custom_mask_type < NumCustomMaskTypes,
+        "Invalid value for `custom_mask_type`");
+    XFORMERS_CHECK(
+        p.dropout_prob <= 1.0f && p.dropout_prob >= 0.0f,
+        "Invalid value for `dropout_prob`");
+    XFORMERS_CHECK(
+        kApplyDropout || p.dropout_prob == 0.0f,
+        "Set `kApplyDropout`=True to support `dropout_prob > 0`");
+    XFORMERS_CHECK(p.head_dim > 0, "Invalid value for `head_dim`");
+    XFORMERS_CHECK(p.head_dim_value > 0, "Invalid value for `head_dim_value`");
+    XFORMERS_CHECK(p.num_queries > 0, "Invalid value for `num_queries`");
+    XFORMERS_CHECK(p.num_keys > 0, "Invalid value for `num_keys`");
+    XFORMERS_CHECK(p.num_heads > 0, "Invalid value for `num_heads`");
+    XFORMERS_CHECK(p.num_batches > 0, "Invalid value for `num_batches`");
+    XFORMERS_CHECK(p.head_dim <= kMaxK, "kMaxK: Expected `head_dim < kMaxK`");
+    XFORMERS_CHECK(
+        p.head_dim_value <= kMaxK, "kMaxK: Expected `head_dim_value < kMaxK`");
+    if (kKeysQueriesAlignedToBlockSize) {
+      XFORMERS_CHECK(
+          p.cu_seqlens_k_ptr == nullptr,
+          "This kernel does not support cu_seqlen");
+      XFORMERS_CHECK(
+          p.cu_seqlens_q_ptr == nullptr,
+          "This kernel does not support cu_seqlen");
+      XFORMERS_CHECK(
+          p.num_queries % kBlockSizeI == 0,
+          "kKeysQueriesAlignedToBlockSize condition not respected");
+      XFORMERS_CHECK(
+          p.num_keys % kBlockSizeJ == 0,
+          "kKeysQueriesAlignedToBlockSize condition not respected");
+    }
+    XFORMERS_CHECK(
+        kEnableSplitKeys || p.num_splits_key == 1, "SplitKeys is disabled");
+    XFORMERS_CHECK(
+        p.num_splits_key > 0, "Invalid `num_splits_key` (expected >0)");
+    XFORMERS_CHECK(
+        p.num_splits_key <= cutlass::ceil_div(p.num_keys, kBlockSizeJ),
+        "Invalid `num_splits_key` (too large)");
+    return true;
+  }
+
+  static CUTLASS_DEVICE void attention_kernel(Params p) {
+    extern __shared__ char smem_buffer[];
+    SharedStorage& shared_storage = *((SharedStorage*)smem_buffer);
+
+    uint16_t thread_id = threadIdx.x;
+    uint8_t warp_id = warp_uniform(thread_id / 32);
+    uint8_t lane_id = thread_id % 32;
+
+    int32_t key_start = p.split_key_device() * kBlockSizeJ;
+    if (key_start >= p.num_keys) {
+      return;
+    }
+    if (kPrologueQK) {
+      int32_t query_start = getQueryStart(p, key_start);
+      prologueQkNextIteration<true>(
+          shared_storage, p, query_start, key_start, warp_id, lane_id);
+    }
+
+    // Computes (dO*out).sum(-1) and writes it to `p.delta_ptr`
+    if (kKernelComputesDelta) {
+      constexpr int kOptimalElements =
+          128 / cutlass::sizeof_bits<scalar_t>::value;
+      if (p.head_dim_value % kOptimalElements == 0) {
+        for (int query_start = 0; query_start < p.num_queries;
+             query_start += kBlockSizeI) {
+          computeDelta<kOptimalElements>(p, query_start, warp_id, lane_id);
+        }
+      } else {
+        for (int query_start = 0; query_start < p.num_queries;
+             query_start += kBlockSizeI) {
+          computeDelta<1>(p, query_start, warp_id, lane_id);
+        }
+      }
+      __syncthreads();
+    }
+
+    OutputFragments output_frags;
+
+    curandStatePhilox4_32_10_t rng_state_init;
+#ifdef HAS_PYTORCH
+    if (kApplyDropout) {
+      auto seeds = at::cuda::philox::unpack(p.rng_engine_inputs);
+      // each element of the attention matrix P with shape
+      // (batch_sz, n_heads, n_queries, n_keys) is associated with a single
+      // offset in RNG sequence. we initialize the RNG state with offset that
+      // starts at the beginning of a (n_queries, n_keys) matrix for this
+      // block's batch_id and head_id
+      // initializing rng state is very expensive, so we run once per kernel,
+      // rather than once per iteration. each iteration takes a copy of the
+      // initialized RNG state and offsets it as needed.
+      curand_init(
+          std::get<0>(seeds),
+          0,
+          std::get<1>(seeds) + p.dropout_batch_head_rng_offset,
+          &rng_state_init);
+    }
+#endif
+    CUTLASS_PRAGMA_UNROLL
+    for (; key_start < p.num_keys;
+         key_start += p.num_splits_key_device() * kBlockSizeJ) {
+      output_frags.clear();
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int32_t query_start_shifted = getQueryStart(p, key_start);
+           query_start_shifted < getQueryStartShift(p) + getQueryEnd(p);
+           query_start_shifted += kBlockSizeI) {
+        // This line here
+        // vvvvvvvvvvvvvv
+        warp_id = warp_uniform(warp_id);
+        // ^^^^^^^^^^^^^^
+        // ... makes everything use less RF and be 10% faster. Why?
+        // I don't know. My theory is that it forces `nvcc` to
+        // re-compute indices, offsets etc... and not keep them
+        // from the previous iteration, which prevents MASSIVE
+        // register spilling.
+
+        int32_t query_start = query_start_shifted;
+        if (query_start >= p.num_queries) {
+          query_start = query_start % getQueryEnd(p);
+        }
+
+        processBlockIJ<kKeysQueriesAlignedToBlockSize>(
+            shared_storage,
+            output_frags,
+            p,
+            query_start,
+            key_start,
+            rng_state_init,
+            warp_id,
+            lane_id);
+      }
+      if (kOutputInRF) {
+        writeFragsToGmem<kKeysQueriesAlignedToBlockSize>(
+            shared_storage, output_frags, p, key_start, warp_id, lane_id);
+      } else if (getQueryStart(p, key_start) >= p.num_queries) {
+        zfillGradKV<kKeysQueriesAlignedToBlockSize>(
+            p, key_start, warp_id, lane_id);
+      }
+      __syncthreads();
+    }
+  }
+
+  template <bool skipBoundsChecks>
+  static CUTLASS_DEVICE void zfillGradKV(
+      Params const& p,
+      int32_t key_start,
+      uint8_t warp_id,
+      uint8_t lane_id) {
+    constexpr int kThreadsPerKey = 8;
+    constexpr int kParallelKeys = kNumThreads / kThreadsPerKey;
+    static_assert(kBlockSizeJ % kParallelKeys == 0, "");
+    // This function is not really optimized, but should rarely be used
+    // It's only used when some keys are "useless" and don't attend to
+    // any query, due to causal masking
+
+    int thread_id = 32 * warp_id + lane_id;
+    int k_shift = lane_id % kThreadsPerKey;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int j = 0; j < kBlockSizeJ; j += kParallelKeys) {
+      int key = key_start + j + (thread_id / kThreadsPerKey);
+      if (!skipBoundsChecks && key >= p.num_keys) {
+        continue;
+      }
+      auto gv_ptr = p.grad_value_ptr + key * p.gV_strideM();
+      auto gk_ptr = p.grad_key_ptr + key * p.gK_strideM();
+
+      for (int k = k_shift; k < p.head_dim_value; k += kThreadsPerKey) {
+        gv_ptr[k] = scalar_t(0);
+      }
+      for (int k = k_shift; k < p.head_dim; k += kThreadsPerKey) {
+        gk_ptr[k] = scalar_t(0);
+      }
+    }
+  }
+
+  template <bool skipBoundsChecks>
+  static CUTLASS_DEVICE void processBlockIJ(
+      SharedStorage& shared_storage,
+      OutputFragments& output_frags,
+      Params& p,
+      int32_t query_start,
+      int32_t key_start,
+      const curandStatePhilox4_32_10_t& curand_state_init,
+      uint8_t warp_id,
+      uint8_t lane_id) {
+    cutlass::Array<cutlass::uint1b_t, MatmulDOIVJ::Mma::FragmentC::kElements>
+        dropout_keep_mask_doivj;
+    dropout_keep_mask_doivj.fill(cutlass::uint1b_t{1});
+    const float dropout_scale =
+        kApplyDropout ? 1.0 / (1.0 - p.dropout_prob) : 1.0f;
+
+    cutlass::MatrixCoord no_offset{0, 0};
+    accum_t scale = p.scale;
+    int16_t thread_id = 32 * warp_id + lane_id;
+
+    auto rematerializeThreadIds = [&]() {
+      // Prevents `nvcc` from keeping values deduced from
+      // `thread_id`, `warp_id`, ... in RF - to reduce register pressure
+      warp_id = warp_uniform(thread_id / 32);
+      lane_id = thread_id % 32;
+      thread_id = 32 * warp_id + lane_id;
+    };
+
+    bool isFirstQuery = (query_start == getQueryStart(p, key_start));
+    int32_t next_query, next_key;
+    incrIteration(p, query_start, key_start, next_query, next_key);
+    bool isLastQuery = next_key != key_start;
+
+    accum_t di_rf = accum_t(0);
+    if (thread_id < kBlockSizeI) {
+      if (query_start + thread_id < p.num_queries) {
+        di_rf = p.delta_ptr[query_start + thread_id];
+      }
+      shared_storage.di()[thread_id] = di_rf;
+    }
+
+    int32_t num_queries_in_block = skipBoundsChecks
+        ? MatmulQK::Mma::Shape::kN
+        : warp_uniform(cutlass::fast_min(
+              (int32_t)MatmulQK::Mma::Shape::kN, p.num_queries - query_start));
+    int32_t num_keys_in_block = skipBoundsChecks
+        ? MatmulQK::Mma::Shape::kM
+        : warp_uniform(cutlass::fast_min(
+              (int32_t)MatmulQK::Mma::Shape::kM, p.num_keys - key_start));
+
+    auto prologueGradV = [&](int col) {
+      typename MatmulGradV::Mma::IteratorB iterator_dO(
+          {int32_t(p.gO_strideM)},
+          p.grad_output_ptr + query_start * p.gO_strideM + col,
+          {num_queries_in_block, p.head_dim_value - col},
+          thread_id,
+          no_offset);
+      MatmulGradV::Mma::prologue(
+          shared_storage.mm_gradV(),
+          iterator_dO,
+          thread_id,
+          num_queries_in_block);
+    };
+    auto prologueGradQ = [&](int col) {
+      typename MatmulGradQ::Mma::IteratorB iterator_K(
+          {int32_t(p.k_strideM)},
+          p.key_ptr + key_start * p.k_strideM + col,
+          {num_keys_in_block, p.head_dim - col},
+          thread_id,
+          no_offset);
+      MatmulGradQ::Mma::prologue(
+          shared_storage.mm_gradQ(), iterator_K, thread_id, num_keys_in_block);
+    };
+    auto prologueGradK = [&](int col) {
+      typename MatmulGradK::Mma::IteratorB iterator_Q(
+          {int32_t(p.q_strideM)},
+          p.query_ptr + query_start * p.q_strideM + col,
+          {num_queries_in_block, p.head_dim - col},
+          thread_id,
+          no_offset);
+      MatmulGradK::Mma::prologue(
+          shared_storage.mm_gradK(),
+          iterator_Q,
+          thread_id,
+          num_queries_in_block);
+    };
+    auto prologueDOV = [&]() {
+      typename MatmulDOIVJ::Mma::IteratorA iterator_A(
+          {int32_t(p.gO_strideM)},
+          p.grad_output_ptr + query_start * p.gO_strideM,
+          {num_queries_in_block, p.head_dim_value},
+          thread_id,
+          no_offset);
+      typename MatmulDOIVJ::Mma::IteratorB iterator_B(
+          {int32_t(p.v_strideM)},
+          p.value_ptr + key_start * p.v_strideM,
+          {p.head_dim_value, num_keys_in_block},
+          thread_id,
+          no_offset);
+      MatmulDOIVJ::Mma::prologue(
+          shared_storage.mm_doivj(),
+          iterator_A,
+          iterator_B,
+          thread_id,
+          p.head_dim_value);
+    };
+
+    /////////////////////////////////////////////////////////////////////////////////////////////////
+    // MatmulQK
+    /////////////////////////////////////////////////////////////////////////////////////////////////
+    {
+      using Mma = typename MatmulQK::Mma;
+
+      cutlass::gemm::GemmCoord problem_size(
+          num_keys_in_block,
+          num_queries_in_block,
+          p.head_dim // k
+      );
+
+      // k_j
+      typename Mma::IteratorA iterator_A(
+          {int32_t(p.k_strideM)},
+          p.key_ptr + key_start * p.k_strideM,
+          {problem_size.m(), problem_size.k()},
+          thread_id,
+          no_offset);
+
+      // q_i.transpose(-2, -1)
+      typename Mma::IteratorB iterator_B(
+          {int32_t(p.q_strideM)},
+          p.query_ptr + query_start * p.q_strideM,
+          {problem_size.k(), problem_size.n()},
+          thread_id,
+          no_offset);
+
+      Mma mma(
+          shared_storage.mm_qk_k(),
+          shared_storage.mm_qk_q(),
+          thread_id,
+          warp_id,
+          lane_id);
+
+      typename Mma::FragmentC accum;
+
+      accum.clear();
+
+      auto gemm_k_iterations =
+          (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+      // Compute threadblock-scoped matrix multiply-add
+      mma.set_prologue_done(kPrologueQK);
+      mma.set_zero_outside_bounds(!skipBoundsChecks);
+      mma(gemm_k_iterations, accum, iterator_A, iterator_B, accum);
+      accum = cutlass::multiplies<typename Mma::FragmentC>()(scale, accum);
+
+      // Epilogue: add LSE + exp and store that to our shared memory buffer
+      // shmem <- (matmul_result -
+      // logsumexp[i_start:i_end].unsqueeze(1)).exp()
+      int warp_idx_mn_0 =
+          warp_id % (Mma::Base::WarpCount::kM * Mma::Base::WarpCount::kN);
+      auto output_tile_coords = cutlass::MatrixCoord{
+          warp_idx_mn_0 % Mma::Base::WarpCount::kM,
+          warp_idx_mn_0 / Mma::Base::WarpCount::kM};
+
+      // apply bias if applicable
+      if (p.bias_ptr != nullptr) {
+        // load bias tile Bij into shared memory
+        typename MatmulQK::BiasLoader::GmemTileIterator bias_iter(
+            {cutlass::layout::RowMajor(p.bias_strideM)},
+            p.bias_ptr + query_start * p.bias_strideM + key_start,
+            {num_queries_in_block, num_keys_in_block},
+            thread_id);
+        cutlass::TensorRef<scalar_t, cutlass::layout::RowMajor> bias_tensor_ref(
+            shared_storage.bias().data(),
+            cutlass::layout::RowMajor(MatmulQK::ThreadblockShape::kM));
+        typename MatmulQK::BiasLoader::SmemTileIterator smem_tile_iter(
+            bias_tensor_ref, thread_id);
+        MatmulQK::BiasLoader::load(bias_iter, smem_tile_iter);
+
+        // Pij += Bij, where Pij is in register fragment and Bij is in shmem
+        auto lane_offset = MatmulQK::AccumLambdaIterator::get_lane_offset(
+            lane_id, warp_id, output_tile_coords);
+        MatmulQK::AccumLambdaIterator::iterateRows(
+            lane_offset,
+            [&](int accum_n) {},
+            [&](int accum_m, int accum_n, int idx) {
+              // remember we are transposed
+              accum[idx] += bias_tensor_ref.at({accum_n, accum_m});
+            },
+            [&](int accum_n) {});
+      }
+
+      // Apply mask
+      if (p.custom_mask_type == CausalFromTopLeft ||
+          p.custom_mask_type == CausalFromBottomRight) {
+        auto lane_offset = MatmulQK::AccumLambdaIterator::get_lane_offset(
+            lane_id, warp_id, output_tile_coords);
+        int shift = query_start - key_start;
+        if (p.custom_mask_type == CausalFromBottomRight) {
+          shift += p.num_keys - p.num_queries;
+        }
+        // current_key = key_start + accum_m
+        // current_query = query_start + accum_n
+        // mask if: `current_key > current_query`
+        MatmulQK::AccumLambdaIterator::iterateRows(
+            lane_offset,
+            [&](int accum_m) {},
+            [&](int accum_m, int accum_n, int idx) {
+              if (accum_m > accum_n + shift) {
+                accum[idx] =
+                    -cutlass::platform::numeric_limits<accum_t>::infinity();
+              }
+            },
+            [&](int accum_m) {});
+      }
+
+      __syncthreads();
+      if (kPrologueGV) {
+        prologueGradV(0);
+      }
+      if (kPrologueDOV) {
+        prologueDOV();
+      }
+
+      MatmulQK::B2bGemm::accumApplyLSEToSmem(
+          shared_storage.attn_shared_storage(),
+          accum,
+          p.logsumexp_ptr + query_start,
+          problem_size.n(),
+          thread_id,
+          warp_id,
+          lane_id,
+          output_tile_coords);
+#if 0
+      auto accum_ref_attnT = shared_storage.attn_shared_storage().accum_ref();
+      PRINT_TENSOR4x4_T0_L0("attn_T", accum_ref_attnT);
+#endif
+
+      // if we are using dropout, compute Zij, writing it to shared memory.
+      // each element of Zij is:
+      // - 0 with probability dropout_p
+      // - 1 / (1 - dropout_p) with probability 1 - dropout_p
+      if (kApplyDropout) {
+        auto zij = shared_storage.zij().accum_ref();
+        // each thread generates a contiguous sequence of elements in Zij, all
+        // in the same row. the reason they have to come from the same row is
+        // that sampling random numbers from a contiguous random number sequence
+        // is much more efficient than jumping around, and the linear offset of
+        // each element of Z (the global matrix) maps to an offset in a random
+        // number sequence. for Z, the end of a row and the beginning of the
+        // next have adjacent offsets, but for Zij (tile of global matrix), this
+        // is not necessarily the case.
+        // We must fill the entire `zij` shmem with values (even out of bounds
+        // on the K-dimension) otherwise we can get NaNs during the GEMM
+        const int kQueriesPerBlock = kBlockSizeI;
+        const int threads_per_row = cutlass::fast_min(
+            int32_t(kNumThreads / kQueriesPerBlock), num_keys_in_block);
+        const int elts_per_thread = cutlass::round_nearest(
+            cutlass::ceil_div(num_keys_in_block, threads_per_row), 4);
+
+        const int thread_i = thread_id / threads_per_row;
+        const int thread_start_j =
+            (thread_id % threads_per_row) * elts_per_thread;
+
+        if (thread_i < kQueriesPerBlock && thread_start_j < num_keys_in_block) {
+          curandStatePhilox4_32_10_t curand_state = curand_state_init;
+          skipahead(
+              (query_start + thread_i) * p.num_keys +
+                  (key_start + thread_start_j),
+              &curand_state);
+
+          // generate elements of Zij, 4 elements at a time
+          for (int zij_start_col_idx = thread_start_j; zij_start_col_idx <
+               cutlass::fast_min<int32_t>(thread_start_j + elts_per_thread,
+                                          num_keys_in_block);
+               zij_start_col_idx += 4) {
+            const float4 rand_uniform_quad = curand_uniform4(&curand_state);
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int quad_idx = 0; quad_idx < 4; ++quad_idx) {
+              // we'll write Zij transposed since attention is also transposed
+              // during the matmul to compute dV.
+              zij.at({zij_start_col_idx + quad_idx /*k*/, thread_i /*q*/}) =
+                  (&rand_uniform_quad.x)[quad_idx] > p.dropout_prob
+                  ? scalar_t(dropout_scale)
+                  : scalar_t(0);
+            }
+          }
+        }
+        __syncthreads();
+#if 0
+        PRINT_TENSOR4x4_T0_L0("zij", zij);
+        PRINT_TENSOR4x4_T0_L0_START("zij", zij, kBlockSizeJ - 4, kBlockSizeI - 4);
+#endif
+
+        // Save mask for later DOIVJ matmul
+
+        int warp_idx_mn_0 = warp_id %
+            (MatmulDOIVJ::Mma::Base::WarpCount::kM *
+             MatmulDOIVJ::Mma::Base::WarpCount::kN);
+        auto output_tile_coords_doivj = cutlass::MatrixCoord{
+            warp_idx_mn_0 % MatmulDOIVJ::Mma::Base::WarpCount::kM,
+            warp_idx_mn_0 / MatmulDOIVJ::Mma::Base::WarpCount::kM};
+        auto lane_offset = MatmulDOIVJ::AccumLambdaIterator::get_lane_offset(
+            lane_id, warp_id, output_tile_coords_doivj);
+        MatmulDOIVJ::AccumLambdaIterator::iterateRows(
+            lane_offset,
+            [&](int accum_m) {},
+            [&](int accum_m /*q*/, int accum_n /*k*/, int idx) {
+              if (zij.at({accum_n, accum_m}) == scalar_t(0)) {
+                dropout_keep_mask_doivj[idx] = cutlass::uint1b_t{0};
+              }
+            },
+            [&](int accum_m) {});
+      }
+      __syncthreads();
+    }
+    rematerializeThreadIds();
+
+    /////////////////////////////////////////////////////////////////////////////////////////////////
+    // GradV matmul
+    //
+    // grad_v[j_start:j_end] += attn_T @ do_i
+    /////////////////////////////////////////////////////////////////////////////////////////////////
+    constexpr bool kSingleIterationGradV =
+        kMaxK <= MatmulGradV::ThreadblockShape::kN;
+    for (int col = 0; col < (kSingleIterationGradV ? 1 : p.head_dim_value);
+         col += MatmulGradV::ThreadblockShape::kN) {
+      using Mma = typename MatmulGradV::Mma;
+      using AccumTileGmem = typename MatmulGradQ::AccumTileGmem;
+
+      cutlass::gemm::GemmCoord problem_size(
+          num_keys_in_block, p.head_dim_value - col, num_queries_in_block);
+      auto createEpilogueIter = [&]() {
+        return typename MatmulGradV::OutputTileIterator(
+            typename MatmulGradV::OutputTileIterator::Params{p.gV_strideM()},
+            p.grad_value_ptr + key_start * p.gV_strideM() + col,
+            {num_keys_in_block, p.head_dim_value - col},
+            thread_id);
+      };
+      typename Mma::IteratorB iterator_B(
+          {int32_t(p.gO_strideM)},
+          p.grad_output_ptr + query_start * p.gO_strideM + col,
+          {num_queries_in_block, p.head_dim_value - col},
+          thread_id,
+          no_offset);
+
+      // if dropout: dVj += (Pij.T * Zij) @ dOi
+      // otherwise:  dVj += Pij.T @ dOi
+      Mma mma(
+          // operand A: Pij.T
+          shared_storage.attn_shared_storage().accum_ref(),
+          // operand A_scale Zij.T:
+          // if we're using dropout, operand A is Pij_dropped.T = Pij.T * Zij.T
+          // which is computed on the fly as fragments of Pij.T are loaded in
+          shared_storage.zij().accum_ref(),
+          // operand B: dOi - which was loaded into shared memory previously
+          // when we computed dVj
+          shared_storage.mm_gradV().operand_B_ref(),
+          thread_id,
+          warp_id,
+          lane_id);
+
+      int storage_id = col / MatmulGradV::ThreadblockShape::kN;
+      AccumTileGmem gmem_tile{
+          p.workspace_gv + storage_id * AccumTileGmem::kElementsStored};
+      if (!kOutputInRF) {
+        if (isFirstQuery || !kNeedsAccumGradV) {
+          output_frags.gradV.clear();
+        } else {
+          gmem_tile.load(output_frags.gradV, thread_id);
+        }
+      }
+      mma.set_prologue_done(kPrologueGV);
+
+      auto gemm_k_iterations =
+          (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+      // Compute threadblock-scoped matrix multiply-add
+      __syncthreads();
+
+      mma(gemm_k_iterations,
+          output_frags.gradV,
+          iterator_B,
+          output_frags.gradV);
+      __syncthreads();
+      if (kPrologueGV && !kSingleIterationGradV &&
+          col + MatmulGradV::ThreadblockShape::kN < p.head_dim_value) {
+        prologueGradV(col + MatmulGradV::ThreadblockShape::kN);
+      }
+
+      if (!kOutputInRF) {
+        if (kNeedsAccumGradV && !isLastQuery) {
+          gmem_tile.store(output_frags.gradV, thread_id);
+        } else {
+          accumulateInGmem<MatmulGradV>(
+              shared_storage.gradV_epilogue(),
+              output_frags.gradV,
+              createEpilogueIter(),
+              isFirstQuery || kNeedsAccumGradV,
+              warp_id,
+              lane_id);
+        }
+      }
+    }
+    __syncthreads();
+
+    /////////////////////////////////////////////////////////////////////////////////////////////////
+    // MatmulDOIVJ
+    /////////////////////////////////////////////////////////////////////////////////////////////////
+    {
+      using Mma = typename MatmulDOIVJ::Mma;
+      // do_i
+      typename Mma::IteratorA iterator_A(
+          {int32_t(p.gO_strideM)},
+          p.grad_output_ptr + query_start * p.gO_strideM,
+          {num_queries_in_block, p.head_dim_value},
+          thread_id,
+          no_offset);
+
+      // v_j.transpose(-2, -1)
+      typename Mma::IteratorB iterator_B(
+          {int32_t(p.v_strideM)},
+          p.value_ptr + key_start * p.v_strideM,
+          {p.head_dim_value, num_keys_in_block},
+          thread_id,
+          no_offset);
+
+      Mma mma(shared_storage.mm_doivj(), thread_id, warp_id, lane_id);
+      mma.set_prologue_done(kPrologueDOV);
+      mma.set_zero_outside_bounds(!skipBoundsChecks);
+
+      typename Mma::FragmentC accum;
+
+      accum.clear();
+
+      auto gemm_k_iterations =
+          (p.head_dim_value + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+      // Compute threadblock-scoped matrix multiply-add
+      mma(gemm_k_iterations, accum, iterator_A, iterator_B, accum);
+      __syncthreads();
+      if (kPrologueGQ) {
+        prologueGradQ(0);
+      }
+      if (kPrologueGK) {
+        prologueGradK(0);
+      }
+
+      int warp_idx_mn_0 =
+          warp_id % (Mma::Base::WarpCount::kM * Mma::Base::WarpCount::kN);
+      auto output_tile_coords = cutlass::MatrixCoord{
+          warp_idx_mn_0 % Mma::Base::WarpCount::kM,
+          warp_idx_mn_0 / Mma::Base::WarpCount::kM};
+      // TODO: This must be terribly inefficient. There must be a better way
+      // tmp [RF] <- (accum [RF] - Di [smem] ) * attn_T.T [smem]
+      // attn_shared_storage  [smem] <- tmp.T
+      // tmp_shared_storage [smem] <- tmp
+      {
+        using LambdaIterator = typename MatmulDOIVJ::AccumLambdaIterator;
+        auto lane_offset = LambdaIterator::get_lane_offset(
+            lane_id, warp_id, output_tile_coords);
+        // if dropout was used, compute dPij = dPij_dropped * Zij
+        if (kApplyDropout) {
+          LambdaIterator::iterateRows(
+              lane_offset,
+              [&](int accum_m) {},
+              [&](int accum_m, int accum_n, int idx) {
+                if (dropout_keep_mask_doivj[idx].get()) {
+                  accum[idx] *= dropout_scale;
+                } else {
+                  accum[idx] = 0;
+                }
+              },
+              [&](int accum_m) {});
+        }
+
+        auto attn_T = shared_storage.attn_shared_storage().accum_ref();
+#if 0
+        PRINT_B0_T0("doivj_dropped");
+        print_warp_accum<LambdaIterator>(accum, lane_offset, 4, 4);
+        PRINT_TENSOR4x4_T0_L0("attn_T", attn_T)
+#endif
+        accum_t current_di;
+        // dSij = (dPij - Di) * Pij
+        LambdaIterator::iterateRows(
+            lane_offset,
+            [&](int accum_m) { current_di = shared_storage.di()[accum_m]; },
+            [&](int accum_m, int accum_n, int idx) {
+              // TODO: Otherwise we can get nans as we
+              // might have infs here (only seen on f16 tho)
+              if (skipBoundsChecks ||
+                  (accum_m < num_queries_in_block &&
+                   accum_n < num_keys_in_block)) {
+                accum_t attn = attn_T.at({accum_n, accum_m});
+                accum[idx] = (accum[idx] - current_di) * attn;
+              } else {
+                accum[idx] = 0;
+              }
+            },
+            [&](int accum_m) {
+
+            });
+
+        // store bias gradient tile dBij to global memory,
+        // where dBij = dSij = Pij * (dPij - Di)
+        if (p.grad_bias_ptr != nullptr) {
+          typename MatmulDOIVJ::BiasGradEpilogue::OutputTileIterator
+              output_iter(
+                  typename MatmulDOIVJ::BiasGradEpilogue::OutputTileIterator::
+                      Params{p.gB_strideM},
+                  // grad_bias_ptr is offset to point at beginning of
+                  // matrix of shape (queries, keys) for a given
+                  // (batch_id, head_id) the pointer arithmetic here produces
+                  // a pointer to the start of the current tile within that
+                  // matrix
+                  p.grad_bias_ptr + query_start * p.gB_strideM + key_start,
+                  {num_queries_in_block, num_keys_in_block},
+                  thread_id);
+
+          // no-op epilogue operator - just casting and storing contents of
+          // accum to global memory
+          typename MatmulDOIVJ::BiasGradEpilogue::OutputOp output_op(
+              typename MatmulDOIVJ::BiasGradEpilogue::OutputOp::Params{1, 1});
+          typename MatmulDOIVJ::BiasGradEpilogue epilogue(
+              shared_storage.gradB_epilogue(), thread_id, warp_id, lane_id);
+          epilogue(output_op, output_iter, accum, output_iter);
+        }
+
+        accum = accum * scale;
+
+#if 0
+        PRINT_B0_T0("(doivj - di) * attn * scale");
+        print_warp_accum<LambdaIterator>(accum, lane_offset, 4, 4);
+#endif
+
+        __syncthreads();
+        if (!MatmulGradK::DefaultMmaFromSmem::kIsTransposedA) {
+          auto tmpT = shared_storage.tmpT_shared_storage().accum_ref();
+          // attn <- attn_T.T
+          LambdaIterator::iterateRows(
+              lane_offset,
+              [&](int accum_m) {},
+              [&](int accum_m, int accum_n, int idx) {
+                tmpT.at({accum_n, accum_m}) = scalar_t(accum[idx]);
+              },
+              [&](int accum_m) {});
+        }
+      }
+
+      MatmulDOIVJ::B2bGemm::accumToSmem(
+          shared_storage.tmp_shared_storage(),
+          accum,
+          lane_id,
+          output_tile_coords);
+      __syncthreads();
+    }
+    // Force `nvcc` to recompute values that depend on the variables just below
+    // to use less RF and prevent some spilling
+    p.head_dim = warp_uniform(p.head_dim);
+    p.k_strideM = warp_uniform(p.k_strideM);
+    rematerializeThreadIds();
+
+    /////////////////////////////////////////////////////////////////////////////////////////////////
+    // GradQ matmul
+    //
+    // grad_q[i_start:i_end] += tmp @ k_j
+    /////////////////////////////////////////////////////////////////////////////////////////////////
+    // Skip the loop & associated branches if we know at compile time the number
+    // of iterations
+    constexpr bool kSingleIterationGradQ =
+        kMaxK <= MatmulGradQ::ThreadblockShape::kN;
+    for (int col = 0; col < (kSingleIterationGradQ ? 1 : p.head_dim);
+         col += MatmulGradQ::ThreadblockShape::kN) {
+      using Mma = typename MatmulGradQ::Mma;
+      using AccumTileGmem = typename MatmulGradQ::AccumTileGmem;
+
+      cutlass::gemm::GemmCoord problem_size(
+          num_queries_in_block,
+          false ? MatmulGradQ::ThreadblockShape::kN : p.head_dim - col,
+          num_keys_in_block);
+
+      // k_j
+      typename Mma::IteratorB iterator_B(
+          {int32_t(p.k_strideM)},
+          p.key_ptr + key_start * p.k_strideM + col,
+          {problem_size.k(), problem_size.n()},
+          thread_id,
+          no_offset);
+
+      auto a = shared_storage.tmp_shared_storage().accum_ref();
+      Mma mma(
+          // operand A: dSij
+          shared_storage.tmp_shared_storage().accum_ref(),
+          // operand B: Kj
+          shared_storage.mm_gradQ().operand_B_ref(),
+          thread_id,
+          warp_id,
+          lane_id);
+
+      typename Mma::FragmentC accum;
+
+      int col_id = col / MatmulGradQ::ThreadblockShape::kN;
+      int num_cols = kSingleIterationGradQ
+          ? 1
+          : ceil_div(p.head_dim, MatmulGradQ::ThreadblockShape::kN);
+      int storage_id = (col_id + query_start / kBlockSizeI * num_cols);
+
+      if (p.num_splits_key_device() > 1) {
+        AtomicLock::acquire(
+            &p.workspace_gq[storage_id].lock,
+            p.split_key_device() + 1,
+            thread_id);
+        // Make sure we can see other block's output
+        __threadfence();
+      }
+
+      AccumTileGmem gmem_tile{&p.workspace_gq[storage_id].buffer[0]};
+      if (!kNeedsAccumGradQ ||
+          (p.num_splits_key_device() == 1 && key_start == 0)) {
+        // if we know we are the first to access it, we know it's only zeros.
+        // Avoids a load from gmem (and gmem init as well)
+        accum.clear();
+      } else {
+        gmem_tile.load(accum, thread_id);
+      }
+
+      auto gemm_k_iterations =
+          (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+      // Compute threadblock-scoped matrix multiply-add
+      __syncthreads();
+      mma.set_prologue_done(kPrologueGQ);
+      mma(gemm_k_iterations, accum, iterator_B, accum);
+      __syncthreads();
+      bool isLastColumn = kSingleIterationGradQ ||
+          (col + MatmulGradQ::ThreadblockShape::kN >= p.head_dim);
+      if (kPrologueGQ && !isLastColumn) {
+        prologueGradQ(col + MatmulGradQ::ThreadblockShape::kN);
+      }
+
+      bool isLast = [&]() {
+        int32_t next_key = key_start + p.num_splits_key_device() * kBlockSizeJ;
+        if (p.num_keys <= next_key) {
+          return true;
+        }
+        if (query_start < getSmallestQueryForKey(p, next_key)) {
+          return true;
+        }
+        return false;
+      }();
+      // Output results
+      if (p.num_splits_key_device() > 1) {
+        int32_t numAddsSoFar = -1;
+        if (isLast && thread_id == 0) {
+          numAddsSoFar = atomicAdd(&p.workspace_gq[storage_id].counter, 1) +
+              1; // `atomicAdd` returns the old value
+        }
+        isLast = __syncthreads_or(
+            numAddsSoFar == getNumParallelBlocksForQuery(p, query_start));
+        assert(numAddsSoFar <= getNumParallelBlocksForQuery(p, query_start));
+      }
+      if (kNeedsAccumGradQ && !isLast) {
+        gmem_tile.store(accum, thread_id);
+        if (p.num_splits_key_device() > 1) {
+          // Make sure everyone wrote before we release the lock
+          __threadfence();
+          __syncthreads();
+          AtomicLock::release(&p.workspace_gq[storage_id].lock, thread_id);
+        }
+      } else {
+        // NOTE: We're not releasing the lock because no one is expected
+        // to come after us (we're the last one to write)
+        typename MatmulGradQ::OutputTileIterator output_it(
+            typename MatmulGradQ::OutputTileIterator::Params{p.gQ_strideM()},
+            p.grad_query_ptr + query_start * p.gQ_strideM() + col,
+            {problem_size.m(), problem_size.n()},
+            thread_id);
+        bool storage_contains_zeros = kNeedsAccumGradQ || key_start == 0 ||
+            (p.num_splits_key_device() > 1);
+        accumulateInGmem<MatmulGradQ>(
+            isLastColumn ? shared_storage.gradQ_epilogue_lastIter()
+                         : shared_storage.gradQ_epilogue(),
+            accum,
+            output_it,
+            storage_contains_zeros,
+            warp_id,
+            lane_id);
+      }
+    }
+    /////////////////////////////////////////////////////////////////////////////////////////////////
+    // GradK matmul
+    //
+    // grad_k[i_start:i_end] += tmp.transpose(-2, -1) @ q_i
+    /////////////////////////////////////////////////////////////////////////////////////////////////
+    rematerializeThreadIds();
+
+    constexpr bool kSingleIterationGradK =
+        kMaxK <= MatmulGradK::ThreadblockShape::kN;
+    for (int col = 0; col < (kSingleIterationGradK ? 1 : p.head_dim);
+         col += MatmulGradK::ThreadblockShape::kN) {
+      using Mma = typename MatmulGradK::Mma;
+      using AccumTileGmem = typename MatmulGradQ::AccumTileGmem;
+
+      cutlass::gemm::GemmCoord problem_size(
+          num_keys_in_block,
+          false ? MatmulGradK::ThreadblockShape::kN : p.head_dim - col,
+          num_queries_in_block);
+      auto createEpilogueIter = [&]() {
+        return typename MatmulGradK::OutputTileIterator(
+            typename MatmulGradK::OutputTileIterator::Params{p.gK_strideM()},
+            p.grad_key_ptr + key_start * p.gK_strideM() + col,
+            {num_keys_in_block,
+             false ? MatmulGradK::ThreadblockShape::kN : p.head_dim - col},
+            thread_id);
+      };
+
+      // q_i
+      typename Mma::IteratorB iterator_B(
+          {int32_t(p.q_strideM)},
+          p.query_ptr + query_start * p.q_strideM + col,
+          {problem_size.k(), problem_size.n()},
+          thread_id,
+          no_offset);
+
+      auto getTmp = [&](int) { return &shared_storage.tmp_shared_storage(); };
+      auto getTmpT = [&](int) { return &shared_storage.tmpT_shared_storage(); };
+      // this is basically:
+      // opA = kIsTransposedA ? getTmp() : getTmpT();
+      bool constexpr kIsTransposedA =
+          MatmulGradK::DefaultMmaFromSmem::kIsTransposedA;
+      auto& opA = *call_conditional<
+          kIsTransposedA,
+          decltype(getTmp),
+          decltype(getTmpT)>::apply(getTmp, getTmpT, 0);
+      Mma mma(
+          // operand A: dSij.T
+          opA.accum_ref(),
+          // operand B: Qi
+          shared_storage.mm_gradK().operand_B_ref(),
+          thread_id,
+          warp_id,
+          lane_id);
+
+      int storage_id = col / MatmulGradK::ThreadblockShape::kN;
+      AccumTileGmem gmem_tile{
+          p.workspace + storage_id * AccumTileGmem::kElementsStored};
+      if (!kOutputInRF) {
+        if (isFirstQuery || !kNeedsAccumGradK) {
+          output_frags.gradK.clear();
+        } else {
+          gmem_tile.load(output_frags.gradK, thread_id);
+        }
+      }
+      mma.set_prologue_done(kPrologueGK);
+
+      auto gemm_k_iterations =
+          (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK;
+
+      // Compute threadblock-scoped matrix multiply-add
+      __syncthreads();
+
+      mma(gemm_k_iterations,
+          output_frags.gradK,
+          iterator_B,
+          output_frags.gradK);
+      __syncthreads();
+      bool isLastColumn = kSingleIterationGradK ||
+          col + MatmulGradK::ThreadblockShape::kN >= p.head_dim;
+      if (kPrologueGK && !isLastColumn) {
+        prologueGradK(col + MatmulGradK::ThreadblockShape::kN);
+      }
+
+      if (kPrologueQK && isLastColumn) {
+        int32_t next_query, next_key;
+        incrIteration(p, query_start, key_start, next_query, next_key);
+        DISPATCH_BOOL(
+            next_key != key_start, kForceReloadK, ([&]() {
+              prologueQkNextIteration<kForceReloadK::value>(
+                  shared_storage, p, next_query, next_key, warp_id, lane_id);
+            }));
+      }
+
+      // Output results
+      if (!kOutputInRF) {
+        if (kNeedsAccumGradK && !isLastQuery) {
+          gmem_tile.store(output_frags.gradK, thread_id);
+        } else {
+          accumulateInGmem<MatmulGradK>(
+              isLastColumn ? shared_storage.gradK_epilogue_final()
+                           : shared_storage.gradK_epilogue(),
+              output_frags.gradK,
+              createEpilogueIter(),
+              isFirstQuery || kNeedsAccumGradK,
+              warp_id,
+              lane_id);
+          __syncthreads();
+        }
+      }
+    }
+  }
+
+  static CUTLASS_DEVICE int32_t getQueryStartShift(Params const& p) {
+    if (p.custom_mask_type == NoCustomMask && p.num_splits_key_device() > 1) {
+      return (p.split_key_device() * kBlockSizeI) % getQueryEnd(p);
+    }
+    return 0;
+  }
+
+  // Iteration order logic
+  static CUTLASS_DEVICE int32_t
+  getQueryStart(Params const& p, int32_t key_start) {
+    return getSmallestQueryForKey(p, key_start) + getQueryStartShift(p);
+  };
+  static CUTLASS_DEVICE int32_t getQueryEnd(Params const& p) {
+    return align_up(p.num_queries, kBlockSizeI);
+  };
+
+  static CUTLASS_DEVICE int32_t
+  getSmallestQueryForKey(Params const& p, int32_t key_start) {
+    if (p.custom_mask_type == CausalFromTopLeft) {
+      return (key_start / kBlockSizeI) * kBlockSizeI;
+    } else if (p.custom_mask_type == CausalFromBottomRight) {
+      int first_query =
+          cutlass::fast_max(0, key_start - p.num_keys + p.num_queries);
+      return (first_query / kBlockSizeI) * kBlockSizeI;
+    }
+    return 0;
+  };
+
+  // Returns how many kernel blocks will write to a given block in `grad_query`
+  // This is usually equal to the number of key splits, but can be different
+  // for instance in the causal case, or varying seqlen
+  static CUTLASS_DEVICE int32_t
+  getNumParallelBlocksForQuery(Params const& p, int32_t query_start) {
+    int16_t num_key_blocks = ceil_div(p.num_keys, kBlockSizeJ);
+    if (p.custom_mask_type == CausalFromTopLeft) {
+      int32_t last_key_for_block = query_start + kBlockSizeI - 1;
+      last_key_for_block = cutlass::fast_min(last_key_for_block, p.num_keys);
+      num_key_blocks = ceil_div(last_key_for_block, kBlockSizeJ);
+    } else if (p.custom_mask_type == CausalFromBottomRight) {
+      int32_t last_key_for_block =
+          query_start + (kBlockSizeI - 1) + (1 + p.num_keys - p.num_queries);
+      last_key_for_block = cutlass::fast_min(last_key_for_block, p.num_keys);
+      num_key_blocks = ceil_div(last_key_for_block, kBlockSizeJ);
+    }
+    return cutlass::fast_min(p.num_splits_key_device(), num_key_blocks);
+  };
+
+  // Returns the next block to process
+  static CUTLASS_DEVICE void incrIteration(
+      Params const& p,
+      int32_t query_start,
+      int32_t key_start,
+      int32_t& next_query,
+      int32_t& next_key) {
+    next_query = query_start + kBlockSizeI;
+    next_key = key_start;
+    auto query_shift = getQueryStartShift(p);
+    // Wrap around
+    if (query_shift) {
+      if (next_query >= p.num_queries) {
+        next_query = getSmallestQueryForKey(p, key_start);
+        return;
+      } else if (query_start < query_shift && query_shift <= next_query) {
+        // jump to next key
+      } else {
+        return;
+      }
+    } else {
+      if (next_query < p.num_queries) {
+        return;
+      }
+      // jump to next key
+    }
+    // Next key
+    next_key = key_start + p.num_splits_key_device() * kBlockSizeJ;
+    next_query = getQueryStart(p, next_key);
+  }
+
+  template <bool kForceReloadK>
+  static CUTLASS_DEVICE void prologueQkNextIteration(
+      SharedStorage& shared_storage,
+      Params const& p,
+      int32_t query_start,
+      int32_t key_start,
+      uint8_t warp_id,
+      uint8_t lane_id) {
+    if (query_start >= p.num_queries || key_start >= p.num_keys) {
+      return;
+    }
+
+    static constexpr bool kReloadK =
+        kForceReloadK || !MatmulQK::Mma::kSmemContainsEntireMat;
+    int thread_id = 32 * warp_id + lane_id;
+    typename MatmulQK::Mma::IteratorA iterator_A(
+        {int32_t(p.k_strideM)},
+        p.key_ptr + key_start * p.k_strideM,
+        {p.num_keys - key_start, p.head_dim},
+        thread_id,
+        cutlass::MatrixCoord{0, 0});
+
+    typename MatmulQK::Mma::IteratorB iterator_B(
+        {int32_t(p.q_strideM)},
+        p.query_ptr + query_start * p.q_strideM,
+        {p.head_dim, p.num_queries - query_start},
+        thread_id,
+        cutlass::MatrixCoord{0, 0});
+
+    MatmulQK::Mma::template prologue<kReloadK, true>(
+        shared_storage.mm_qk_k(),
+        shared_storage.mm_qk_q(),
+        iterator_A,
+        iterator_B,
+        thread_id,
+        p.head_dim);
+  }
+
+  template <bool skipBoundsChecks>
+  static CUTLASS_DEVICE void writeFragsToGmem(
+      SharedStorage& shared_storage,
+      OutputFragments& output_frags,
+      Params const& p,
+      int32_t key_start,
+      uint8_t warp_id,
+      uint8_t lane_id) {
+    uint16_t thread_id = 32 * warp_id + lane_id;
+    int32_t num_keys_in_block = skipBoundsChecks
+        ? MatmulQK::Mma::Shape::kM
+        : cutlass::fast_min(
+              (int32_t)MatmulQK::Mma::Shape::kM, p.num_keys - key_start);
+    typename MatmulGradV::OutputTileIterator outputV_it(
+        typename MatmulGradV::OutputTileIterator::Params{p.gV_strideM()},
+        p.grad_value_ptr + key_start * p.gV_strideM(),
+        {num_keys_in_block, p.head_dim_value},
+        thread_id);
+
+    accumulateInGmem<MatmulGradV>(
+        shared_storage.gradV_epilogue_final(),
+        output_frags.gradV,
+        outputV_it,
+        true,
+        warp_id,
+        lane_id);
+
+    typename MatmulGradK::OutputTileIterator outputK_it(
+        typename MatmulGradK::OutputTileIterator::Params{p.gK_strideM()},
+        p.grad_key_ptr + key_start * p.gK_strideM(),
+        {num_keys_in_block,
+         false ? MatmulGradK::ThreadblockShape::kN : p.head_dim},
+        thread_id);
+    accumulateInGmem<MatmulGradK>(
+        shared_storage.gradK_epilogue_final(),
+        output_frags.gradK,
+        outputK_it,
+        true,
+        warp_id,
+        lane_id);
+  }
+
+  template <typename MatmulT>
+  static CUTLASS_DEVICE void accumulateInGmem(
+      typename MatmulT::DefaultEpilogue::SharedStorage& epilogue_smem,
+      typename MatmulT::Mma::FragmentC const& accum,
+      typename MatmulT::OutputTileIterator output_it,
+      bool first,
+      uint8_t warp_id,
+      uint8_t lane_id) {
+    using DefaultEpilogue = typename MatmulT::DefaultEpilogue;
+    using DefaultOutputOp = typename MatmulT::DefaultOutputOp;
+    using Mma = typename MatmulT::Mma;
+    int thread_id = 32 * warp_id + lane_id;
+    DISPATCH_BOOL(
+        first, kIsFirst, ([&]() {
+          static constexpr auto ScaleType = kIsFirst::value
+              ? cutlass::epilogue::thread::ScaleType::Nothing
+              : cutlass::epilogue::thread::ScaleType::NoBetaScaling;
+          using EpilogueOutputOp =
+              typename cutlass::epilogue::thread::LinearCombination<
+                  typename DefaultOutputOp::ElementOutput,
+                  DefaultOutputOp::kCount,
+                  typename DefaultOutputOp::ElementAccumulator,
+                  typename DefaultOutputOp::ElementCompute,
+                  ScaleType>;
+          using Epilogue =
+              typename cutlass::epilogue::threadblock::EpiloguePipelined<
+                  typename DefaultEpilogue::Shape,
+                  typename Mma::Operator,
+                  DefaultEpilogue::kPartitionsK,
+                  typename MatmulT::OutputTileIterator,
+                  typename DefaultEpilogue::AccumulatorFragmentIterator,
+                  typename DefaultEpilogue::WarpTileIterator,
+                  typename DefaultEpilogue::SharedLoadIterator,
+                  EpilogueOutputOp,
+                  typename DefaultEpilogue::Padding,
+                  DefaultEpilogue::kFragmentsPerIteration,
+                  true // IterationsUnroll
+                  >;
+          EpilogueOutputOp rescale({1, 1});
+          Epilogue epilogue(epilogue_smem, thread_id, warp_id, lane_id);
+          epilogue(rescale, output_it, accum, output_it);
+        }));
+  }
+
+  template <int kElementsPerAccess>
+  static CUTLASS_DEVICE void computeDelta(
+      Params const& p,
+      int32_t query_start,
+      uint8_t warp_id,
+      uint8_t lane_id) {
+    // Each thread computes one value for Delta
+    // Depending on warp configuration, we might have multiple
+    // threads of the same warp working on the same row
+    using AccessType = cutlass::Array<scalar_t, kElementsPerAccess>;
+    static_assert(kNumThreads >= kBlockSizeI, "");
+    static constexpr int kNumThreadsPerLine = kNumThreads / kBlockSizeI;
+    int16_t thread_id = 32 * warp_id + lane_id;
+
+    int16_t laneFirstCol = kElementsPerAccess * (lane_id % kNumThreadsPerLine);
+    int16_t laneRow = thread_id / kNumThreadsPerLine;
+    bool rowPred = (query_start + laneRow) < p.num_queries;
+    bool pred = rowPred;
+
+    // on windows, previous syntax __restrict__ AccessType*
+    // resulted in error: "restrict" is not allowed
+    const AccessType* __restrict__ grad_output_ptr =
+        reinterpret_cast<const AccessType*>(
+            p.grad_output_ptr + (query_start + laneRow) * p.gO_strideM +
+            laneFirstCol);
+    const AccessType* __restrict__ output_ptr =
+        reinterpret_cast<const AccessType*>(
+            p.output_ptr + (query_start + laneRow) * p.o_strideM() +
+            laneFirstCol);
+
+    static constexpr int64_t kMaxIters =
+        kMaxK / (kElementsPerAccess * kNumThreadsPerLine);
+    constexpr int kPipelineStages = 2;
+    accum_t delta_value = accum_t(0);
+    using GlobalLoad =
+        cutlass::arch::global_load<AccessType, sizeof(AccessType)>;
+    AccessType frag_grad_output[kPipelineStages];
+    AccessType frag_output[kPipelineStages];
+
+    auto loadAndIncrement = [&](int ld_pos, bool is_valid) {
+      frag_grad_output[ld_pos].clear();
+      frag_output[ld_pos].clear();
+      GlobalLoad(frag_grad_output[ld_pos], grad_output_ptr, is_valid);
+      GlobalLoad(frag_output[ld_pos], output_ptr, is_valid);
+      grad_output_ptr += kNumThreadsPerLine;
+      output_ptr += kNumThreadsPerLine;
+    };
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int iter = 0; iter < kPipelineStages - 1; ++iter) {
+      int ld_pos = iter % kPipelineStages;
+      pred = pred &&
+          (laneFirstCol + iter * kElementsPerAccess * kNumThreadsPerLine) <
+              p.head_dim_value;
+      loadAndIncrement(ld_pos, pred);
+    }
+    auto columnIteration = [&](int iter) {
+      // Load for next iter
+      int ld_pos = (iter + kPipelineStages - 1) % kPipelineStages;
+      pred = pred &&
+          (laneFirstCol +
+           (iter + kPipelineStages - 1) * kElementsPerAccess *
+               kNumThreadsPerLine) < p.head_dim_value;
+      loadAndIncrement(ld_pos, pred);
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < AccessType::kElements; ++i) {
+        delta_value += accum_t(frag_output[iter % kPipelineStages][i]) *
+            accum_t(frag_grad_output[iter % kPipelineStages][i]);
+      }
+    };
+
+    // If we have a small lower-bound for K, we can unroll the loop
+    if (kMaxK <= 256) {
+      CUTLASS_PRAGMA_UNROLL
+      for (int iter = 0; iter < kMaxIters; ++iter) {
+        columnIteration(iter);
+      }
+    } else {
+      int num_iters =
+          ceil_div(p.head_dim_value, kElementsPerAccess * kNumThreadsPerLine) *
+          (kElementsPerAccess * kNumThreadsPerLine);
+      for (int iter = 0; iter < num_iters; ++iter) {
+        columnIteration(iter);
+      }
+    }
+
+    // Reduce between workers
+    static_assert(
+        kNumThreadsPerLine == 1 || kNumThreadsPerLine == 2 ||
+            kNumThreadsPerLine == 4,
+        "");
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 1; i < kNumThreadsPerLine; i *= 2) {
+      delta_value = delta_value + __shfl_xor_sync(0xffffffff, delta_value, i);
+    }
+
+    // Store in gmem
+    if (rowPred) {
+      p.delta_ptr[query_start + laneRow] = delta_value;
+    }
+  }
+};
+
+template <typename AK>
+__global__ void __launch_bounds__(AK::kNumThreads, AK::kMinBlocksPerSm)
+    attention_kernel_backward_batched_impl(typename AK::Params p) {
+  if (!p.advance_to_block()) {
+    return;
+  }
+  AK::attention_kernel(p);
+}
+
+template <typename AK>
+__global__ void __launch_bounds__(AK::kNumThreads, AK::kMinBlocksPerSm)
+    attention_kernel_backward_batched(typename AK::Params params);
diff --git a/third_party/fused_multi_head_attention/kernel_forward.h b/third_party/fused_multi_head_attention/kernel_forward.h
new file mode 100644
index 0000000000..ed4e167759
--- /dev/null
+++ b/third_party/fused_multi_head_attention/kernel_forward.h
@@ -0,0 +1,1322 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#ifdef HAS_PYTORCH
+#include <ATen/cuda/CUDAGeneratorImpl.h>
+#include <ATen/cuda/CUDAGraphsUtils.cuh>
+#endif
+
+#include <curand_kernel.h>
+#include <cmath>
+#include <cinttypes>
+#include <vector>
+
+#include "cutlass/fast_math.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/vector.h"
+#include "cutlass/matrix.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/tensor_ref.h"
+
+#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
+#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/kernel/default_gemm.h"
+#include "cutlass/gemm/threadblock/default_mma.h"
+#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
+#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/matrix_shape.h"
+#include "cutlass/platform/platform.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "debug_utils.h"
+#include "epilogue/epilogue_pipelined.h"
+#include "epilogue/epilogue_rescale_output.h"
+#include "gemm/custom_mma.h"
+#include "gemm/find_default_mma.h"
+#include "gemm/mma_from_smem.h"
+#include "gemm_kernel_utils.h"
+#include "transform/tile_smem_loader.h"
+
+using namespace gemm_kernel_utils;
+
+namespace {
+template <typename scalar_t, typename Arch>
+constexpr int getWarpsPerSmFw() {
+  return (
+      Arch::kMinComputeCapability >= 80 &&
+              !cutlass::platform::is_same<scalar_t, float>::value
+          ? 16
+          : 12);
+}
+static CUTLASS_DEVICE float atomicMaxFloat(float* addr, float value) {
+  // source: https://stackoverflow.com/a/51549250
+  return (value >= 0)
+      ? __int_as_float(atomicMax((int*)addr, __float_as_int(value)))
+      : __uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(value)));
+}
+} // namespace
+
+// If ToBatchHookType_ is supplied other than this default (which is
+// never the case in the xformers library) then the user is
+// defining the logic which each block uses to find its data to work on,
+// with the advance_to_batch function with the following signature.
+// It should return false if there is no work to do for this block.
+// In general this will not work with saving for backward due to fixed layout
+// for logsumexp and incompatible rngs for dropout, so is likely only useful for
+// custom inference.
+struct DefaultToBatchHook {
+  template <typename Params>
+  CUTLASS_DEVICE static bool advance_to_batch(
+      Params&,
+      int64_t& /* q_start */,
+      int64_t& /* k_start */) {
+    return true;
+  }
+};
+
+template <
+    // The datatype of Q/K/V
+    typename scalar_t_,
+    // Architecture we are targeting (eg `cutlass::arch::Sm80`)
+    typename ArchTag,
+    // If Q/K/V are correctly aligned in memory and we can run a fast kernel
+    bool isAligned_,
+    int kQueriesPerBlock_,
+    int kKeysPerBlock_,
+    // upperbound on `max(value.shape[-1], query.shape[-1])`
+    int kMaxK_ = (int)cutlass::platform::numeric_limits<uint32_t>::max(),
+    // This is quite slower on V100 for some reason
+    // Set to false if you know at compile-time you will never need dropout
+    bool kSupportsDropout_ = true,
+    bool kSupportsBias_ = true,
+    typename ToBatchHookType_ = DefaultToBatchHook>
+struct AttentionKernel {
+  enum CustomMaskType {
+    NoCustomMask = 0,
+    CausalFromTopLeft = 1,
+    CausalFromBottomRight = 2,
+    NumCustomMaskTypes,
+  };
+
+  using scalar_t = scalar_t_;
+  using accum_t = float;
+  using lse_scalar_t = float;
+  using output_t = scalar_t;
+  // Accumulator between 2 iterations
+  // Using `accum_t` improves perf on f16 at the cost of
+  // numerical errors
+  using output_accum_t = accum_t;
+  static constexpr bool kSupportsDropout = kSupportsDropout_;
+  static constexpr bool kSupportsBias = kSupportsBias_;
+  static constexpr int kKeysPerBlock = kKeysPerBlock_;
+  static constexpr int kQueriesPerBlock = kQueriesPerBlock_;
+  static constexpr int kMaxK = kMaxK_;
+  static constexpr bool kIsAligned = isAligned_;
+  static constexpr bool kSingleValueIteration = kMaxK <= kKeysPerBlock;
+  static constexpr int32_t kAlignLSE = 32; // block size of backward
+  static constexpr bool kIsHalf = cutlass::sizeof_bits<scalar_t>::value == 16;
+  static constexpr bool kPreloadV =
+      ArchTag::kMinComputeCapability >= 80 && kIsHalf;
+  static constexpr bool kKeepOutputInRF = kSingleValueIteration;
+  static constexpr bool kNeedsOutputAccumulatorBuffer = !kKeepOutputInRF &&
+      !cutlass::platform::is_same<output_accum_t, output_t>::value;
+
+  static_assert(kQueriesPerBlock % 32 == 0, "");
+  static_assert(kKeysPerBlock % 32 == 0, "");
+  static constexpr int kNumWarpsPerBlock =
+      kQueriesPerBlock * kKeysPerBlock / (32 * 32);
+  static constexpr int kWarpSize = 32;
+
+  // Launch bounds
+  static constexpr int kNumThreads = kWarpSize * kNumWarpsPerBlock;
+  static constexpr int kMinBlocksPerSm =
+      getWarpsPerSmFw<scalar_t, ArchTag>() / kNumWarpsPerBlock;
+
+  struct Params {
+    // Input tensors
+    scalar_t* query_ptr = nullptr; // [num_queries, num_heads, head_dim]
+    scalar_t* key_ptr = nullptr; // [num_keys, num_heads, head_dim]
+    scalar_t* value_ptr = nullptr; // [num_keys, num_heads, head_dim_value]
+    scalar_t* attn_bias_ptr = nullptr; // [num_heads, num_queries, num_keys]
+    int32_t* seqstart_q_ptr = nullptr;
+    int32_t* seqstart_k_ptr = nullptr;
+
+    int32_t* seqlen_k_ptr = nullptr;
+    uint32_t causal_diagonal_offset = 0;
+
+    // Output tensors
+    output_t* output_ptr = nullptr; // [num_queries, num_heads, head_dim_value]
+    // [num_queries, num_heads, head_dim_value]
+    output_accum_t* output_accum_ptr = nullptr;
+    // [num_heads, num_queries] - can be null
+    lse_scalar_t* logsumexp_ptr = nullptr;
+
+    // Scale
+    accum_t scale = 0.0;
+
+    // Dimensions/strides
+    int32_t head_dim = 0;
+    int32_t head_dim_value = 0;
+    int32_t num_queries = 0;
+    int32_t num_keys = 0;
+    int32_t num_keys_absolute = 0;
+
+    uint8_t custom_mask_type = NoCustomMask;
+
+    int32_t q_strideM = 0;
+    int32_t k_strideM = 0;
+    int32_t v_strideM = 0;
+    int32_t bias_strideM = 0;
+
+    int32_t o_strideM = 0;
+
+    // Everything below is only used in `advance_to_block`
+    // and shouldn't use registers
+    int32_t q_strideH = 0;
+    int32_t k_strideH = 0;
+    int32_t v_strideH = 0;
+    int64_t bias_strideH = 0;
+
+    int64_t q_strideB = 0;
+    int64_t k_strideB = 0;
+    int64_t v_strideB = 0;
+    int64_t bias_strideB = 0;
+
+    int32_t num_batches = 0;
+    int32_t num_heads = 0;
+
+    // dropout
+    bool use_dropout = false;
+    unsigned long long dropout_batch_head_rng_offset = 0;
+    float dropout_prob = 0.0f;
+#ifdef HAS_PYTORCH
+    at::PhiloxCudaState rng_engine_inputs = at::PhiloxCudaState(0, 0);
+#endif
+
+    // Moves pointers to what we should process
+    // Returns "false" if there is no work to do
+    CUTLASS_DEVICE bool advance_to_block() {
+      auto batch_id = blockIdx.z;
+      auto head_id = blockIdx.y;
+      auto query_start = blockIdx.x * kQueriesPerBlock;
+
+      auto lse_dim = ceil_div((int32_t)num_queries, kAlignLSE) * kAlignLSE;
+
+      if (kSupportsDropout) {
+        dropout_batch_head_rng_offset =
+            batch_id * num_heads * num_queries * num_keys +
+            head_id * num_queries * num_keys;
+      }
+
+      int64_t q_start = 0, k_start = 0;
+      // Advance to current batch - in case of different sequence lengths
+      constexpr bool kToBatchHook =
+          !cutlass::platform::is_same<ToBatchHookType_, DefaultToBatchHook>::
+              value;
+      if (kToBatchHook) {
+        // Call out to a custom implementation.
+        if (!ToBatchHookType_::advance_to_batch(*this, q_start, k_start)) {
+          return false;
+        }
+      } else if (seqstart_q_ptr != nullptr) {
+        assert(seqstart_k_ptr != nullptr);
+        seqstart_q_ptr += batch_id;
+
+        q_start = seqstart_q_ptr[0];
+        int64_t q_next_start = seqstart_q_ptr[1];
+        int64_t k_end;
+        seqstart_k_ptr += batch_id;
+
+        if (seqlen_k_ptr) {
+          k_start = seqstart_k_ptr[0];
+          k_end = k_start + seqlen_k_ptr[batch_id];
+        } else {
+          k_start = seqstart_k_ptr[0];
+          k_end = seqstart_k_ptr[1];
+        }
+
+        num_queries = q_next_start - q_start;
+        num_keys = k_end - k_start;
+
+        if (query_start >= num_queries) {
+          return false;
+        }
+      } else {
+        query_ptr += batch_id * q_strideB;
+        key_ptr += batch_id * k_strideB;
+        value_ptr += batch_id * v_strideB;
+        output_ptr += int64_t(batch_id * num_queries) * o_strideM;
+        if (output_accum_ptr != nullptr) {
+          output_accum_ptr +=
+              int64_t(batch_id * num_queries) * (head_dim_value * num_heads);
+        }
+        q_start = 0;
+        k_start = 0;
+      }
+
+      // Advance to the current batch / head / query_start
+      query_ptr += (q_start + query_start) * q_strideM + head_id * q_strideH;
+      key_ptr += k_start * k_strideM + head_id * k_strideH;
+
+      value_ptr += k_start * v_strideM + head_id * v_strideH;
+      output_ptr +=
+          int64_t(q_start + query_start) * o_strideM + head_id * head_dim_value;
+
+      if (kSupportsBias && attn_bias_ptr != nullptr) {
+        attn_bias_ptr += (batch_id * bias_strideB) + (head_id * bias_strideH);
+      }
+      if (output_accum_ptr != nullptr) {
+        output_accum_ptr +=
+            int64_t(q_start + query_start) * (head_dim_value * num_heads) +
+            head_id * head_dim_value;
+      } else {
+        // Accumulate directly in the destination buffer (eg for f32)
+        output_accum_ptr = (accum_t*)output_ptr;
+      }
+
+      if (logsumexp_ptr != nullptr) {
+        // lse[batch_id, head_id, query_start]
+        logsumexp_ptr +=
+            batch_id * lse_dim * num_heads + head_id * lse_dim + query_start;
+      }
+
+      // Custom masking
+      if (custom_mask_type == CausalFromBottomRight) {
+        causal_diagonal_offset = num_keys - num_queries;
+      }
+      // We use num_keys_absolute to index into the rng_state
+      // We need this index to match between forward and backwards
+      num_keys_absolute = num_keys;
+      if (custom_mask_type == CausalFromTopLeft ||
+          custom_mask_type == CausalFromBottomRight) {
+        // the bottom row of the current block is query_start + kQueriesPerBlock
+        // the last active key is then query_start + causal_diagonal_offset +
+        // kQueriesPerBlock so num_keys is the min between actual num_keys and
+        // this to avoid extra computations
+        num_keys = cutlass::fast_min(
+            int32_t(query_start + causal_diagonal_offset + kQueriesPerBlock),
+            num_keys);
+      }
+
+      num_queries -= query_start;
+      num_batches = 0; // no longer used after
+
+      // If num_queries == 1, and there is only one key head we're wasting
+      // 15/16th of tensor core compute In that case :
+      //  - we only launch kernels for head_id % kQueriesPerBlock == 0
+      //  - we iterate over heads instead of queries (strideM = strideH)
+      if (num_queries == 1 && k_strideH == 0 && v_strideH == 0) {
+        if (head_id % kQueriesPerBlock != 0)
+          return false;
+        q_strideM = q_strideH;
+        num_queries = num_heads;
+        num_heads = 1; // unused but here for intent
+        // remove causal since n_query = 1
+        // otherwise, offset would change with head !
+        custom_mask_type = NoCustomMask;
+        o_strideM = head_dim_value;
+      }
+
+      // Make sure the compiler knows these variables are the same on all
+      // the threads of the warp.
+      // Only worth doing if they could have been modified above.
+      query_ptr = warp_uniform(query_ptr);
+      key_ptr = warp_uniform(key_ptr);
+      value_ptr = warp_uniform(value_ptr);
+      if (kSupportsBias) {
+        attn_bias_ptr = warp_uniform(attn_bias_ptr);
+      }
+      output_ptr = warp_uniform(output_ptr);
+      output_accum_ptr = warp_uniform(output_accum_ptr);
+      logsumexp_ptr = warp_uniform(logsumexp_ptr);
+      num_queries = warp_uniform(num_queries);
+      num_keys = warp_uniform(num_keys);
+      num_heads = warp_uniform(num_heads);
+      o_strideM = warp_uniform(o_strideM);
+      custom_mask_type = warp_uniform(custom_mask_type);
+      return true;
+    }
+
+    __host__ dim3 getBlocksGrid() const {
+      return dim3(
+          ceil_div(num_queries, (int32_t)kQueriesPerBlock),
+          num_heads,
+          num_batches);
+    }
+
+    __host__ dim3 getThreadsGrid() const {
+      return dim3(kWarpSize, kNumWarpsPerBlock, 1);
+    }
+  };
+
+  struct MM0 {
+    /*
+      In this first matmul, we compute a block of `Q @ K.T`.
+      While the calculation result is still hot in registers, we update
+      `mi`, `m_prime`, `s_prime` in shared-memory, and then store this value
+      into a shared-memory ("AccumulatorSharedStorage") that is used later as
+      operand A for the second matmul (see MM1)
+    */
+    using GemmType = DefaultGemmType<ArchTag, scalar_t>;
+
+    using OpClass = typename GemmType::OpClass;
+    using DefaultConfig =
+        typename cutlass::gemm::device::DefaultGemmConfiguration<
+            OpClass,
+            ArchTag,
+            scalar_t,
+            scalar_t,
+            scalar_t, // ElementC
+            accum_t // ElementAccumulator
+            >;
+    static constexpr int kAlignmentA =
+        kIsAligned ? DefaultConfig::kAlignmentA : GemmType::kMinimumAlignment;
+    static constexpr int kAlignmentB =
+        kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment;
+    using ThreadblockShape = cutlass::gemm::
+        GemmShape<kQueriesPerBlock, kKeysPerBlock, GemmType::ThreadK>;
+    using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+    using DefaultMma = typename cutlass::gemm::threadblock::FindDefaultMma<
+        scalar_t, // ElementA,
+        cutlass::layout::RowMajor, // LayoutA,
+        kAlignmentA,
+        scalar_t, // ElementB,
+        cutlass::layout::ColumnMajor, // LayoutB,
+        kAlignmentB,
+        accum_t,
+        cutlass::layout::RowMajor, // LayoutC,
+        OpClass,
+        ArchTag, // ArchTag
+        ThreadblockShape, // ThreadblockShape
+        WarpShape, // WarpShape
+        typename GemmType::InstructionShape, // InstructionShape
+        ArchTag::kMinComputeCapability >= 80 && kIsHalf
+            ? 4
+            : DefaultConfig::kStages,
+        typename GemmType::Operator // Operator
+        >::DefaultMma;
+    using MmaCore = typename DefaultMma::MmaCore;
+    using IteratorA = typename DefaultMma::IteratorA;
+    using IteratorB = typename DefaultMma::IteratorB;
+    using DefaultThreadblockMma = typename DefaultMma::ThreadblockMma;
+    using Mma = typename cutlass::platform::conditional<
+        kSingleValueIteration,
+        typename MakeCustomMma<DefaultThreadblockMma, kMaxK>::Mma,
+        DefaultThreadblockMma>::type;
+    using AccumLambdaIterator = typename DefaultMmaAccumLambdaIterator<
+        typename Mma::Operator::IteratorC,
+        accum_t,
+        kWarpSize>::Iterator;
+    static_assert(
+        MmaCore::WarpCount::kM * MmaCore::WarpCount::kN *
+                MmaCore::WarpCount::kK ==
+            kNumWarpsPerBlock,
+        "");
+
+    // used for efficient load of bias tile Bij from global to shared memory
+    using BiasLoader = TileSmemLoader<
+        scalar_t,
+        cutlass::MatrixShape<kQueriesPerBlock, kKeysPerBlock>,
+        MmaCore::kThreads,
+        // input restriction: kv_len has to be a multiple of this value
+        128 / cutlass::sizeof_bits<scalar_t>::value>;
+
+    // Epilogue to store to shared-memory in a format that we can use later for
+    // the second matmul
+    using B2bGemm = typename cutlass::gemm::threadblock::B2bGemm<
+        typename Mma::Operator::IteratorC,
+        typename Mma::Operator,
+        scalar_t,
+        WarpShape,
+        ThreadblockShape>;
+    using AccumulatorSharedStorage = typename B2bGemm::AccumulatorSharedStorage;
+  };
+
+  struct MM1 {
+    /**
+      Second matmul: perform `attn @ V` where `attn` is the attention (not
+      normalized) and stored in shared memory
+    */
+    using GemmType = DefaultGemmType<ArchTag, scalar_t>;
+
+    using OpClass = typename GemmType::OpClass;
+    using DefaultConfig =
+        typename cutlass::gemm::device::DefaultGemmConfiguration<
+            OpClass,
+            ArchTag,
+            scalar_t,
+            scalar_t,
+            output_accum_t, // ElementC
+            accum_t // ElementAccumulator
+            >;
+    static constexpr int kAlignmentA = DefaultConfig::kAlignmentA; // from smem
+    static constexpr int kAlignmentB =
+        kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment;
+    using ThreadblockShape = cutlass::gemm::
+        GemmShape<kQueriesPerBlock, kKeysPerBlock, GemmType::ThreadK>;
+    using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
+    using InstructionShape = typename GemmType::InstructionShape;
+
+    using LayoutB = cutlass::layout::RowMajor;
+    using DefaultGemm = cutlass::gemm::kernel::DefaultGemm<
+        scalar_t, // ElementA,
+        cutlass::layout::RowMajor, // LayoutA,
+        kAlignmentA,
+        scalar_t, // ElementB,
+        LayoutB, // LayoutB,
+        kAlignmentB,
+        output_accum_t,
+        cutlass::layout::RowMajor, // LayoutC,
+        accum_t,
+        OpClass,
+        ArchTag,
+        ThreadblockShape,
+        WarpShape,
+        typename GemmType::InstructionShape,
+        typename DefaultConfig::EpilogueOutputOp,
+        void, // ThreadblockSwizzle - not used
+        ArchTag::kMinComputeCapability >= 80 && kIsHalf
+            ? 4
+            : DefaultConfig::kStages,
+        false, // SplitKSerial
+        typename GemmType::Operator>;
+
+    using WarpIteratorA = typename cutlass::gemm::threadblock::
+        DefaultWarpIteratorAFromSharedMemory<
+            typename DefaultGemm::Mma::Policy::Operator::Shape, // WarpShape
+            typename DefaultGemm::Mma::Policy::Operator::InstructionShape,
+            typename DefaultGemm::Mma::Policy::Operator::IteratorA,
+            typename DefaultGemm::Mma::Policy>::WarpIterator;
+    using DefaultMmaFromSmem =
+        typename cutlass::gemm::threadblock::DefaultMmaFromSharedMemory<
+            typename DefaultGemm::Mma,
+            MM0::AccumulatorSharedStorage::Shape::kN, // kMaxK
+            WarpIteratorA,
+            false>; // kScaleOperandA
+    using Mma = typename DefaultMmaFromSmem::Mma;
+    using IteratorB = typename Mma::IteratorB;
+    using WarpCount = typename Mma::WarpCount;
+    static_assert(
+        WarpCount::kM * WarpCount::kN * WarpCount::kK == kNumWarpsPerBlock,
+        "");
+
+    using DefaultEpilogue = typename DefaultGemm::Epilogue;
+    using OutputTileIterator =
+        typename cutlass::epilogue::threadblock::PredicatedTileIterator<
+            typename DefaultEpilogue::OutputTileIterator::ThreadMap,
+            output_t>;
+    using OutputTileIteratorAccum =
+        typename cutlass::epilogue::threadblock::PredicatedTileIterator<
+            typename DefaultEpilogue::OutputTileIterator::ThreadMap,
+            output_accum_t>;
+  };
+
+  static constexpr int64_t kAlignmentQ = MM0::kAlignmentA;
+  static constexpr int64_t kAlignmentK = MM0::kAlignmentB;
+  static constexpr int64_t kAlignmentV = 1;
+
+  // Shared storage - depends on kernel params
+  struct ScalingCoefs {
+    cutlass::Array<accum_t, kQueriesPerBlock> m_prime;
+    cutlass::Array<accum_t, kQueriesPerBlock> s_prime;
+    cutlass::Array<accum_t, kQueriesPerBlock> mi;
+    cutlass::Array<accum_t, kQueriesPerBlock> out_rescale;
+    cutlass::Array<accum_t, kQueriesPerBlock * MM0::MmaCore::WarpCount::kN>
+        addition_storage;
+  };
+
+  struct SharedStorageEpilogueAtEnd : ScalingCoefs {
+    struct SharedStorageAfterMM0 {
+      // Everything here might be overwritten during MM0
+      union {
+        typename MM0::BiasLoader::SmemTile bias;
+        typename MM0::AccumulatorSharedStorage si;
+      };
+      typename MM1::Mma::SharedStorage mm1;
+    };
+
+    union {
+      typename MM0::Mma::SharedStorage mm0;
+      SharedStorageAfterMM0 after_mm0;
+      typename MM1::DefaultEpilogue::SharedStorage epilogue;
+    };
+
+    CUTLASS_DEVICE typename MM1::DefaultEpilogue::SharedStorage&
+    epilogue_shared_storage() {
+      return epilogue;
+    }
+  };
+
+  struct SharedStorageEpilogueInLoop : ScalingCoefs {
+    struct SharedStorageAfterMM0 {
+      // Everything here might be overwritten during MM0
+      union {
+        typename MM0::BiasLoader::SmemTile bias;
+        typename MM0::AccumulatorSharedStorage si;
+      };
+      typename MM1::Mma::SharedStorage mm1;
+      typename MM1::DefaultEpilogue::SharedStorage epilogue;
+    };
+
+    union {
+      typename MM0::Mma::SharedStorage mm0;
+      SharedStorageAfterMM0 after_mm0;
+    };
+
+    CUTLASS_DEVICE typename MM1::DefaultEpilogue::SharedStorage&
+    epilogue_shared_storage() {
+      return after_mm0.epilogue;
+    }
+  };
+
+  using SharedStorage = typename cutlass::platform::conditional<
+      kSingleValueIteration || kKeepOutputInRF,
+      SharedStorageEpilogueAtEnd,
+      SharedStorageEpilogueInLoop>::type;
+
+  static bool __host__ check_supported(Params const& p) {
+    CHECK_ALIGNED_PTR(p.query_ptr, kAlignmentQ);
+    CHECK_ALIGNED_PTR(p.key_ptr, kAlignmentK);
+    CHECK_ALIGNED_PTR(p.value_ptr, kAlignmentV);
+    if (kSupportsBias) {
+      CHECK_ALIGNED_PTR(p.attn_bias_ptr, kAlignmentQ);
+      XFORMERS_CHECK(
+          p.num_batches <= 1 || p.bias_strideB % kAlignmentQ == 0,
+          "attn_bias is not correctly aligned (strideB)");
+      XFORMERS_CHECK(
+          p.num_heads <= 1 || p.bias_strideH % kAlignmentQ == 0,
+          "attn_bias is not correctly aligned (strideH)");
+      XFORMERS_CHECK(
+          p.bias_strideM % kAlignmentQ == 0,
+          "attn_bias is not correctly aligned");
+    }
+    XFORMERS_CHECK(
+        p.q_strideM % kAlignmentQ == 0,
+        "query is not correctly aligned (strideM)");
+    XFORMERS_CHECK(
+        p.k_strideM % kAlignmentK == 0,
+        "key is not correctly aligned (strideM)");
+    XFORMERS_CHECK(
+        p.v_strideM % kAlignmentV == 0,
+        "value is not correctly aligned (strideM)");
+    XFORMERS_CHECK(
+        p.num_heads <= 1 || p.q_strideH % kAlignmentQ == 0,
+        "query is not correctly aligned (strideH)");
+    XFORMERS_CHECK(
+        p.num_heads <= 1 || p.k_strideH % kAlignmentK == 0,
+        "key is not correctly aligned (strideH)");
+    XFORMERS_CHECK(
+        p.num_heads <= 1 || p.v_strideH % kAlignmentV == 0,
+        "value is not correctly aligned (strideH)");
+    XFORMERS_CHECK(
+        p.custom_mask_type < NumCustomMaskTypes,
+        "invalid value for `custom_mask_type`");
+    return true;
+  }
+
+  static void CUTLASS_DEVICE attention_kernel(Params& p) {
+    // In this block, we will only ever:
+    // - read query[query_start:query_end, :]
+    // - write to output[query_start:query_end, :]
+
+    extern __shared__ char smem_buffer[];
+    SharedStorage& shared_storage = *((SharedStorage*)smem_buffer);
+    auto& m_prime = shared_storage.m_prime;
+    auto& s_prime = shared_storage.s_prime;
+    auto& mi = shared_storage.mi;
+    auto& out_rescale = shared_storage.out_rescale;
+    const uint32_t query_start = blockIdx.x * kQueriesPerBlock;
+
+    static_assert(kQueriesPerBlock < kNumWarpsPerBlock * kWarpSize, "");
+    if (thread_id() < kQueriesPerBlock) {
+      s_prime[thread_id()] = accum_t(0);
+      out_rescale[thread_id()] = accum_t(1.0);
+      m_prime[thread_id()] =
+          -cutlass::platform::numeric_limits<accum_t>::infinity();
+      mi[thread_id()] = -cutlass::platform::numeric_limits<accum_t>::infinity();
+    }
+    typename MM1::Mma::FragmentC accum_o;
+    accum_o.clear();
+
+    auto createOutputIter = [&](int col) -> typename MM1::OutputTileIterator {
+      using OutputTileIterator = typename MM1::OutputTileIterator;
+      return OutputTileIterator(
+          typename OutputTileIterator::Params{(int32_t)p.o_strideM},
+          p.output_ptr,
+          typename OutputTileIterator::TensorCoord{
+              p.num_queries, p.head_dim_value},
+          thread_id(),
+          {0, col});
+    };
+
+    auto createOutputAccumIter = [&](int col) ->
+        typename MM1::OutputTileIteratorAccum {
+          using OutputTileIteratorAccum = typename MM1::OutputTileIteratorAccum;
+          return OutputTileIteratorAccum(
+              typename OutputTileIteratorAccum::Params{
+                  (int32_t)(p.head_dim_value * p.num_heads)},
+              p.output_accum_ptr,
+              typename OutputTileIteratorAccum::TensorCoord{
+                  p.num_queries, p.head_dim_value},
+              thread_id(),
+              {0, col});
+        };
+
+#ifdef HAS_PYTORCH
+    curandStatePhilox4_32_10_t curand_state_init;
+    if (kSupportsDropout && p.use_dropout) {
+      const auto seeds = at::cuda::philox::unpack(p.rng_engine_inputs);
+
+      // each element of the attention matrix P with shape
+      // (batch_sz, n_heads, n_queries, n_keys) is associated with a single
+      // offset in RNG sequence. we initialize the RNG state with offset that
+      // starts at the beginning of a (n_queries, n_keys) matrix for this
+      // block's batch_id and head_id
+      // initializing rng state is very expensive, so we run once per kernel,
+      // rather than once per iteration. each iteration takes a copy of the
+      // initialized RNG state and offsets it as needed.
+      curand_init(
+          std::get<0>(seeds),
+          0,
+          std::get<1>(seeds) + p.dropout_batch_head_rng_offset,
+          &curand_state_init);
+    }
+#endif
+
+    // Iterate through keys
+    for (int32_t iter_key_start = 0; iter_key_start < p.num_keys;
+         iter_key_start += kKeysPerBlock) {
+      int32_t problem_size_0_m =
+          cutlass::fast_min((int32_t)kQueriesPerBlock, p.num_queries);
+      int32_t problem_size_0_n = cutlass::fast_min(
+          int32_t(kKeysPerBlock), p.num_keys - iter_key_start);
+      int32_t const& problem_size_0_k = p.head_dim;
+      int32_t const& problem_size_1_n = p.head_dim_value;
+      int32_t const& problem_size_1_k = problem_size_0_n;
+
+      auto prologueV = [&](int blockN) {
+        typename MM1::Mma::IteratorB iterator_V(
+            typename MM1::IteratorB::Params{typename MM1::LayoutB(p.v_strideM)},
+            p.value_ptr + iter_key_start * p.v_strideM,
+            {problem_size_1_k, problem_size_1_n},
+            thread_id(),
+            cutlass::MatrixCoord{0, blockN * MM1::Mma::Shape::kN});
+        MM1::Mma::prologue(
+            shared_storage.after_mm0.mm1,
+            iterator_V,
+            thread_id(),
+            problem_size_1_k);
+      };
+
+      __syncthreads(); // Need to have shared memory initialized, and `m_prime`
+                       // updated from end of prev iter
+      //
+      // MATMUL: Q.K_t
+      //
+      // Computes the block-matrix product of:
+      // (a) query[query_start:query_end, :]
+      // with
+      // (b) key[iter_key_start:iter_key_start + kKeysPerBlock]
+      // and stores that into `shared_storage.si`
+      //
+
+      // Compute threadblock location
+      cutlass::gemm::GemmCoord tb_tile_offset = {0, 0, 0};
+
+      cutlass::MatrixCoord tb_offset_A{
+          tb_tile_offset.m() * MM0::Mma::Shape::kM, tb_tile_offset.k()};
+
+      cutlass::MatrixCoord tb_offset_B{
+          tb_tile_offset.k(), tb_tile_offset.n() * MM0::Mma::Shape::kN};
+
+      // Construct iterators to A and B operands
+      typename MM0::IteratorA iterator_A(
+          typename MM0::IteratorA::Params(
+              typename MM0::MmaCore::LayoutA(p.q_strideM)),
+          p.query_ptr,
+          {problem_size_0_m, problem_size_0_k},
+          thread_id(),
+          tb_offset_A);
+
+      typename MM0::IteratorB iterator_B(
+          typename MM0::IteratorB::Params(
+              typename MM0::MmaCore::LayoutB(p.k_strideM)),
+          p.key_ptr + iter_key_start * p.k_strideM,
+          {problem_size_0_k, problem_size_0_n},
+          thread_id(),
+          tb_offset_B);
+
+      auto my_warp_id = warp_uniform(warp_id());
+      auto my_lane_id = lane_id();
+
+      // Construct thread-scoped matrix multiply
+      typename MM0::Mma mma(
+          shared_storage.mm0, thread_id(), my_warp_id, my_lane_id);
+
+      typename MM0::Mma::FragmentC accum;
+
+      accum.clear();
+
+      auto gemm_k_iterations =
+          (problem_size_0_k + MM0::Mma::Shape::kK - 1) / MM0::Mma::Shape::kK;
+
+      // Compute threadblock-scoped matrix multiply-add
+      mma(gemm_k_iterations, accum, iterator_A, iterator_B, accum);
+      __syncthreads();
+
+      if (kPreloadV) {
+        prologueV(0);
+      } else {
+        MM1::Mma::drain_cp_asyncs();
+      }
+
+      typename MM0::Mma::Operator::IteratorC::TensorCoord
+          iteratorC_tile_offset = {
+              (tb_tile_offset.m() * MM0::Mma::WarpCount::kM) +
+                  (my_warp_id % MM0::Mma::WarpCount::kM),
+              (tb_tile_offset.n() * MM0::Mma::WarpCount::kN) +
+                  (my_warp_id / MM0::Mma::WarpCount::kM)};
+
+      // multiply by scaling factor
+      if (kSupportsBias) {
+        accum =
+            cutlass::multiplies<typename MM0::Mma::FragmentC>()(p.scale, accum);
+      }
+
+      // apply attention bias if applicable
+      if (kSupportsBias && p.attn_bias_ptr != nullptr) {
+        // load bias tile Bij into shared memory
+        typename MM0::BiasLoader::GmemTileIterator bias_iter(
+            {cutlass::layout::RowMajor(p.bias_strideM)},
+            // attn_bias_pointer points to matrix of size (n_queries, n_keys)
+            // for the relevant batch_id and head_id
+            p.attn_bias_ptr + query_start * p.bias_strideM + iter_key_start,
+            {problem_size_0_m, problem_size_0_n},
+            thread_id());
+        cutlass::TensorRef<scalar_t, cutlass::layout::RowMajor> bias_tensor_ref(
+            shared_storage.after_mm0.bias.data(),
+            cutlass::layout::RowMajor(MM0::ThreadblockShape::kN));
+        typename MM0::BiasLoader::SmemTileIterator smem_tile_iter(
+            bias_tensor_ref, thread_id());
+        MM0::BiasLoader::load(bias_iter, smem_tile_iter);
+
+        // Pij += Bij, Pij is in register fragment and Bij is in shared memory
+        auto lane_offset = MM0::AccumLambdaIterator::get_lane_offset(
+            my_lane_id, my_warp_id, iteratorC_tile_offset);
+        MM0::AccumLambdaIterator::iterateRows(
+            lane_offset,
+            [&](int accum_m) {},
+            [&](int accum_m, int accum_n, int idx) {
+              if (accum_m < problem_size_0_m && accum_n < problem_size_0_n) {
+                accum[idx] += bias_tensor_ref.at({accum_m, accum_n});
+              }
+            },
+            [&](int accum_m) {});
+      }
+
+      // Mask out last if causal
+      // This is only needed if upper-right corner of current query / key block
+      // intersects the mask Coordinates of upper-right corner of current block
+      // is y=query_start x=min(iter_key_start + kKeysPerBlock, num_keys)) The
+      // first masked element is x = y + offset -> query_start + offset There is
+      // intersection (and we need to mask) if min(iter_key_start +
+      // kKeysPerBlock, num_keys)) >= query_start + offset
+      if (p.custom_mask_type &&
+          cutlass::fast_min(iter_key_start + kKeysPerBlock, p.num_keys) >=
+              (query_start + p.causal_diagonal_offset)) {
+        auto query_start = blockIdx.x * kQueriesPerBlock;
+        auto lane_offset = MM0::AccumLambdaIterator::get_lane_offset(
+            my_lane_id, my_warp_id, iteratorC_tile_offset);
+        int32_t last_col;
+        MM0::AccumLambdaIterator::iterateRows(
+            lane_offset,
+            [&](int accum_m) {
+              // last absolute col is (last absolute query + offset)
+              // last local col is (last absolute query + offset -
+              // iter_key_start)
+              last_col = query_start + accum_m + p.causal_diagonal_offset -
+                  iter_key_start;
+            },
+            [&](int accum_m, int accum_n, int idx) {
+              if (accum_n > last_col) {
+                accum[idx] =
+                    -cutlass::platform::numeric_limits<accum_t>::infinity();
+              }
+            },
+            [&](int accum_m) {});
+      }
+      // Update `mi` from accum stored in registers
+      // Also does accum[i] <- exp(accum[i] - mi)
+      iterative_softmax<typename MM0::Mma::Operator::IteratorC>(
+          accum_o,
+          accum,
+          mi,
+          m_prime,
+          s_prime,
+          out_rescale,
+          shared_storage.addition_storage,
+          my_lane_id,
+          thread_id(),
+          my_warp_id,
+          p.num_keys - iter_key_start,
+          iter_key_start == 0,
+          iteratorC_tile_offset,
+          kSupportsBias ? 1.0f : p.scale);
+
+      // Output results to shared-memory
+      int warp_idx_mn_0 = my_warp_id %
+          (MM0::Mma::Base::WarpCount::kM * MM0::Mma::Base::WarpCount::kN);
+      auto output_tile_coords = cutlass::MatrixCoord{
+          warp_idx_mn_0 % MM0::Mma::Base::WarpCount::kM,
+          warp_idx_mn_0 / MM0::Mma::Base::WarpCount::kM};
+
+      MM0::B2bGemm::accumToSmem(
+          shared_storage.after_mm0.si, accum, my_lane_id, output_tile_coords);
+
+      __syncthreads();
+
+#ifdef HAS_PYTORCH
+      // apply dropout (if applicable) after we've written Pij to smem.
+      // dropout is applied by multiplying each element of Pij by:
+      // - 0 with probability dropout_p
+      // - 1 / (1 - dropout_p) with probability 1 - dropout_p
+      //
+      // for backward purposes we want to be able to map each element of the
+      // attention matrix to the same random uniform number as the one we used
+      // in forward, without needing to use the same iteration order or having
+      // to store the dropout matrix. its possible to do this in registers but
+      // it ends up being very slow because each thread having noncontiguous
+      // strips of the Pij tile means we have to skip around a lot, and also
+      // have to generate a single random number at a time
+      if (kSupportsDropout && p.use_dropout) {
+        auto si = shared_storage.after_mm0.si.accum_ref();
+        // each thread handles a contiguous sequence of elements from Sij, all
+        // coming from the same row. the reason they have to come from the same
+        // row is that the sampling random numbers from a contiguous random
+        // number sequence is much more efficient than jumping around, and the
+        // linear offset of each element of S (the global matrix) maps to an
+        // offset in a random number sequence. for S, the end of a row and the
+        // beginning of the next have adjacent offsets, but for Sij, this is not
+        // necessarily the case.
+        const int num_threads = blockDim.x * blockDim.y * blockDim.z;
+        const int threads_per_row =
+            cutlass::fast_min(num_threads / problem_size_0_m, problem_size_0_n);
+        const int elts_per_thread = cutlass::round_nearest(
+            cutlass::ceil_div(problem_size_0_n, threads_per_row), 4);
+
+        const int thread_i = thread_id() / threads_per_row;
+        const int thread_start_j =
+            (thread_id() % threads_per_row) * elts_per_thread;
+
+        if (thread_i < problem_size_0_m && thread_start_j < problem_size_0_n) {
+          curandStatePhilox4_32_10_t curand_state = curand_state_init;
+          skipahead(
+              static_cast<unsigned long long>(
+                  (query_start + thread_i) * p.num_keys_absolute +
+                  (iter_key_start + thread_start_j)),
+              &curand_state);
+          const float dropout_scale = 1.0 / (1.0 - p.dropout_prob);
+
+          // apply dropout scaling to elements this thread is responsible for,
+          // in chunks of 4
+          for (int sij_start_col_idx = thread_start_j; sij_start_col_idx <
+               cutlass::fast_min(thread_start_j + elts_per_thread,
+                                 problem_size_0_n);
+               sij_start_col_idx += 4) {
+            const float4 rand_uniform_quad = curand_uniform4(&curand_state);
+
+            CUTLASS_PRAGMA_UNROLL
+            for (int quad_idx = 0; quad_idx < 4; ++quad_idx) {
+              si.at({thread_i, sij_start_col_idx + quad_idx}) *=
+                  static_cast<scalar_t>(
+                      dropout_scale *
+                      ((&rand_uniform_quad.x)[quad_idx] > p.dropout_prob));
+            }
+          }
+        }
+        __syncthreads(); // p.use_dropout should have same value kernel-wide
+      }
+#endif
+
+      //
+      // MATMUL: Attn . V
+      // Run the matmul `attn @ V` for a block of attn and V.
+      // `attn` is read from shared memory (in `shared_storage_si`)
+      // `V` is read from global memory (with iterator_B)
+      //
+
+      const int64_t nBlockN = kSingleValueIteration
+          ? 1
+          : ceil_div(
+                (int64_t)problem_size_1_n, int64_t(MM1::ThreadblockShape::kN));
+      for (int blockN = 0; blockN < nBlockN; ++blockN) {
+        int gemm_k_iterations =
+            (problem_size_1_k + MM1::Mma::Shape::kK - 1) / MM1::Mma::Shape::kK;
+
+        // Compute threadblock-scoped matrix multiply-add and store it in accum
+        // (in registers)
+        if (!kPreloadV) {
+          __syncthreads(); // we share shmem between mma and epilogue
+        }
+
+        typename MM1::Mma::IteratorB iterator_V(
+            typename MM1::IteratorB::Params{typename MM1::LayoutB(p.v_strideM)},
+            p.value_ptr + iter_key_start * p.v_strideM,
+            {problem_size_1_k, problem_size_1_n},
+            thread_id(),
+            cutlass::MatrixCoord{0, blockN * MM1::Mma::Shape::kN});
+        typename MM1::Mma mma_pv(
+            // operand A: Pij_dropped in shared memory
+            shared_storage.after_mm0.si.accum_ref(),
+            // operand B: shared memory staging area for Vj, which is loaded
+            // from global memory
+            shared_storage.after_mm0.mm1.operand_B_ref(),
+            (int)thread_id(),
+            (int)my_warp_id,
+            (int)my_lane_id);
+        mma_pv.set_prologue_done(kPreloadV);
+        if (!kKeepOutputInRF) {
+          accum_o.clear();
+        }
+        mma_pv(gemm_k_iterations, accum_o, iterator_V, accum_o);
+        __syncthreads();
+
+        if (kPreloadV && !kSingleValueIteration && blockN + 1 < nBlockN) {
+          prologueV(blockN + 1);
+        }
+
+        if (!kKeepOutputInRF) {
+          MM1::Mma::drain_cp_asyncs();
+          DISPATCH_BOOL(
+              iter_key_start == 0, kIsFirst, ([&] {
+                DISPATCH_BOOL(
+                    (iter_key_start + kKeysPerBlock) >= p.num_keys,
+                    kIsLast,
+                    ([&] {
+                      using DefaultEpilogue = typename MM1::DefaultEpilogue;
+                      using DefaultOp =
+                          typename MM1::DefaultConfig::EpilogueOutputOp;
+                      using ElementCompute = typename DefaultOp::ElementCompute;
+                      using EpilogueOutputOp = typename cutlass::epilogue::
+                          thread::MemoryEfficientAttentionNormalize<
+                              typename cutlass::platform::conditional<
+                                  kIsLast::value,
+                                  output_t,
+                                  output_accum_t>::type,
+                              output_accum_t,
+                              DefaultOp::kCount,
+                              typename DefaultOp::ElementAccumulator,
+                              ElementCompute,
+                              kIsFirst::value,
+                              kIsLast::value,
+                              cutlass::Array<ElementCompute, kQueriesPerBlock>>;
+                      using Epilogue = typename cutlass::epilogue::threadblock::
+                          EpiloguePipelined<
+                              typename DefaultEpilogue::Shape,
+                              typename MM1::Mma::Operator,
+                              DefaultEpilogue::kPartitionsK,
+                              typename cutlass::platform::conditional<
+                                  kIsLast::value,
+                                  typename MM1::OutputTileIterator,
+                                  typename MM1::OutputTileIteratorAccum>::type,
+                              typename DefaultEpilogue::
+                                  AccumulatorFragmentIterator,
+                              typename DefaultEpilogue::WarpTileIterator,
+                              typename DefaultEpilogue::SharedLoadIterator,
+                              EpilogueOutputOp,
+                              typename DefaultEpilogue::Padding,
+                              DefaultEpilogue::kFragmentsPerIteration,
+                              true, // IterationsUnroll
+                              typename MM1::OutputTileIteratorAccum // Read
+                                                                    // iterator
+                              >;
+
+                      int col = blockN * MM1::Mma::Shape::kN;
+                      auto source_iter = createOutputAccumIter(col);
+                      auto dest_iter = call_conditional<
+                          kIsLast::value,
+                          decltype(createOutputIter),
+                          decltype(createOutputAccumIter)>::
+                          apply(createOutputIter, createOutputAccumIter, col);
+                      EpilogueOutputOp rescale(s_prime, out_rescale);
+                      Epilogue epilogue(
+                          shared_storage.epilogue_shared_storage(),
+                          thread_id(),
+                          my_warp_id,
+                          my_lane_id);
+                      epilogue(rescale, dest_iter, accum_o, source_iter);
+                    }));
+              }));
+          if (!kSingleValueIteration) {
+            __syncthreads();
+          }
+        }
+      }
+      __syncthreads(); // we modify `m_prime` after
+    }
+
+    if (kKeepOutputInRF) {
+      constexpr bool kIsFirst = true;
+      constexpr bool kIsLast = true;
+      using DefaultEpilogue = typename MM1::DefaultEpilogue;
+      using DefaultOp = typename MM1::DefaultConfig::EpilogueOutputOp;
+      using ElementCompute = typename DefaultOp::ElementCompute;
+      using EpilogueOutputOp =
+          typename cutlass::epilogue::thread::MemoryEfficientAttentionNormalize<
+              output_t, // output
+              output_accum_t, // source
+              DefaultOp::kCount,
+              typename DefaultOp::ElementAccumulator, // accum
+              output_accum_t, // compute
+              kIsFirst,
+              kIsLast,
+              cutlass::Array<ElementCompute, kQueriesPerBlock>>;
+      using Epilogue =
+          typename cutlass::epilogue::threadblock::EpiloguePipelined<
+              typename DefaultEpilogue::Shape,
+              typename MM1::Mma::Operator,
+              DefaultEpilogue::kPartitionsK,
+              typename MM1::OutputTileIterator, // destination
+              typename DefaultEpilogue::AccumulatorFragmentIterator,
+              typename DefaultEpilogue::WarpTileIterator,
+              typename DefaultEpilogue::SharedLoadIterator,
+              EpilogueOutputOp,
+              typename DefaultEpilogue::Padding,
+              DefaultEpilogue::kFragmentsPerIteration,
+              true, // IterationsUnroll
+              typename MM1::OutputTileIteratorAccum // source tile
+              >;
+      auto dest_iter = createOutputIter(0);
+      EpilogueOutputOp rescale(s_prime, out_rescale);
+      Epilogue epilogue(
+          shared_storage.epilogue_shared_storage(),
+          thread_id(),
+          warp_id(),
+          lane_id());
+      MM1::Mma::drain_cp_asyncs();
+      epilogue(rescale, dest_iter, accum_o);
+    }
+
+    // 7. Calculate logsumexp
+    // To make the backward easier, we pad logsumexp with `inf`
+    // this avoids a few bound checks, and is not more expensive during fwd
+    static_assert(kQueriesPerBlock < kNumWarpsPerBlock * kWarpSize, "");
+    if (p.logsumexp_ptr && thread_id() < kQueriesPerBlock) {
+      auto lse_dim = ceil_div((int32_t)p.num_queries, kAlignLSE) * kAlignLSE;
+      constexpr float kLog2e = 1.4426950408889634074; // log_2(e) = M_LOG2E
+      if (thread_id() < p.num_queries) {
+        p.logsumexp_ptr[thread_id()] = accum_t(mi[thread_id()] / kLog2e) +
+            cutlass::fast_log(accum_t(s_prime[thread_id()]));
+      } else if (thread_id() < lse_dim) {
+        p.logsumexp_ptr[thread_id()] =
+            cutlass::platform::numeric_limits<accum_t>::infinity();
+      }
+    }
+  }
+
+  template <typename WarpIteratorC>
+  CUTLASS_DEVICE static void iterative_softmax(
+      typename WarpIteratorC::Fragment& frag_o, // output so far
+      typename WarpIteratorC::Fragment& frag,
+      cutlass::Array<accum_t, kQueriesPerBlock>& mi,
+      cutlass::Array<accum_t, kQueriesPerBlock>& m_prime,
+      cutlass::Array<accum_t, kQueriesPerBlock>& s_prime,
+      cutlass::Array<accum_t, kQueriesPerBlock>& out_rescale,
+      cutlass::Array<accum_t, kQueriesPerBlock * MM0::MmaCore::WarpCount::kN>&
+          addition_storage,
+      int8_t lane_id,
+      int8_t thread_id,
+      int8_t warp_id,
+      int max_col,
+      bool is_first,
+      typename WarpIteratorC::TensorCoord const& tile_offset,
+      float scaling) {
+    /* Iterates on the accumulator and corresponding position on result matrix
+
+    (1) Update `mi[r]` to the max value of the row `r`
+    (2) In a second iteration do the following:
+        (a) accum   <- exp(accum - mi)
+        (b) m_prime <- exp(m_prime - mi)
+        (c) s_prime <- s_prime * m_prime + sum(accum)
+
+    All of this is done on registers, before we store all of this
+    on shared memory for the next matmul with Value.
+    */
+    using Fragment = typename WarpIteratorC::Fragment;
+    using LambdaIterator = typename DefaultMmaAccumLambdaIterator<
+        WarpIteratorC,
+        accum_t,
+        kWarpSize>::Iterator;
+    // Convert to `accum_t` (rather than double)
+    constexpr float kLog2e = 1.4426950408889634074; // log_2(e) = M_LOG2E
+
+    static_assert(kQueriesPerBlock % kNumWarpsPerBlock == 0, "");
+    static constexpr int kLinesPerWarp = kQueriesPerBlock / kNumWarpsPerBlock;
+
+    frag = cutlass::multiplies<Fragment>()(scaling * kLog2e, frag);
+
+    auto lane_offset =
+        LambdaIterator::get_lane_offset(lane_id, warp_id, tile_offset);
+
+    // First update `mi` to the max per-row
+    {
+      accum_t max;
+      LambdaIterator::iterateRows(
+          lane_offset,
+          [&](int accum_m) {
+            max = -cutlass::platform::numeric_limits<accum_t>::infinity();
+          },
+          [&](int accum_m, int accum_n, int idx) {
+            if (accum_n < max_col) {
+              max = cutlass::fast_max(max, frag[idx]);
+            }
+          },
+          [&](int accum_m) {
+            // Having 4x atomicMax seems faster than reduce within warp
+            // first...
+            atomicMaxFloat(&mi[accum_m], max);
+          });
+    }
+
+    // Make sure we all share the update values for `mi`
+    __syncthreads();
+
+    // Doing this `exp` is quite expensive. Let's
+    // split it across the warps
+    bool restore_mi_to_minus_inf = false;
+    if (lane_id < kLinesPerWarp) {
+      int id = warp_id * kLinesPerWarp + lane_id;
+      auto m_prime_id = m_prime[id];
+      auto mi_id = mi[id];
+      bool changed = m_prime_id < mi_id; // `false` if both are -inf
+      if (changed) {
+        auto m_prime_exp = exp2f(m_prime_id - mi_id);
+        out_rescale[id] = m_prime_exp;
+        s_prime[id] *= m_prime_exp;
+      } else {
+        // Only when bias is enabled, it's possible that all the first values
+        // of attention are masked to `-inf`. In that case we want to avoid
+        // `nan = exp2f(-inf - (-inf))` so we temporarily set `mi` to 0
+        if (kSupportsBias &&
+            mi_id == -cutlass::platform::numeric_limits<accum_t>::infinity()) {
+          restore_mi_to_minus_inf = true;
+          mi[id] = 0.0f;
+        }
+        out_rescale[id] = 1.0f;
+      }
+    }
+    __syncthreads(); // Update output fragments
+    if (kKeepOutputInRF && !is_first) {
+      accum_t line_rescale;
+      LambdaIterator::iterateRows(
+          lane_offset,
+          [&](int accum_m) { line_rescale = out_rescale[accum_m]; },
+          [&](int accum_m, int accum_n, int idx) {
+            frag_o[idx] = frag_o[idx] * line_rescale;
+          },
+          [&](int accum_m) {});
+    }
+    // Update accum_m, accum_n, ...
+    {
+      accum_t mi_row, total_row;
+      LambdaIterator::iterateRows(
+          lane_offset,
+          [&](int accum_m) { mi_row = mi[accum_m]; },
+          [&](int accum_m, int accum_n, int idx) {
+            frag[idx] =
+                (accum_n < max_col) ? exp2f(frag[idx] - mi_row) : accum_t(0.0);
+          },
+          [&](int accum_m) {});
+      LambdaIterator::iterateRows(
+          lane_offset,
+          [&](int accum_m) { total_row = 0.0; },
+          [&](int accum_m, int accum_n, int idx) { total_row += frag[idx]; },
+          [&](int accum_m) {
+            if (LambdaIterator::reduceSameRow(
+                    lane_id, total_row, [](accum_t a, accum_t b) {
+                      return a + b;
+                    })) {
+              // NOTE: we could atomically add `total_row` to `s_prime`, but
+              // it's faster (and deterministic) to avoid atomics here
+              addition_storage
+                  [accum_m + kQueriesPerBlock * tile_offset.column()] =
+                      total_row;
+            }
+          });
+    }
+    __syncthreads();
+    if (lane_id < kLinesPerWarp) {
+      int id = warp_id * kLinesPerWarp + lane_id;
+      accum_t total_row = s_prime[id];
+      if (restore_mi_to_minus_inf) {
+        // Restore `mi`, see above when we set `restore_mi_to_minus_inf=true`
+        mi[id] = -cutlass::platform::numeric_limits<accum_t>::infinity();
+      } else {
+        m_prime[id] = mi[id];
+      }
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < MM0::MmaCore::WarpCount::kN; ++i) {
+        total_row += addition_storage[id + kQueriesPerBlock * i];
+      }
+      s_prime[id] = total_row;
+    }
+  }
+
+  static CUTLASS_DEVICE int8_t lane_id() {
+    return threadIdx.x;
+  }
+  static CUTLASS_DEVICE int8_t warp_id() {
+    return threadIdx.y;
+  }
+  static CUTLASS_DEVICE int16_t thread_id() {
+    return threadIdx.x + threadIdx.y * blockDim.x;
+  }
+};
+
+template <typename AK>
+__global__ void __launch_bounds__(AK::kNumThreads, AK::kMinBlocksPerSm)
+    attention_kernel_batched_impl(typename AK::Params p) {
+  if (!p.advance_to_block()) {
+    return;
+  }
+  AK::attention_kernel(p);
+}
+
+template <typename AK>
+__global__ void __launch_bounds__(AK::kNumThreads, AK::kMinBlocksPerSm)
+    attention_kernel_batched(typename AK::Params params);
diff --git a/third_party/fused_multi_head_attention/piped_subprocess.py b/third_party/fused_multi_head_attention/piped_subprocess.py
new file mode 100644
index 0000000000..536bdb4305
--- /dev/null
+++ b/third_party/fused_multi_head_attention/piped_subprocess.py
@@ -0,0 +1,144 @@
+#################################################################################################
+#
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#################################################################################################
+
+from typing import List
+import torch
+import subprocess
+import sys
+import tempfile
+import os
+import numpy as np
+
+
+TORCH_DTYPE_NAME = {
+    torch.float32: "f32",
+    torch.float16: "f16",
+    torch.bfloat16: "b16"
+}
+NAME_TORCH_DTYPE = {v: k for k, v in TORCH_DTYPE_NAME.items()}
+
+def _tensor_from_storage(tensor: torch.Tensor, dtype) -> torch.Tensor:
+    # PyTorch >= 2.0
+    if hasattr(tensor, 'untyped_storage'):
+        return torch.tensor([], dtype=dtype).set_(tensor.untyped_storage())
+    return torch.tensor([], dtype=dtype).set_(tensor.storage().untyped())
+
+class PipedSubprocess:
+    def __init__(self, binary: str) -> None:
+        self.binary = binary
+        self.tempdir_ctx = tempfile.TemporaryDirectory()
+
+    def __enter__(self) -> "PipedSubprocess":
+        self.subp = subprocess.Popen(self.binary, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=sys.stderr, text=True, bufsize=0)
+        self.tempdir = self.tempdir_ctx.__enter__()
+        self.file_counter = 0
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
+        self.tempdir_ctx.__exit__(exc_type, exc_val, exc_tb)
+
+    def temp_filename(self, suffix: str) -> str:
+        self.file_counter += 1
+        return os.path.join(self.tempdir, f"{self.file_counter}{suffix}")
+
+    def write(self, *args) -> None:
+        for a in args:
+            self.subp.stdin.write(str(a) + " ")
+
+    def writeTensor(self, tensor: torch.Tensor, name: str, stride_names: List[str]) -> None:
+        print(f"Py ->C++: {TORCH_DTYPE_NAME[tensor.dtype]}:{name}")
+        tensor_u8 = _tensor_from_storage(tensor, torch.uint8)
+        self.write("tensor_begin", f"{TORCH_DTYPE_NAME[tensor.dtype]}:{name}", tensor_u8.shape[0])
+        filename = self.temp_filename(f"{name}.tensor")
+        assert tensor.storage_offset() == 0
+        with open(filename, "wb+") as fd:
+            fd.write(bytes(tensor_u8.numpy()))
+        self.write("file", filename)
+        self.write("tensor_end")
+
+        for stride_name, stride_value in zip(stride_names, tensor.stride()):
+            self.write(stride_name, stride_value)
+
+    def readTensor(self, name, stride_name, shape) -> torch.Tensor:
+        tmpfile = self.temp_filename(f"{name}.tensor")
+        self.write("tmpfile", tmpfile)
+
+        self.readExpect("tensor_begin")
+        dtype_str, name = self.read().split(":")
+        print(f"C++->Py : {dtype_str}:{name}")
+        u8len = int(self.read())
+        dtype = NAME_TORCH_DTYPE[dtype_str]
+
+        self.readExpect("file")
+        self.readExpect(tmpfile)
+
+        with open(tmpfile, "rb") as fd:
+            data = fd.read(u8len)
+            # `np.array` is not strictly needed, but avoids a torch warning
+            tensor_u8 = torch.frombuffer(np.array(data), dtype=torch.uint8, count=u8len)
+        self.readExpect("tensor_end")
+        
+        tensor = _tensor_from_storage(tensor_u8, dtype)
+        strides = []
+        for sn in stride_name:
+            self.readExpect(sn)
+            strides.append(int(self.read()))
+        if len(strides) != shape:
+            strides.append(1)
+        assert len(strides) == len(shape), name
+        return torch.as_strided(tensor, shape, strides)
+
+    def readNamed(self, name: str):
+        self.readExpect(name)
+        return self.read()
+
+    def readExpect(self, what: str) -> None:
+        r = self.read()
+        if r != what:
+            raise ValueError(f"Read {r} but expected {what}")
+
+    def read(self):
+        read_all = []
+        # Skip initial whitespace
+        while True:
+            r = self.subp.stdout.read(1)
+            if r not in [' ', "\n"]:
+                read_all.append(r)
+                break
+        # Read data
+        while True:
+            r = self.subp.stdout.read(1)
+            if r in [' ', "\n"]:
+                break
+            read_all.append(r)
+        return ''.join(read_all)
+        
diff --git a/third_party/fused_multi_head_attention/transform/tile_smem_loader.h b/third_party/fused_multi_head_attention/transform/tile_smem_loader.h
new file mode 100644
index 0000000000..048c1e019b
--- /dev/null
+++ b/third_party/fused_multi_head_attention/transform/tile_smem_loader.h
@@ -0,0 +1,90 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#pragma once
+
+#include <cutlass/cutlass.h>
+#include "cutlass/aligned_buffer.h"
+#include "cutlass/array.h"
+#include "cutlass/layout/matrix.h"
+#include "cutlass/layout/pitch_linear.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/transform/pitch_linear_thread_map.h"
+#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
+#include "cutlass/transform/threadblock/regular_tile_iterator.h"
+
+template <
+    typename scalar_t, // scalar type
+    typename ThreadblockTileShape, // size of tile to load
+    int Threads, // number of participating threads
+    int ElementsPerAccess> // thread access width in elements
+class TileSmemLoader {
+ public:
+  using SmemTile =
+      cutlass::AlignedBuffer<scalar_t, ThreadblockTileShape::kCount>;
+
+  using ThreadMap = cutlass::transform::PitchLinearStripminedThreadMap<
+      cutlass::layout::PitchLinearShape<
+          ThreadblockTileShape::kColumn, // contiguous
+          ThreadblockTileShape::kRow>, // strided
+      Threads, // Threads
+      ElementsPerAccess>; // ElementsPerAccess
+
+  using GmemTileIterator =
+      cutlass::transform::threadblock::PredicatedTileIterator<
+          ThreadblockTileShape, // Shape
+          scalar_t, // Element
+          cutlass::layout::RowMajor, // Layout
+          0, // AdvanceRank
+          ThreadMap>; // ThreadMap
+
+  using SmemTileIterator = cutlass::transform::threadblock::RegularTileIterator<
+      ThreadblockTileShape, // Shape
+      scalar_t, // Element
+      cutlass::layout::RowMajor, // Layout
+      0, // AdvanceRank
+      ThreadMap>; // ThreadMap
+
+  using Fragment = typename GmemTileIterator::Fragment;
+
+  /// load a tile from global memory into shared memory
+  CUTLASS_DEVICE
+  static void load(
+      GmemTileIterator tile_load_iter,
+      SmemTileIterator tile_store_iter) {
+    Fragment tb_frag;
+    tb_frag.clear();
+    tile_load_iter.load(tb_frag);
+    tile_store_iter.store(tb_frag);
+
+    __syncthreads();
+  }
+};
diff --git a/third_party/pjrt_c_api.h b/third_party/pjrt_c_api.h
index f1ab16f1a6..3158073aea 100644
--- a/third_party/pjrt_c_api.h
+++ b/third_party/pjrt_c_api.h
@@ -16,21 +16,71 @@ limitations under the License.
 #ifndef XLA_PJRT_C_PJRT_C_API_H_
 #define XLA_PJRT_C_PJRT_C_API_H_
 
+#include <assert.h>
 #include <stdbool.h>
 #include <stddef.h>
 #include <stdint.h>
 
+// Read more on C API ABI versioning and compatibility here:
+// https://docs.google.com/document/d/1TKB5NyGtdzrpgw5mpyFjVAhJjpSNdF31T6pjPl_UT2o/edit?usp=sharing
+
 #define PJRT_STRUCT_SIZE(struct_type, last_field) \
   offsetof(struct_type, last_field) + sizeof(((struct_type*)0)->last_field)
 
-#define PJRT_DEFINE_STRUCT_TRAITS(sname, last_field) \
-  typedef struct sname sname;                        \
-  enum { sname##_STRUCT_SIZE = PJRT_STRUCT_SIZE(sname, last_field) }
+#ifdef __cplusplus
+#define PJRT_CHECK_STRUCT_SIZE(sname, last_field)                       \
+  static_assert(                                                        \
+      sizeof(struct sname) ==                                           \
+          ((PJRT_STRUCT_SIZE(sname, last_field) + alignof(sname) - 1) / \
+           alignof(sname)) *                                            \
+              alignof(sname),                                           \
+      "Failed to update last_field");
+#else
+#define PJRT_CHECK_STRUCT_SIZE(sname, last_field)
+#endif
+
+// Must update PJRT_DEFINE_STRUCT_TRAITS with the new `last_field` after
+// adding a new member to a struct.
+#define PJRT_DEFINE_STRUCT_TRAITS(sname, last_field)                  \
+  typedef struct sname sname;                                         \
+  enum { sname##_STRUCT_SIZE = PJRT_STRUCT_SIZE(sname, last_field) }; \
+  PJRT_CHECK_STRUCT_SIZE(sname, last_field)
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+// ------------------------------- Extensions ----------------------------------
+
+typedef enum {
+  PJRT_Extension_Type_Gpu_Custom_Call = 0,
+  PJRT_Extension_Type_Profiler,
+  PJRT_Extension_Type_Custom_Partitioner,
+  PJRT_Extension_Type_Stream,
+  PJRT_Extension_Type_Layouts,
+  PJRT_Extension_Type_FFI,
+  PJRT_Extension_Type_MemoryDescriptions,
+  PJRT_Extension_Type_Triton,
+  PJRT_Extension_Type_RawBuffer,     // Experimental.
+  PJRT_Extension_Type_PhaseCompile,  // Experimental.
+  PJRT_Extension_Type_Example,
+  PJRT_Extension_Type_Unknown,
+  PJRT_Extension_Type_CrossHostTransfers,
+  PJRT_Extension_Type_ExecutableMetadata,
+  PJRT_Extension_Type_Callback,
+  PJRT_Extension_Type_HostAllocator,  // Experimental.
+} PJRT_Extension_Type;
+
+// PJRT_Extension_Base contains a type and a pointer to next
+// PJRT_Extension_Base. The framework can go through this chain to find an
+// extension and identify it with the type.
+typedef struct PJRT_Extension_Base {
+  size_t struct_size;
+  PJRT_Extension_Type type;
+  struct PJRT_Extension_Base* next;
+} PJRT_Extension_Base;
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Extension_Base, next);
+
 // --------------------------------- Version -----------------------------------
 
 // Incremented when an ABI-incompatible change is made to the interface.
@@ -53,14 +103,14 @@ extern "C" {
 // Changes include:
 // * Adding a new field to the PJRT_Api or argument structs
 // * Renaming a method or argument (doesn't affect ABI)
-#define PJRT_API_MINOR 40
+#define PJRT_API_MINOR 80
 
 // The plugin should set the major_version and minor_version of
 // PJRT_Api.pjrt_api_version to be the `PJRT_API_MAJOR` and `PJRT_API_MINOR` in
 // this header that the implementation was compiled with.
 struct PJRT_Api_Version {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   int major_version;  // out
   int minor_version;  // out
 };
@@ -77,7 +127,7 @@ typedef struct PJRT_Error PJRT_Error;
 
 struct PJRT_Error_Destroy_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Error* error;
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Error_Destroy_Args, error);
@@ -87,7 +137,7 @@ typedef void PJRT_Error_Destroy(PJRT_Error_Destroy_Args* args);
 
 struct PJRT_Error_Message_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   const PJRT_Error* error;
   // Has the lifetime of `error`.
   const char* message;  // out
@@ -101,6 +151,7 @@ typedef void PJRT_Error_Message(PJRT_Error_Message_Args* args);
 
 // Codes are based on https://abseil.io/docs/cpp/guides/status-codes
 typedef enum {
+  PJRT_Error_Code_OK = 0,
   PJRT_Error_Code_CANCELLED = 1,
   PJRT_Error_Code_UNKNOWN = 2,
   PJRT_Error_Code_INVALID_ARGUMENT = 3,
@@ -121,7 +172,7 @@ typedef enum {
 
 struct PJRT_Error_GetCode_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   const PJRT_Error* error;
   PJRT_Error_Code code;  // out
 };
@@ -151,7 +202,7 @@ typedef enum {
 // Named value for key-value pairs.
 struct PJRT_NamedValue {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   const char* name;
   size_t name_size;
   PJRT_NamedValue_Type type;
@@ -172,25 +223,25 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_NamedValue, value_size);
 
 struct PJRT_Plugin_Initialize_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
 };
-PJRT_DEFINE_STRUCT_TRAITS(PJRT_Plugin_Initialize_Args, priv);
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Plugin_Initialize_Args, extension_start);
 
 // One-time plugin setup. Must be called before any other functions are called.
 typedef PJRT_Error* PJRT_Plugin_Initialize(PJRT_Plugin_Initialize_Args* args);
 
 struct PJRT_Plugin_Attributes_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   // Returned attributes have the lifetime of the process.
   const PJRT_NamedValue* attributes;  // out
   size_t num_attributes;              // out
 };
-PJRT_DEFINE_STRUCT_TRAITS(PJRT_Plugin_Attributes_Args, attributes);
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Plugin_Attributes_Args, num_attributes);
 
-// Returns an array of plugin attributes which are key-value pairs. One example
-// attribute is the minimum supported StableHLO version.
-// TODO(b/280349977): standardize the list of attributes.
+// Returns an array of plugin attributes which are key-value pairs. Common keys
+// include `xla_version`, `stablehlo_current_version`, and
+// `stablehlo_minimum_version`.
 typedef PJRT_Error* PJRT_Plugin_Attributes(PJRT_Plugin_Attributes_Args* args);
 
 // ---------------------------------- Events -----------------------------------
@@ -205,7 +256,7 @@ typedef struct PJRT_Event PJRT_Event;
 
 struct PJRT_Event_Destroy_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Event* event;
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Event_Destroy_Args, event);
@@ -215,7 +266,7 @@ typedef PJRT_Error* PJRT_Event_Destroy(PJRT_Event_Destroy_Args* args);
 
 struct PJRT_Event_IsReady_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Event* event;
   bool is_ready;  // out
 };
@@ -227,7 +278,7 @@ typedef PJRT_Error* PJRT_Event_IsReady(PJRT_Event_IsReady_Args* args);
 
 struct PJRT_Event_Error_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Event* event;
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Event_Error_Args, event);
@@ -245,7 +296,7 @@ typedef PJRT_Error* PJRT_Event_Error(PJRT_Event_Error_Args* args);
 
 struct PJRT_Event_Await_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Event* event;
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Event_Await_Args, event);
@@ -263,7 +314,7 @@ typedef void (*PJRT_Event_OnReadyCallback)(PJRT_Error* error, void* user_arg);
 
 struct PJRT_Event_OnReady_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Event* event;
   PJRT_Event_OnReadyCallback callback;
   // `user_arg` allows `callback` to be called with arbitrary arguments (e.g.
@@ -281,23 +332,28 @@ typedef PJRT_Error* PJRT_Event_OnReady(PJRT_Event_OnReady_Args* args);
 typedef struct PJRT_Client PJRT_Client;
 typedef struct PJRT_Device PJRT_Device;
 typedef struct PJRT_Memory PJRT_Memory;
+typedef struct PJRT_ShapeSpec PJRT_ShapeSpec;
 typedef struct PJRT_DeviceDescription PJRT_DeviceDescription;
 typedef struct PJRT_TopologyDescription PJRT_TopologyDescription;
 typedef struct PJRT_Executable PJRT_Executable;
 typedef struct PJRT_LoadedExecutable PJRT_LoadedExecutable;
 typedef struct PJRT_Buffer PJRT_Buffer;
+typedef struct PJRT_FulfillAliasBufferCallback PJRT_FulfillAliasBufferCallback;
+typedef struct PJRT_AsyncHostToDeviceTransferManager
+    PJRT_AsyncHostToDeviceTransferManager;
+typedef struct PJRT_PhaseCompiler PJRT_PhaseCompiler;
 
 // The caller of PJRT_Client_Create can optionally provide a key-value store
-// accessible across nodes and/or processes. KV store access may be necessary to
-// create some multi-node/multi-process clients. The caller can provide the two
-// callbacks below to access the key-value store.
+// accessible across nodes and/or processes. KV store access may be necessary
+// to create some multi-node/multi-process clients. The caller can provide the
+// two callbacks below to access the key-value store.
 
 // A callback to delete the value returned by PJRT_KeyValueGetCallback.
 typedef void (*PJRT_KeyValueGetCallback_ValueDeleter)(char* value);
 
 struct PJRT_KeyValueGetCallback_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   const char* key;
   size_t key_size;
   int timeout_in_ms;
@@ -321,9 +377,38 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_KeyValueGetCallback_Args,
 typedef PJRT_Error* (*PJRT_KeyValueGetCallback)(
     PJRT_KeyValueGetCallback_Args* args);
 
+// Same as KeyValueGet, but returns `NotFoundError` immediately if the key is
+// not found.
+typedef void (*PJRT_KeyValueTryGetCallback_ValueDeleter)(char* value);
+
+struct PJRT_KeyValueTryGetCallback_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  const char* key;
+  size_t key_size;
+  PJRT_CallbackError* callback_error;
+  void* user_arg;
+  char* value;        // out
+  size_t value_size;  // out
+  // The caller needs to set a PJRT_KeyValueTryGetCallback_ValueDeleter to
+  // delete the value returned by PJRT_KeyValueTryGetCallback. The
+  // implementation is responsible for copying `value` and then calling
+  // value_deleter_callback.
+  PJRT_KeyValueTryGetCallback_ValueDeleter value_deleter_callback;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_KeyValueTryGetCallback_Args,
+                          value_deleter_callback);
+
+// Requirements for PJRT_KeyValueTryGetCallback implementation: (1) Thread-safe.
+// (2) The caller that provides the two callbacks is responsible for avoiding
+// key collisions between different users of key-value store (i.e. between
+// different plugins, but not between different nodes in one plugin).
+typedef PJRT_Error* (*PJRT_KeyValueTryGetCallback)(
+    PJRT_KeyValueTryGetCallback_Args* args);
+
 struct PJRT_KeyValuePutCallback_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   const char* key;
   size_t key_size;
   // Only needs to stay alive for the duration of the PJRT_KeyValuePutCallback
@@ -344,7 +429,7 @@ typedef PJRT_Error* (*PJRT_KeyValuePutCallback)(
 
 struct PJRT_Client_Create_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   // Extra platform-specific options to create a client.
   const PJRT_NamedValue* create_options;
   size_t num_options;
@@ -359,15 +444,22 @@ struct PJRT_Client_Create_Args {
   void* kv_put_user_arg;
 
   PJRT_Client* client;  // out
+
+  // Key-value try-get callback provided by the caller of PJRT_Client_Create.
+  // Same as key-value get callback, but returns `NotFoundError` immediately if
+  // the key is not found.
+  PJRT_KeyValueTryGetCallback kv_try_get_callback;
+  // Will be passed to `kv_try_get_callback` as `user_arg` argument.
+  void* kv_try_get_user_arg;
 };
-PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_Create_Args, client);
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_Create_Args, kv_try_get_user_arg);
 
 // Creates and initializes a new PJRT_Client and returns in `client`.
 typedef PJRT_Error* PJRT_Client_Create(PJRT_Client_Create_Args* args);
 
 struct PJRT_Client_Destroy_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Client* client;
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_Destroy_Args, client);
@@ -377,7 +469,7 @@ typedef PJRT_Error* PJRT_Client_Destroy(PJRT_Client_Destroy_Args* args);
 
 struct PJRT_Client_PlatformName_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Client* client;
   // `platform_name` has the same lifetime as `client`. It is owned by `client`.
   const char* platform_name;  // out
@@ -391,7 +483,7 @@ typedef PJRT_Error* PJRT_Client_PlatformName(
 
 struct PJRT_Client_ProcessIndex_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Client* client;
   int process_index;  // out
 };
@@ -404,7 +496,7 @@ typedef PJRT_Error* PJRT_Client_ProcessIndex(
 
 struct PJRT_Client_PlatformVersion_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Client* client;
   // `platform_version` has the same lifetime as `client`. It's owned by
   // `client`.
@@ -421,7 +513,7 @@ typedef PJRT_Error* PJRT_Client_PlatformVersion(
 
 struct PJRT_Client_TopologyDescription_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Client* client;
   // Is owned by and has the same lifetime as `client`.
   PJRT_TopologyDescription* topology;  // out
@@ -435,7 +527,7 @@ typedef PJRT_Error* PJRT_Client_TopologyDescription(
 
 struct PJRT_Client_Devices_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Client* client;
   PJRT_Device* const* devices;  // out
   size_t num_devices;           // out
@@ -448,7 +540,7 @@ typedef PJRT_Error* PJRT_Client_Devices(PJRT_Client_Devices_Args* args);
 
 struct PJRT_Client_AddressableDevices_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Client* client;
   PJRT_Device* const* addressable_devices;  // out
   size_t num_addressable_devices;           // out
@@ -464,7 +556,7 @@ typedef PJRT_Error* PJRT_Client_AddressableDevices(
 
 struct PJRT_Client_LookupDevice_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Client* client;
   int id;
   // `device` has the same lifetime as `client`. It is owned by `client`.
@@ -479,7 +571,7 @@ typedef PJRT_Error* PJRT_Client_LookupDevice(
 
 struct PJRT_Client_LookupAddressableDevice_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Client* client;
   int local_hardware_id;
   // `addressable_device` has the same lifetime as `client`. It is owned by
@@ -494,9 +586,49 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_LookupAddressableDevice_Args,
 typedef PJRT_Error* PJRT_Client_LookupAddressableDevice(
     PJRT_Client_LookupAddressableDevice_Args* args);
 
+typedef enum {
+  PJRT_ProcessState_kUnspecified = 0,
+  PJRT_ProcessState_kUninitialized = 1,
+  PJRT_ProcessState_kDisconnected = 2,
+  PJRT_ProcessState_kConnected = 3,
+  PJRT_ProcessState_kError = 4,
+} PJRT_ProcessState;
+
+// TODO: mwhittaker - Add the remaining fields from
+// tensorflow::CoordinatedTaskStateInfo.
+struct PJRT_ProcessInfo {
+  size_t struct_size;
+  int task_id;
+  uint64_t incarnation_id;
+  PJRT_ProcessState state;
+  int error_code;
+  const char* error_message;
+  size_t error_message_size;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_ProcessInfo, error_message_size);
+
+struct PJRT_Client_UpdateGlobalProcessInfo_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Client* client;
+  PJRT_ProcessInfo* process_infos;
+  size_t num_process_infos;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_UpdateGlobalProcessInfo_Args,
+                          num_process_infos);
+
+// Updates the PjRt client with information about all global processes.
+//
+// Recall that a distributed program may consist of multiple PjRt clients
+// spanning multiple machines. These clients perform collective operations, like
+// AllGather, to execute a distributed program. UpdateGlobalProcessInfo updates
+// a PjRt client with information about all processes.
+typedef PJRT_Error* PJRT_Client_UpdateGlobalProcessInfo(
+    PJRT_Client_UpdateGlobalProcessInfo_Args* args);
+
 struct PJRT_Client_AddressableMemories_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Client* client;
   PJRT_Memory* const* addressable_memories;  // out
   size_t num_addressable_memories;           // out
@@ -512,7 +644,7 @@ typedef PJRT_Error* PJRT_Client_AddressableMemories(
 
 struct PJRT_Program {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   // Serialized code in the specified format below.
   // String is owned by the caller.
   char* code;  // in/out depending on usage
@@ -529,14 +661,13 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Program, format_size);
 
 struct PJRT_Client_Compile_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Client* client;
   // Only needs to stay alive for the duration of the Compile call.
   // `program->format` and `program->format_size` are owned by the caller.
   const PJRT_Program* program;
   // TODO(b/240560013): consider putting some of option fields in priv.
-  // Serialized CompileOptionsProto
-  // (https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/xla/pjrt/compile_options.proto)
+  // Serialized CompileOptionsProto.
   const char* compile_options;
   size_t compile_options_size;
   PJRT_LoadedExecutable* executable;  // out
@@ -549,7 +680,7 @@ typedef PJRT_Error* PJRT_Client_Compile(PJRT_Client_Compile_Args* args);
 
 struct PJRT_Client_DefaultDeviceAssignment_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Client* client;
   int num_replicas;
   int num_partitions;
@@ -566,6 +697,129 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_DefaultDeviceAssignment_Args,
 typedef PJRT_Error* PJRT_Client_DefaultDeviceAssignment(
     PJRT_Client_DefaultDeviceAssignment_Args* args);
 
+struct PJRT_Client_DmaMap_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Client* client;
+  void* data;
+  size_t size;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_DmaMap_Args, size);
+
+typedef PJRT_Error* PJRT_Client_DmaMap(PJRT_Client_DmaMap_Args* args);
+
+struct PJRT_Client_DmaUnmap_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Client* client;
+  void* data;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_DmaUnmap_Args, data);
+
+typedef PJRT_Error* PJRT_Client_DmaUnmap(PJRT_Client_DmaUnmap_Args* args);
+
+struct PJRT_AsyncHostToDeviceTransferManager_Destroy_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_AsyncHostToDeviceTransferManager* transfer_manager;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_AsyncHostToDeviceTransferManager_Destroy_Args,
+                          transfer_manager);
+
+// Frees `transfer_manager`. `transfer_manager` can be nullptr.
+typedef PJRT_Error* PJRT_AsyncHostToDeviceTransferManager_Destroy(
+    PJRT_AsyncHostToDeviceTransferManager_Destroy_Args* args);
+
+struct PJRT_AsyncHostToDeviceTransferManager_TransferData_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_AsyncHostToDeviceTransferManager* transfer_manager;
+  int buffer_index;
+  const void* data;
+  int64_t offset;
+  int64_t transfer_size;
+  bool is_last_transfer;
+  PJRT_Event* done_with_h2d_transfer;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(
+    PJRT_AsyncHostToDeviceTransferManager_TransferData_Args,
+    done_with_h2d_transfer);
+typedef PJRT_Error* PJRT_AsyncHostToDeviceTransferManager_TransferData(
+    PJRT_AsyncHostToDeviceTransferManager_TransferData_Args* args);
+
+struct PJRT_AsyncHostToDeviceTransferManager_RetrieveBuffer_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_AsyncHostToDeviceTransferManager* transfer_manager;
+  int buffer_index;
+  PJRT_Buffer* buffer_out;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(
+    PJRT_AsyncHostToDeviceTransferManager_RetrieveBuffer_Args, buffer_out);
+typedef PJRT_Error* PJRT_AsyncHostToDeviceTransferManager_RetrieveBuffer(
+    PJRT_AsyncHostToDeviceTransferManager_RetrieveBuffer_Args* args);
+
+struct PJRT_AsyncHostToDeviceTransferManager_Device_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_AsyncHostToDeviceTransferManager* transfer_manager;
+  PJRT_Device* device_out;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_AsyncHostToDeviceTransferManager_Device_Args,
+                          device_out);
+typedef PJRT_Error* PJRT_AsyncHostToDeviceTransferManager_Device(
+    PJRT_AsyncHostToDeviceTransferManager_Device_Args* args);
+
+struct PJRT_AsyncHostToDeviceTransferManager_BufferCount_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_AsyncHostToDeviceTransferManager* transfer_manager;
+  size_t buffer_count;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(
+    PJRT_AsyncHostToDeviceTransferManager_BufferCount_Args, buffer_count);
+typedef PJRT_Error* PJRT_AsyncHostToDeviceTransferManager_BufferCount(
+    PJRT_AsyncHostToDeviceTransferManager_BufferCount_Args* args);
+
+struct PJRT_AsyncHostToDeviceTransferManager_BufferSize_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_AsyncHostToDeviceTransferManager* transfer_manager;
+  int buffer_index;
+  size_t buffer_size;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_AsyncHostToDeviceTransferManager_BufferSize_Args,
+                          buffer_size);
+typedef PJRT_Error* PJRT_AsyncHostToDeviceTransferManager_BufferSize(
+    PJRT_AsyncHostToDeviceTransferManager_BufferSize_Args* args);
+
+struct PJRT_AsyncHostToDeviceTransferManager_SetBufferError_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_AsyncHostToDeviceTransferManager* transfer_manager;
+  int buffer_index;
+  PJRT_Error_Code error_code;
+  const char* error_message;
+  size_t error_message_size;
+};
+PJRT_DEFINE_STRUCT_TRAITS(
+    PJRT_AsyncHostToDeviceTransferManager_SetBufferError_Args,
+    error_message_size);
+typedef PJRT_Error* PJRT_AsyncHostToDeviceTransferManager_SetBufferError(
+    PJRT_AsyncHostToDeviceTransferManager_SetBufferError_Args* args);
+
+struct PJRT_AsyncHostToDeviceTransferManager_AddMetadata_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_AsyncHostToDeviceTransferManager* transfer_manager;
+  const PJRT_NamedValue* transfer_metadata;
+  size_t num_metadata;
+};
+PJRT_DEFINE_STRUCT_TRAITS(
+    PJRT_AsyncHostToDeviceTransferManager_AddMetadata_Args, num_metadata);
+typedef PJRT_Error* PJRT_AsyncHostToDeviceTransferManager_AddMetadata(
+    PJRT_AsyncHostToDeviceTransferManager_AddMetadata_Args* args);
+
 typedef enum {
   // Invalid primitive type to serve as default.
   PJRT_Buffer_Type_INVALID,
@@ -612,6 +866,20 @@ typedef enum {
   // 4-bit integer types
   PJRT_Buffer_Type_S4,
   PJRT_Buffer_Type_U4,
+
+  PJRT_Buffer_Type_TOKEN,
+
+  // 2-bit integer types
+  PJRT_Buffer_Type_S2,
+  PJRT_Buffer_Type_U2,
+
+  // More truncated 8 bit floating-point formats.
+  PJRT_Buffer_Type_F8E4M3,
+  PJRT_Buffer_Type_F8E3M4,
+  PJRT_Buffer_Type_F8E8M0FNU,
+
+  // 4-bit MX floating-point format.
+  PJRT_Buffer_Type_F4E2M1FN,
 } PJRT_Buffer_Type;
 
 typedef enum {
@@ -629,11 +897,24 @@ typedef enum {
   PJRT_HostBufferSemantics_kImmutableUntilTransferCompletes,
 
   // The PjRtBuffer may alias `data` internally and the runtime may use the
-  // `data` contents as long as the buffer is alive. The caller promises to
-  // keep `data` alive and not to mutate its contents as long as the buffer is
-  // alive; to notify the caller that the buffer may be freed, the runtime
-  // will call `done_with_host_buffer` when the PjRtBuffer is freed.
-  PJRT_HostBufferSemantics_kZeroCopy,
+  // `data` contents as long as the buffer is alive. The runtime promises not
+  // to mutate contents of the buffer (i.e. it will not use it for aliased
+  // output buffers). The caller promises to keep `data` alive and not to mutate
+  // its contents as long as the buffer is alive; to notify the caller that the
+  // buffer may be freed, the runtime will call `done_with_host_buffer` when the
+  // PjRtBuffer is freed.
+  PJRT_HostBufferSemantics_kImmutableZeroCopy,
+
+  // The PjRtBuffer may alias `data` internally and the runtime may use the
+  // `data` contents as long as the buffer is alive. The runtime is allowed
+  // to mutate contents of the buffer (i.e. use it for aliased output
+  // buffers). The caller promises to keep `data` alive and not to mutate its
+  // contents as long as the buffer is alive (otherwise it could be a data
+  // race with the runtime); to notify the caller that the buffer may be
+  // freed, the runtime will call `on_done_with_host_buffer` when the
+  // PjRtBuffer is freed. On non-CPU platforms this acts identically to
+  // kImmutableUntilTransferCompletes.
+  PJRT_HostBufferSemantics_kMutableZeroCopy,
 } PJRT_HostBufferSemantics;
 
 typedef enum {
@@ -643,7 +924,7 @@ typedef enum {
 
 struct PJRT_Buffer_MemoryLayout_Tiled {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   // A map from physical dimension numbers to logical dimension numbers.
   // The first element is the most minor physical dimension (fastest varying
   // index) and the last the most major (slowest varying index). The contents of
@@ -661,7 +942,7 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_MemoryLayout_Tiled, num_tiles);
 
 struct PJRT_Buffer_MemoryLayout_Strides {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   // Number of bytes to traverse per dimension. Must be the same size as
   // the number of dimensions of the data. Caution: `byte_strides` are allowed
   // to be negative, in which case data may need to point to the interior of
@@ -676,7 +957,7 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_MemoryLayout_Strides, num_byte_strides);
 // strides.
 struct PJRT_Buffer_MemoryLayout {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   union {
     PJRT_Buffer_MemoryLayout_Tiled tiled;
     PJRT_Buffer_MemoryLayout_Strides strides;
@@ -685,9 +966,76 @@ struct PJRT_Buffer_MemoryLayout {
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_MemoryLayout, type);
 
+struct PJRT_Client_CreateUninitializedBuffer_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Client* client;
+
+  // Shape fields.
+  const int64_t* shape_dims;
+  size_t shape_num_dims;
+  PJRT_Buffer_Type shape_element_type;
+  PJRT_Buffer_MemoryLayout* shape_layout;
+
+  // Device to copy host data to.
+  PJRT_Device* device;
+
+  // If nullptr, host data will be copied to `device`, otherwise we copy data to
+  // `memory`.
+  PJRT_Memory* memory;
+
+  // Output device buffer. The caller is responsible for calling
+  // PJRT_Buffer_Destroy.
+  PJRT_Buffer* buffer;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_CreateUninitializedBuffer_Args, buffer);
+
+typedef PJRT_Error* PJRT_Client_CreateUninitializedBuffer(
+    PJRT_Client_CreateUninitializedBuffer_Args* args);
+
+struct PJRT_Client_CreateAliasBuffer_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Client* client;
+
+  // Destination memory space for the buffer alias.
+  PJRT_Memory* memory;
+
+  // Shape fields.
+  const int64_t* shape_dims;
+  size_t shape_num_dims;
+  PJRT_Buffer_Type shape_element_type;
+  PJRT_Buffer_MemoryLayout* shape_layout;
+
+  PJRT_Buffer* alias_buffer;                                 // out
+  PJRT_FulfillAliasBufferCallback* fulfill_alias_buffer_cb;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_CreateAliasBuffer_Args,
+                          fulfill_alias_buffer_cb);
+
+typedef PJRT_Error* PJRT_Client_CreateAliasBuffer(
+    PJRT_Client_CreateAliasBuffer_Args* args);
+
+struct PJRT_Client_FulfillAliasBuffer_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Client* client;
+
+  PJRT_Buffer* buffer;                                       // in
+  PJRT_Error_Code status_code;                               // in
+  const char* error_message;                                 // in
+  size_t error_message_size;                                 // in
+  PJRT_FulfillAliasBufferCallback* fulfill_alias_buffer_cb;  // in
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_FulfillAliasBuffer_Args,
+                          fulfill_alias_buffer_cb);
+
+typedef PJRT_Error* PJRT_Client_FulfillAliasBuffer(
+    PJRT_Client_FulfillAliasBuffer_Args* args);
+
 struct PJRT_Client_BufferFromHostBuffer_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Client* client;
   // Pointer to the host buffer
   const void* data;
@@ -735,7 +1083,7 @@ typedef PJRT_Error* PJRT_Client_BufferFromHostBuffer(
 
 struct PJRT_Client_CreateViewOfDeviceBuffer_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Client* client;
   // A pointer to a non-owned device buffer. A PJRT_Buffer that is a non-owned
   // view of this device buffer will be created.
@@ -744,7 +1092,9 @@ struct PJRT_Client_CreateViewOfDeviceBuffer_Args {
   size_t num_dims;
   PJRT_Buffer_Type element_type;
   PJRT_Buffer_MemoryLayout* layout;
-  // The device that `device_buffer_ptr` is on.
+  // The device that `device_buffer_ptr` is on. The argument is ignored if
+  // `memory` is provided.
+  // DEPRECATED: Use `memory` instead.
   PJRT_Device* device;
   // A callback to be performed when the PJRT_Buffer is done with the on-device
   // buffer. This callback is optional and can be a nullptr.
@@ -760,8 +1110,10 @@ struct PJRT_Client_CreateViewOfDeviceBuffer_Args {
   // to be supported on all hardware platforms.
   intptr_t stream;
   PJRT_Buffer* buffer;  // out
+  // The memory space that `device_buffer_ptr` is in.
+  PJRT_Memory* memory;
 };
-PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_CreateViewOfDeviceBuffer_Args, buffer);
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_CreateViewOfDeviceBuffer_Args, memory);
 
 // Creates a PJRT buffer that is a non-owned view of an on-device buffer
 // (typically allocated by another library). The buffer may be mutated,
@@ -770,6 +1122,31 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_CreateViewOfDeviceBuffer_Args, buffer);
 typedef PJRT_Error* PJRT_Client_CreateViewOfDeviceBuffer(
     PJRT_Client_CreateViewOfDeviceBuffer_Args* args);
 
+struct PJRT_ShapeSpec {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  const int64_t* dims;
+  size_t num_dims;
+  PJRT_Buffer_Type element_type;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_ShapeSpec, element_type);
+
+struct PJRT_Client_CreateBuffersForAsyncHostToDevice_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Client* client;
+  PJRT_ShapeSpec* shape_specs;
+  size_t num_shape_specs;
+  PJRT_Buffer_MemoryLayout** device_layouts;  // optional
+  size_t num_device_layouts;
+  PJRT_Memory* memory;
+  PJRT_AsyncHostToDeviceTransferManager* transfer_manager;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_CreateBuffersForAsyncHostToDevice_Args,
+                          transfer_manager);
+typedef PJRT_Error* PJRT_Client_CreateBuffersForAsyncHostToDevice(
+    PJRT_Client_CreateBuffersForAsyncHostToDevice_Args* args);
+
 // -------------------------- Device Descriptions ------------------------------
 
 // Device descriptions may be associated with an actual device
@@ -781,7 +1158,7 @@ typedef PJRT_Error* PJRT_Client_CreateViewOfDeviceBuffer(
 
 struct PJRT_DeviceDescription_Id_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_DeviceDescription* device_description;
   int id;  // out
 };
@@ -795,7 +1172,7 @@ typedef PJRT_Error* PJRT_DeviceDescription_Id(
 
 struct PJRT_DeviceDescription_ProcessIndex_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_DeviceDescription* device_description;
   int process_index;  // out
 };
@@ -812,7 +1189,7 @@ typedef PJRT_Error* PJRT_DeviceDescription_ProcessIndex(
 
 struct PJRT_DeviceDescription_Attributes_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_DeviceDescription* device_description;
   size_t num_attributes;              // out
   const PJRT_NamedValue* attributes;  // out
@@ -826,7 +1203,7 @@ typedef PJRT_Error* PJRT_DeviceDescription_Attributes(
 
 struct PJRT_DeviceDescription_Kind_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_DeviceDescription* device_description;
   // `device_kind` string is owned by `device` and has same lifetime as
   // `device`.
@@ -842,7 +1219,7 @@ typedef PJRT_Error* PJRT_DeviceDescription_Kind(
 
 struct PJRT_DeviceDescription_DebugString_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_DeviceDescription* device_description;
   const char* debug_string;  // out
   size_t debug_string_size;  // out
@@ -857,7 +1234,7 @@ typedef PJRT_Error* PJRT_DeviceDescription_DebugString(
 
 struct PJRT_DeviceDescription_ToString_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_DeviceDescription* device_description;
   const char* to_string;  // out
   size_t to_string_size;  // out
@@ -873,7 +1250,7 @@ typedef PJRT_Error* PJRT_DeviceDescription_ToString(
 
 struct PJRT_Device_GetDescription_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Device* device;
   PJRT_DeviceDescription* device_description;  // out
 };
@@ -885,7 +1262,7 @@ typedef PJRT_Error* PJRT_Device_GetDescription(
 
 struct PJRT_Device_IsAddressable_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Device* device;
   bool is_addressable;  // out
 };
@@ -897,7 +1274,7 @@ typedef PJRT_Error* PJRT_Device_IsAddressable(
 
 struct PJRT_Device_LocalHardwareId_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Device* device;
   int local_hardware_id;  // out
 };
@@ -910,13 +1287,13 @@ typedef PJRT_Error* PJRT_Device_LocalHardwareId(
 
 struct PJRT_Device_AddressableMemories_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Device* device;
   // Has the lifetime of `device`.
   PJRT_Memory* const* memories;  // out
   size_t num_memories;           // out
 };
-PJRT_DEFINE_STRUCT_TRAITS(PJRT_Device_AddressableMemories_Args, memories);
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Device_AddressableMemories_Args, num_memories);
 
 // Returns the memories that a device can address.
 typedef PJRT_Error* PJRT_Device_AddressableMemories(
@@ -924,7 +1301,7 @@ typedef PJRT_Error* PJRT_Device_AddressableMemories(
 
 struct PJRT_Device_DefaultMemory_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Device* device;
   // `memory` has the same lifetime as `device`.
   PJRT_Memory* memory;  // out
@@ -938,7 +1315,7 @@ typedef PJRT_Error* PJRT_Device_DefaultMemory(
 
 struct PJRT_Device_MemoryStats_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Device* device;
 
   // Number of bytes in use.
@@ -989,7 +1366,7 @@ typedef PJRT_Error* PJRT_Device_MemoryStats(PJRT_Device_MemoryStats_Args* args);
 
 struct PJRT_Memory_Id_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Memory* memory;
   int id;  // out
 };
@@ -1000,20 +1377,31 @@ typedef PJRT_Error* PJRT_Memory_Id(PJRT_Memory_Id_Args* args);
 
 struct PJRT_Memory_Kind_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Memory* memory;
   // `memory_kind` has same lifetime as `memory`.
-  const char* memory_kind;  // out
-  size_t memory_kind_size;  // out
+  const char* kind;  // out
+  size_t kind_size;  // out
 };
-PJRT_DEFINE_STRUCT_TRAITS(PJRT_Memory_Kind_Args, memory_kind_size);
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Memory_Kind_Args, kind_size);
 
 // A platform-dependent string that uniquely identifies the kind of the memory.
 typedef PJRT_Error* PJRT_Memory_Kind(PJRT_Memory_Kind_Args* args);
 
+struct PJRT_Memory_Kind_Id_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Memory* memory;
+  int kind_id;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Memory_Kind_Id_Args, kind_id);
+
+// A platform-dependent ID that uniquely identifies the kind of the memory.
+typedef PJRT_Error* PJRT_Memory_Kind_Id(PJRT_Memory_Kind_Id_Args* args);
+
 struct PJRT_Memory_DebugString_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Memory* memory;
   const char* debug_string;  // out
   size_t debug_string_size;  // out
@@ -1026,7 +1414,7 @@ typedef PJRT_Error* PJRT_Memory_DebugString(PJRT_Memory_DebugString_Args* args);
 
 struct PJRT_Memory_ToString_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Memory* memory;
   const char* to_string;  // out
   size_t to_string_size;  // out
@@ -1038,7 +1426,7 @@ typedef PJRT_Error* PJRT_Memory_ToString(PJRT_Memory_ToString_Args* args);
 
 struct PJRT_Memory_AddressableByDevices_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Memory* memory;
   PJRT_Device* const* devices;  // out
   size_t num_devices;           // out
@@ -1049,11 +1437,41 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Memory_AddressableByDevices_Args, num_devices);
 typedef PJRT_Error* PJRT_Memory_AddressableByDevices(
     PJRT_Memory_AddressableByDevices_Args* args);
 
+// ------------------------------- Execute Context -----------------------------
+
+// An opaque context passed to an execution that may be used to supply
+// additional arguments to a derived class of PJRT_Executable. It is a caller
+// responsibility to ensure that the context is valid for the duration of the
+// execution.
+typedef struct PJRT_ExecuteContext PJRT_ExecuteContext;
+
+struct PJRT_ExecuteContext_Create_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_ExecuteContext* context;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_ExecuteContext_Create_Args, context);
+
+// Creates an execute context.
+typedef PJRT_Error* PJRT_ExecuteContext_Create(
+    PJRT_ExecuteContext_Create_Args* args);
+
+struct PJRT_ExecuteContext_Destroy_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_ExecuteContext* context;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_ExecuteContext_Destroy_Args, context);
+
+// Frees an execute context. `context` can be nullptr.
+typedef PJRT_Error* PJRT_ExecuteContext_Destroy(
+    PJRT_ExecuteContext_Destroy_Args* args);
+
 // ------------------------------- Executables ---------------------------------
 
 struct PJRT_Executable_Destroy_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Executable* executable;
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Executable_Destroy_Args, executable);
@@ -1063,7 +1481,7 @@ typedef PJRT_Error* PJRT_Executable_Destroy(PJRT_Executable_Destroy_Args* args);
 
 struct PJRT_LoadedExecutable_Destroy_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_LoadedExecutable* executable;
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_LoadedExecutable_Destroy_Args, executable);
@@ -1075,7 +1493,7 @@ typedef PJRT_Error* PJRT_LoadedExecutable_Destroy(
 
 struct PJRT_LoadedExecutable_GetExecutable_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_LoadedExecutable* loaded_executable;
   PJRT_Executable* executable;  // out
 };
@@ -1086,9 +1504,38 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_LoadedExecutable_GetExecutable_Args, executable);
 typedef PJRT_Error* PJRT_LoadedExecutable_GetExecutable(
     PJRT_LoadedExecutable_GetExecutable_Args* args);
 
+typedef struct PJRT_DeviceAssignmentSerialized PJRT_DeviceAssignmentSerialized;
+
+struct PJRT_LoadedExecutable_GetDeviceAssignment_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_LoadedExecutable* executable;
+
+  // Lives only as long as serialized_device_assignment
+  const char* serialized_bytes;  // out
+  size_t serialized_bytes_size;  // out
+
+  PJRT_DeviceAssignmentSerialized*
+      serialized_device_assignment;  // backs serialized_bytes.
+  // cleanup fn must be called to free the backing memory for serialized_bytes.
+  // Should only be called once on serialized_device_assignment.
+  void (*serialized_device_assignment_deleter)(
+      PJRT_DeviceAssignmentSerialized* da);  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_LoadedExecutable_GetDeviceAssignment_Args,
+                          serialized_device_assignment_deleter);
+
+// Retrieves the serialized DeviceAssignmentProto for a given
+// PJRT_LoadedExecutable. The implementation allocates the serialized data,
+// which is valid as long as `serialized_device_assignment` is alive. The
+// caller must call `serialized_device_assignment_deleter` to free the
+// backing memory.
+typedef PJRT_Error* PJRT_LoadedExecutable_GetDeviceAssignment(
+    PJRT_LoadedExecutable_GetDeviceAssignment_Args* args);
+
 struct PJRT_Executable_Name_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Executable* executable;
   // `executable_name` has the same lifetime as `executable`. It is owned by
   // `executable`.
@@ -1103,7 +1550,7 @@ typedef PJRT_Error* PJRT_Executable_Name(PJRT_Executable_Name_Args* args);
 // TODO(b/269178731): Revisit whether num_replicas is needed.
 struct PJRT_Executable_NumReplicas_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Executable* executable;
   size_t num_replicas;  // out
 };
@@ -1115,7 +1562,7 @@ typedef PJRT_Error* PJRT_Executable_NumReplicas(
 
 struct PJRT_Executable_NumPartitions_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Executable* executable;
   size_t num_partitions;  // out
 };
@@ -1127,7 +1574,7 @@ typedef PJRT_Error* PJRT_Executable_NumPartitions(
 
 struct PJRT_LoadedExecutable_AddressableDevices_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_LoadedExecutable* executable;
   PJRT_Device* const* addressable_devices;  // out
   size_t num_addressable_devices;           // out
@@ -1141,7 +1588,7 @@ typedef PJRT_Error* PJRT_LoadedExecutable_AddressableDevices(
 
 struct PJRT_Executable_OptimizedProgram_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Executable* executable;
   PJRT_Program* program;  // out, but read below
 };
@@ -1175,7 +1622,7 @@ typedef PJRT_Error* PJRT_Executable_OptimizedProgram(
 
 struct PJRT_LoadedExecutable_Delete_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_LoadedExecutable* executable;
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_LoadedExecutable_Delete_Args, executable);
@@ -1190,7 +1637,7 @@ typedef PJRT_Error* PJRT_LoadedExecutable_Delete(
 
 struct PJRT_LoadedExecutable_IsDeleted_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_LoadedExecutable* executable;
   bool is_deleted;  // out
 };
@@ -1247,7 +1694,7 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_RecvCallbackInfo, recv_callback);
 
 struct PJRT_ExecuteOptions {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   // Callbacks for when send/recv ops are executed. The outer lists correspond
   // to each device returned by `PJRT_Executable_AddressableDevices` for
   // `executable` (i.e. they will have length `num_devices`). Each inner list
@@ -1274,12 +1721,31 @@ struct PJRT_ExecuteOptions {
   // during the call.
   const int64_t* non_donatable_input_indices;
   size_t num_non_donatable_input_indices;
-};
-PJRT_DEFINE_STRUCT_TRAITS(PJRT_ExecuteOptions, launch_id);
+  PJRT_ExecuteContext* context;
+  // The `call_location` field is used to pass down call site location
+  // information from higher-level frameworks like JAX and PyTorch to the PJRT
+  // plugin. This field stores the source location (e.g., file:line) of the
+  // Python code that triggered the execution of this compiled program. This
+  // differs from the source location metadata stored in `OpMetadata`, which
+  // refers to the origin of individual operations within the HLO module.
+  // The plugin can use `call_location` for debugging and error reporting,
+  // allowing users to pinpoint which program execution led to an issue.
+  // The `call_location` pointer is owned by the caller and must point to a
+  // null-terminated string. It is only valid for the duration of the C API
+  // call. The plugin must copy the string if it needs to be stored.
+  const char* call_location;
+
+  // The incarnation id for every task. For every 0 <= i < num_tasks,
+  // task task_ids[i] has incarnation incarnation_ids[i].
+  size_t num_tasks;
+  int* task_ids;
+  int64_t* incarnation_ids;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_ExecuteOptions, incarnation_ids);
 
 struct PJRT_LoadedExecutable_Execute_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_LoadedExecutable* executable;
   // Only needs to stay alive for the duration of the Execute call.
   PJRT_ExecuteOptions* options;
@@ -1318,7 +1784,7 @@ typedef PJRT_Error* PJRT_LoadedExecutable_Execute(
 
 struct PJRT_Executable_NumOutputs_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Executable* executable;
   size_t num_outputs;  // out
 };
@@ -1330,7 +1796,7 @@ typedef PJRT_Error* PJRT_Executable_NumOutputs(
 
 struct PJRT_Executable_SizeOfGeneratedCodeInBytes_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Executable* executable;
   int64_t size_in_bytes;  // out
 };
@@ -1342,7 +1808,7 @@ typedef PJRT_Error* PJRT_Executable_SizeOfGeneratedCodeInBytes(
 
 struct PJRT_Executable_Fingerprint_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Executable* executable;
   // Has the lifetime of `executable`
   const char* executable_fingerprint;  // out
@@ -1360,7 +1826,7 @@ typedef PJRT_Error* PJRT_Executable_Fingerprint(
 
 struct PJRT_Executable_GetCostAnalysis_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Executable* executable;
   size_t num_properties;  // out
   // `properties` and any embedded data are owned by and have the same lifetime
@@ -1378,28 +1844,40 @@ typedef PJRT_Error* PJRT_Executable_GetCostAnalysis(
 
 struct PJRT_Executable_GetCompiledMemoryStats_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Executable* executable;
 
   // Mirrors xla::CompiledMemoryStats.
+  // Device default memory (e.g., HBM for GPU/TPU) usage stats.
   int64_t generated_code_size_in_bytes;  // out
   int64_t argument_size_in_bytes;        // out
   int64_t output_size_in_bytes;          // out
   // How much argument is reused for output.
   int64_t alias_size_in_bytes;  // out
   int64_t temp_size_in_bytes;   // out
+
+  // Host memory usage stats.
+  int64_t host_generated_code_size_in_bytes;  // out
+  int64_t host_argument_size_in_bytes;        // out
+  int64_t host_output_size_in_bytes;          // out
+  int64_t host_alias_size_in_bytes;           // out
+  int64_t host_temp_size_in_bytes;            // out
+
+  // Device memory stats, from xla::CompiledMemoryStats.
+  int64_t peak_memory_in_bytes;  // out
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Executable_GetCompiledMemoryStats_Args,
-                          temp_size_in_bytes);
+                          peak_memory_in_bytes);
 
-// Return memory stats that allow callers to estimate device memory usage
-// when running this executable.
+// Return memory stats that allow callers to estimate memory usage when running
+// this executable. The memory stats could contain usage info from different
+// memory spaces, like default memory (e.g., HBM for GPU/TPU) and host memory.
 typedef PJRT_Error* PJRT_Executable_GetCompiledMemoryStats(
     PJRT_Executable_GetCompiledMemoryStats_Args* args);
 
 struct PJRT_Executable_OutputElementTypes_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Executable* executable;
   PJRT_Buffer_Type* output_types;  // out
   size_t num_output_types;         // out
@@ -1413,7 +1891,7 @@ typedef PJRT_Error* PJRT_Executable_OutputElementTypes(
 
 struct PJRT_Executable_OutputDimensions_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Executable* executable;
   size_t num_outputs;
   // Has length: sum of all elements in the list `dim_sizes`.
@@ -1431,7 +1909,7 @@ typedef PJRT_Error* PJRT_Executable_OutputDimensions(
 
 struct PJRT_Executable_OutputMemoryKinds_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Executable* executable;
   size_t num_outputs;
   // Has length `num_outputs`.
@@ -1450,7 +1928,7 @@ typedef struct PJRT_SerializedExecutable PJRT_SerializedExecutable;
 
 struct PJRT_Executable_Serialize_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   const PJRT_Executable* executable;
 
   // Lives only as long as serialized_executable
@@ -1473,14 +1951,18 @@ typedef PJRT_Error* PJRT_Executable_Serialize(
 
 struct PJRT_Executable_DeserializeAndLoad_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Client* client;
   const char* serialized_executable;
   size_t serialized_executable_size;
   PJRT_LoadedExecutable* loaded_executable;  // out
+  // Serialized CompileOptionsProto or null (to use the options
+  // from the serialized executable).
+  const char* overridden_serialized_compile_options;
+  size_t overridden_serialized_compile_options_size;
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Executable_DeserializeAndLoad_Args,
-                          loaded_executable);
+                          overridden_serialized_compile_options_size);
 
 // Deserializes an executable serialized by `PJRT_Executable_Serialize`.
 // `serialized_executable` must have been produced by the same platform and
@@ -1490,7 +1972,7 @@ typedef PJRT_Error* PJRT_Executable_DeserializeAndLoad(
 
 struct PJRT_LoadedExecutable_Fingerprint_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_LoadedExecutable* executable;
   // Has the lifetime of `executable`
   const char* executable_fingerprint;  // out
@@ -1510,7 +1992,7 @@ typedef PJRT_Error* PJRT_LoadedExecutable_Fingerprint(
 
 struct PJRT_Buffer_Destroy_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Buffer* buffer;
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_Destroy_Args, buffer);
@@ -1521,7 +2003,7 @@ typedef PJRT_Error* PJRT_Buffer_Destroy(PJRT_Buffer_Destroy_Args* args);
 
 struct PJRT_Buffer_ElementType_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Buffer* buffer;
   PJRT_Buffer_Type type;  // out
 };
@@ -1532,7 +2014,7 @@ typedef PJRT_Error* PJRT_Buffer_ElementType(PJRT_Buffer_ElementType_Args* args);
 
 struct PJRT_Buffer_Dimensions_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Buffer* buffer;
   // Has the lifetime of `buffer` and length `num_dims`.
   const int64_t* dims;  // out
@@ -1545,7 +2027,7 @@ typedef PJRT_Error* PJRT_Buffer_Dimensions(PJRT_Buffer_Dimensions_Args* args);
 
 struct PJRT_Buffer_UnpaddedDimensions_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Buffer* buffer;
   // Has the lifetime of `buffer` and length `num_dims`.
   const int64_t* unpadded_dims;  // out
@@ -1565,7 +2047,7 @@ typedef PJRT_Error* PJRT_Buffer_UnpaddedDimensions(
 
 struct PJRT_Buffer_DynamicDimensionIndices_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Buffer* buffer;
   // Has the lifetime of `buffer` and length `num_dynamic_dims`.
   const size_t* dynamic_dim_indices;  // out
@@ -1583,20 +2065,22 @@ typedef PJRT_Error* PJRT_Buffer_DynamicDimensionIndices(
 
 struct PJRT_Buffer_GetMemoryLayout_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Buffer* buffer;
   // Layout data is owned by and has the lifetime of `buffer`.
   PJRT_Buffer_MemoryLayout layout;  // out
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_GetMemoryLayout_Args, layout);
 
+// DEPRECATED. Please use layout extension instead.
+// https://github.com/openxla/xla/blob/main/xla/pjrt/c/pjrt_c_api_layouts_extension.h
 // Returns the memory layout of the data in this buffer.
 typedef PJRT_Error* PJRT_Buffer_GetMemoryLayout(
     PJRT_Buffer_GetMemoryLayout_Args* args);
 
 struct PJRT_Buffer_ToHostBuffer_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Buffer* src;
 
   // The caller can specify an optional host layout. If nullptr, the layout of
@@ -1622,7 +2106,7 @@ typedef PJRT_Error* PJRT_Buffer_ToHostBuffer(
 
 struct PJRT_Buffer_OnDeviceSizeInBytes_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Buffer* buffer;
   size_t on_device_size_in_bytes;  // out
 };
@@ -1635,7 +2119,7 @@ typedef PJRT_Error* PJRT_Buffer_OnDeviceSizeInBytes(
 
 struct PJRT_Buffer_Delete_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Buffer* buffer;
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_Delete_Args, buffer);
@@ -1649,7 +2133,7 @@ typedef PJRT_Error* PJRT_Buffer_Delete(PJRT_Buffer_Delete_Args* args);
 
 struct PJRT_Buffer_IsDeleted_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Buffer* buffer;
   bool is_deleted;  // out
 };
@@ -1658,9 +2142,23 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_IsDeleted_Args, is_deleted);
 // True if and only if PJRT_Buffer_Delete has previously been called.
 typedef PJRT_Error* PJRT_Buffer_IsDeleted(PJRT_Buffer_IsDeleted_Args* args);
 
+struct PJRT_Buffer_CopyRawToHost_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Buffer* buffer;
+  void* dst;
+  int64_t offset;
+  int64_t transfer_size;
+  PJRT_Event* event;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_CopyRawToHost_Args, event);
+
+typedef PJRT_Error* PJRT_Buffer_CopyRawToHost(
+    PJRT_Buffer_CopyRawToHost_Args* args);
+
 struct PJRT_Buffer_CopyToDevice_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Buffer* buffer;
   PJRT_Device* dst_device;
   PJRT_Buffer* dst_buffer;  // out
@@ -1675,7 +2173,7 @@ typedef PJRT_Error* PJRT_Buffer_CopyToDevice(
 
 struct PJRT_Buffer_CopyToMemory_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Buffer* buffer;
   PJRT_Memory* dst_memory;
   PJRT_Buffer* dst_buffer;  // out
@@ -1690,7 +2188,7 @@ typedef PJRT_Error* PJRT_Buffer_CopyToMemory(
 
 struct PJRT_Buffer_IsOnCpu_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Buffer* buffer;
   bool is_on_cpu;  // out
 };
@@ -1701,7 +2199,7 @@ typedef PJRT_Error* PJRT_Buffer_IsOnCpu(PJRT_Buffer_IsOnCpu_Args* args);
 
 struct PJRT_Buffer_Device_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Buffer* buffer;
   PJRT_Device* device;  // out
 };
@@ -1712,7 +2210,7 @@ typedef PJRT_Error* PJRT_Buffer_Device(PJRT_Buffer_Device_Args* args);
 
 struct PJRT_Buffer_Memory_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Buffer* buffer;
   PJRT_Memory* memory;  // out
 };
@@ -1723,7 +2221,7 @@ typedef PJRT_Error* PJRT_Buffer_Memory(PJRT_Buffer_Memory_Args* args);
 
 struct PJRT_Buffer_ReadyEvent_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Buffer* buffer;
   // The caller is responsible for calling PJRT_Event_Destroy on `event`.
   PJRT_Event* event;  // out
@@ -1743,7 +2241,7 @@ typedef PJRT_Error* PJRT_Buffer_ReadyEvent(PJRT_Buffer_ReadyEvent_Args* args);
 
 struct PJRT_Buffer_UnsafePointer_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Buffer* buffer;
   uintptr_t buffer_pointer;  // out
 };
@@ -1756,7 +2254,7 @@ typedef PJRT_Error* PJRT_Buffer_UnsafePointer(
 
 struct PJRT_Buffer_IncreaseExternalReferenceCount_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Buffer* buffer;
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_IncreaseExternalReferenceCount_Args,
@@ -1772,7 +2270,7 @@ typedef PJRT_Error* PJRT_Buffer_IncreaseExternalReferenceCount(
 
 struct PJRT_Buffer_DecreaseExternalReferenceCount_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Buffer* buffer;
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_DecreaseExternalReferenceCount_Args,
@@ -1786,7 +2284,7 @@ typedef PJRT_Error* PJRT_Buffer_DecreaseExternalReferenceCount(
 
 struct PJRT_Buffer_OpaqueDeviceMemoryDataPointer_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_Buffer* buffer;
   void* device_memory_ptr;  // out
 };
@@ -1803,7 +2301,7 @@ typedef PJRT_Error* PJRT_Buffer_OpaqueDeviceMemoryDataPointer(
 
 struct PJRT_CopyToDeviceStream_Destroy_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_CopyToDeviceStream* stream;
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_CopyToDeviceStream_Destroy_Args, stream);
@@ -1814,7 +2312,7 @@ typedef PJRT_Error* PJRT_CopyToDeviceStream_Destroy(
 
 struct PJRT_CopyToDeviceStream_AddChunk_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_CopyToDeviceStream* stream;
   // Takes ownership of `chunk` (i.e. implementation will call chunk.deleter).
   PJRT_Chunk* chunk;
@@ -1835,7 +2333,7 @@ typedef PJRT_Error* PJRT_CopyToDeviceStream_AddChunk(
 
 struct PJRT_CopyToDeviceStream_TotalBytes_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_CopyToDeviceStream* stream;
   int64_t total_bytes;  // out
 };
@@ -1847,7 +2345,7 @@ typedef PJRT_Error* PJRT_CopyToDeviceStream_TotalBytes(
 
 struct PJRT_CopyToDeviceStream_GranuleSize_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_CopyToDeviceStream* stream;
   int64_t granule_size_in_bytes;  // out
 };
@@ -1861,7 +2359,7 @@ typedef PJRT_Error* PJRT_CopyToDeviceStream_GranuleSize(
 
 struct PJRT_CopyToDeviceStream_CurrentBytes_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_CopyToDeviceStream* stream;
   int64_t current_bytes;  // out
 };
@@ -1877,7 +2375,7 @@ typedef PJRT_Error* PJRT_CopyToDeviceStream_CurrentBytes(
 
 struct PJRT_TopologyDescription_Create_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   const char* topology_name;
   size_t topology_name_size;
   // Extra platform-specific options to create a client.
@@ -1894,7 +2392,7 @@ typedef PJRT_Error* PJRT_TopologyDescription_Create(
 
 struct PJRT_TopologyDescription_Destroy_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_TopologyDescription* topology;
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_TopologyDescription_Destroy_Args, topology);
@@ -1905,7 +2403,7 @@ typedef PJRT_Error* PJRT_TopologyDescription_Destroy(
 
 struct PJRT_TopologyDescription_PlatformVersion_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_TopologyDescription* topology;
   // `platform_version` has the same lifetime as `topology`. It's owned by
   // `topology`.
@@ -1922,8 +2420,8 @@ typedef PJRT_Error* PJRT_TopologyDescription_PlatformVersion(
 
 struct PJRT_TopologyDescription_PlatformName_Args {
   size_t struct_size;
-  void* priv;
-  PJRT_TopologyDescription* topology;
+  PJRT_Extension_Base* extension_start;
+  const PJRT_TopologyDescription* topology;
   // `platform_name` has the same lifetime as `topology`. It is owned by
   // `topology`.
   const char* platform_name;  // out
@@ -1938,8 +2436,8 @@ typedef PJRT_Error* PJRT_TopologyDescription_PlatformName(
 
 struct PJRT_TopologyDescription_GetDeviceDescriptions_Args {
   size_t struct_size;
-  void* priv;
-  PJRT_TopologyDescription* topology;
+  PJRT_Extension_Base* extension_start;
+  const PJRT_TopologyDescription* topology;
   // Has the same lifetime as topology.
   PJRT_DeviceDescription* const* descriptions;  // out
   size_t num_descriptions;                      // out
@@ -1957,7 +2455,7 @@ typedef struct PJRT_SerializedTopology PJRT_SerializedTopology;
 
 struct PJRT_TopologyDescription_Serialize_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_TopologyDescription* topology;
 
   // Lives only as long as serialized_topology.
@@ -1977,9 +2475,22 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_TopologyDescription_Serialize_Args,
 typedef PJRT_Error* PJRT_TopologyDescription_Serialize(
     PJRT_TopologyDescription_Serialize_Args* args);
 
+struct PJRT_TopologyDescription_Deserialize_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  const char* serialized_topology;
+  size_t serialized_topology_size;
+
+  PJRT_TopologyDescription* topology;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_TopologyDescription_Deserialize_Args, topology);
+
+typedef PJRT_Error* PJRT_TopologyDescription_Deserialize(
+    PJRT_TopologyDescription_Deserialize_Args* args);
+
 struct PJRT_TopologyDescription_Attributes_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   PJRT_TopologyDescription* topology;
 
   // Only lives as long as topology.
@@ -1995,14 +2506,13 @@ typedef PJRT_Error* PJRT_TopologyDescription_Attributes(
 
 struct PJRT_Compile_Args {
   size_t struct_size;
-  void* priv;
+  PJRT_Extension_Base* extension_start;
   const PJRT_TopologyDescription* topology;
   // Only needs to stay alive for the duration of the Compile call.
   // `program->format` and `program->format_size` are owned by the caller.
   const PJRT_Program* program;
   // TODO(b/240560013): consider putting some of option fields in priv.
-  // Serialized CompileOptionsProto
-  // (https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/xla/pjrt/compile_options.proto)
+  // Serialized CompileOptionsProto.
   const char* compile_options;
   size_t compile_options_size;
   // Optionally provided for performance-guided optimizations.
@@ -2016,21 +2526,6 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Compile_Args, executable);
 // PJRT_Client before execution.
 typedef PJRT_Error* PJRT_Compile(PJRT_Compile_Args* args);
 
-// -------------------------------- Extension ----------------------------------
-
-typedef enum {
-  PJRT_Structure_Type_Gpu_Custom_Call = 0,
-  PJRT_Structure_Type_Profiler,
-} PJRT_Structure_Type;
-
-// PJRT_Structure_Base contains a type and a pointer to next
-// PJRT_Structure_Base. The framework can go through this chain to find
-// structure and identify it with the type.
-typedef struct PJRT_Structure_Base {
-  PJRT_Structure_Type type;
-  const struct PJRT_Structure_Base* next;
-} PJRT_Structure_Base;
-
 // -------------------------------- API access ---------------------------------
 
 #define _PJRT_API_STRUCT_FIELD(fn_type) fn_type* fn_type
@@ -2038,7 +2533,7 @@ typedef struct PJRT_Structure_Base {
 // Please modify PJRT_Api_STRUCT_SIZE if the last field of PJRT_Api is changed.
 typedef struct PJRT_Api {
   size_t struct_size;
-  void* extension_start;
+  PJRT_Extension_Base* extension_start;
 
   PJRT_Api_Version pjrt_api_version;
 
@@ -2159,11 +2654,35 @@ typedef struct PJRT_Api {
   _PJRT_API_STRUCT_FIELD(PJRT_Client_TopologyDescription);
 
   _PJRT_API_STRUCT_FIELD(PJRT_Executable_GetCompiledMemoryStats);
+
+  _PJRT_API_STRUCT_FIELD(PJRT_Memory_Kind_Id);
+
+  _PJRT_API_STRUCT_FIELD(PJRT_ExecuteContext_Create);
+  _PJRT_API_STRUCT_FIELD(PJRT_ExecuteContext_Destroy);
+  _PJRT_API_STRUCT_FIELD(PJRT_Buffer_CopyRawToHost);
+  _PJRT_API_STRUCT_FIELD(PJRT_AsyncHostToDeviceTransferManager_Destroy);
+  _PJRT_API_STRUCT_FIELD(PJRT_AsyncHostToDeviceTransferManager_TransferData);
+  _PJRT_API_STRUCT_FIELD(PJRT_Client_CreateBuffersForAsyncHostToDevice);
+  _PJRT_API_STRUCT_FIELD(PJRT_AsyncHostToDeviceTransferManager_RetrieveBuffer);
+  _PJRT_API_STRUCT_FIELD(PJRT_AsyncHostToDeviceTransferManager_Device);
+  _PJRT_API_STRUCT_FIELD(PJRT_AsyncHostToDeviceTransferManager_BufferCount);
+  _PJRT_API_STRUCT_FIELD(PJRT_AsyncHostToDeviceTransferManager_BufferSize);
+  _PJRT_API_STRUCT_FIELD(PJRT_AsyncHostToDeviceTransferManager_SetBufferError);
+  _PJRT_API_STRUCT_FIELD(PJRT_AsyncHostToDeviceTransferManager_AddMetadata);
+  _PJRT_API_STRUCT_FIELD(PJRT_Client_DmaMap);
+  _PJRT_API_STRUCT_FIELD(PJRT_Client_DmaUnmap);
+
+  _PJRT_API_STRUCT_FIELD(PJRT_Client_CreateUninitializedBuffer);
+  _PJRT_API_STRUCT_FIELD(PJRT_Client_UpdateGlobalProcessInfo);
+  _PJRT_API_STRUCT_FIELD(PJRT_TopologyDescription_Deserialize);
+  _PJRT_API_STRUCT_FIELD(PJRT_Client_CreateAliasBuffer);
+  _PJRT_API_STRUCT_FIELD(PJRT_Client_FulfillAliasBuffer);
+  _PJRT_API_STRUCT_FIELD(PJRT_LoadedExecutable_GetDeviceAssignment);
 } PJRT_Api;
 
 enum {
   PJRT_Api_STRUCT_SIZE =
-      PJRT_STRUCT_SIZE(PJRT_Api, PJRT_Client_TopologyDescription)
+      PJRT_STRUCT_SIZE(PJRT_Api, PJRT_LoadedExecutable_GetDeviceAssignment)
 };
 
 #undef _PJRT_API_STRUCT_FIELD
@@ -2173,3 +2692,4 @@ enum {
 #endif
 
 #endif  // XLA_PJRT_C_PJRT_C_API_H_
+